use lynx and eliminate noise in top_words

This commit is contained in:
Morgan 2019-10-16 22:18:13 +02:00
parent 27919298d9
commit ff41f4f2e3

View File

@ -3,6 +3,27 @@ set -o errexit
set -o nounset set -o nounset
set -o pipefail set -o pipefail
cmd_args=" | sed -e 's/\[.*\]//g' \
| sed 's!http\(s\)\{0,1\}://[^[:space:]]*!!g' \
| \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' \
| tr '[:upper:]' '[:lower:]' \
| \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \
| sort \
| uniq -c \
| sort -n"
fetch_page_content() {
if type "lynx" > /dev/null; then
cmd_bin="lynx --dump -nolist -hiddenlinks=ignore -nonumbers -hiddenlinks=merge "$url""
cmd="$cmd_bin $cmd_args"
eval "$cmd"
else
cmd_bin="curl "$url" "
cmd="$cmd_bin $cmd_args"
eval "$cmd"
fi
}
fetch_top_words() { fetch_top_words() {
local url local url
local basedir local basedir
@ -19,20 +40,11 @@ fetch_top_words() {
fi fi
if [[ "${url:-}" = "" ]]; then if [[ "${url:-}" = "" ]]; then
\grep --only-matching --extended-regexp '[a-zA-Z]{3,}' "$basedir/README.md" \ cmd_bin="\cat "$basedir/README.md""
| tr '[:upper:]' '[:lower:]' \ cmd="$cmd_bin $cmd_args"
| \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \ eval "$cmd"
| sort \
| uniq -c \
| sort -n
else else
curl "$url" \ fetch_page_content "$url"
| \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' \
| tr '[:upper:]' '[:lower:]' \
| \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \
| sort \
| uniq -c \
| sort -n
fi fi
} }
fetch_top_words "$@" fetch_top_words "$@"