From ff41f4f2e308a82c42dc992cfc1eaadaf79f4546 Mon Sep 17 00:00:00 2001 From: Morgan Date: Wed, 16 Oct 2019 22:18:13 +0200 Subject: [PATCH] use lynx and eliminate noise in top_words --- top_words.sh | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/top_words.sh b/top_words.sh index 866a82c..a654269 100755 --- a/top_words.sh +++ b/top_words.sh @@ -3,6 +3,27 @@ set -o errexit set -o nounset set -o pipefail +cmd_args=" | sed -e 's/\[.*\]//g' \ + | sed 's!http\(s\)\{0,1\}://[^[:space:]]*!!g' \ + | \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' \ + | tr '[:upper:]' '[:lower:]' \ + | \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \ + | sort \ + | uniq -c \ + | sort -n" + +fetch_page_content() { + if type "lynx" > /dev/null; then + cmd_bin="lynx --dump -nolist -hiddenlinks=ignore -nonumbers -hiddenlinks=merge "$url"" + cmd="$cmd_bin $cmd_args" + eval "$cmd" + else + cmd_bin="curl "$url" " + cmd="$cmd_bin $cmd_args" + eval "$cmd" + fi +} + fetch_top_words() { local url local basedir @@ -19,20 +40,11 @@ fetch_top_words() { fi if [[ "${url:-}" = "" ]]; then - \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' "$basedir/README.md" \ - | tr '[:upper:]' '[:lower:]' \ - | \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \ - | sort \ - | uniq -c \ - | sort -n + cmd_bin="\cat "$basedir/README.md"" + cmd="$cmd_bin $cmd_args" + eval "$cmd" else - curl "$url" \ - | \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' \ - | tr '[:upper:]' '[:lower:]' \ - | \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \ - | sort \ - | uniq -c \ - | sort -n + fetch_page_content "$url" fi } fetch_top_words "$@"