use lynx and eliminate noise in top_words

2019-10-16 22:18:13 +02:00
parent 27919298d9
commit ff41f4f2e3
1 changed files with 25 additions and 13 deletions
--- a/top_words.sh
+++ b/top_words.sh
@@ -3,6 +3,27 @@ set -o errexit
 set -o nounset
 set -o pipefail
 cmd_args=" | sed -e 's/\[.*\]//g' \
  | sed 's!http\(s\)\{0,1\}://[^[:space:]]*!!g' \
  | \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' \
  | tr '[:upper:]' '[:lower:]' \
  | \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \
  | sort \
  | uniq -c \
  | sort -n"
 fetch_page_content() {
  if type "lynx" > /dev/null; then
    cmd_bin="lynx --dump -nolist -hiddenlinks=ignore -nonumbers -hiddenlinks=merge  "$url""
    cmd="$cmd_bin $cmd_args"
    eval "$cmd"
  else
    cmd_bin="curl "$url" "
    cmd="$cmd_bin $cmd_args"
    eval "$cmd"
  fi
 }
 fetch_top_words() {
  local url
  local basedir
@@ -19,20 +40,11 @@ fetch_top_words() {
  fi
  if [[ "${url:-}" = "" ]]; then
-    \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' "$basedir/README.md" \
+    cmd_bin="\cat "$basedir/README.md""
-    | tr '[:upper:]' '[:lower:]' \
+    cmd="$cmd_bin $cmd_args"
-    | \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \
+    eval "$cmd"
    | sort \
    | uniq -c \
    | sort -n
  else
-    curl "$url" \
+    fetch_page_content "$url"
    | \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' \
    | tr '[:upper:]' '[:lower:]' \
    | \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \
    | sort \
    | uniq -c \
    | sort -n
  fi
 }
 fetch_top_words "$@"