From ee13f58a35b6df347be439a96ce3ef856c8304d1 Mon Sep 17 00:00:00 2001 From: Morgan Date: Tue, 22 Oct 2019 08:16:21 +0200 Subject: [PATCH] improve helper for computing top words --- top_words.sh | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/top_words.sh b/top_words.sh index 4236c90..366959f 100755 --- a/top_words.sh +++ b/top_words.sh @@ -8,6 +8,7 @@ sed_bin="LANG=C LC_CTYPE=C sed" # workaround for the illegal byte sequence on Ma # and https://unix.stackexchange.com/questions/141420/tr-complains-of-illegal-byte-sequence cmd_args=" | $sed_bin -e 's/\[.*\]//g' \ | $sed_bin 's!http\(s\)\{0,1\}://[^[:space:]]*!!g' \ + | $sed_bin -e 's/\(BUTTON\)//g' \ | \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' \ | tr '[:upper:]' '[:lower:]' \ | \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \ @@ -17,12 +18,18 @@ cmd_args=" | $sed_bin -e 's/\[.*\]//g' \ fetch_page_content() { if type "lynx" > /dev/null; then - cmd_bin="lynx --dump -nolist -hiddenlinks=ignore -nonumbers -hiddenlinks=merge "$url"" + cmd_bin="lynx --dump -nolist -hiddenlinks=ignore -nonumbers -hiddenlinks=merge $url" cmd="$cmd_bin $cmd_args" eval "$cmd" else - cmd_bin="curl "$url" " - cmd="$cmd_bin $cmd_args" + nohtml_args=" | \grep -oEi \">([^<>]*)<\" \ + | \sed $'s,\x1b\\[[0-9;]*[a-zA-Z],,g' \ + | \sed -E 's/>]//g' \ + | \sed 's/{.*}//g' \ + | awk 1 ORS=' '" + cmd_bin="curl $url" + cmd="$cmd_bin $nohtml_args $cmd_args" eval "$cmd" fi }