improve helper for computing top words
This commit is contained in:
parent
091adb5e82
commit
ee13f58a35
13
top_words.sh
13
top_words.sh
@ -8,6 +8,7 @@ sed_bin="LANG=C LC_CTYPE=C sed" # workaround for the illegal byte sequence on Ma
|
|||||||
# and https://unix.stackexchange.com/questions/141420/tr-complains-of-illegal-byte-sequence
|
# and https://unix.stackexchange.com/questions/141420/tr-complains-of-illegal-byte-sequence
|
||||||
cmd_args=" | $sed_bin -e 's/\[.*\]//g' \
|
cmd_args=" | $sed_bin -e 's/\[.*\]//g' \
|
||||||
| $sed_bin 's!http\(s\)\{0,1\}://[^[:space:]]*!!g' \
|
| $sed_bin 's!http\(s\)\{0,1\}://[^[:space:]]*!!g' \
|
||||||
|
| $sed_bin -e 's/\(BUTTON\)//g' \
|
||||||
| \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' \
|
| \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' \
|
||||||
| tr '[:upper:]' '[:lower:]' \
|
| tr '[:upper:]' '[:lower:]' \
|
||||||
| \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \
|
| \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \
|
||||||
@ -17,12 +18,18 @@ cmd_args=" | $sed_bin -e 's/\[.*\]//g' \
|
|||||||
|
|
||||||
fetch_page_content() {
|
fetch_page_content() {
|
||||||
if type "lynx" > /dev/null; then
|
if type "lynx" > /dev/null; then
|
||||||
cmd_bin="lynx --dump -nolist -hiddenlinks=ignore -nonumbers -hiddenlinks=merge "$url""
|
cmd_bin="lynx --dump -nolist -hiddenlinks=ignore -nonumbers -hiddenlinks=merge $url"
|
||||||
cmd="$cmd_bin $cmd_args"
|
cmd="$cmd_bin $cmd_args"
|
||||||
eval "$cmd"
|
eval "$cmd"
|
||||||
else
|
else
|
||||||
cmd_bin="curl "$url" "
|
nohtml_args=" | \grep -oEi \">([^<>]*)<\" \
|
||||||
cmd="$cmd_bin $cmd_args"
|
| \sed $'s,\x1b\\[[0-9;]*[a-zA-Z],,g' \
|
||||||
|
| \sed -E 's/><//' \
|
||||||
|
| \sed 's/[<>]//g' \
|
||||||
|
| \sed 's/{.*}//g' \
|
||||||
|
| awk 1 ORS=' '"
|
||||||
|
cmd_bin="curl $url"
|
||||||
|
cmd="$cmd_bin $nohtml_args $cmd_args"
|
||||||
eval "$cmd"
|
eval "$cmd"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user