bookmarks/top_words.sh

61 lines
1.6 KiB
Bash
Raw Normal View History

2018-09-13 18:58:56 +00:00
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
sed_bin="LANG=C LC_CTYPE=C sed" # workaround for the illegal byte sequence on MacOS X
# see also https://stackoverflow.com/questions/11287564/getting-sed-error-illegal-byte-sequence-in-bash
# and https://unix.stackexchange.com/questions/141420/tr-complains-of-illegal-byte-sequence
cmd_args=" | $sed_bin -e 's/\[.*\]//g' \
| $sed_bin 's!http\(s\)\{0,1\}://[^[:space:]]*!!g' \
2019-10-22 06:16:21 +00:00
| $sed_bin -e 's/\(BUTTON\)//g' \
| \grep --only-matching --extended-regexp '[a-zA-Z]{3,}' \
| tr '[:upper:]' '[:lower:]' \
| \grep --invert-match --word-regexp --fixed-strings --file=stopwords.txt \
| sort \
| uniq -c \
| sort -n"
fetch_page_content() {
if type "lynx" > /dev/null; then
2019-10-22 06:16:21 +00:00
cmd_bin="lynx --dump -nolist -hiddenlinks=ignore -nonumbers -hiddenlinks=merge $url"
cmd="$cmd_bin $cmd_args"
eval "$cmd"
else
2019-10-22 06:16:21 +00:00
nohtml_args=" | \grep -oEi \">([^<>]*)<\" \
| \sed $'s,\x1b\\[[0-9;]*[a-zA-Z],,g' \
| \sed -E 's/><//' \
| \sed 's/[<>]//g' \
| \sed 's/{.*}//g' \
| awk 1 ORS=' '"
cmd_bin="curl $url"
cmd="$cmd_bin $nohtml_args $cmd_args"
eval "$cmd"
fi
}
2018-09-13 19:14:23 +00:00
fetch_top_words() {
local url
local basedir
2019-07-22 12:11:00 +00:00
2018-09-13 19:14:23 +00:00
url="${1:-}"
basedir=$(dirname "$0")
2019-07-22 12:11:00 +00:00
if [[ $(uname -s) == "Darwin" ]]; then
gnubin_dir="/usr/local/opt/grep/libexec/gnubin"
if test -f "$gnubin_dir"; then
brew install grep
fi
export PATH="$gnubin_dir:$PATH"
fi
2018-09-13 18:58:56 +00:00
2018-09-13 19:14:23 +00:00
if [[ "${url:-}" = "" ]]; then
cmd_bin="\cat "$basedir/README.md""
cmd="$cmd_bin $cmd_args"
eval "$cmd"
2018-09-13 19:14:23 +00:00
else
fetch_page_content "$url"
2018-09-13 19:14:23 +00:00
fi
}
fetch_top_words "$@"