## I use this script to search in a number of large firefox and brave bookmarks files
# arguments give search keywords
# keywords cannot contain blanks
#
# you need to set browser.bookmarks.autoExportHTML to true in firefox
# you need two restarts for this to produce results
appl=awk # sed, awk or bash (bash takes too long)
bkdir= # here you put the large firefox and brave bookmarks files
case $appl in
sed)
# takes 2.8 s for david icke politics life
# I add a blank at the end of the keyword list
# to allow for a homogeneous processing
# when I have to search over several newlines in one pass
sed -nr '
1{s/$/ /; h} # keywords are placed in the hold buffer, with a blank after each of them
2,4H # three lines are put in the hold buffer, with \ns between them
5{
g
:line
/\n.*\n *"type": "url", *\n/{
:keys
s/(^[^ \n]+) ([^\n]*\n.*\1.*)/\2/i
# substitution occurs only when the first keyword in the buffer is found in the lines
t keys
# if all keywords have been removed then format for print
s/^\n *([^\n]*\n)[^\n]*\n *([^\n]*)/\1 \2\n/p
# when "type": "url" is found, I can safely skip three lines
g
s/(^[^\n]*)\n.*/\1/ # I remove the three current lines
N; N; N # I read three more lines
h
b line
}
g # I retrieve the keywords and the three lines once more
s/([^\n]*\n)[^\n]*\n(.*)/\1\2/ # I remove the first line from the combination
N # I add a new line
h # I memorize the new keywords plus three lines combination
b line
}' <(echo $*) $bkdir/*brave*.html
# takes 29 s for david icke politics life
sed -nr '
{s/$/ /; h}
{
:line
N
/HREF/ {
:keys # verify keys
# all keys have been verified already if and only if there is a ^\n and then it can not fit
s/(^[^ \n]+) (.*\n.*\1.*)/\2/i
t keys
s/^\n.*(HREF="[^"]*").*(TAGS="[^"]*").*(>[^>]*<)\/A>/\1\n \2\n \3\n/p
}
g
b line
}' <(echo $*) $bkdir/*firefox*.html
# # takes 54 s for david icke politics life
# # I put the blank separated keyword list at the beginning of the buffer,
# # such that I can select the keyword before I use it as a back reference
# # bash guarantees no more than one blank between keywords
#
# sed -nr '
#
# 1h # save the keywords
#
# 2,${/HREF/{
# H; g; s/\n.*\n/\n/; h # this eliminates the previous line from h
# # on line 2 there is nothing to eliminate
# :l # loop
# s/(^[^ ]+)( |\n)((.*\n)?.*\1.*)/\3/i # this eliminates \n when the last keyword matches
# t l
#
# # format the selected line for output
# /\n/! s/.*(HREF="[^"]*").*(TAGS="[^"]*").*(>[^>]*<)\/A>/\1\n \2\n \3\n/p
# }}' <(echo $*) $bkdir/*firefox*.html
# # takes too long, no results in the first 7m
# # because it has to discover the keyword in the line by trying all substrings in the line
# # it works, verified on a very small test file:
# # the script is ./testsed.sh, the test file is ./test.html
#
# sed -nr '
#
# 1h
#
# 2,${/HREF/{
# G
# :l # loop
# # s/(\2.*\n)([^ ]+)( |$)(.*)/\1\4/
# # invalid back reference, because \2 is used before it is selected
# s/(([^ ]+).*\n)\2( |$)(.*)/\1\4/ # the line before the replacement string remains unchanged
# t l
#
# /\n$/p # this is not formatted
# }}' <(echo $*) $bkdir/*firefox*.html
;;
awk)
## takes .14s for david icke politics life
awk -v myargs="$*" '
BEGIN{IGNORECASE=1; split(myargs, myarray)}
{prev3=prev2; prev2=prev1; prev1=$0}
/^ *"url"/{
if (prev3 ~ /^ *"name"/) {
found = 1
for (i in myarray) {
if (prev3 " " prev1 !~ myarray[i]) {found = 0; break}}
if (found) {gsub(/^ *"/, "\"", prev3); gsub(/^ *"/, "\"", prev1)
print prev3 "\n " prev1 "\n"}}}
' \
$bkdir/*brave*.html
## takes .40s for david icke politics life
awk -v myargs="$*" '
BEGIN{IGNORECASE=1; split(myargs, myarray)}
/HREF/{
found = 1
for (i in myarray) {
if ($0 !~ myarray[i]) {found = 0; break}}
if (found) {match($0, /HREF="[^"]*"/, url);
match($0, /TAGS="[^"]*"/, tags);
match($0, />[^>]*<\/A>/, name);
print name[0]
if (tags[0]) print " " tags[0]
print " " url[0] "\n"}}
' \
$bkdir/*firefox*.html
;;
bash)
# takes 1m 1s for david icke politics life
while read; do
name="$(echo "${REPLY}" | cut -d\" -f4)"
read
url="$(echo "${REPLY}" | cut -d\" -f4)"
found=true
for s; do
echo "$name" $url | grep -q "$s"
(($?)) && { found=false; break; }; done
[ $found = true ] && echo -e "$name\n $url\n"; done \
< <(grep -hB2 '^ *"url"' $bkdir/*brave*.html | grep '^ *\("name"\|"url"\)')
# takes 2m 31s for david icke politics life
while read; do
url=$(echo "$REPLY" | grep -o 'HREF="[^"]*"')
tags=$(echo "$REPLY" | grep -o 'TAGS="[^"]*"')
name=$(echo "$REPLY" | grep -o '>[^>]*</A>' | grep -o '^.*<')
found=true
for s; do
echo "$name""$tags"$url | grep -iq "$s"
(($?)) && { found=false; break; }; done
[ $found = true ] && echo -e "$name\n $tags\n $url\n" ; done \
< <(grep HREF $bkdir/*firefox*.html)
esac
## I use this script to search in a number of large firefox and brave bookmarks files
# arguments give search keywords
# keywords cannot contain blanks
#
# you need to set browser.bookmarks.autoExportHTML to true in firefox
# you need two restarts for this to produce results
appl=awk # sed, awk or bash (bash takes too long)
bkdir= # here you put the large firefox and brave bookmarks files
case $appl in
sed)
# takes 2.8 s for david icke politics life
# I add a blank at the end of the keyword list
# to allow for a homogeneous processing
# when I have to search over several newlines in one pass
sed -nr '
1{s/$/ /; h} # keywords are placed in the hold buffer, with a blank after each of them
2,4H # three lines are put in the hold buffer, with \ns between them
5{
g
:line
/\n.*\n *"type": "url", *\n/{
:keys
s/(^[^ \n]+) ([^\n]*\n.*\1.*)/\2/i
# substitution occurs only when the first keyword in the buffer is found in the lines
t keys
# if all keywords have been removed then format for print
s/^\n *([^\n]*\n)[^\n]*\n *([^\n]*)/\1 \2\n/p
# when "type": "url" is found, I can safely skip three lines
g
s/(^[^\n]*)\n.*/\1/ # I remove the three current lines
N; N; N # I read three more lines
h
b line
}
g # I retrieve the keywords and the three lines once more
s/([^\n]*\n)[^\n]*\n(.*)/\1\2/ # I remove the first line from the combination
N # I add a new line
h # I memorize the new keywords plus three lines combination
b line
}' <(echo $*) $bkdir/*brave*.html
# takes 29 s for david icke politics life
sed -nr '
{s/$/ /; h}
{
:line
N
/HREF/ {
:keys # verify keys
# all keys have been verified already if and only if there is a ^\n and then it can not fit
s/(^[^ \n]+) (.*\n.*\1.*)/\2/i
t keys
s/^\n.*(HREF="[^"]*").*(TAGS="[^"]*").*(>[^>]*<)\/A>/\1\n \2\n \3\n/p
}
g
b line
}' <(echo $*) $bkdir/*firefox*.html
# # takes 54 s for david icke politics life
# # I put the blank separated keyword list at the beginning of the buffer,
# # such that I can select the keyword before I use it as a back reference
# # bash guarantees no more than one blank between keywords
#
# sed -nr '
#
# 1h # save the keywords
#
# 2,${/HREF/{
# H; g; s/\n.*\n/\n/; h # this eliminates the previous line from h
# # on line 2 there is nothing to eliminate
# :l # loop
# s/(^[^ ]+)( |\n)((.*\n)?.*\1.*)/\3/i # this eliminates \n when the last keyword matches
# t l
#
# # format the selected line for output
# /\n/! s/.*(HREF="[^"]*").*(TAGS="[^"]*").*(>[^>]*<)\/A>/\1\n \2\n \3\n/p
# }}' <(echo $*) $bkdir/*firefox*.html
# # takes too long, no results in the first 7m
# # because it has to discover the keyword in the line by trying all substrings in the line
# # it works, verified on a very small test file:
# # the script is ./testsed.sh, the test file is ./test.html
#
# sed -nr '
#
# 1h
#
# 2,${/HREF/{
# G
# :l # loop
# # s/(\2.*\n)([^ ]+)( |$)(.*)/\1\4/
# # invalid back reference, because \2 is used before it is selected
# s/(([^ ]+).*\n)\2( |$)(.*)/\1\4/ # the line before the replacement string remains unchanged
# t l
#
# /\n$/p # this is not formatted
# }}' <(echo $*) $bkdir/*firefox*.html
;;
awk)
## takes .14s for david icke politics life
awk -v myargs="$*" '
BEGIN{IGNORECASE=1; split(myargs, myarray)}
{prev3=prev2; prev2=prev1; prev1=$0}
/^ *"url"/{
if (prev3 ~ /^ *"name"/) {
found = 1
for (i in myarray) {
if (prev3 " " prev1 !~ myarray[i]) {found = 0; break}}
if (found) {gsub(/^ *"/, "\"", prev3); gsub(/^ *"/, "\"", prev1)
print prev3 "\n " prev1 "\n"}}}
' \
$bkdir/*brave*.html
## takes .40s for david icke politics life
awk -v myargs="$*" '
BEGIN{IGNORECASE=1; split(myargs, myarray)}
/HREF/{
found = 1
for (i in myarray) {
if ($0 !~ myarray[i]) {found = 0; break}}
if (found) {match($0, /HREF="[^"]*"/, url);
match($0, /TAGS="[^"]*"/, tags);
match($0, />[^>]*<\/A>/, name);
print name[0]
if (tags[0]) print " " tags[0]
print " " url[0] "\n"}}
' \
$bkdir/*firefox*.html
;;
bash)
# takes 1m 1s for david icke politics life
while read; do
name="$(echo "${REPLY}" | cut -d\" -f4)"
read
url="$(echo "${REPLY}" | cut -d\" -f4)"
found=true
for s; do
echo "$name" $url | grep -q "$s"
(($?)) && { found=false; break; }; done
[ $found = true ] && echo -e "$name\n $url\n"; done \
< <(grep -hB2 '^ *"url"' $bkdir/*brave*.html | grep '^ *\("name"\|"url"\)')
# takes 2m 31s for david icke politics life
while read; do
url=$(echo "$REPLY" | grep -o 'HREF="[^"]*"')
tags=$(echo "$REPLY" | grep -o 'TAGS="[^"]*"')
name=$(echo "$REPLY" | grep -o '>[^>]*</A>' | grep -o '^.*<')
found=true
for s; do
echo "$name""$tags"$url | grep -iq "$s"
(($?)) && { found=false; break; }; done
[ $found = true ] && echo -e "$name\n $tags\n $url\n" ; done \
< <(grep HREF $bkdir/*firefox*.html)
esac