doru001 - ## I use this script to search in a number of large...

2023-11-16 12:08:00 -

## I use this script to search in a number of large firefox and brave bookmarks files

# arguments give search keywords
# keywords cannot contain blanks
#
# you need to set browser.bookmarks.autoExportHTML to true in firefox
# you need two restarts for this to produce results

appl=awk # sed, awk or bash (bash takes too long)

bkdir= # here you put the large firefox and brave bookmarks files

case $appl in

sed)

# takes 2.8 s for david icke politics life
# I add a blank at the end of the keyword list
# to allow for a homogeneous processing
# when I have to search over several newlines in one pass

sed -nr '

1{s/$/ /; h} # keywords are placed in the hold buffer, with a blank after each of them
2,4H # three lines are put in the hold buffer, with \ns between them

5{
g
:line
/\n.*\n *"type": "url", *\n/{
:keys
s/(^[^ \n]+) ([^\n]*\n.*\1.*)/\2/i
# substitution occurs only when the first keyword in the buffer is found in the lines
t keys

# if all keywords have been removed then format for print
s/^\n *([^\n]*\n)[^\n]*\n *([^\n]*)/\1 \2\n/p

# when "type": "url" is found, I can safely skip three lines
g
s/(^[^\n]*)\n.*/\1/ # I remove the three current lines
N; N; N # I read three more lines
h
b line
}

g # I retrieve the keywords and the three lines once more
s/([^\n]*\n)[^\n]*\n(.*)/\1\2/ # I remove the first line from the combination
N # I add a new line
h # I memorize the new keywords plus three lines combination
b line
}' <(echo $*) $bkdir/*brave*.html

# takes 29 s for david icke politics life

sed -nr '

{s/$/ /; h}

{
:line
N
/HREF/ {
:keys # verify keys
# all keys have been verified already if and only if there is a ^\n and then it can not fit
s/(^[^ \n]+) (.*\n.*\1.*)/\2/i
t keys

s/^\n.*(HREF="[^"]*").*(TAGS="[^"]*").*(>[^>]*<)\/A>/\1\n \2\n \3\n/p
}

g

b line

}' <(echo $*) $bkdir/*firefox*.html

# # takes 54 s for david icke politics life
# # I put the blank separated keyword list at the beginning of the buffer,
# # such that I can select the keyword before I use it as a back reference
# # bash guarantees no more than one blank between keywords
#
# sed -nr '
#
# 1h # save the keywords
#
# 2,${/HREF/{
# H; g; s/\n.*\n/\n/; h # this eliminates the previous line from h
# # on line 2 there is nothing to eliminate
# :l # loop
# s/(^[^ ]+)( |\n)((.*\n)?.*\1.*)/\3/i # this eliminates \n when the last keyword matches
# t l
#
# # format the selected line for output
# /\n/! s/.*(HREF="[^"]*").*(TAGS="[^"]*").*(>[^>]*<)\/A>/\1\n \2\n \3\n/p
# }}' <(echo $*) $bkdir/*firefox*.html

# # takes too long, no results in the first 7m
# # because it has to discover the keyword in the line by trying all substrings in the line
# # it works, verified on a very small test file:
# # the script is ./testsed.sh, the test file is ./test.html
#
# sed -nr '
#
# 1h
#
# 2,${/HREF/{
# G
# :l # loop
# # s/(\2.*\n)([^ ]+)( |$)(.*)/\1\4/
# # invalid back reference, because \2 is used before it is selected
# s/(([^ ]+).*\n)\2( |$)(.*)/\1\4/ # the line before the replacement string remains unchanged
# t l
#
# /\n$/p # this is not formatted
# }}' <(echo $*) $bkdir/*firefox*.html

;;

awk)

## takes .14s for david icke politics life
awk -v myargs="$*" '

BEGIN{IGNORECASE=1; split(myargs, myarray)}

{prev3=prev2; prev2=prev1; prev1=$0}
/^ *"url"/{
if (prev3 ~ /^ *"name"/) {
found = 1
for (i in myarray) {
if (prev3 " " prev1 !~ myarray[i]) {found = 0; break}}
if (found) {gsub(/^ *"/, "\"", prev3); gsub(/^ *"/, "\"", prev1)
print prev3 "\n " prev1 "\n"}}}
' \
$bkdir/*brave*.html

## takes .40s for david icke politics life
awk -v myargs="$*" '

BEGIN{IGNORECASE=1; split(myargs, myarray)}

/HREF/{
found = 1
for (i in myarray) {
if ($0 !~ myarray[i]) {found = 0; break}}
if (found) {match($0, /HREF="[^"]*"/, url);
match($0, /TAGS="[^"]*"/, tags);
match($0, />[^>]*<\/A>/, name);
print name[0]
if (tags[0]) print " " tags[0]
print " " url[0] "\n"}}
' \
$bkdir/*firefox*.html

;;

bash)

# takes 1m 1s for david icke politics life
while read; do
name="$(echo "${REPLY}" | cut -d\" -f4)"
read
url="$(echo "${REPLY}" | cut -d\" -f4)"

found=true
for s; do
echo "$name" $url | grep -q "$s"
(($?)) && { found=false; break; }; done
[ $found = true ] && echo -e "$name\n $url\n"; done \
< <(grep -hB2 '^ *"url"' $bkdir/*brave*.html | grep '^ *$"name"\|"url"$')

# takes 2m 31s for david icke politics life
while read; do
url=$(echo "$REPLY" | grep -o 'HREF="[^"]*"')
tags=$(echo "$REPLY" | grep -o 'TAGS="[^"]*"')
name=$(echo "$REPLY" | grep -o '>[^>]*</A>' | grep -o '^.*<')

found=true
for s; do
echo "$name""$tags"$url | grep -iq "$s"
(($?)) && { found=false; break; }; done
[ $found = true ] && echo -e "$name\n $tags\n $url\n" ; done \
< <(grep HREF $bkdir/*firefox*.html)

esac

## I use this script to search in a number of large firefox and brave bookmarks files # arguments give search keywords # keywords cannot contain blanks # # you need to set browser.bookmarks.autoExportHTML to true in firefox # you need two restarts for this to produce results appl=awk # sed, awk or bash (bash takes too long) bkdir= # here you put the large firefox and brave bookmarks files case $appl in sed) # takes 2.8 s for david icke politics life # I add a blank at the end of the keyword list # to allow for a homogeneous processing # when I have to search over several newlines in one pass sed -nr ' 1{s/$/ /; h} # keywords are placed in the hold buffer, with a blank after each of them 2,4H # three lines are put in the hold buffer, with \ns between them 5{ g :line /\n.*\n *"type": "url", *\n/{ :keys s/(^[^ \n]+) ([^\n]*\n.*\1.*)/\2/i # substitution occurs only when the first keyword in the buffer is found in the lines t keys # if all keywords have been removed then format for print s/^\n *([^\n]*\n)[^\n]*\n *([^\n]*)/\1 \2\n/p # when "type": "url" is found, I can safely skip three lines g s/(^[^\n]*)\n.*/\1/ # I remove the three current lines N; N; N # I read three more lines h b line } g # I retrieve the keywords and the three lines once more s/([^\n]*\n)[^\n]*\n(.*)/\1\2/ # I remove the first line from the combination N # I add a new line h # I memorize the new keywords plus three lines combination b line }' <(echo $*) $bkdir/*brave*.html # takes 29 s for david icke politics life sed -nr ' {s/$/ /; h} { :line N /HREF/ { :keys # verify keys # all keys have been verified already if and only if there is a ^\n and then it can not fit s/(^[^ \n]+) (.*\n.*\1.*)/\2/i t keys s/^\n.*(HREF="[^"]*").*(TAGS="[^"]*").*(>[^>]*<)\/A>/\1\n \2\n \3\n/p } g b line }' <(echo $*) $bkdir/*firefox*.html # # takes 54 s for david icke politics life # # I put the blank separated keyword list at the beginning of the buffer, # # such that I can select the keyword before I use it as a back reference # # bash guarantees no more than one blank between keywords # # sed -nr ' # # 1h # save the keywords # # 2,${/HREF/{ # H; g; s/\n.*\n/\n/; h # this eliminates the previous line from h # # on line 2 there is nothing to eliminate # :l # loop # s/(^[^ ]+)( |\n)((.*\n)?.*\1.*)/\3/i # this eliminates \n when the last keyword matches # t l # # # format the selected line for output # /\n/! s/.*(HREF="[^"]*").*(TAGS="[^"]*").*(>[^>]*<)\/A>/\1\n \2\n \3\n/p # }}' <(echo $*) $bkdir/*firefox*.html # # takes too long, no results in the first 7m # # because it has to discover the keyword in the line by trying all substrings in the line # # it works, verified on a very small test file: # # the script is ./testsed.sh, the test file is ./test.html # # sed -nr ' # # 1h # # 2,${/HREF/{ # G # :l # loop # # s/(\2.*\n)([^ ]+)( |$)(.*)/\1\4/ # # invalid back reference, because \2 is used before it is selected # s/(([^ ]+).*\n)\2( |$)(.*)/\1\4/ # the line before the replacement string remains unchanged # t l # # /\n$/p # this is not formatted # }}' <(echo $*) $bkdir/*firefox*.html ;; awk) ## takes .14s for david icke politics life awk -v myargs="$*" ' BEGIN{IGNORECASE=1; split(myargs, myarray)} {prev3=prev2; prev2=prev1; prev1=$0} /^ *"url"/{ if (prev3 ~ /^ *"name"/) { found = 1 for (i in myarray) { if (prev3 " " prev1 !~ myarray[i]) {found = 0; break}} if (found) {gsub(/^ *"/, "\"", prev3); gsub(/^ *"/, "\"", prev1) print prev3 "\n " prev1 "\n"}}} ' \ $bkdir/*brave*.html ## takes .40s for david icke politics life awk -v myargs="$*" ' BEGIN{IGNORECASE=1; split(myargs, myarray)} /HREF/{ found = 1 for (i in myarray) { if ($0 !~ myarray[i]) {found = 0; break}} if (found) {match($0, /HREF="[^"]*"/, url); match($0, /TAGS="[^"]*"/, tags); match($0, />[^>]*<\/A>/, name); print name[0] if (tags[0]) print " " tags[0] print " " url[0] "\n"}} ' \ $bkdir/*firefox*.html ;; bash) # takes 1m 1s for david icke politics life while read; do name="$(echo "${REPLY}" | cut -d\" -f4)" read url="$(echo "${REPLY}" | cut -d\" -f4)" found=true for s; do echo "$name" $url | grep -q "$s" (($?)) && { found=false; break; }; done [ $found = true ] && echo -e "$name\n $url\n"; done \ < <(grep -hB2 '^ *"url"' $bkdir/*brave*.html | grep '^ *$"name"\|"url"$') # takes 2m 31s for david icke politics life while read; do url=$(echo "$REPLY" | grep -o 'HREF="[^"]*"') tags=$(echo "$REPLY" | grep -o 'TAGS="[^"]*"') name=$(echo "$REPLY" | grep -o '>[^>]*</A>' | grep -o '^.*<') found=true for s; do echo "$name""$tags"$url | grep -iq "$s" (($?)) && { found=false; break; }; done [ $found = true ] && echo -e "$name\n $tags\n $url\n" ; done \ < <(grep HREF $bkdir/*firefox*.html) esac

1 Comments 0 Shares 1313 Views