@ -2,6 +2,8 @@ from bs4 import BeautifulSoup
import os
import urllib . parse as urlparse
from urllib . parse import parse_qs
import re
from bs4 import NavigableString
SKIP_ARGS = [ ' ref_src ' , ' utm ' ]
@ -34,6 +36,45 @@ SITE_ALTS = {
}
def bold_search_terms ( response : str , query : str ) - > BeautifulSoup :
""" Wraps all search terms in bold tags (<b>). If any terms are wrapped
in quotes , only that exact phrase will be made bold .
Args :
response : The initial response body for the query
query : The original search query
Returns :
BeautifulSoup : modified soup object with bold items
"""
response = BeautifulSoup ( response , ' html.parser ' )
def replace_any_case ( element : NavigableString , target_word : str ) - > None :
# Replace all instances of the word, but maintaining the same case in
# the replacement
element . replace_with (
element . replace (
target_word . lower ( ) , f ' <b> { target_word . lower ( ) } </b> '
) . replace (
target_word . capitalize ( ) , f ' <b> { target_word . capitalize ( ) } </b> '
) . replace (
target_word . title ( ) , f ' <b> { target_word . title ( ) } </b> '
) . replace (
target_word . upper ( ) , f ' <b> { target_word . upper ( ) } </b> '
)
)
# Split all words out of query, grouping the ones wrapped in quotes
for word in re . split ( r ' \ s+(?=[^ " ]*(?: " [^ " ]* " [^ " ]*)*$) ' , query ) :
word = re . sub ( r ' [^A-Za-z0-9 ]+ ' , ' ' , word )
target = response . find_all (
text = re . compile ( r ' ' + re . escape ( word ) , re . I ) )
for nav_str in target :
replace_any_case ( nav_str , word )
return response
def has_ad_content ( element : str ) - > bool :
""" Inspects an HTML element for ad related content