@ -43,7 +43,8 @@ class Douban(Metadata):
__id__ = " douban "
DESCRIPTION = " 豆瓣 "
META_URL = " https://book.douban.com/ "
SEARCH_URL = " https://www.douban.com/j/search "
SEARCH_JSON_URL = " https://www.douban.com/j/search "
SEARCH_URL = " https://www.douban.com/search "
ID_PATTERN = re . compile ( r " sid: (?P<id> \ d+), " )
AUTHORS_PATTERN = re . compile ( r " 作者|译者 " )
@ -52,6 +53,7 @@ class Douban(Metadata):
PUBLISHED_DATE_PATTERN = re . compile ( r " 出版年 " )
SERIES_PATTERN = re . compile ( r " 丛书 " )
IDENTIFIERS_PATTERN = re . compile ( r " ISBN|统一书号 " )
CRITERIA_PATTERN = re . compile ( " criteria = ' (.+) ' " )
TITTLE_XPATH = " //span[@property= ' v:itemreviewed ' ] "
COVER_XPATH = " //a[@class= ' nbg ' ] "
@ -63,56 +65,90 @@ class Douban(Metadata):
session = requests . Session ( )
session . headers = {
' user-agent ' :
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56 ' ,
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56 ' ,
}
def search (
self , query : str , generic_cover : str = " " , locale : str = " en "
) - > Optional [ List [ MetaRecord ] ] :
def search ( self ,
query : str ,
generic_cover : str = " " ,
locale : str = " en " ) - > List [ MetaRecord ] :
val = [ ]
if self . active :
log . debug ( f " starting search { query } on douban " )
log . debug ( f " start searching { query } on douban " )
if title_tokens := list (
self . get_title_tokens ( query , strip_joiners = False )
) :
self . get_title_tokens ( query , strip_joiners = False ) ) :
query = " + " . join ( title_tokens )
try :
r = self . session . get (
self . SEARCH_URL , params = { " cat " : 1001 , " q " : query }
)
r . raise_for_status ( )
book_id_list = self . _get_book_id_list_from_html ( query )
except Exception as e :
log . warning ( e )
return None
results = r . json ( )
if results [ " total " ] == 0 :
if not book_id_list :
log . debug ( " No search results in Douban " )
return [ ]
book_id_list = [
self . ID_PATTERN . search ( item ) . group ( " id " )
for item in results [ " items " ] [ : 10 ] if self . ID_PATTERN . search ( item )
]
with futures . ThreadPoolExecutor ( max_workers = 5 ) as executor :
with futures . ThreadPoolExecutor (
max_workers = 5 , thread_name_prefix = ' douban ' ) as executor :
fut = [
executor . submit ( self . _parse_single_book , book_id , generic_cover )
for book_id in book_id_list
executor . submit ( self . _parse_single_book , book_id ,
generic_cover ) for book_id in book_id_list
]
val = [
future . result ( )
for future in futures . as_completed ( fut ) if future . result ( )
future . result ( ) for future in futures . as_completed ( fut )
if future . result ( )
]
return val
def _parse_single_book (
self , id : str , generic_cover : str = " "
) - > Optional [ MetaRecord ] :
def _get_book_id_list_from_html ( self , query : str ) - > List [ str ] :
try :
r = self . session . get ( self . SEARCH_URL ,
params = {
" cat " : 1001 ,
" q " : query
} )
r . raise_for_status ( )
except Exception as e :
log . warning ( e )
return [ ]
html = etree . HTML ( r . content . decode ( " utf8 " ) )
result_list = html . xpath ( self . COVER_XPATH )
return [
self . ID_PATTERN . search ( item . get ( " onclick " ) ) . group ( " id " )
for item in result_list [ : 10 ]
if self . ID_PATTERN . search ( item . get ( " onclick " ) )
]
def _get_book_id_list_from_json ( self , query : str ) - > List [ str ] :
try :
r = self . session . get ( self . SEARCH_JSON_URL ,
params = {
" cat " : 1001 ,
" q " : query
} )
r . raise_for_status ( )
except Exception as e :
log . warning ( e )
return [ ]
results = r . json ( )
if results [ " total " ] == 0 :
return [ ]
return [
self . ID_PATTERN . search ( item ) . group ( " id " )
for item in results [ " items " ] [ : 10 ] if self . ID_PATTERN . search ( item )
]
def _parse_single_book ( self ,
id : str ,
generic_cover : str = " " ) - > Optional [ MetaRecord ] :
url = f " https://book.douban.com/subject/ { id } / "
log . debug ( f " start parsing { url } " )
try :
r = self . session . get ( url )
@ -136,7 +172,8 @@ class Douban(Metadata):
html = etree . HTML ( r . content . decode ( " utf8 " ) )
match . title = html . xpath ( self . TITTLE_XPATH ) [ 0 ] . text
match . cover = html . xpath ( self . COVER_XPATH ) [ 0 ] . attrib [ " href " ] or generic_cover
match . cover = html . xpath (
self . COVER_XPATH ) [ 0 ] . attrib [ " href " ] or generic_cover
try :
rating_num = float ( html . xpath ( self . RATING_XPATH ) [ 0 ] . text . strip ( ) )
except Exception :
@ -146,35 +183,39 @@ class Douban(Metadata):
tag_elements = html . xpath ( self . TAGS_XPATH )
if len ( tag_elements ) :
match . tags = [ tag_element . text for tag_element in tag_elements ]
else :
match . tags = self . _get_tags ( html . text )
description_element = html . xpath ( self . DESCRIPTION_XPATH )
if len ( description_element ) :
match . description = html2text ( etree . tostring (
description_element[ - 1 ] , encoding = " utf8 " ) . decode ( " utf8 " ) )
match . description = html2text (
etree. tostring ( description_element[ - 1 ] ) . decode ( " utf8 " ) )
info = html . xpath ( self . INFO_XPATH )
for element in info :
text = element . text
if self . AUTHORS_PATTERN . search ( text ) :
nex t = element . getnext ( )
while next is not None and nex t. tag != " br " :
match . authors . append ( nex t. text )
next = nex t. getnext ( )
next_elemen t = element . getnext ( )
while next_element is not None and next_elemen t. tag != " br " :
match . authors . append ( next_elemen t. text )
next_element = next_elemen t. getnext ( )
elif self . PUBLISHER_PATTERN . search ( text ) :
match . publisher = element . tail . strip ( )
if publisher := element . tail . strip ( ) :
match . publisher = publisher
else :
match . publisher = element . getnext ( ) . text
elif self . SUBTITLE_PATTERN . search ( text ) :
match . title = f ' { match . title } : ' + element . tail . strip ( )
match . title = f ' { match . title } : { element . tail . strip ( ) } '
elif self . PUBLISHED_DATE_PATTERN . search ( text ) :
match . publishedDate = self . _clean_date ( element . tail . strip ( ) )
elif self . S UBTITLE _PATTERN. search ( text ) :
elif self . S ERIES _PATTERN. search ( text ) :
match . series = element . getnext ( ) . text
elif i_type := self . IDENTIFIERS_PATTERN . search ( text ) :
match . identifiers [ i_type . group ( ) ] = element . tail . strip ( )
return match
def _clean_date ( self , date : str ) - > str :
"""
Clean up the date string to be in the format YYYY - MM - DD
@ -194,13 +235,24 @@ class Douban(Metadata):
if date [ i ] . isdigit ( ) :
digit . append ( date [ i ] )
elif digit :
ls . append ( " " . join ( digit ) if len ( digit ) == 2 else f " 0 { digit [ 0 ] } " )
ls . append ( " " . join ( digit ) if len ( digit ) ==
2 else f " 0 { digit [ 0 ] } " )
digit = [ ]
if digit :
ls . append ( " " . join ( digit ) if len ( digit ) == 2 else f " 0 { digit [ 0 ] } " )
ls . append ( " " . join ( digit ) if len ( digit ) ==
2 else f " 0 { digit [ 0 ] } " )
moon = ls [ 0 ]
if len ( ls ) > 1 :
day = ls [ 1 ]
if len ( ls ) > 1 :
day = ls [ 1 ]
return f " { year } - { moon } - { day } "
def _get_tags ( self , text : str ) - > List [ str ] :
tags = [ ]
if criteria := self . CRITERIA_PATTERN . search ( text ) :
tags . extend (
item . replace ( ' 7: ' , ' ' ) for item in criteria . group ( ) . split ( ' | ' )
if item . startswith ( ' 7: ' ) )
return tags