>",
html,
)
h2t = HTML2Text()
h2t.body_width = 0
h2t.single_line_break = True
h2t.emphasis_mark = "*"
return h2t.handle(html)
class LubimyCzytac(Metadata):
__name__ = "LubimyCzytac.pl"
__id__ = "lubimyczytac"
BASE_URL = "https://lubimyczytac.pl"
BOOK_SEARCH_RESULT_XPATH = (
"*//div[@class='listSearch']//div[@class='authorAllBooks__single']"
)
SINGLE_BOOK_RESULT_XPATH = ".//div[contains(@class,'authorAllBooks__singleText')]"
TITLE_PATH = "/div/a[contains(@class,'authorAllBooks__singleTextTitle')]"
TITLE_TEXT_PATH = f"{TITLE_PATH}//text()"
URL_PATH = f"{TITLE_PATH}/@href"
AUTHORS_PATH = "/div/a[contains(@href,'autor')]//text()"
SIBLINGS = "/following-sibling::dd"
CONTAINER = "//section[@class='container book']"
PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()"
LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()"
DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']"
SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]"
DETAILS = "//div[@id='book-details']"
PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania"
FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()"
FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()"
TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()"
RATING = "//meta[@property='books:rating:value']/@content"
COVER = "//meta[@property='og:image']/@content"
SUMMARY = "//script[@type='application/ld+json']//text()"
def search(self, query, __):
if self.active:
result = requests.get(self._prepare_query(title=query))
root = fromstring(result.text)
matches = self._parse_search_results(root=root)
if matches:
for ind, match in enumerate(matches):
matches[ind] = self._parse_single_book(match=match)
return matches
def _prepare_query(self, title: str) -> str:
query = ""
characters_to_remove = "\?()\/"
pattern = "[" + characters_to_remove + "]"
title = re.sub(pattern, "", title)
title = title.replace("_", " ")
if '"' in title or ",," in title:
title = title.split('"')[0].split(",,")[0]
if "/" in title:
title_tokens = [
token for token in title.lower().split(" ") if len(token) > 1
]
else:
title_tokens = list(
self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)
)
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = query + "%20".join(tokens)
if not query:
return ""
return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
def _parse_search_results(self, root) -> List[Dict]:
matches = []
results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
for result in results:
title = result.xpath(
f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.TITLE_TEXT_PATH}"
)
book_url = result.xpath(
f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}"
)
authors = result.xpath(
f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.AUTHORS_PATH}"
)
if not title or not book_url or not authors:
continue
title = title[0].strip()
book_url = LubimyCzytac.BASE_URL + book_url[0]
book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split(
"/"
)[0]
matches.append(
{"id": book_id, "title": title, "authors": authors, "url": book_url}
)
return matches
def _parse_single_book(self, match: Dict) -> Dict:
url = match.get("url")
result = requests.get(url)
root = fromstring(result.text)
match["series"], match["series_index"] = self._parse_series(root=root)
match["tags"] = self._parse_tags(root=root)
match["publisher"] = self._parse_publisher(root=root)
match["publishedDate"] = self._parse_from_summary(
root=root, attribute_name="datePublished"
)
match["rating"] = self._parse_rating(root=root)
match["description"] = self._parse_description(root=root)
match["cover"] = self._parse_cover(root=root)
match["source"] = {
"id": self.__id__,
"description": self.__name__,
"link": LubimyCzytac.BASE_URL,
}
match['languages'] = self._parse_languages(root=root)
match["identifiers"] = {
"isbn": self._parse_isbn(root=root),
"lubimyczytac": match["id"],
}
return match
def _parse_cover(self, root):
imgcol_node = root.xpath('//meta[@property="og:image"]/@content')
if imgcol_node:
img_url = imgcol_node[0]
return img_url
def _parse_publisher(self, root):
publisher = root.xpath(LubimyCzytac.PUBLISHER)
if publisher:
return publisher[0]
else:
return None
def _parse_languages(self, root):
lang = root.xpath(LubimyCzytac.LANGUAGES)
languages = list()
if lang:
lang = lang[0].strip()
if "polski" in lang:
languages.append("Polish")
if "angielski" in lang:
languages.append("English")
if not languages:
return ['Polish']
return languages
def _parse_series(self, root):
try:
series_node = root.xpath(LubimyCzytac.SERIES)
if series_node:
series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()")
if series_lst:
series_txt = series_lst
else:
series_txt = None
else:
return (None, None)
if series_txt:
ser_string = [series_txt[0].replace("\n", "").strip()]
ser_nazwa = ser_string
for ser in ser_string:
if "tom " in ser:
ser_info = ser.split(" (tom ", 1)
ser_nazwa = ser.split(" (tom ")[0]
break
if ser_info:
series_index_unicode = ser_info[1]
series_index_string = str(
series_index_unicode.replace(" ", "").replace(")", "")
)
# Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3
if "-" in series_index_string:
series_index_string_temp = series_index_string.split("-", 1)
series_index_string = series_index_string_temp[0]
if series_index_string.replace(".", "").isdigit() is True:
series_index = get_int_or_float(series_index_string)
else:
series_index = 0
else:
series_index = 0
series = ser_nazwa
return (series, series_index)
except:
return (None, None)
def _parse_tags(self, root):
tags = None
try:
tags_from_genre = root.xpath(LubimyCzytac.TAGS)
if tags_from_genre:
tags = tags_from_genre
tags = [w.replace(", itd.", " itd.") for w in tags]
return tags
else:
return None
except:
return tags
def _parse_from_summary(self, root, attribute_name: str) -> str:
data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0])
value = data.get(attribute_name)
return value.strip() if value is not None else value
def _parse_rating(self, root):
rating_node = root.xpath(LubimyCzytac.RATING)
if rating_node:
rating_value = round(float((rating_node[0]).replace(",", ".")) / 2)
return rating_value
return None
def _parse_date(self, root, xpath="first_publish"):
options = {
"first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
"first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
}
path = options.get(xpath)
from dateutil import parser
data = root.xpath(path)
if data:
first_pub_date = data[0].strip()
return parser.parse(first_pub_date)
return None
def _parse_isbn(self, root):
isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0]
return isbn_node
def _parse_description(self, root):
comments = ""
description_node = root.xpath(LubimyCzytac.DESCRIPTION)
if description_node:
for zrodla in root.xpath('//p[@class="source"]'):
zrodla.getparent().remove(zrodla)
comments = tostring(description_node[0], method="html")
comments = sanitize_comments_html(comments)
else:
# try
description_node = root.xpath('//meta[@property="og:description"]/@content')
if description_node:
comments = description_node[0]
comments = sanitize_comments_html(comments)
pages = self._parse_from_summary(root=root, attribute_name="numberOfPages")
if pages:
comments += f'Książka ma {pages} stron(y).
'
first_publish_date = self._parse_date(root=root)
if first_publish_date:
comments += f'Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}
'
first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl")
if first_publish_date_pl:
comments += f'Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}
'
return comments
def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
"""
Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py.
"""
# strip sub-titles
if strip_subtitle:
subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)")
if len(subtitle.sub("", title)) > 1:
title = subtitle.sub("", title)
title_patterns = [
(re.compile(pat, re.IGNORECASE), repl)
for pat, repl in [
# Remove things like: (2010) (Omnibus) etc.
(
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]",
"",
),
# Remove any strings that contain the substring edition inside
# parentheses
(r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
# Remove commas used a separators in numbers
(r"(\d+),(\d+)", r"\1\2"),
# Remove hyphens only if they have whitespace before them
(r"(\s-)", " "),
# Replace other special chars with a space
(r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
]
]
for pat, repl in title_patterns:
title = pat.sub(repl, title)
tokens = title.split()
for token in tokens:
token = token.strip().strip('"').strip("'")
if token and (
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
):
yield token