# -*- coding: utf-8 -*- # This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web) # Copyright (C) 2021 OzzieIsaacs # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import json import re from typing import Dict, List from urllib.parse import quote import requests from cps.services.Metadata import Metadata from lxml.html import fromstring, tostring def get_int_or_float(v): number_as_float = float(v) number_as_int = int(number_as_float) return number_as_int if number_as_float == number_as_int else number_as_float def strip_accents(s): if s is None: return s else: symbols = ( "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ", ) tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)]) return s.translate(tr) # .lower() def sanitize_comments_html(html): from markdown2 import Markdown text = html2text(html) md = Markdown() html = md.convert(text) return html def html2text(html): from html2text import HTML2Text import re # replace tags with as becomes emphasis in html2text if isinstance(html, bytes): html = html.decode("utf-8") html = re.sub( r"<\s*(?P/?)\s*[uU]\b(?P[^>]*)>", r"<\gspan\g>", html, ) h2t = HTML2Text() h2t.body_width = 0 h2t.single_line_break = True h2t.emphasis_mark = "*" return h2t.handle(html) class LubimyCzytac(Metadata): __name__ = "LubimyCzytac.pl" __id__ = "lubimyczytac" BASE_URL = "https://lubimyczytac.pl" BOOK_SEARCH_RESULT_XPATH = ( "*//div[@class='listSearch']//div[@class='authorAllBooks__single']" ) SINGLE_BOOK_RESULT_XPATH = ".//div[contains(@class,'authorAllBooks__singleText')]" TITLE_PATH = "/div/a[contains(@class,'authorAllBooks__singleTextTitle')]" TITLE_TEXT_PATH = f"{TITLE_PATH}//text()" URL_PATH = f"{TITLE_PATH}/@href" AUTHORS_PATH = "/div/a[contains(@href,'autor')]//text()" SIBLINGS = "/following-sibling::dd" CONTAINER = "//section[@class='container book']" PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()" LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()" DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']" SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]" DETAILS = "//div[@id='book-details']" PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania" FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()" FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()" TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()" RATING = "//meta[@property='books:rating:value']/@content" COVER = "//meta[@property='og:image']/@content" SUMMARY = "//script[@type='application/ld+json']//text()" def search(self, query, __): if self.active: result = requests.get(self._prepare_query(title=query)) root = fromstring(result.text) matches = self._parse_search_results(root=root) if matches: for ind, match in enumerate(matches): matches[ind] = self._parse_single_book(match=match) return matches def _prepare_query(self, title: str) -> str: query = "" characters_to_remove = "\?()\/" pattern = "[" + characters_to_remove + "]" title = re.sub(pattern, "", title) title = title.replace("_", " ") if '"' in title or ",," in title: title = title.split('"')[0].split(",,")[0] if "/" in title: title_tokens = [ token for token in title.lower().split(" ") if len(token) > 1 ] else: title_tokens = list( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True) ) if title_tokens: tokens = [quote(t.encode("utf-8")) for t in title_tokens] query = query + "%20".join(tokens) if not query: return "" return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}" def _parse_search_results(self, root) -> List[Dict]: matches = [] results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH) for result in results: title = result.xpath( f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.TITLE_TEXT_PATH}" ) book_url = result.xpath( f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}" ) authors = result.xpath( f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.AUTHORS_PATH}" ) if not title or not book_url or not authors: continue title = title[0].strip() book_url = LubimyCzytac.BASE_URL + book_url[0] book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split( "/" )[0] matches.append( {"id": book_id, "title": title, "authors": authors, "url": book_url} ) return matches def _parse_single_book(self, match: Dict) -> Dict: url = match.get("url") result = requests.get(url) root = fromstring(result.text) match["series"], match["series_index"] = self._parse_series(root=root) match["tags"] = self._parse_tags(root=root) match["publisher"] = self._parse_publisher(root=root) match["publishedDate"] = self._parse_from_summary( root=root, attribute_name="datePublished" ) match["rating"] = self._parse_rating(root=root) match["description"] = self._parse_description(root=root) match["cover"] = self._parse_cover(root=root) match["source"] = { "id": self.__id__, "description": self.__name__, "link": LubimyCzytac.BASE_URL, } match['languages'] = self._parse_languages(root=root) match["identifiers"] = { "isbn": self._parse_isbn(root=root), "lubimyczytac": match["id"], } return match def _parse_cover(self, root): imgcol_node = root.xpath('//meta[@property="og:image"]/@content') if imgcol_node: img_url = imgcol_node[0] return img_url def _parse_publisher(self, root): publisher = root.xpath(LubimyCzytac.PUBLISHER) if publisher: return publisher[0] else: return None def _parse_languages(self, root): lang = root.xpath(LubimyCzytac.LANGUAGES) languages = list() if lang: lang = lang[0].strip() if "polski" in lang: languages.append("Polish") if "angielski" in lang: languages.append("English") if not languages: return ['Polish'] return languages def _parse_series(self, root): try: series_node = root.xpath(LubimyCzytac.SERIES) if series_node: series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()") if series_lst: series_txt = series_lst else: series_txt = None else: return (None, None) if series_txt: ser_string = [series_txt[0].replace("\n", "").strip()] ser_nazwa = ser_string for ser in ser_string: if "tom " in ser: ser_info = ser.split(" (tom ", 1) ser_nazwa = ser.split(" (tom ")[0] break if ser_info: series_index_unicode = ser_info[1] series_index_string = str( series_index_unicode.replace(" ", "").replace(")", "") ) # Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3 if "-" in series_index_string: series_index_string_temp = series_index_string.split("-", 1) series_index_string = series_index_string_temp[0] if series_index_string.replace(".", "").isdigit() is True: series_index = get_int_or_float(series_index_string) else: series_index = 0 else: series_index = 0 series = ser_nazwa return (series, series_index) except: return (None, None) def _parse_tags(self, root): tags = None try: tags_from_genre = root.xpath(LubimyCzytac.TAGS) if tags_from_genre: tags = tags_from_genre tags = [w.replace(", itd.", " itd.") for w in tags] return tags else: return None except: return tags def _parse_from_summary(self, root, attribute_name: str) -> str: data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0]) value = data.get(attribute_name) return value.strip() if value is not None else value def _parse_rating(self, root): rating_node = root.xpath(LubimyCzytac.RATING) if rating_node: rating_value = round(float((rating_node[0]).replace(",", ".")) / 2) return rating_value return None def _parse_date(self, root, xpath="first_publish"): options = { "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE, "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL, } path = options.get(xpath) from dateutil import parser data = root.xpath(path) if data: first_pub_date = data[0].strip() return parser.parse(first_pub_date) return None def _parse_isbn(self, root): isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0] return isbn_node def _parse_description(self, root): comments = "" description_node = root.xpath(LubimyCzytac.DESCRIPTION) if description_node: for zrodla in root.xpath('//p[@class="source"]'): zrodla.getparent().remove(zrodla) comments = tostring(description_node[0], method="html") comments = sanitize_comments_html(comments) else: # try description_node = root.xpath('//meta[@property="og:description"]/@content') if description_node: comments = description_node[0] comments = sanitize_comments_html(comments) pages = self._parse_from_summary(root=root, attribute_name="numberOfPages") if pages: comments += f'

Książka ma {pages} stron(y).

' first_publish_date = self._parse_date(root=root) if first_publish_date: comments += f'

Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}

' first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl") if first_publish_date_pl: comments += f'

Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}

' return comments def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False): """ Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py. """ # strip sub-titles if strip_subtitle: subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)") if len(subtitle.sub("", title)) > 1: title = subtitle.sub("", title) title_patterns = [ (re.compile(pat, re.IGNORECASE), repl) for pat, repl in [ # Remove things like: (2010) (Omnibus) etc. ( r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]", "", ), # Remove any strings that contain the substring edition inside # parentheses (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""), # Remove commas used a separators in numbers (r"(\d+),(\d+)", r"\1\2"), # Remove hyphens only if they have whitespace before them (r"(\s-)", " "), # Replace other special chars with a space (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "), ] ] for pat, repl in title_patterns: title = pat.sub(repl, title) tokens = title.split() for token in tokens: token = token.strip().strip('"').strip("'") if token and ( not strip_joiners or token.lower() not in ("a", "and", "the", "&") ): yield token