From 920acaca99879055821744470a0e85f6ee51ec03 Mon Sep 17 00:00:00 2001 From: collerek Date: Sat, 11 Dec 2021 01:06:04 +0100 Subject: [PATCH 1/6] everything working to refactor --- .gitignore | 1 + cps/metadata_provider/lubimyczytac.py | 373 ++++++++++++++++++++++++++ cps/static/js/get_meta.js | 46 +++- requirements.txt | 2 + 4 files changed, 415 insertions(+), 7 deletions(-) create mode 100644 cps/metadata_provider/lubimyczytac.py diff --git a/.gitignore b/.gitignore index 614e9936..989c7811 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ __pycache__/ .python-version env/ venv/ +p38venv/ eggs/ dist/ executable/ diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py new file mode 100644 index 00000000..aab50bb6 --- /dev/null +++ b/cps/metadata_provider/lubimyczytac.py @@ -0,0 +1,373 @@ +# -*- coding: utf-8 -*- + +# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web) +# Copyright (C) 2021 OzzieIsaacs +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +import json +import re +from typing import Dict, List +from urllib.parse import quote + +import requests +from cps.services.Metadata import Metadata +from lxml.html import fromstring, tostring + + +def get_int_or_float(v): + number_as_float = float(v) + number_as_int = int(number_as_float) + return number_as_int if number_as_float == number_as_int else number_as_float + + +def strip_accents(s): + if s is None: + return s + else: + symbols = ( + "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", + "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ", + ) + tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)]) + return s.translate(tr) # .lower() + + +def sanitize_comments_html(html): + from markdown2 import Markdown + + text = html2text(html) + md = Markdown() + html = md.convert(text) + return html + + +def html2text(html): + from html2text import HTML2Text + import re + + # replace tags with as becomes emphasis in html2text + if isinstance(html, bytes): + html = html.decode("utf-8") + html = re.sub( + r"<\s*(?P/?)\s*[uU]\b(?P[^>]*)>", + r"<\gspan\g>", + html, + ) + h2t = HTML2Text() + h2t.body_width = 0 + h2t.single_line_break = True + h2t.emphasis_mark = "*" + return h2t.handle(html) + + +class LubimyCzytac(Metadata): + __name__ = "LubimyCzytac.pl" + __id__ = "lubimyczytac" + + BASE_URL = "https://lubimyczytac.pl" + + BOOK_SEARCH_RESULT_XPATH = ( + "*//div[@class='listSearch']//div[@class='authorAllBooks__single']" + ) + SINGLE_BOOK_RESULT_XPATH = ".//div[contains(@class,'authorAllBooks__singleText')]" + TITLE_PATH = "/div/a[contains(@class,'authorAllBooks__singleTextTitle')]" + TITLE_TEXT_PATH = f"{TITLE_PATH}//text()" + URL_PATH = f"{TITLE_PATH}/@href" + AUTHORS_PATH = "/div/a[contains(@href,'autor')]//text()" + + SIBLINGS = "/following-sibling::dd" + + CONTAINER = "//section[@class='container book']" + PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()" + LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()" + DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']" + SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]" + + DETAILS = "//div[@id='book-details']" + PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania" + FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()" + FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()" + TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()" + RATING = "//meta[@property='books:rating:value']/@content" + COVER = "//meta[@property='og:image']/@content" + + SUMMARY = "//script[@type='application/ld+json']//text()" + + def search(self, query, __): + if self.active: + result = requests.get(self._prepare_query(title=query)) + root = fromstring(result.text) + matches = self._parse_search_results(root=root) + if matches: + for ind, match in enumerate(matches): + matches[ind] = self._parse_single_book(match=match) + return matches + + def _prepare_query(self, title: str) -> str: + query = "" + characters_to_remove = "\?()\/" + pattern = "[" + characters_to_remove + "]" + title = re.sub(pattern, "", title) + title = title.replace("_", " ") + if '"' in title or ",," in title: + title = title.split('"')[0].split(",,")[0] + + if "/" in title: + title_tokens = [ + token for token in title.lower().split(" ") if len(token) > 1 + ] + else: + title_tokens = list( + self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True) + ) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = query + "%20".join(tokens) + if not query: + return "" + return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}" + + def _parse_search_results(self, root) -> List[Dict]: + matches = [] + results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH) + for result in results: + title = result.xpath( + f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" + f"{LubimyCzytac.TITLE_TEXT_PATH}" + ) + book_url = result.xpath( + f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}" + ) + authors = result.xpath( + f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" + f"{LubimyCzytac.AUTHORS_PATH}" + ) + + if not title or not book_url or not authors: + continue + title = title[0].strip() + book_url = LubimyCzytac.BASE_URL + book_url[0] + book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split( + "/" + )[0] + matches.append( + {"id": book_id, "title": title, "authors": authors, "url": book_url} + ) + return matches + + def _parse_single_book(self, match: Dict) -> Dict: + url = match.get("url") + result = requests.get(url) + root = fromstring(result.text) + match["series"], match["series_index"] = self._parse_series(root=root) + match["tags"] = self._parse_tags(root=root) + match["publisher"] = self._parse_publisher(root=root) + match["publishedDate"] = self._parse_from_summary( + root=root, attribute_name="datePublished" + ) + match["rating"] = self._parse_rating(root=root) + match["description"] = self._parse_description(root=root) + match["cover"] = self._parse_cover(root=root) + match["source"] = { + "id": self.__id__, + "description": self.__name__, + "link": LubimyCzytac.BASE_URL, + } + match['languages'] = self._parse_languages(root=root) + match["identifiers"] = { + "isbn": self._parse_isbn(root=root), + "lubimyczytac": match["id"], + } + return match + + def _parse_cover(self, root): + imgcol_node = root.xpath('//meta[@property="og:image"]/@content') + if imgcol_node: + img_url = imgcol_node[0] + return img_url + + def _parse_publisher(self, root): + publisher = root.xpath(LubimyCzytac.PUBLISHER) + if publisher: + return publisher[0] + else: + return None + + def _parse_languages(self, root): + lang = root.xpath(LubimyCzytac.LANGUAGES) + languages = list() + if lang: + lang = lang[0].strip() + if "polski" in lang: + languages.append("Polish") + if "angielski" in lang: + languages.append("English") + if not languages: + return ['Polish'] + return languages + + def _parse_series(self, root): + try: + series_node = root.xpath(LubimyCzytac.SERIES) + if series_node: + series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()") + if series_lst: + series_txt = series_lst + else: + series_txt = None + else: + return (None, None) + + if series_txt: + ser_string = [series_txt[0].replace("\n", "").strip()] + ser_nazwa = ser_string + for ser in ser_string: + if "tom " in ser: + ser_info = ser.split(" (tom ", 1) + ser_nazwa = ser.split(" (tom ")[0] + break + + if ser_info: + series_index_unicode = ser_info[1] + series_index_string = str( + series_index_unicode.replace(" ", "").replace(")", "") + ) + # Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3 + if "-" in series_index_string: + series_index_string_temp = series_index_string.split("-", 1) + series_index_string = series_index_string_temp[0] + if series_index_string.replace(".", "").isdigit() is True: + series_index = get_int_or_float(series_index_string) + else: + series_index = 0 + else: + series_index = 0 + series = ser_nazwa + return (series, series_index) + except: + return (None, None) + + def _parse_tags(self, root): + tags = None + try: + tags_from_genre = root.xpath(LubimyCzytac.TAGS) + if tags_from_genre: + tags = tags_from_genre + tags = [w.replace(", itd.", " itd.") for w in tags] + return tags + else: + return None + except: + return tags + + def _parse_from_summary(self, root, attribute_name: str) -> str: + data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0]) + value = data.get(attribute_name) + return value.strip() if value is not None else value + + def _parse_rating(self, root): + rating_node = root.xpath(LubimyCzytac.RATING) + if rating_node: + rating_value = round(float((rating_node[0]).replace(",", ".")) / 2) + return rating_value + return None + + def _parse_date(self, root, xpath="first_publish"): + options = { + "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE, + "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL, + } + path = options.get(xpath) + from dateutil import parser + + data = root.xpath(path) + if data: + first_pub_date = data[0].strip() + return parser.parse(first_pub_date) + return None + + def _parse_isbn(self, root): + isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0] + return isbn_node + + def _parse_description(self, root): + comments = "" + description_node = root.xpath(LubimyCzytac.DESCRIPTION) + if description_node: + for zrodla in root.xpath('//p[@class="source"]'): + zrodla.getparent().remove(zrodla) + comments = tostring(description_node[0], method="html") + comments = sanitize_comments_html(comments) + + else: + # try + description_node = root.xpath('//meta[@property="og:description"]/@content') + if description_node: + comments = description_node[0] + comments = sanitize_comments_html(comments) + + pages = self._parse_from_summary(root=root, attribute_name="numberOfPages") + if pages: + comments += f'

Książka ma {pages} stron(y).

' + + first_publish_date = self._parse_date(root=root) + if first_publish_date: + comments += f'

Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}

' + + first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl") + if first_publish_date_pl: + comments += f'

Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}

' + + return comments + + def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False): + """ + Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py. + """ + # strip sub-titles + if strip_subtitle: + subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)") + if len(subtitle.sub("", title)) > 1: + title = subtitle.sub("", title) + + title_patterns = [ + (re.compile(pat, re.IGNORECASE), repl) + for pat, repl in [ + # Remove things like: (2010) (Omnibus) etc. + ( + r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]", + "", + ), + # Remove any strings that contain the substring edition inside + # parentheses + (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""), + # Remove commas used a separators in numbers + (r"(\d+),(\d+)", r"\1\2"), + # Remove hyphens only if they have whitespace before them + (r"(\s-)", " "), + # Replace other special chars with a space + (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "), + ] + ] + + for pat, repl in title_patterns: + title = pat.sub(repl, title) + + tokens = title.split() + for token in tokens: + token = token.strip().strip('"').strip("'") + if token and ( + not strip_joiners or token.lower() not in ("a", "and", "the", "&") + ): + yield token diff --git a/cps/static/js/get_meta.js b/cps/static/js/get_meta.js index 51ab740d..a8643065 100644 --- a/cps/static/js/get_meta.js +++ b/cps/static/js/get_meta.js @@ -26,19 +26,26 @@ $(function () { ) }; - function populateForm (book) { - tinymce.get("description").setContent(book.description); - var uniqueTags = $.map($("#tags").val().split(","), $.trim); - if ( uniqueTags.length == 1 && uniqueTags[0] == "") { - uniqueTags = []; + function getUniqueValues(attribute_name, book){ + var presentArray = $.map($("#"+attribute_name).val().split(","), $.trim); + if ( presentArray.length === 1 && presentArray[0] === "") { + presentArray = []; } - $.each(book.tags, function(i, el) { - if ($.inArray(el, uniqueTags) === -1) uniqueTags.push(el); + $.each(book[attribute_name], function(i, el) { + if ($.inArray(el, presentArray) === -1) presentArray.push(el); }); + return presentArray + } + + function populateForm (book) { + tinymce.get("description").setContent(book.description); + var uniqueTags = getUniqueValues('tags', book) + var uniqueLanguages = getUniqueValues('languages', book) var ampSeparatedAuthors = (book.authors || []).join(" & "); $("#bookAuthor").val(ampSeparatedAuthors); $("#book_title").val(book.title); $("#tags").val(uniqueTags.join(", ")); + $("#languages").val(uniqueLanguages.join(", ")); $("#rating").data("rating").setValue(Math.round(book.rating)); if(book.cover !== null){ $(".cover img").attr("src", book.cover); @@ -48,7 +55,32 @@ $(function () { $("#publisher").val(book.publisher); if (typeof book.series !== "undefined") { $("#series").val(book.series); + $("#series_index").val(book.series_index); } + if (typeof book.identifiers !== "undefined") { + populateIdentifiers(book.identifiers) + } + } + + function populateIdentifiers(identifiers){ + for (const property in identifiers) { + console.log(`${property}: ${identifiers[property]}`); + if ($('input[name="identifier-type-'+property+'"]').length) { + $('input[name="identifier-val-'+property+'"]').val(identifiers[property]) + } + else { + addIdentifier(property, identifiers[property]) + } + } + } + + function addIdentifier(name, value){ + var line = ''; + line += ''; + line += ''; + line += ''+_("Remove")+''; + line += ''; + $("#identifier-table").append(line); } function doSearch (keyword) { diff --git a/requirements.txt b/requirements.txt index 1db961fe..d1f58a8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,5 @@ Wand>=0.4.4,<0.7.0 unidecode>=0.04.19,<1.3.0 lxml>=3.8.0,<4.7.0 flask-wtf>=0.14.2,<1.1.0 +markdown2==2.4.2 +html2text==2020.1.16 From d55626d4452bf15849b3ead2266a2ca89f8d9c8d Mon Sep 17 00:00:00 2001 From: collerek Date: Mon, 13 Dec 2021 01:23:03 +0100 Subject: [PATCH 2/6] refactor and cleaning --- cps/metadata_provider/comicvine.py | 4 +- cps/metadata_provider/google.py | 12 +- cps/metadata_provider/lubimyczytac.py | 440 +++++++++++++------------- cps/services/Metadata.py | 33 +- requirements.txt | 1 + 5 files changed, 264 insertions(+), 226 deletions(-) diff --git a/cps/metadata_provider/comicvine.py b/cps/metadata_provider/comicvine.py index 8f496608..195e68f8 100644 --- a/cps/metadata_provider/comicvine.py +++ b/cps/metadata_provider/comicvine.py @@ -26,7 +26,7 @@ class ComicVine(Metadata): __name__ = "ComicVine" __id__ = "comicvine" - def search(self, query, __): + def search(self, query, generic_cover=""): val = list() apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6" if self.active: @@ -52,7 +52,7 @@ class ComicVine(Metadata): v['tags'] = ["Comics", seriesTitle] v['rating'] = 0 v['series'] = seriesTitle - v['cover'] = r['image'].get('original_url') + v['cover'] = r['image'].get('original_url', generic_cover) v['source'] = { "id": self.__id__, "description": "ComicVine Books", diff --git a/cps/metadata_provider/google.py b/cps/metadata_provider/google.py index f3d02d8e..8be8ad74 100644 --- a/cps/metadata_provider/google.py +++ b/cps/metadata_provider/google.py @@ -17,19 +17,20 @@ # along with this program. If not, see . # Google Books api document: https://developers.google.com/books/docs/v1/using - - import requests + from cps.services.Metadata import Metadata + class Google(Metadata): __name__ = "Google" __id__ = "google" + BASE_URL = "https://www.googleapis.com/books/v1/volumes?q=" - def search(self, query, __): + def search(self, query, generic_cover=""): if self.active: val = list() - result = requests.get("https://www.googleapis.com/books/v1/volumes?q="+query.replace(" ","+")) + result = requests.get(Google.BASE_URL + query.replace(" ","+")) for r in result.json()['items']: v = dict() v['id'] = r['id'] @@ -43,7 +44,8 @@ class Google(Metadata): if r['volumeInfo'].get('imageLinks'): v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://") else: - v['cover'] = "/../../../static/generic_cover.jpg" + # v['cover'] = "/../../../static/generic_cover.jpg" + v['cover'] = generic_cover v['source'] = { "id": self.__id__, "description": "Google Books", diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py index aab50bb6..ee66d1b4 100644 --- a/cps/metadata_provider/lubimyczytac.py +++ b/cps/metadata_provider/lubimyczytac.py @@ -15,47 +15,47 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import datetime import json import re -from typing import Dict, List +from typing import Dict, Generator, List, Optional, Tuple, Union from urllib.parse import quote import requests -from cps.services.Metadata import Metadata -from lxml.html import fromstring, tostring +from dateutil import parser +from html2text import HTML2Text +from lxml.html import HtmlElement, fromstring, tostring +from markdown2 import Markdown +from cps.services.Metadata import MetaRecord, Metadata -def get_int_or_float(v): - number_as_float = float(v) +SYMBOLS_TO_TRANSLATE = ( + "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", + "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ", +) +SYMBOL_TRANSLATION_MAP = dict( + [(ord(a), ord(b)) for (a, b) in zip(*SYMBOLS_TO_TRANSLATE)] +) + + +def get_int_or_float(value: str) -> Union[int, float]: + number_as_float = float(value) number_as_int = int(number_as_float) return number_as_int if number_as_float == number_as_int else number_as_float -def strip_accents(s): - if s is None: - return s - else: - symbols = ( - "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", - "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ", - ) - tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)]) - return s.translate(tr) # .lower() - +def strip_accents(s: Optional[str]) -> Optional[str]: + return s.translate(SYMBOL_TRANSLATION_MAP) if s is not None else s -def sanitize_comments_html(html): - from markdown2 import Markdown +def sanitize_comments_html(html: str) -> str: text = html2text(html) md = Markdown() html = md.convert(text) return html -def html2text(html): - from html2text import HTML2Text - import re - +def html2text(html: str) -> str: # replace tags with as becomes emphasis in html2text if isinstance(html, bytes): html = html.decode("utf-8") @@ -92,26 +92,36 @@ class LubimyCzytac(Metadata): PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()" LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()" DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']" - SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]" + SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]/text()" DETAILS = "//div[@id='book-details']" PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania" FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()" FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()" TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()" + RATING = "//meta[@property='books:rating:value']/@content" COVER = "//meta[@property='og:image']/@content" + ISBN = "//meta[@property='books:isbn']/@content" + META_TITLE = "//meta[@property='og:description']/@content" SUMMARY = "//script[@type='application/ld+json']//text()" - def search(self, query, __): + def search(self, query: str, generic_cover: str = "") -> Optional[List]: if self.active: result = requests.get(self._prepare_query(title=query)) root = fromstring(result.text) - matches = self._parse_search_results(root=root) + lc_parser = LubimyCzytacParser(root=root, metadata=self) + matches = lc_parser.parse_search_results() if matches: - for ind, match in enumerate(matches): - matches[ind] = self._parse_single_book(match=match) + final_matches = [] + for match in matches: + response = requests.get(match.get("url")) + match = lc_parser.parse_single_book( + match=match, response=response, generic_cover=generic_cover + ) + final_matches.append(match) + return final_matches return matches def _prepare_query(self, title: str) -> str: @@ -128,9 +138,7 @@ class LubimyCzytac(Metadata): token for token in title.lower().split(" ") if len(token) > 1 ] else: - title_tokens = list( - self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True) - ) + title_tokens = list(self.get_title_tokens(title, strip_joiners=False)) if title_tokens: tokens = [quote(t.encode("utf-8")) for t in title_tokens] query = query + "%20".join(tokens) @@ -138,236 +146,232 @@ class LubimyCzytac(Metadata): return "" return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}" - def _parse_search_results(self, root) -> List[Dict]: + @staticmethod + def get_title_tokens( + title: str, strip_joiners: bool = True + ) -> Generator[str, None, None]: + """ + Taken from calibre source code + """ + title_patterns = [ + (re.compile(pat, re.IGNORECASE), repl) + for pat, repl in [ + # Remove things like: (2010) (Omnibus) etc. + ( + r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|" + r"audiobook|audio\scd|paperback|turtleback|" + r"mass\s*market|edition|ed\.)[\])}]", + "", + ), + # Remove any strings that contain the substring edition inside + # parentheses + (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""), + # Remove commas used a separators in numbers + (r"(\d+),(\d+)", r"\1\2"), + # Remove hyphens only if they have whitespace before them + (r"(\s-)", " "), + # Replace other special chars with a space + (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "), + ] + ] + + for pat, repl in title_patterns: + title = pat.sub(repl, title) + + tokens = title.split() + for token in tokens: + token = token.strip().strip('"').strip("'") + if token and ( + not strip_joiners or token.lower() not in ("a", "and", "the", "&") + ): + yield token + + +class LubimyCzytacParser: + PAGES_TEMPLATE = "

Książka ma {0} stron(y).

" + PUBLISH_DATE_TEMPLATE = "

Data pierwszego wydania: {0}

" + PUBLISH_DATE_PL_TEMPLATE = ( + "

Data pierwszego wydania w Polsce: {0}

" + ) + + def __init__(self, root: HtmlElement, metadata: Metadata) -> None: + self.root = root + self.metadata = metadata + + def parse_search_results(self) -> List[Dict]: matches = [] - results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH) + results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH) for result in results: - title = result.xpath( - f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.TITLE_TEXT_PATH}" + title = self._parse_xpath_node( + root=result, + xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" + f"{LubimyCzytac.TITLE_TEXT_PATH}", ) - book_url = result.xpath( - f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}" + + book_url = self._parse_xpath_node( + root=result, + xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" + f"{LubimyCzytac.URL_PATH}", ) - authors = result.xpath( - f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.AUTHORS_PATH}" + authors = self._parse_xpath_node( + root=result, + xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" + f"{LubimyCzytac.AUTHORS_PATH}", + take_first=False, ) - - if not title or not book_url or not authors: + if not all([title, book_url, authors]): continue - title = title[0].strip() - book_url = LubimyCzytac.BASE_URL + book_url[0] - book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split( - "/" - )[0] matches.append( - {"id": book_id, "title": title, "authors": authors, "url": book_url} + { + "id": book_url.replace(f"/ksiazka/", "").split("/")[0], + "title": title, + "authors": [strip_accents(author) for author in authors], + "url": LubimyCzytac.BASE_URL + book_url, + } ) return matches - def _parse_single_book(self, match: Dict) -> Dict: - url = match.get("url") - result = requests.get(url) - root = fromstring(result.text) - match["series"], match["series_index"] = self._parse_series(root=root) - match["tags"] = self._parse_tags(root=root) - match["publisher"] = self._parse_publisher(root=root) + def parse_single_book( + self, match: Dict, response, generic_cover: str + ) -> MetaRecord: + self.root = fromstring(response.text) + match["series"], match["series_index"] = self._parse_series() + match["tags"] = self._parse_tags() + match["publisher"] = self._parse_publisher() match["publishedDate"] = self._parse_from_summary( - root=root, attribute_name="datePublished" + attribute_name="datePublished" ) - match["rating"] = self._parse_rating(root=root) - match["description"] = self._parse_description(root=root) - match["cover"] = self._parse_cover(root=root) + match["rating"] = self._parse_rating() + match["description"] = self._parse_description() + match["cover"] = self._parse_cover(generic_cover=generic_cover) match["source"] = { - "id": self.__id__, - "description": self.__name__, + "id": self.metadata.__id__, + "description": self.metadata.__name__, "link": LubimyCzytac.BASE_URL, } - match['languages'] = self._parse_languages(root=root) + match["languages"] = self._parse_languages() match["identifiers"] = { - "isbn": self._parse_isbn(root=root), + "isbn": self._parse_isbn(), "lubimyczytac": match["id"], } return match - def _parse_cover(self, root): - imgcol_node = root.xpath('//meta[@property="og:image"]/@content') - if imgcol_node: - img_url = imgcol_node[0] - return img_url - - def _parse_publisher(self, root): - publisher = root.xpath(LubimyCzytac.PUBLISHER) - if publisher: - return publisher[0] - else: + def _parse_xpath_node( + self, + xpath: str, + root: HtmlElement = None, + take_first: bool = True, + strip_element: bool = True, + ) -> Optional[Union[str, List[str]]]: + root = root if root is not None else self.root + node = root.xpath(xpath) + if not node: return None + return ( + (node[0].strip() if strip_element else node[0]) + if take_first + else [x.strip() for x in node] + ) + + def _parse_cover(self, generic_cover) -> Optional[str]: + return ( + self._parse_xpath_node(xpath=LubimyCzytac.COVER, take_first=True) + or generic_cover + ) + + def _parse_publisher(self) -> Optional[str]: + return self._parse_xpath_node(xpath=LubimyCzytac.PUBLISHER, take_first=True) - def _parse_languages(self, root): - lang = root.xpath(LubimyCzytac.LANGUAGES) + def _parse_languages(self) -> List[str]: languages = list() + lang = self._parse_xpath_node(xpath=LubimyCzytac.LANGUAGES, take_first=True) if lang: - lang = lang[0].strip() if "polski" in lang: languages.append("Polish") if "angielski" in lang: languages.append("English") - if not languages: - return ['Polish'] return languages - def _parse_series(self, root): - try: - series_node = root.xpath(LubimyCzytac.SERIES) - if series_node: - series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()") - if series_lst: - series_txt = series_lst - else: - series_txt = None - else: - return (None, None) - - if series_txt: - ser_string = [series_txt[0].replace("\n", "").strip()] - ser_nazwa = ser_string - for ser in ser_string: - if "tom " in ser: - ser_info = ser.split(" (tom ", 1) - ser_nazwa = ser.split(" (tom ")[0] - break - - if ser_info: - series_index_unicode = ser_info[1] - series_index_string = str( - series_index_unicode.replace(" ", "").replace(")", "") - ) - # Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3 - if "-" in series_index_string: - series_index_string_temp = series_index_string.split("-", 1) - series_index_string = series_index_string_temp[0] - if series_index_string.replace(".", "").isdigit() is True: - series_index = get_int_or_float(series_index_string) - else: - series_index = 0 - else: - series_index = 0 - series = ser_nazwa - return (series, series_index) - except: - return (None, None) - - def _parse_tags(self, root): - tags = None - try: - tags_from_genre = root.xpath(LubimyCzytac.TAGS) - if tags_from_genre: - tags = tags_from_genre - tags = [w.replace(", itd.", " itd.") for w in tags] - return tags - else: - return None - except: - return tags - - def _parse_from_summary(self, root, attribute_name: str) -> str: - data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0]) - value = data.get(attribute_name) + def _parse_series(self) -> Tuple[Optional[str], Optional[Union[float, int]]]: + series_index = 0 + series = self._parse_xpath_node(xpath=LubimyCzytac.SERIES, take_first=True) + if series: + if "tom " in series: + series_name, series_info = series.split(" (tom ", 1) + series_info = series_info.replace(" ", "").replace(")", "") + # Check if book is not a bundle, i.e. chapter 1-3 + if "-" in series_info: + series_info = series_info.split("-", 1)[0] + if series_info.replace(".", "").isdigit() is True: + series_index = get_int_or_float(series_info) + return series_name, series_index + return None, None + + def _parse_tags(self) -> List[str]: + tags = self._parse_xpath_node(xpath=LubimyCzytac.TAGS, take_first=False) + return [ + strip_accents(w.replace(", itd.", " itd.")) + for w in tags + if isinstance(w, str) + ] + + def _parse_from_summary(self, attribute_name: str) -> Optional[str]: + value = None + summary_text = self._parse_xpath_node(xpath=LubimyCzytac.SUMMARY) + if summary_text: + data = json.loads(summary_text) + value = data.get(attribute_name) return value.strip() if value is not None else value - def _parse_rating(self, root): - rating_node = root.xpath(LubimyCzytac.RATING) - if rating_node: - rating_value = round(float((rating_node[0]).replace(",", ".")) / 2) - return rating_value - return None + def _parse_rating(self) -> Optional[str]: + rating = self._parse_xpath_node(xpath=LubimyCzytac.RATING) + return round(float(rating.replace(",", ".")) / 2) if rating else rating - def _parse_date(self, root, xpath="first_publish"): + def _parse_date(self, xpath="first_publish") -> Optional[datetime.datetime]: options = { "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE, "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL, } - path = options.get(xpath) - from dateutil import parser - - data = root.xpath(path) - if data: - first_pub_date = data[0].strip() - return parser.parse(first_pub_date) - return None - - def _parse_isbn(self, root): - isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0] - return isbn_node - - def _parse_description(self, root): - comments = "" - description_node = root.xpath(LubimyCzytac.DESCRIPTION) - if description_node: - for zrodla in root.xpath('//p[@class="source"]'): - zrodla.getparent().remove(zrodla) - comments = tostring(description_node[0], method="html") - comments = sanitize_comments_html(comments) + date = self._parse_xpath_node(xpath=options.get(xpath)) + return parser.parse(date) if date else None - else: - # try - description_node = root.xpath('//meta[@property="og:description"]/@content') - if description_node: - comments = description_node[0] - comments = sanitize_comments_html(comments) + def _parse_isbn(self) -> Optional[str]: + return self._parse_xpath_node(xpath=LubimyCzytac.ISBN) + + def _parse_description(self) -> str: + description = "" + description_node = self._parse_xpath_node( + xpath=LubimyCzytac.DESCRIPTION, strip_element=False + ) + if description_node is not None: + for source in self.root.xpath('//p[@class="source"]'): + source.getparent().remove(source) + description = tostring(description_node, method="html") + description = sanitize_comments_html(description) - pages = self._parse_from_summary(root=root, attribute_name="numberOfPages") + else: + description_node = self._parse_xpath_node(xpath=LubimyCzytac.META_TITLE) + if description_node is not None: + description = description_node + description = sanitize_comments_html(description) + description = self._add_extra_info_to_description(description=description) + return description + + def _add_extra_info_to_description(self, description: str) -> str: + pages = self._parse_from_summary(attribute_name="numberOfPages") if pages: - comments += f'

Książka ma {pages} stron(y).

' + description += LubimyCzytacParser.PAGES_TEMPLATE.format(pages) - first_publish_date = self._parse_date(root=root) + first_publish_date = self._parse_date() if first_publish_date: - comments += f'

Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}

' + description += LubimyCzytacParser.PUBLISH_DATE_TEMPLATE.format( + first_publish_date.strftime("%d.%m.%Y") + ) - first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl") + first_publish_date_pl = self._parse_date(xpath="first_publish_pl") if first_publish_date_pl: - comments += f'

Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}

' - - return comments - - def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False): - """ - Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py. - """ - # strip sub-titles - if strip_subtitle: - subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)") - if len(subtitle.sub("", title)) > 1: - title = subtitle.sub("", title) - - title_patterns = [ - (re.compile(pat, re.IGNORECASE), repl) - for pat, repl in [ - # Remove things like: (2010) (Omnibus) etc. - ( - r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]", - "", - ), - # Remove any strings that contain the substring edition inside - # parentheses - (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""), - # Remove commas used a separators in numbers - (r"(\d+),(\d+)", r"\1\2"), - # Remove hyphens only if they have whitespace before them - (r"(\s-)", " "), - # Replace other special chars with a space - (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "), - ] - ] - - for pat, repl in title_patterns: - title = pat.sub(repl, title) + description += LubimyCzytacParser.PUBLISH_DATE_PL_TEMPLATE.format( + first_publish_date_pl.strftime("%d.%m.%Y") + ) - tokens = title.split() - for token in tokens: - token = token.strip().strip('"').strip("'") - if token and ( - not strip_joiners or token.lower() not in ("a", "and", "the", "&") - ): - yield token + return description diff --git a/cps/services/Metadata.py b/cps/services/Metadata.py index d6e4e7d5..17a9e38e 100644 --- a/cps/services/Metadata.py +++ b/cps/services/Metadata.py @@ -15,13 +15,44 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import abc +from typing import Dict, List, Optional, TypedDict, Union -class Metadata(): +class Metadata: __name__ = "Generic" + __id__ = "generic" def __init__(self): self.active = True def set_status(self, state): self.active = state + + @abc.abstractmethod + def search(self, query: str, generic_cover: str): + pass + + +class MetaSourceInfo(TypedDict): + id: str + description: str + link: str + + +class MetaRecord(TypedDict): + id: Union[str, int] + title: str + authors: List[str] + url: str + cover: str + series: Optional[str] + series_index: Optional[Union[int, float]] + tags: Optional[List[str]] + publisher: Optional[str] + publishedDate: Optional[str] + rating: Optional[int] + description: Optional[str] + source: MetaSourceInfo + languages: Optional[List[str]] + identifiers: Dict[str, Union[str, int]] diff --git a/requirements.txt b/requirements.txt index d1f58a8d..d09c2157 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ lxml>=3.8.0,<4.7.0 flask-wtf>=0.14.2,<1.1.0 markdown2==2.4.2 html2text==2020.1.16 +python-dateutil==2.8.2 From 362fdc57166e778dd8f08f326adbaaa6b0bf3d5d Mon Sep 17 00:00:00 2001 From: collerek Date: Mon, 13 Dec 2021 02:14:53 +0100 Subject: [PATCH 3/6] run lubimyczytac detail pages in threadpool --- cps/metadata_provider/lubimyczytac.py | 60 ++++---------------- cps/search_metadata.py | 79 +++++++++++++++------------ cps/services/Metadata.py | 45 ++++++++++++++- 3 files changed, 98 insertions(+), 86 deletions(-) diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py index ee66d1b4..1d4e18e1 100644 --- a/cps/metadata_provider/lubimyczytac.py +++ b/cps/metadata_provider/lubimyczytac.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- - # This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web) # Copyright (C) 2021 OzzieIsaacs # @@ -18,7 +17,8 @@ import datetime import json import re -from typing import Dict, Generator, List, Optional, Tuple, Union +from multiprocessing.pool import ThreadPool +from typing import Dict, List, Optional, Tuple, Union from urllib.parse import quote import requests @@ -114,13 +114,14 @@ class LubimyCzytac(Metadata): lc_parser = LubimyCzytacParser(root=root, metadata=self) matches = lc_parser.parse_search_results() if matches: - final_matches = [] - for match in matches: - response = requests.get(match.get("url")) - match = lc_parser.parse_single_book( - match=match, response=response, generic_cover=generic_cover + with ThreadPool(processes=10) as pool: + final_matches = pool.starmap( + lc_parser.parse_single_book, + [ + (match, generic_cover) + for match in matches + ], ) - final_matches.append(match) return final_matches return matches @@ -146,46 +147,6 @@ class LubimyCzytac(Metadata): return "" return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}" - @staticmethod - def get_title_tokens( - title: str, strip_joiners: bool = True - ) -> Generator[str, None, None]: - """ - Taken from calibre source code - """ - title_patterns = [ - (re.compile(pat, re.IGNORECASE), repl) - for pat, repl in [ - # Remove things like: (2010) (Omnibus) etc. - ( - r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|" - r"audiobook|audio\scd|paperback|turtleback|" - r"mass\s*market|edition|ed\.)[\])}]", - "", - ), - # Remove any strings that contain the substring edition inside - # parentheses - (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""), - # Remove commas used a separators in numbers - (r"(\d+),(\d+)", r"\1\2"), - # Remove hyphens only if they have whitespace before them - (r"(\s-)", " "), - # Replace other special chars with a space - (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "), - ] - ] - - for pat, repl in title_patterns: - title = pat.sub(repl, title) - - tokens = title.split() - for token in tokens: - token = token.strip().strip('"').strip("'") - if token and ( - not strip_joiners or token.lower() not in ("a", "and", "the", "&") - ): - yield token - class LubimyCzytacParser: PAGES_TEMPLATE = "

Książka ma {0} stron(y).

" @@ -232,8 +193,9 @@ class LubimyCzytacParser: return matches def parse_single_book( - self, match: Dict, response, generic_cover: str + self, match: Dict, generic_cover: str ) -> MetaRecord: + response = requests.get(match.get("url")) self.root = fromstring(response.text) match["series"], match["series_index"] = self._parse_series() match["tags"] = self._parse_tags() diff --git a/cps/search_metadata.py b/cps/search_metadata.py index e837fe21..7d9b6e05 100644 --- a/cps/search_metadata.py +++ b/cps/search_metadata.py @@ -16,25 +16,23 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import os -import json +import concurrent.futures import importlib -import sys import inspect -import datetime -import concurrent.futures +import json +import os +import sys -from flask import Blueprint, request, Response, url_for +from flask import Blueprint, Response, request, url_for from flask_login import current_user from flask_login import login_required +from sqlalchemy.exc import InvalidRequestError, OperationalError from sqlalchemy.orm.attributes import flag_modified -from sqlalchemy.exc import OperationalError, InvalidRequestError -from . import constants, logger, ub from cps.services.Metadata import Metadata +from . import constants, logger, ub - -meta = Blueprint('metadata', __name__) +meta = Blueprint("metadata", __name__) log = logger.create() @@ -42,7 +40,7 @@ new_list = list() meta_dir = os.path.join(constants.BASE_DIR, "cps", "metadata_provider") modules = os.listdir(os.path.join(constants.BASE_DIR, "cps", "metadata_provider")) for f in modules: - if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith('__init__.py'): + if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith("__init__.py"): a = os.path.basename(f)[:-3] try: importlib.import_module("cps.metadata_provider." + a) @@ -51,34 +49,46 @@ for f in modules: log.error("Import error for metadata source: {}".format(a)) pass + def list_classes(provider_list): classes = list() for element in provider_list: - for name, obj in inspect.getmembers(sys.modules["cps.metadata_provider." + element]): - if inspect.isclass(obj) and name != "Metadata" and issubclass(obj, Metadata): + for name, obj in inspect.getmembers( + sys.modules["cps.metadata_provider." + element] + ): + if ( + inspect.isclass(obj) + and name != "Metadata" + and issubclass(obj, Metadata) + ): classes.append(obj()) return classes + cl = list_classes(new_list) + @meta.route("/metadata/provider") @login_required def metadata_provider(): - active = current_user.view_settings.get('metadata', {}) + active = current_user.view_settings.get("metadata", {}) provider = list() for c in cl: ac = active.get(c.__id__, True) - provider.append({"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__}) - return Response(json.dumps(provider), mimetype='application/json') + provider.append( + {"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__} + ) + return Response(json.dumps(provider), mimetype="application/json") -@meta.route("/metadata/provider", methods=['POST']) -@meta.route("/metadata/provider/", methods=['POST']) + +@meta.route("/metadata/provider", methods=["POST"]) +@meta.route("/metadata/provider/", methods=["POST"]) @login_required def metadata_change_active_provider(prov_name): new_state = request.get_json() - active = current_user.view_settings.get('metadata', {}) - active[new_state['id']] = new_state['value'] - current_user.view_settings['metadata'] = active + active = current_user.view_settings.get("metadata", {}) + active[new_state["id"]] = new_state["value"] + current_user.view_settings["metadata"] = active try: try: flag_modified(current_user, "view_settings") @@ -91,27 +101,26 @@ def metadata_change_active_provider(prov_name): if "initial" in new_state and prov_name: for c in cl: if c.__id__ == prov_name: - data = c.search(new_state.get('query', "")) + data = c.search(new_state.get("query", "")) break - return Response(json.dumps(data), mimetype='application/json') + return Response(json.dumps(data), mimetype="application/json") return "" -@meta.route("/metadata/search", methods=['POST']) + +@meta.route("/metadata/search", methods=["POST"]) @login_required def metadata_search(): - query = request.form.to_dict().get('query') + query = request.form.to_dict().get("query") data = list() - active = current_user.view_settings.get('metadata', {}) + active = current_user.view_settings.get("metadata", {}) if query: - static_cover = url_for('static', filename='generic_cover.jpg') + static_cover = url_for("static", filename="generic_cover.jpg") with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: - meta = {executor.submit(c.search, query, static_cover): c for c in cl if active.get(c.__id__, True)} + meta = { + executor.submit(c.search, query, static_cover): c + for c in cl + if active.get(c.__id__, True) + } for future in concurrent.futures.as_completed(meta): data.extend(future.result()) - return Response(json.dumps(data), mimetype='application/json') - - - - - - + return Response(json.dumps(data), mimetype="application/json") diff --git a/cps/services/Metadata.py b/cps/services/Metadata.py index 17a9e38e..1464411a 100644 --- a/cps/services/Metadata.py +++ b/cps/services/Metadata.py @@ -16,7 +16,8 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . import abc -from typing import Dict, List, Optional, TypedDict, Union +import re +from typing import Dict, Generator, List, Optional, TypedDict, Union class Metadata: @@ -30,9 +31,49 @@ class Metadata: self.active = state @abc.abstractmethod - def search(self, query: str, generic_cover: str): + def search(self, query: str, generic_cover: str = ""): pass + @staticmethod + def get_title_tokens( + title: str, strip_joiners: bool = True + ) -> Generator[str, None, None]: + """ + Taken from calibre source code + """ + title_patterns = [ + (re.compile(pat, re.IGNORECASE), repl) + for pat, repl in [ + # Remove things like: (2010) (Omnibus) etc. + ( + r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|" + r"audiobook|audio\scd|paperback|turtleback|" + r"mass\s*market|edition|ed\.)[\])}]", + "", + ), + # Remove any strings that contain the substring edition inside + # parentheses + (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""), + # Remove commas used a separators in numbers + (r"(\d+),(\d+)", r"\1\2"), + # Remove hyphens only if they have whitespace before them + (r"(\s-)", " "), + # Replace other special chars with a space + (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "), + ] + ] + + for pat, repl in title_patterns: + title = pat.sub(repl, title) + + tokens = title.split() + for token in tokens: + token = token.strip().strip('"').strip("'") + if token and ( + not strip_joiners or token.lower() not in ("a", "and", "the", "&") + ): + yield token + class MetaSourceInfo(TypedDict): id: str From d64589914fdf69ce78111c2a5d29a967f7a881e3 Mon Sep 17 00:00:00 2001 From: collerek Date: Mon, 13 Dec 2021 15:14:19 +0100 Subject: [PATCH 4/6] add series, languages and isbn to google provider --- cps/metadata_provider/google.py | 104 +++++++++++++++++++------- cps/metadata_provider/lubimyczytac.py | 24 +++--- cps/search_metadata.py | 5 +- cps/services/Metadata.py | 52 ++++++------- 4 files changed, 119 insertions(+), 66 deletions(-) diff --git a/cps/metadata_provider/google.py b/cps/metadata_provider/google.py index 8be8ad74..1074fe3d 100644 --- a/cps/metadata_provider/google.py +++ b/cps/metadata_provider/google.py @@ -17,41 +17,93 @@ # along with this program. If not, see . # Google Books api document: https://developers.google.com/books/docs/v1/using +from typing import Dict, List, Optional +from urllib.parse import quote + import requests -from cps.services.Metadata import Metadata +from cps.isoLanguages import get_lang3, get_language_name +from cps.services.Metadata import MetaRecord, Metadata class Google(Metadata): __name__ = "Google" __id__ = "google" - BASE_URL = "https://www.googleapis.com/books/v1/volumes?q=" + DESCRIPTION = "Google Books" + META_URL = "https://books.google.com/" + BOOK_URL = "https://books.google.com/books?id=" + SEARCH_URL = "https://www.googleapis.com/books/v1/volumes?q=" + ISBN_TYPE = "ISBN_13" - def search(self, query, generic_cover=""): + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: if self.active: val = list() - result = requests.get(Google.BASE_URL + query.replace(" ","+")) - for r in result.json()['items']: - v = dict() - v['id'] = r['id'] - v['title'] = r['volumeInfo']['title'] - v['authors'] = r['volumeInfo'].get('authors', []) - v['description'] = r['volumeInfo'].get('description', "") - v['publisher'] = r['volumeInfo'].get('publisher', "") - v['publishedDate'] = r['volumeInfo'].get('publishedDate', "") - v['tags'] = r['volumeInfo'].get('categories', []) - v['rating'] = r['volumeInfo'].get('averageRating', 0) - if r['volumeInfo'].get('imageLinks'): - v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://") - else: - # v['cover'] = "/../../../static/generic_cover.jpg" - v['cover'] = generic_cover - v['source'] = { - "id": self.__id__, - "description": "Google Books", - "link": "https://books.google.com/"} - v['url'] = "https://books.google.com/books?id=" + r['id'] - val.append(v) - return val + title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = "+".join(tokens) + results = requests.get(Google.SEARCH_URL + query) + for result in results.json()["items"]: + val.append( + self._parse_search_result( + result=result, generic_cover=generic_cover, locale=locale + ) + ) + return val + + def _parse_search_result( + self, result: Dict, generic_cover: str, locale: str + ) -> MetaRecord: + match = dict() + match["id"] = result["id"] + match["title"] = result["volumeInfo"]["title"] + match["authors"] = result["volumeInfo"].get("authors", []) + match["url"] = Google.BOOK_URL + result["id"] + match["cover"] = self._parse_cover(result=result, generic_cover=generic_cover) + match["description"] = result["volumeInfo"].get("description", "") + match["languages"] = self._parse_languages(result=result, locale=locale) + match["publisher"] = result["volumeInfo"].get("publisher", "") + match["publishedDate"] = result["volumeInfo"].get("publishedDate", "") + match["rating"] = result["volumeInfo"].get("averageRating", 0) + match["series"], match["series_index"] = "", 1 + match["tags"] = result["volumeInfo"].get("categories", []) + + match["source"] = { + "id": self.__id__, + "description": Google.DESCRIPTION, + "link": Google.META_URL, + } + + match["identifiers"] = { + "google": match.get("id"), + } + match = self._parse_isbn(result=result, match=match) + return match + + @staticmethod + def _parse_isbn(result: Dict, match: Dict) -> Dict: + identifiers = result["volumeInfo"].get("industryIdentifiers", []) + for identifier in identifiers: + if identifier.get("type") == Google.ISBN_TYPE: + match["identifiers"]["isbn"] = identifier.get("identifier") + break + return match + @staticmethod + def _parse_cover(result: Dict, generic_cover: str) -> str: + if result["volumeInfo"].get("imageLinks"): + cover_url = result["volumeInfo"]["imageLinks"]["thumbnail"] + return cover_url.replace("http://", "https://") + return generic_cover + @staticmethod + def _parse_languages(result: Dict, locale: str) -> List[str]: + language_iso2 = result.get("language", "") + languages = ( + [get_language_name(locale, get_lang3(language_iso2))] + if language_iso2 + else [] + ) + return languages diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py index 1d4e18e1..fd9ca4a7 100644 --- a/cps/metadata_provider/lubimyczytac.py +++ b/cps/metadata_provider/lubimyczytac.py @@ -107,7 +107,9 @@ class LubimyCzytac(Metadata): SUMMARY = "//script[@type='application/ld+json']//text()" - def search(self, query: str, generic_cover: str = "") -> Optional[List]: + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: if self.active: result = requests.get(self._prepare_query(title=query)) root = fromstring(result.text) @@ -117,10 +119,7 @@ class LubimyCzytac(Metadata): with ThreadPool(processes=10) as pool: final_matches = pool.starmap( lc_parser.parse_single_book, - [ - (match, generic_cover) - for match in matches - ], + [(match, generic_cover) for match in matches], ) return final_matches return matches @@ -192,26 +191,25 @@ class LubimyCzytacParser: ) return matches - def parse_single_book( - self, match: Dict, generic_cover: str - ) -> MetaRecord: + def parse_single_book(self, match: Dict, generic_cover: str) -> MetaRecord: response = requests.get(match.get("url")) self.root = fromstring(response.text) - match["series"], match["series_index"] = self._parse_series() - match["tags"] = self._parse_tags() + match["cover"] = self._parse_cover(generic_cover=generic_cover) + match["description"] = self._parse_description() + match["languages"] = self._parse_languages() match["publisher"] = self._parse_publisher() match["publishedDate"] = self._parse_from_summary( attribute_name="datePublished" ) match["rating"] = self._parse_rating() - match["description"] = self._parse_description() - match["cover"] = self._parse_cover(generic_cover=generic_cover) + match["series"], match["series_index"] = self._parse_series() + match["tags"] = self._parse_tags() + match["source"] = { "id": self.metadata.__id__, "description": self.metadata.__name__, "link": LubimyCzytac.BASE_URL, } - match["languages"] = self._parse_languages() match["identifiers"] = { "isbn": self._parse_isbn(), "lubimyczytac": match["id"], diff --git a/cps/search_metadata.py b/cps/search_metadata.py index 7d9b6e05..a128f9ac 100644 --- a/cps/search_metadata.py +++ b/cps/search_metadata.py @@ -30,7 +30,7 @@ from sqlalchemy.exc import InvalidRequestError, OperationalError from sqlalchemy.orm.attributes import flag_modified from cps.services.Metadata import Metadata -from . import constants, logger, ub +from . import constants, get_locale, logger, ub meta = Blueprint("metadata", __name__) @@ -113,11 +113,12 @@ def metadata_search(): query = request.form.to_dict().get("query") data = list() active = current_user.view_settings.get("metadata", {}) + locale = get_locale() if query: static_cover = url_for("static", filename="generic_cover.jpg") with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: meta = { - executor.submit(c.search, query, static_cover): c + executor.submit(c.search, query, static_cover, locale): c for c in cl if active.get(c.__id__, True) } diff --git a/cps/services/Metadata.py b/cps/services/Metadata.py index 1464411a..09fc70ce 100644 --- a/cps/services/Metadata.py +++ b/cps/services/Metadata.py @@ -20,6 +20,30 @@ import re from typing import Dict, Generator, List, Optional, TypedDict, Union +class MetaSourceInfo(TypedDict): + id: str + description: str + link: str + + +class MetaRecord(TypedDict): + id: Union[str, int] + title: str + authors: List[str] + url: str + cover: str + series: Optional[str] + series_index: Optional[Union[int, float]] + tags: Optional[List[str]] + publisher: Optional[str] + publishedDate: Optional[str] + rating: Optional[int] + description: Optional[str] + source: MetaSourceInfo + languages: Optional[List[str]] + identifiers: Dict[str, Union[str, int]] + + class Metadata: __name__ = "Generic" __id__ = "generic" @@ -31,7 +55,9 @@ class Metadata: self.active = state @abc.abstractmethod - def search(self, query: str, generic_cover: str = ""): + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: pass @staticmethod @@ -73,27 +99,3 @@ class Metadata: not strip_joiners or token.lower() not in ("a", "and", "the", "&") ): yield token - - -class MetaSourceInfo(TypedDict): - id: str - description: str - link: str - - -class MetaRecord(TypedDict): - id: Union[str, int] - title: str - authors: List[str] - url: str - cover: str - series: Optional[str] - series_index: Optional[Union[int, float]] - tags: Optional[List[str]] - publisher: Optional[str] - publishedDate: Optional[str] - rating: Optional[int] - description: Optional[str] - source: MetaSourceInfo - languages: Optional[List[str]] - identifiers: Dict[str, Union[str, int]] From 51bf35c2e41a16032e1250a8cac252195116a147 Mon Sep 17 00:00:00 2001 From: collerek Date: Mon, 13 Dec 2021 17:21:41 +0100 Subject: [PATCH 5/6] unify scholar --- cps/metadata_provider/comicvine.py | 87 ++++++++++++++++----------- cps/metadata_provider/google.py | 49 ++++++++------- cps/metadata_provider/lubimyczytac.py | 57 +++++++++--------- cps/metadata_provider/scholar.py | 66 +++++++++++--------- cps/search_metadata.py | 15 +++-- cps/services/Metadata.py | 32 ++++++---- optional-requirements.txt | 3 + requirements.txt | 3 - 8 files changed, 172 insertions(+), 140 deletions(-) diff --git a/cps/metadata_provider/comicvine.py b/cps/metadata_provider/comicvine.py index 195e68f8..56618d4b 100644 --- a/cps/metadata_provider/comicvine.py +++ b/cps/metadata_provider/comicvine.py @@ -17,49 +17,68 @@ # along with this program. If not, see . # ComicVine api document: https://comicvine.gamespot.com/api/documentation +from typing import Dict, List, Optional +from urllib.parse import quote import requests -from cps.services.Metadata import Metadata +from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata class ComicVine(Metadata): __name__ = "ComicVine" __id__ = "comicvine" + DESCRIPTION = "ComicVine Books" + META_URL = "https://comicvine.gamespot.com/" + API_KEY = "57558043c53943d5d1e96a9ad425b0eb85532ee6" + BASE_URL = ( + f"https://comicvine.gamespot.com/api/search?api_key={API_KEY}" + f"&resources=issue&query=" + ) + QUERY_PARAMS = "&sort=name:desc&format=json" + HEADERS = {"User-Agent": "Not Evil Browser"} - def search(self, query, generic_cover=""): + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: val = list() - apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6" if self.active: - headers = { - 'User-Agent': 'Not Evil Browser' - } - - result = requests.get("https://comicvine.gamespot.com/api/search?api_key=" - + apikey + "&resources=issue&query=" + query + "&sort=name:desc&format=json", headers=headers) - for r in result.json()['results']: - seriesTitle = r['volume'].get('name', "") - if r.get('store_date'): - dateFomers = r.get('store_date') - else: - dateFomers = r.get('date_added') - v = dict() - v['id'] = r['id'] - v['title'] = seriesTitle + " #" + r.get('issue_number', "0") + " - " + ( r.get('name', "") or "") - v['authors'] = r.get('authors', []) - v['description'] = r.get('description', "") - v['publisher'] = "" - v['publishedDate'] = dateFomers - v['tags'] = ["Comics", seriesTitle] - v['rating'] = 0 - v['series'] = seriesTitle - v['cover'] = r['image'].get('original_url', generic_cover) - v['source'] = { - "id": self.__id__, - "description": "ComicVine Books", - "link": "https://comicvine.gamespot.com/" - } - v['url'] = r.get('site_detail_url', "") - val.append(v) + title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = "%20".join(tokens) + result = requests.get( + f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}", + headers=ComicVine.HEADERS, + ) + for result in result.json()["results"]: + match = self._parse_search_result( + result=result, generic_cover=generic_cover, locale=locale + ) + val.append(match) return val - + def _parse_search_result( + self, result: Dict, generic_cover: str, locale: str + ) -> MetaRecord: + series = result["volume"].get("name", "") + series_index = result.get("issue_number", 0) + issue_name = result.get("name", "") + match = MetaRecord( + id=result["id"], + title=f"{series}#{series_index} - {issue_name}", + authors=result.get("authors", []), + url=result.get("site_detail_url", ""), + source=MetaSourceInfo( + id=self.__id__, + description=ComicVine.DESCRIPTION, + link=ComicVine.META_URL, + ), + series=series, + ) + match.cover = result["image"].get("original_url", generic_cover) + match.description = result.get("description", "") + match.publishedDate = result.get("store_date", result.get("date_added")) + match.series_index = series_index + match.tags = ["Comics", series] + match.identifiers = {"comicvine": match.id} + return match diff --git a/cps/metadata_provider/google.py b/cps/metadata_provider/google.py index 1074fe3d..5ac3e7ee 100644 --- a/cps/metadata_provider/google.py +++ b/cps/metadata_provider/google.py @@ -23,7 +23,7 @@ from urllib.parse import quote import requests from cps.isoLanguages import get_lang3, get_language_name -from cps.services.Metadata import MetaRecord, Metadata +from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata class Google(Metadata): @@ -56,38 +56,37 @@ class Google(Metadata): def _parse_search_result( self, result: Dict, generic_cover: str, locale: str ) -> MetaRecord: - match = dict() - match["id"] = result["id"] - match["title"] = result["volumeInfo"]["title"] - match["authors"] = result["volumeInfo"].get("authors", []) - match["url"] = Google.BOOK_URL + result["id"] - match["cover"] = self._parse_cover(result=result, generic_cover=generic_cover) - match["description"] = result["volumeInfo"].get("description", "") - match["languages"] = self._parse_languages(result=result, locale=locale) - match["publisher"] = result["volumeInfo"].get("publisher", "") - match["publishedDate"] = result["volumeInfo"].get("publishedDate", "") - match["rating"] = result["volumeInfo"].get("averageRating", 0) - match["series"], match["series_index"] = "", 1 - match["tags"] = result["volumeInfo"].get("categories", []) + match = MetaRecord( + id=result["id"], + title=result["volumeInfo"]["title"], + authors=result["volumeInfo"].get("authors", []), + url=Google.BOOK_URL + result["id"], + source=MetaSourceInfo( + id=self.__id__, + description=Google.DESCRIPTION, + link=Google.META_URL, + ), + ) - match["source"] = { - "id": self.__id__, - "description": Google.DESCRIPTION, - "link": Google.META_URL, - } + match.cover = self._parse_cover(result=result, generic_cover=generic_cover) + match.description = result["volumeInfo"].get("description", "") + match.languages = self._parse_languages(result=result, locale=locale) + match.publisher = result["volumeInfo"].get("publisher", "") + match.publishedDate = result["volumeInfo"].get("publishedDate", "") + match.rating = result["volumeInfo"].get("averageRating", 0) + match.series, match.series_index = "", 1 + match.tags = result["volumeInfo"].get("categories", []) - match["identifiers"] = { - "google": match.get("id"), - } + match.identifiers = {"google": match.id} match = self._parse_isbn(result=result, match=match) return match @staticmethod - def _parse_isbn(result: Dict, match: Dict) -> Dict: + def _parse_isbn(result: Dict, match: MetaRecord) -> MetaRecord: identifiers = result["volumeInfo"].get("industryIdentifiers", []) for identifier in identifiers: if identifier.get("type") == Google.ISBN_TYPE: - match["identifiers"]["isbn"] = identifier.get("identifier") + match.identifiers["isbn"] = identifier.get("identifier") break return match @@ -100,7 +99,7 @@ class Google(Metadata): @staticmethod def _parse_languages(result: Dict, locale: str) -> List[str]: - language_iso2 = result.get("language", "") + language_iso2 = result["volumeInfo"].get("language", "") languages = ( [get_language_name(locale, get_lang3(language_iso2))] if language_iso2 diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py index fd9ca4a7..4f6aca1e 100644 --- a/cps/metadata_provider/lubimyczytac.py +++ b/cps/metadata_provider/lubimyczytac.py @@ -27,7 +27,7 @@ from html2text import HTML2Text from lxml.html import HtmlElement, fromstring, tostring from markdown2 import Markdown -from cps.services.Metadata import MetaRecord, Metadata +from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata SYMBOLS_TO_TRANSLATE = ( "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", @@ -158,61 +158,60 @@ class LubimyCzytacParser: self.root = root self.metadata = metadata - def parse_search_results(self) -> List[Dict]: + def parse_search_results(self) -> List[MetaRecord]: matches = [] results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH) for result in results: title = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.TITLE_TEXT_PATH}", + f"{LubimyCzytac.TITLE_TEXT_PATH}", ) book_url = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.URL_PATH}", + f"{LubimyCzytac.URL_PATH}", ) authors = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.AUTHORS_PATH}", + f"{LubimyCzytac.AUTHORS_PATH}", take_first=False, ) if not all([title, book_url, authors]): continue matches.append( - { - "id": book_url.replace(f"/ksiazka/", "").split("/")[0], - "title": title, - "authors": [strip_accents(author) for author in authors], - "url": LubimyCzytac.BASE_URL + book_url, - } + MetaRecord( + id=book_url.replace(f"/ksiazka/", "").split("/")[0], + title=title, + authors=[strip_accents(author) for author in authors], + url=LubimyCzytac.BASE_URL + book_url, + source=MetaSourceInfo( + id=self.metadata.__id__, + description=self.metadata.__name__, + link=LubimyCzytac.BASE_URL, + ) + ) ) return matches - def parse_single_book(self, match: Dict, generic_cover: str) -> MetaRecord: - response = requests.get(match.get("url")) + def parse_single_book(self, match: MetaRecord, generic_cover: str) -> MetaRecord: + response = requests.get(match.url) self.root = fromstring(response.text) - match["cover"] = self._parse_cover(generic_cover=generic_cover) - match["description"] = self._parse_description() - match["languages"] = self._parse_languages() - match["publisher"] = self._parse_publisher() - match["publishedDate"] = self._parse_from_summary( + match.cover = self._parse_cover(generic_cover=generic_cover) + match.description = self._parse_description() + match.languages = self._parse_languages() + match.publisher = self._parse_publisher() + match.publishedDate = self._parse_from_summary( attribute_name="datePublished" ) - match["rating"] = self._parse_rating() - match["series"], match["series_index"] = self._parse_series() - match["tags"] = self._parse_tags() - - match["source"] = { - "id": self.metadata.__id__, - "description": self.metadata.__name__, - "link": LubimyCzytac.BASE_URL, - } - match["identifiers"] = { + match.rating = self._parse_rating() + match.series, match.series_index = self._parse_series() + match.tags = self._parse_tags() + match.identifiers = { "isbn": self._parse_isbn(), - "lubimyczytac": match["id"], + "lubimyczytac": match.id, } return match diff --git a/cps/metadata_provider/scholar.py b/cps/metadata_provider/scholar.py index 6e13c768..0becaef0 100644 --- a/cps/metadata_provider/scholar.py +++ b/cps/metadata_provider/scholar.py @@ -15,47 +15,53 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import itertools +from typing import Dict, List, Optional +from urllib.parse import quote from scholarly import scholarly -from cps.services.Metadata import Metadata +from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata class scholar(Metadata): __name__ = "Google Scholar" __id__ = "googlescholar" + META_URL = "https://scholar.google.com/" - def search(self, query, generic_cover=""): + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: val = list() if self.active: - scholar_gen = scholarly.search_pubs(' '.join(query.split('+'))) - i = 0 - for publication in scholar_gen: - v = dict() - v['id'] = "1234" # publication['bib'].get('title') - v['title'] = publication['bib'].get('title') - v['authors'] = publication['bib'].get('author', []) - v['description'] = publication['bib'].get('abstract', "") - v['publisher'] = publication['bib'].get('venue', "") - if publication['bib'].get('pub_year'): - v['publishedDate'] = publication['bib'].get('pub_year')+"-01-01" - else: - v['publishedDate'] = "" - v['tags'] = "" - v['ratings'] = 0 - v['series'] = "" - v['cover'] = generic_cover - v['url'] = publication.get('pub_url') or publication.get('eprint_url') or "", - v['source'] = { - "id": self.__id__, - "description": "Google Scholar", - "link": "https://scholar.google.com/" - } - val.append(v) - i += 1 - if (i >= 10): - break + title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = " ".join(tokens) + scholar_gen = itertools.islice(scholarly.search_pubs(query), 10) + for result in scholar_gen: + match = self._parse_search_result( + result=result, generic_cover=generic_cover, locale=locale + ) + val.append(match) return val + def _parse_search_result( + self, result: Dict, generic_cover: str, locale: str + ) -> MetaRecord: + match = MetaRecord( + id=result.get("pub_url", result.get("eprint_url", "")), + title=result["bib"].get("title"), + authors=result["bib"].get("author", []), + url=result.get("pub_url", result.get("eprint_url", "")), + source=MetaSourceInfo( + id=self.__id__, description=self.__name__, link=scholar.META_URL + ), + ) - + match.cover = result.get("image", {}).get("original_url", generic_cover) + match.description = result["bib"].get("abstract", "") + match.publisher = result["bib"].get("venue", "") + match.publishedDate = result["bib"].get("pub_year") + "-01-01" + match.identifiers = {"scholar": match.id} + return match diff --git a/cps/search_metadata.py b/cps/search_metadata.py index a128f9ac..53cbf553 100644 --- a/cps/search_metadata.py +++ b/cps/search_metadata.py @@ -22,6 +22,7 @@ import inspect import json import os import sys +from dataclasses import asdict from flask import Blueprint, Response, request, url_for from flask_login import current_user @@ -99,11 +100,13 @@ def metadata_change_active_provider(prov_name): log.error("Invalid request received: {}".format(request)) return "Invalid request", 400 if "initial" in new_state and prov_name: - for c in cl: - if c.__id__ == prov_name: - data = c.search(new_state.get("query", "")) - break - return Response(json.dumps(data), mimetype="application/json") + data = [] + provider = next((c for c in cl if c.__id__ == prov_name), None) + if provider is not None: + data = provider.search(new_state.get("query", "")) + return Response( + json.dumps([asdict(x) for x in data]), mimetype="application/json" + ) return "" @@ -123,5 +126,5 @@ def metadata_search(): if active.get(c.__id__, True) } for future in concurrent.futures.as_completed(meta): - data.extend(future.result()) + data.extend([asdict(x) for x in future.result()]) return Response(json.dumps(data), mimetype="application/json") diff --git a/cps/services/Metadata.py b/cps/services/Metadata.py index 09fc70ce..f4a5662c 100644 --- a/cps/services/Metadata.py +++ b/cps/services/Metadata.py @@ -16,32 +16,38 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . import abc +import dataclasses +import os import re -from typing import Dict, Generator, List, Optional, TypedDict, Union +from typing import Dict, Generator, List, Optional, Union +from cps import constants -class MetaSourceInfo(TypedDict): + +@dataclasses.dataclass +class MetaSourceInfo: id: str description: str link: str -class MetaRecord(TypedDict): +@dataclasses.dataclass +class MetaRecord: id: Union[str, int] title: str authors: List[str] url: str - cover: str - series: Optional[str] - series_index: Optional[Union[int, float]] - tags: Optional[List[str]] - publisher: Optional[str] - publishedDate: Optional[str] - rating: Optional[int] - description: Optional[str] source: MetaSourceInfo - languages: Optional[List[str]] - identifiers: Dict[str, Union[str, int]] + cover: str = os.path.join(constants.STATIC_DIR, 'generic_cover.jpg') + description: Optional[str] = "" + series: Optional[str] = None + series_index: Optional[Union[int, float]] = 0 + identifiers: Dict[str, Union[str, int]] = dataclasses.field(default_factory=dict) + publisher: Optional[str] = None + publishedDate: Optional[str] = None + rating: Optional[int] = 0 + languages: Optional[List[str]] = dataclasses.field(default_factory=list) + tags: Optional[List[str]] = dataclasses.field(default_factory=list) class Metadata: diff --git a/optional-requirements.txt b/optional-requirements.txt index 03f58bb5..17c4b878 100644 --- a/optional-requirements.txt +++ b/optional-requirements.txt @@ -32,6 +32,9 @@ SQLAlchemy-Utils>=0.33.5,<0.38.0 # extracting metadata rarfile>=2.7 scholarly>=1.2.0, <1.5 +markdown2==2.4.2 +html2text==2020.1.16 +python-dateutil==2.8.2 # other natsort>=2.2.0,<8.1.0 diff --git a/requirements.txt b/requirements.txt index d09c2157..1db961fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,3 @@ Wand>=0.4.4,<0.7.0 unidecode>=0.04.19,<1.3.0 lxml>=3.8.0,<4.7.0 flask-wtf>=0.14.2,<1.1.0 -markdown2==2.4.2 -html2text==2020.1.16 -python-dateutil==2.8.2 From bea14d1784184f16040e99b0259664c9d78e98c3 Mon Sep 17 00:00:00 2001 From: collerek Date: Wed, 15 Dec 2021 15:20:01 +0100 Subject: [PATCH 6/6] fix locale for lubimyczytac languages --- cps/metadata_provider/lubimyczytac.py | 31 ++++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py index 4f6aca1e..814a785e 100644 --- a/cps/metadata_provider/lubimyczytac.py +++ b/cps/metadata_provider/lubimyczytac.py @@ -18,7 +18,7 @@ import datetime import json import re from multiprocessing.pool import ThreadPool -from typing import Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union from urllib.parse import quote import requests @@ -27,6 +27,7 @@ from html2text import HTML2Text from lxml.html import HtmlElement, fromstring, tostring from markdown2 import Markdown +from cps.isoLanguages import get_language_name from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata SYMBOLS_TO_TRANSLATE = ( @@ -119,7 +120,7 @@ class LubimyCzytac(Metadata): with ThreadPool(processes=10) as pool: final_matches = pool.starmap( lc_parser.parse_single_book, - [(match, generic_cover) for match in matches], + [(match, generic_cover, locale) for match in matches], ) return final_matches return matches @@ -165,18 +166,18 @@ class LubimyCzytacParser: title = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.TITLE_TEXT_PATH}", + f"{LubimyCzytac.TITLE_TEXT_PATH}", ) book_url = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.URL_PATH}", + f"{LubimyCzytac.URL_PATH}", ) authors = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.AUTHORS_PATH}", + f"{LubimyCzytac.AUTHORS_PATH}", take_first=False, ) if not all([title, book_url, authors]): @@ -191,21 +192,21 @@ class LubimyCzytacParser: id=self.metadata.__id__, description=self.metadata.__name__, link=LubimyCzytac.BASE_URL, - ) + ), ) ) return matches - def parse_single_book(self, match: MetaRecord, generic_cover: str) -> MetaRecord: + def parse_single_book( + self, match: MetaRecord, generic_cover: str, locale: str + ) -> MetaRecord: response = requests.get(match.url) self.root = fromstring(response.text) match.cover = self._parse_cover(generic_cover=generic_cover) match.description = self._parse_description() - match.languages = self._parse_languages() + match.languages = self._parse_languages(locale=locale) match.publisher = self._parse_publisher() - match.publishedDate = self._parse_from_summary( - attribute_name="datePublished" - ) + match.publishedDate = self._parse_from_summary(attribute_name="datePublished") match.rating = self._parse_rating() match.series, match.series_index = self._parse_series() match.tags = self._parse_tags() @@ -241,15 +242,15 @@ class LubimyCzytacParser: def _parse_publisher(self) -> Optional[str]: return self._parse_xpath_node(xpath=LubimyCzytac.PUBLISHER, take_first=True) - def _parse_languages(self) -> List[str]: + def _parse_languages(self, locale: str) -> List[str]: languages = list() lang = self._parse_xpath_node(xpath=LubimyCzytac.LANGUAGES, take_first=True) if lang: if "polski" in lang: - languages.append("Polish") + languages.append("pol") if "angielski" in lang: - languages.append("English") - return languages + languages.append("eng") + return [get_language_name(locale, language) for language in languages] def _parse_series(self) -> Tuple[Optional[str], Optional[Union[float, int]]]: series_index = 0