Fuzzy match book titles to eliminate duplicates

Titles with a Levenshtein ratio of 70% or greater are considered duplicates.
7 years ago · 58abc1d024
parent 53c687251e
commit 58abc1d024
2 changed files with 29 additions and 7 deletions
--- a/cps/web.py
+++ b/cps/web.py
@ -13,6 +13,12 @@ try:
 except ImportError:
    goodreads_support = False

+try:
+    import Levenshtein
+    levenshtein_support = True
+except ImportError:
+    levenshtein_support = False
+
 try:
    from functools import reduce
 except ImportError:
@ -1138,17 +1144,32 @@ def author(book_id, page):
    if goodreads_support and config.config_use_goodreads:
        gc = GoodreadsClient(config.config_goodreads_api_key, config.config_goodreads_api_secret)
        author_info = gc.find_author(author_name=name)
-
-        # Get all identifiers (ISBN, Goodreads, etc) and filter author's books by that list so we show fewer duplicates
-        # Note: Not all images will be shown, even though they're available on Goodreads.com.
-        #       See https://www.goodreads.com/topic/show/18213769-goodreads-book-images
-        identifiers = reduce(lambda acc, book: acc + map(lambda identifier: identifier.val, book.identifiers), entries.all(), [])
-        other_books = filter(lambda book: book.isbn not in identifiers and book.gid["#text"] not in identifiers, author_info.books)
+        other_books = get_unique_other_books(entries.all(), author_info.books)

    return render_title_template('author.html', entries=entries, pagination=pagination,
                                 title=name, author=author_info, other_books=other_books)


+def get_unique_other_books(library_books, author_books):
+    # Get all identifiers (ISBN, Goodreads, etc) and filter author's books by that list so we show fewer duplicates
+    # Note: Not all images will be shown, even though they're available on Goodreads.com.
+    #       See https://www.goodreads.com/topic/show/18213769-goodreads-book-images
+    identifiers = reduce(lambda acc, book: acc + map(lambda identifier: identifier.val, book.identifiers), library_books, [])
+    other_books = filter(lambda book: book.isbn not in identifiers and book.gid["#text"] not in identifiers, author_books)
+
+    # Fuzzy match book titles
+    if levenshtein_support:
+        library_titles = reduce(lambda acc, book: acc + [book.title], library_books, [])
+        other_books = filter(lambda author_book: not filter(
+            lambda library_book:
+            Levenshtein.ratio(re.sub(r"\(.*\)", "", author_book.title), library_book) > 0.7,  # Remove items in parentheses before comparing
+            library_titles
+        ), other_books)
+
+    return other_books
+
+
+
@app.route("/series")
@login_required_if_no_ano
 def series_list():
--- a/optional-requirements.txt
+++ b/optional-requirements.txt
@ -11,4 +11,5 @@ PyYAML==3.12
 rsa==3.4.2
 six==1.10.0
 uritemplate==3.0.0
-goodreads==0.3.2
+goodreads>=0.3.2
+python-Levenshtein>=0.12.0