Add new iFixit document loader (#1333)

iFixit is a wikipedia-like site that has a huge amount of open content on how to fix things, questions/answers for common troubleshooting and "things" related content that is more technical in nature. All content is licensed under CC-BY-SA-NC 3.0 Adding docs from iFixit as context for user questions like "I dropped my phone in water, what do I do?" or "My macbook pro is making a whining noise, what's wrong with it?" can yield significantly better responses than context free response from LLMs.
1 year ago · d22651d82a
parent c46478d70e
commit d22651d82a
6 changed files with 443 additions and 0 deletions
--- a/docs/modules/document_loaders/examples/ifixit.ipynb
+++ b/docs/modules/document_loaders/examples/ifixit.ipynb
--- a/docs/modules/document_loaders/how_to_guides.rst
+++ b/docs/modules/document_loaders/how_to_guides.rst
@ -59,6 +59,8 @@ There are a lot of different document loaders that LangChain supports. Below are

 `CoNLL-U <./examples/CoNLL-U.html>`_: A walkthrough of how to load data from a ConLL-U file.

+`iFixit <./examples/ifixit.html>`_: A walkthrough of how to search and load data like guides, technical Q&A's, and device wikis from iFixit.com
+
 .. toctree::
   :maxdepth: 1
   :glob:
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -16,6 +16,7 @@ from langchain.document_loaders.googledrive import GoogleDriveLoader
 from langchain.document_loaders.gutenberg import GutenbergLoader
 from langchain.document_loaders.hn import HNLoader
 from langchain.document_loaders.html import UnstructuredHTMLLoader
+from langchain.document_loaders.ifixit import IFixitLoader
 from langchain.document_loaders.image import UnstructuredImageLoader
 from langchain.document_loaders.imsdb import IMSDbLoader
 from langchain.document_loaders.notebook import NotebookLoader
@ -70,6 +71,7 @@ __all__ = [
    "IMSDbLoader",
    "AZLyricsLoader",
    "CollegeConfidentialLoader",
+    "IFixitLoader",
    "GutenbergLoader",
    "PagedPDFSplitter",
    "EverNoteLoader",
--- a/langchain/document_loaders/ifixit.py
+++ b/langchain/document_loaders/ifixit.py
@ -0,0 +1,202 @@
+"""Loader that loads iFixit data."""
+from typing import List, Optional
+
+import requests
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.web_base import WebBaseLoader
+
+IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0"
+
+
+class IFixitLoader(BaseLoader):
+    """Load iFixit repair guides, device wikis and answers.
+
+    iFixit is the largest, open repair community on the web. The site contains nearly
+    100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is
+    licensed under CC-BY.
+
+    This loader will allow you to download the text of a repair guide, text of Q&A's
+    and wikis from devices on iFixit using their open APIs and web scraping.
+    """
+
+    def __init__(self, web_path: str):
+        """Initialize with web path."""
+        if not web_path.startswith("https://www.ifixit.com"):
+            raise ValueError("web path must start with 'https://www.ifixit.com'")
+
+        path = web_path.replace("https://www.ifixit.com", "")
+
+        allowed_paths = ["/Device", "/Guide", "/Answers", "/Teardown"]
+
+        """ TODO: Add /Wiki """
+        if not any(path.startswith(allowed_path) for allowed_path in allowed_paths):
+            raise ValueError(
+                "web path must start with /Device, /Guide, /Teardown or /Answers"
+            )
+
+        pieces = [x for x in path.split("/") if x]
+
+        """Teardowns are just guides by a different name"""
+        self.page_type = pieces[0] if pieces[0] != "Teardown" else "Guide"
+
+        if self.page_type == "Guide" or self.page_type == "Answers":
+            self.id = pieces[2]
+        else:
+            self.id = pieces[1]
+
+        self.web_path = web_path
+
+    def load(self) -> List[Document]:
+        if self.page_type == "Device":
+            return self.load_device()
+        elif self.page_type == "Guide" or self.page_type == "Teardown":
+            return self.load_guide()
+        elif self.page_type == "Answers":
+            return self.load_questions_and_answers()
+        else:
+            raise ValueError("Unknown page type: " + self.page_type)
+
+    @staticmethod
+    def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
+        res = requests.get(
+            IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
+        )
+
+        if res.status_code != 200:
+            raise ValueError(
+                'Could not load suggestions for "' + query + '"\n' + res.json()
+            )
+
+        data = res.json()
+
+        results = data["results"]
+        output = []
+
+        for result in results:
+            try:
+                loader = IFixitLoader(result["url"])
+                if loader.page_type == "Device":
+                    output += loader.load_device(include_guides=False)
+                else:
+                    output += loader.load()
+            except ValueError:
+                continue
+
+        return output
+
+    def load_questions_and_answers(
+        self, url_override: Optional[str] = None
+    ) -> List[Document]:
+        loader = WebBaseLoader(self.web_path if url_override is None else url_override)
+        soup = loader.scrape()
+
+        output = []
+
+        title = soup.find("h1", "post-title").text
+
+        output.append("# " + title)
+        output.append(soup.select_one(".post-content .post-text").text.strip())
+
+        output.append("\n## " + soup.find("div", "post-answers-header").text.strip())
+        for answer in soup.select(".js-answers-list .post.post-answer"):
+            if answer.has_attr("itemprop") and "acceptedAnswer" in answer["itemprop"]:
+                output.append("\n### Accepted Answer")
+            elif "post-helpful" in answer["class"]:
+                output.append("\n### Most Helpful Answer")
+            else:
+                output.append("\n### Other Answer")
+
+            output += [
+                a.text.strip() for a in answer.select(".post-content .post-text")
+            ]
+            output.append("\n")
+
+        text = "\n".join(output).strip()
+
+        metadata = {"source": self.web_path, "title": title}
+
+        return [Document(page_content=text, metadata=metadata)]
+
+    def load_device(
+        self, url_override: Optional[str] = None, include_guides: bool = True
+    ) -> List[Document]:
+        documents = []
+        if url_override is None:
+            url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
+        else:
+            url = url_override
+
+        res = requests.get(url)
+        data = res.json()
+        text = "\n".join(
+            [
+                data[key]
+                for key in ["title", "description", "contents_raw"]
+                if key in data
+            ]
+        ).strip()
+
+        metadata = {"source": self.web_path, "title": data["title"]}
+        documents.append(Document(page_content=text, metadata=metadata))
+
+        if include_guides:
+            """Load and return documents for each guide linked to from the device"""
+            guide_urls = [guide["url"] for guide in data["guides"]]
+            for guide_url in guide_urls:
+                documents.append(IFixitLoader(guide_url).load()[0])
+
+        return documents
+
+    def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
+        if url_override is None:
+            url = IFIXIT_BASE_URL + "/guides/" + self.id
+        else:
+            url = url_override
+
+        res = requests.get(url)
+
+        if res.status_code != 200:
+            raise ValueError(
+                "Could not load guide: " + self.web_path + "\n" + res.json()
+            )
+
+        data = res.json()
+
+        doc_parts = ["# " + data["title"], data["introduction_raw"]]
+
+        doc_parts.append("\n\n###Tools Required:")
+        if len(data["tools"]) == 0:
+            doc_parts.append("\n - None")
+        else:
+            for tool in data["tools"]:
+                doc_parts.append("\n - " + tool["text"])
+
+        doc_parts.append("\n\n###Parts Required:")
+        if len(data["parts"]) == 0:
+            doc_parts.append("\n - None")
+        else:
+            for part in data["parts"]:
+                doc_parts.append("\n - " + part["text"])
+
+        for row in data["steps"]:
+            doc_parts.append(
+                "\n\n## "
+                + (
+                    row["title"]
+                    if row["title"] != ""
+                    else "Step {}".format(row["orderby"])
+                )
+            )
+
+            for line in row["lines"]:
+                doc_parts.append(line["text_raw"])
+
+        doc_parts.append(data["conclusion_raw"])
+
+        text = "\n".join(doc_parts)
+
+        metadata = {"source": self.web_path, "title": data["title"]}
+
+        return [Document(page_content=text, metadata=metadata)]
--- a/tests/integration_tests/document_loaders/init.py
+++ b/tests/integration_tests/document_loaders/init.py
@ -0,0 +1 @@
+"""Test document loader integrations."""
--- a/tests/integration_tests/document_loaders/test_ifixit.py
+++ b/tests/integration_tests/document_loaders/test_ifixit.py
@ -0,0 +1,37 @@
+from langchain.document_loaders.ifixit import IFixitLoader
+
+
+def test_ifixit_loader() -> None:
+    """Test iFixit loader."""
+    web_path = "https://www.ifixit.com/Guide/iPad+9+Battery+Replacement/151279"
+    loader = IFixitLoader(web_path)
+    assert loader.page_type == "Guide"
+    assert loader.id == "151279"
+    assert loader.web_path == web_path
+
+
+def test_ifixit_loader_teardown() -> None:
+    web_path = "https://www.ifixit.com/Teardown/Banana+Teardown/811"
+    loader = IFixitLoader(web_path)
+    """ Teardowns are just guides by a different name """
+    assert loader.page_type == "Guide"
+    assert loader.id == "811"
+
+
+def test_ifixit_loader_device() -> None:
+    web_path = "https://www.ifixit.com/Device/Standard_iPad"
+    loader = IFixitLoader(web_path)
+    """ Teardowns are just guides by a different name """
+    assert loader.page_type == "Device"
+    assert loader.id == "Standard_iPad"
+
+
+def test_ifixit_loader_answers() -> None:
+    web_path = (
+        "https://www.ifixit.com/Answers/View/318583/My+iPhone+6+is+typing+and+"
+        "opening+apps+by+itself"
+    )
+    loader = IFixitLoader(web_path)
+
+    assert loader.page_type == "Answers"
+    assert loader.id == "318583"