forked from Archives/langchain
Add new iFixit document loader (#1333)
iFixit is a wikipedia-like site that has a huge amount of open content on how to fix things, questions/answers for common troubleshooting and "things" related content that is more technical in nature. All content is licensed under CC-BY-SA-NC 3.0 Adding docs from iFixit as context for user questions like "I dropped my phone in water, what do I do?" or "My macbook pro is making a whining noise, what's wrong with it?" can yield significantly better responses than context free response from LLMs.
This commit is contained in:
parent
1aa41b5741
commit
72ef69d1ba
199
docs/modules/document_loaders/examples/ifixit.ipynb
Normal file
199
docs/modules/document_loaders/examples/ifixit.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -59,6 +59,8 @@ There are a lot of different document loaders that LangChain supports. Below are
|
||||
|
||||
`CoNLL-U <./examples/CoNLL-U.html>`_: A walkthrough of how to load data from a ConLL-U file.
|
||||
|
||||
`iFixit <./examples/ifixit.html>`_: A walkthrough of how to search and load data like guides, technical Q&A's, and device wikis from iFixit.com
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:glob:
|
||||
|
@ -16,6 +16,7 @@ from langchain.document_loaders.googledrive import GoogleDriveLoader
|
||||
from langchain.document_loaders.gutenberg import GutenbergLoader
|
||||
from langchain.document_loaders.hn import HNLoader
|
||||
from langchain.document_loaders.html import UnstructuredHTMLLoader
|
||||
from langchain.document_loaders.ifixit import IFixitLoader
|
||||
from langchain.document_loaders.image import UnstructuredImageLoader
|
||||
from langchain.document_loaders.imsdb import IMSDbLoader
|
||||
from langchain.document_loaders.notebook import NotebookLoader
|
||||
@ -70,6 +71,7 @@ __all__ = [
|
||||
"IMSDbLoader",
|
||||
"AZLyricsLoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"IFixitLoader",
|
||||
"GutenbergLoader",
|
||||
"PagedPDFSplitter",
|
||||
"EverNoteLoader",
|
||||
|
202
langchain/document_loaders/ifixit.py
Normal file
202
langchain/document_loaders/ifixit.py
Normal file
@ -0,0 +1,202 @@
|
||||
"""Loader that loads iFixit data."""
|
||||
from typing import List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0"
|
||||
|
||||
|
||||
class IFixitLoader(BaseLoader):
|
||||
"""Load iFixit repair guides, device wikis and answers.
|
||||
|
||||
iFixit is the largest, open repair community on the web. The site contains nearly
|
||||
100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is
|
||||
licensed under CC-BY.
|
||||
|
||||
This loader will allow you to download the text of a repair guide, text of Q&A's
|
||||
and wikis from devices on iFixit using their open APIs and web scraping.
|
||||
"""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with web path."""
|
||||
if not web_path.startswith("https://www.ifixit.com"):
|
||||
raise ValueError("web path must start with 'https://www.ifixit.com'")
|
||||
|
||||
path = web_path.replace("https://www.ifixit.com", "")
|
||||
|
||||
allowed_paths = ["/Device", "/Guide", "/Answers", "/Teardown"]
|
||||
|
||||
""" TODO: Add /Wiki """
|
||||
if not any(path.startswith(allowed_path) for allowed_path in allowed_paths):
|
||||
raise ValueError(
|
||||
"web path must start with /Device, /Guide, /Teardown or /Answers"
|
||||
)
|
||||
|
||||
pieces = [x for x in path.split("/") if x]
|
||||
|
||||
"""Teardowns are just guides by a different name"""
|
||||
self.page_type = pieces[0] if pieces[0] != "Teardown" else "Guide"
|
||||
|
||||
if self.page_type == "Guide" or self.page_type == "Answers":
|
||||
self.id = pieces[2]
|
||||
else:
|
||||
self.id = pieces[1]
|
||||
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
if self.page_type == "Device":
|
||||
return self.load_device()
|
||||
elif self.page_type == "Guide" or self.page_type == "Teardown":
|
||||
return self.load_guide()
|
||||
elif self.page_type == "Answers":
|
||||
return self.load_questions_and_answers()
|
||||
else:
|
||||
raise ValueError("Unknown page type: " + self.page_type)
|
||||
|
||||
@staticmethod
|
||||
def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
|
||||
res = requests.get(
|
||||
IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise ValueError(
|
||||
'Could not load suggestions for "' + query + '"\n' + res.json()
|
||||
)
|
||||
|
||||
data = res.json()
|
||||
|
||||
results = data["results"]
|
||||
output = []
|
||||
|
||||
for result in results:
|
||||
try:
|
||||
loader = IFixitLoader(result["url"])
|
||||
if loader.page_type == "Device":
|
||||
output += loader.load_device(include_guides=False)
|
||||
else:
|
||||
output += loader.load()
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return output
|
||||
|
||||
def load_questions_and_answers(
|
||||
self, url_override: Optional[str] = None
|
||||
) -> List[Document]:
|
||||
loader = WebBaseLoader(self.web_path if url_override is None else url_override)
|
||||
soup = loader.scrape()
|
||||
|
||||
output = []
|
||||
|
||||
title = soup.find("h1", "post-title").text
|
||||
|
||||
output.append("# " + title)
|
||||
output.append(soup.select_one(".post-content .post-text").text.strip())
|
||||
|
||||
output.append("\n## " + soup.find("div", "post-answers-header").text.strip())
|
||||
for answer in soup.select(".js-answers-list .post.post-answer"):
|
||||
if answer.has_attr("itemprop") and "acceptedAnswer" in answer["itemprop"]:
|
||||
output.append("\n### Accepted Answer")
|
||||
elif "post-helpful" in answer["class"]:
|
||||
output.append("\n### Most Helpful Answer")
|
||||
else:
|
||||
output.append("\n### Other Answer")
|
||||
|
||||
output += [
|
||||
a.text.strip() for a in answer.select(".post-content .post-text")
|
||||
]
|
||||
output.append("\n")
|
||||
|
||||
text = "\n".join(output).strip()
|
||||
|
||||
metadata = {"source": self.web_path, "title": title}
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
||||
def load_device(
|
||||
self, url_override: Optional[str] = None, include_guides: bool = True
|
||||
) -> List[Document]:
|
||||
documents = []
|
||||
if url_override is None:
|
||||
url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
|
||||
else:
|
||||
url = url_override
|
||||
|
||||
res = requests.get(url)
|
||||
data = res.json()
|
||||
text = "\n".join(
|
||||
[
|
||||
data[key]
|
||||
for key in ["title", "description", "contents_raw"]
|
||||
if key in data
|
||||
]
|
||||
).strip()
|
||||
|
||||
metadata = {"source": self.web_path, "title": data["title"]}
|
||||
documents.append(Document(page_content=text, metadata=metadata))
|
||||
|
||||
if include_guides:
|
||||
"""Load and return documents for each guide linked to from the device"""
|
||||
guide_urls = [guide["url"] for guide in data["guides"]]
|
||||
for guide_url in guide_urls:
|
||||
documents.append(IFixitLoader(guide_url).load()[0])
|
||||
|
||||
return documents
|
||||
|
||||
def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
|
||||
if url_override is None:
|
||||
url = IFIXIT_BASE_URL + "/guides/" + self.id
|
||||
else:
|
||||
url = url_override
|
||||
|
||||
res = requests.get(url)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise ValueError(
|
||||
"Could not load guide: " + self.web_path + "\n" + res.json()
|
||||
)
|
||||
|
||||
data = res.json()
|
||||
|
||||
doc_parts = ["# " + data["title"], data["introduction_raw"]]
|
||||
|
||||
doc_parts.append("\n\n###Tools Required:")
|
||||
if len(data["tools"]) == 0:
|
||||
doc_parts.append("\n - None")
|
||||
else:
|
||||
for tool in data["tools"]:
|
||||
doc_parts.append("\n - " + tool["text"])
|
||||
|
||||
doc_parts.append("\n\n###Parts Required:")
|
||||
if len(data["parts"]) == 0:
|
||||
doc_parts.append("\n - None")
|
||||
else:
|
||||
for part in data["parts"]:
|
||||
doc_parts.append("\n - " + part["text"])
|
||||
|
||||
for row in data["steps"]:
|
||||
doc_parts.append(
|
||||
"\n\n## "
|
||||
+ (
|
||||
row["title"]
|
||||
if row["title"] != ""
|
||||
else "Step {}".format(row["orderby"])
|
||||
)
|
||||
)
|
||||
|
||||
for line in row["lines"]:
|
||||
doc_parts.append(line["text_raw"])
|
||||
|
||||
doc_parts.append(data["conclusion_raw"])
|
||||
|
||||
text = "\n".join(doc_parts)
|
||||
|
||||
metadata = {"source": self.web_path, "title": data["title"]}
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
1
tests/integration_tests/document_loaders/__init__.py
Normal file
1
tests/integration_tests/document_loaders/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Test document loader integrations."""
|
37
tests/integration_tests/document_loaders/test_ifixit.py
Normal file
37
tests/integration_tests/document_loaders/test_ifixit.py
Normal file
@ -0,0 +1,37 @@
|
||||
from langchain.document_loaders.ifixit import IFixitLoader
|
||||
|
||||
|
||||
def test_ifixit_loader() -> None:
|
||||
"""Test iFixit loader."""
|
||||
web_path = "https://www.ifixit.com/Guide/iPad+9+Battery+Replacement/151279"
|
||||
loader = IFixitLoader(web_path)
|
||||
assert loader.page_type == "Guide"
|
||||
assert loader.id == "151279"
|
||||
assert loader.web_path == web_path
|
||||
|
||||
|
||||
def test_ifixit_loader_teardown() -> None:
|
||||
web_path = "https://www.ifixit.com/Teardown/Banana+Teardown/811"
|
||||
loader = IFixitLoader(web_path)
|
||||
""" Teardowns are just guides by a different name """
|
||||
assert loader.page_type == "Guide"
|
||||
assert loader.id == "811"
|
||||
|
||||
|
||||
def test_ifixit_loader_device() -> None:
|
||||
web_path = "https://www.ifixit.com/Device/Standard_iPad"
|
||||
loader = IFixitLoader(web_path)
|
||||
""" Teardowns are just guides by a different name """
|
||||
assert loader.page_type == "Device"
|
||||
assert loader.id == "Standard_iPad"
|
||||
|
||||
|
||||
def test_ifixit_loader_answers() -> None:
|
||||
web_path = (
|
||||
"https://www.ifixit.com/Answers/View/318583/My+iPhone+6+is+typing+and+"
|
||||
"opening+apps+by+itself"
|
||||
)
|
||||
loader = IFixitLoader(web_path)
|
||||
|
||||
assert loader.page_type == "Answers"
|
||||
assert loader.id == "318583"
|
Loading…
Reference in New Issue
Block a user