langchain/langchain/document_loaders/ifixit.py
Tim Asp d22651d82a Add new iFixit document loader (#1333)
iFixit is a wikipedia-like site that has a huge amount of open content
on how to fix things, questions/answers for common troubleshooting and
"things" related content that is more technical in nature. All content
is licensed under CC-BY-SA-NC 3.0

Adding docs from iFixit as context for user questions like "I dropped my
phone in water, what do I do?" or "My macbook pro is making a whining
noise, what's wrong with it?" can yield significantly better responses
than context free response from LLMs.
2023-03-03 22:55:44 +01:00

203 lines
6.6 KiB
Python

"""Loader that loads iFixit data."""
from typing import List, Optional
import requests
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.web_base import WebBaseLoader
IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0"
class IFixitLoader(BaseLoader):
"""Load iFixit repair guides, device wikis and answers.
iFixit is the largest, open repair community on the web. The site contains nearly
100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is
licensed under CC-BY.
This loader will allow you to download the text of a repair guide, text of Q&A's
and wikis from devices on iFixit using their open APIs and web scraping.
"""
def __init__(self, web_path: str):
"""Initialize with web path."""
if not web_path.startswith("https://www.ifixit.com"):
raise ValueError("web path must start with 'https://www.ifixit.com'")
path = web_path.replace("https://www.ifixit.com", "")
allowed_paths = ["/Device", "/Guide", "/Answers", "/Teardown"]
""" TODO: Add /Wiki """
if not any(path.startswith(allowed_path) for allowed_path in allowed_paths):
raise ValueError(
"web path must start with /Device, /Guide, /Teardown or /Answers"
)
pieces = [x for x in path.split("/") if x]
"""Teardowns are just guides by a different name"""
self.page_type = pieces[0] if pieces[0] != "Teardown" else "Guide"
if self.page_type == "Guide" or self.page_type == "Answers":
self.id = pieces[2]
else:
self.id = pieces[1]
self.web_path = web_path
def load(self) -> List[Document]:
if self.page_type == "Device":
return self.load_device()
elif self.page_type == "Guide" or self.page_type == "Teardown":
return self.load_guide()
elif self.page_type == "Answers":
return self.load_questions_and_answers()
else:
raise ValueError("Unknown page type: " + self.page_type)
@staticmethod
def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
res = requests.get(
IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
)
if res.status_code != 200:
raise ValueError(
'Could not load suggestions for "' + query + '"\n' + res.json()
)
data = res.json()
results = data["results"]
output = []
for result in results:
try:
loader = IFixitLoader(result["url"])
if loader.page_type == "Device":
output += loader.load_device(include_guides=False)
else:
output += loader.load()
except ValueError:
continue
return output
def load_questions_and_answers(
self, url_override: Optional[str] = None
) -> List[Document]:
loader = WebBaseLoader(self.web_path if url_override is None else url_override)
soup = loader.scrape()
output = []
title = soup.find("h1", "post-title").text
output.append("# " + title)
output.append(soup.select_one(".post-content .post-text").text.strip())
output.append("\n## " + soup.find("div", "post-answers-header").text.strip())
for answer in soup.select(".js-answers-list .post.post-answer"):
if answer.has_attr("itemprop") and "acceptedAnswer" in answer["itemprop"]:
output.append("\n### Accepted Answer")
elif "post-helpful" in answer["class"]:
output.append("\n### Most Helpful Answer")
else:
output.append("\n### Other Answer")
output += [
a.text.strip() for a in answer.select(".post-content .post-text")
]
output.append("\n")
text = "\n".join(output).strip()
metadata = {"source": self.web_path, "title": title}
return [Document(page_content=text, metadata=metadata)]
def load_device(
self, url_override: Optional[str] = None, include_guides: bool = True
) -> List[Document]:
documents = []
if url_override is None:
url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
else:
url = url_override
res = requests.get(url)
data = res.json()
text = "\n".join(
[
data[key]
for key in ["title", "description", "contents_raw"]
if key in data
]
).strip()
metadata = {"source": self.web_path, "title": data["title"]}
documents.append(Document(page_content=text, metadata=metadata))
if include_guides:
"""Load and return documents for each guide linked to from the device"""
guide_urls = [guide["url"] for guide in data["guides"]]
for guide_url in guide_urls:
documents.append(IFixitLoader(guide_url).load()[0])
return documents
def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
if url_override is None:
url = IFIXIT_BASE_URL + "/guides/" + self.id
else:
url = url_override
res = requests.get(url)
if res.status_code != 200:
raise ValueError(
"Could not load guide: " + self.web_path + "\n" + res.json()
)
data = res.json()
doc_parts = ["# " + data["title"], data["introduction_raw"]]
doc_parts.append("\n\n###Tools Required:")
if len(data["tools"]) == 0:
doc_parts.append("\n - None")
else:
for tool in data["tools"]:
doc_parts.append("\n - " + tool["text"])
doc_parts.append("\n\n###Parts Required:")
if len(data["parts"]) == 0:
doc_parts.append("\n - None")
else:
for part in data["parts"]:
doc_parts.append("\n - " + part["text"])
for row in data["steps"]:
doc_parts.append(
"\n\n## "
+ (
row["title"]
if row["title"] != ""
else "Step {}".format(row["orderby"])
)
)
for line in row["lines"]:
doc_parts.append(line["text_raw"])
doc_parts.append(data["conclusion_raw"])
text = "\n".join(doc_parts)
metadata = {"source": self.web_path, "title": data["title"]}
return [Document(page_content=text, metadata=metadata)]