added Wikipedia document loader (#4141)

- Added the `Wikipedia` document loader. It is based on the existing
`unilities/WikipediaAPIWrapper`
- Added a respective ut-s and example notebook
- Sorted list of classes in __init__
This commit is contained in:
Leonid Ganeline 2023-05-06 09:32:45 -07:00 committed by GitHub
parent 423f497168
commit 9544b30821
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 379 additions and 125 deletions

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,130 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "bda1f3f5",
"metadata": {},
"source": [
"# Wikipedia\n",
"\n",
">[Wikipedia](https://wikipedia.org/) is a multilingual free online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. `Wikipedia` is the largest and most-read reference work in history.\n",
"\n",
"This notebook shows how to load wiki pages from `wikipedia.org` into the Document format that we use downstream."
]
},
{
"cell_type": "markdown",
"id": "1b7a1eef-7bf7-4e7d-8bfc-c4e27c9488cb",
"metadata": {},
"source": [
"## Installation"
]
},
{
"cell_type": "markdown",
"id": "2abd5578-aa3d-46b9-99af-8b262f0b3df8",
"metadata": {},
"source": [
"First, you need to install `wikipedia` python package."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b674aaea-ed3a-4541-8414-260a8f67f623",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"#!pip install wikipedia"
]
},
{
"cell_type": "markdown",
"id": "95f05e1c-195e-4e2b-ae8e-8d6637f15be6",
"metadata": {},
"source": [
"## Examples"
]
},
{
"cell_type": "markdown",
"id": "e29b954c-1407-4797-ae21-6ba8937156be",
"metadata": {},
"source": [
"`WikipediaLoader` has these arguments:\n",
"- `query`: free text which used to find documents in Wikipedia\n",
"- optional `lang`: default=\"en\". Use it to search in a specific language part of Wikipedia\n",
"- optional `load_max_docs`: default=100. Use it to limit number of downloaded documents. It takes time to download all 100 documents, so use a small number for experiments. There is a hard limit of 300 for now.\n",
"- optional `load_all_available_meta`: default=False. By default only the most important fields downloaded: `Published` (date when document was published/last updated), `title`, `Summary`. If True, other fields also downloaded."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9bfd5e46",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import WikipediaLoader"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "700e4ef2",
"metadata": {},
"outputs": [],
"source": [
"docs = WikipediaLoader(query='HUNTER X HUNTER', load_max_docs=2).load()\n",
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8977bac0-0042-4f23-9754-247dbd32439b",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"docs[0].metadata # meta-information of the Document"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46969806-45a9-4c4d-a61b-cfb9658fc9de",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"docs[0].page_content[:400] # a content of the Document \n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -93,6 +93,7 @@ from langchain.document_loaders.url_playwright import PlaywrightURLLoader
from langchain.document_loaders.url_selenium import SeleniumURLLoader from langchain.document_loaders.url_selenium import SeleniumURLLoader
from langchain.document_loaders.web_base import WebBaseLoader from langchain.document_loaders.web_base import WebBaseLoader
from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
from langchain.document_loaders.wikipedia import WikipediaLoader
from langchain.document_loaders.word_document import ( from langchain.document_loaders.word_document import (
Docx2txtLoader, Docx2txtLoader,
UnstructuredWordDocumentLoader, UnstructuredWordDocumentLoader,
@ -111,8 +112,6 @@ __all__ = [
"AirbyteJSONLoader", "AirbyteJSONLoader",
"ApifyDatasetLoader", "ApifyDatasetLoader",
"ArxivLoader", "ArxivLoader",
"StripeLoader",
"SpreedlyLoader",
"AzureBlobStorageContainerLoader", "AzureBlobStorageContainerLoader",
"AzureBlobStorageFileLoader", "AzureBlobStorageFileLoader",
"BSHTMLLoader", "BSHTMLLoader",
@ -129,6 +128,7 @@ __all__ = [
"DiffbotLoader", "DiffbotLoader",
"DirectoryLoader", "DirectoryLoader",
"DiscordChatLoader", "DiscordChatLoader",
"Docx2txtLoader",
"DuckDBLoader", "DuckDBLoader",
"EverNoteLoader", "EverNoteLoader",
"FacebookChatLoader", "FacebookChatLoader",
@ -137,18 +137,19 @@ __all__ = [
"GitLoader", "GitLoader",
"GitbookLoader", "GitbookLoader",
"GoogleApiClient", "GoogleApiClient",
"RedditPostsLoader",
"GoogleApiYoutubeLoader", "GoogleApiYoutubeLoader",
"GoogleDriveLoader", "GoogleDriveLoader",
"GutenbergLoader", "GutenbergLoader",
"HNLoader", "HNLoader",
"HuggingFaceDatasetLoader", "HuggingFaceDatasetLoader",
"HuggingFaceDatasetLoader",
"IFixitLoader", "IFixitLoader",
"IMSDbLoader", "IMSDbLoader",
"ImageCaptionLoader", "ImageCaptionLoader",
"JSONLoader", "JSONLoader",
"ModernTreasuryLoader",
"MWDumpLoader", "MWDumpLoader",
"MathpixPDFLoader",
"ModernTreasuryLoader",
"NotebookLoader", "NotebookLoader",
"NotionDBLoader", "NotionDBLoader",
"NotionDirectoryLoader", "NotionDirectoryLoader",
@ -161,10 +162,12 @@ __all__ = [
"PagedPDFSplitter", "PagedPDFSplitter",
"PlaywrightURLLoader", "PlaywrightURLLoader",
"PyMuPDFLoader", "PyMuPDFLoader",
"PyPDFDirectoryLoader",
"PyPDFLoader", "PyPDFLoader",
"PyPDFium2Loader", "PyPDFium2Loader",
"PythonLoader", "PythonLoader",
"ReadTheDocsLoader", "ReadTheDocsLoader",
"RedditPostsLoader",
"RoamLoader", "RoamLoader",
"S3DirectoryLoader", "S3DirectoryLoader",
"S3FileLoader", "S3FileLoader",
@ -172,15 +175,17 @@ __all__ = [
"SeleniumURLLoader", "SeleniumURLLoader",
"SitemapLoader", "SitemapLoader",
"SlackDirectoryLoader", "SlackDirectoryLoader",
"SpreedlyLoader",
"StripeLoader",
"TelegramChatLoader", "TelegramChatLoader",
"TextLoader", "TextLoader",
"TomlLoader", "TomlLoader",
"TwitterTweetLoader", "TwitterTweetLoader",
"UnstructuredAPIFileIOLoader",
"UnstructuredAPIFileLoader",
"UnstructuredEPubLoader", "UnstructuredEPubLoader",
"UnstructuredEmailLoader", "UnstructuredEmailLoader",
"UnstructuredAPIFileIOLoader",
"UnstructuredFileIOLoader", "UnstructuredFileIOLoader",
"UnstructuredAPIFileLoader",
"UnstructuredFileLoader", "UnstructuredFileLoader",
"UnstructuredHTMLLoader", "UnstructuredHTMLLoader",
"UnstructuredImageLoader", "UnstructuredImageLoader",
@ -192,10 +197,6 @@ __all__ = [
"UnstructuredWordDocumentLoader", "UnstructuredWordDocumentLoader",
"WebBaseLoader", "WebBaseLoader",
"WhatsAppChatLoader", "WhatsAppChatLoader",
"WikipediaLoader",
"YoutubeLoader", "YoutubeLoader",
"PyPDFDirectoryLoader",
"MathpixPDFLoader",
"ChatGPTLoader",
"HuggingFaceDatasetLoader",
"Docx2txtLoader",
] ]

View File

@ -0,0 +1,34 @@
from typing import List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.utilities.wikipedia import WikipediaAPIWrapper
class WikipediaLoader(BaseLoader):
"""Loads a query result from www.wikipedia.org into a list of Documents.
The hard limit on the number of downloaded Documents is 300 for now.
Each wiki page represents one Document.
"""
def __init__(
self,
query: str,
lang: str = "en",
load_max_docs: Optional[int] = 100,
load_all_available_meta: Optional[bool] = False,
):
self.query = query
self.lang = lang
self.load_max_docs = load_max_docs
self.load_all_available_meta = load_all_available_meta
def load(self) -> List[Document]:
client = WikipediaAPIWrapper(
lang=self.lang,
top_k_results=self.load_max_docs,
load_all_available_meta=self.load_all_available_meta,
)
docs = client.load(self.query)
return docs

View File

@ -17,7 +17,7 @@ class WikipediaQueryRun(BaseTool):
description = ( description = (
"A wrapper around Wikipedia. " "A wrapper around Wikipedia. "
"Useful for when you need to answer general questions about " "Useful for when you need to answer general questions about "
"people, places, companies, historical events, or other subjects. " "people, places, companies, facts, historical events, or other subjects. "
"Input should be a search query." "Input should be a search query."
) )
api_wrapper: WikipediaAPIWrapper api_wrapper: WikipediaAPIWrapper

View File

@ -62,10 +62,10 @@ class ArxivAPIWrapper(BaseModel):
def run(self, query: str) -> str: def run(self, query: str) -> str:
""" """
Run Arxiv search and get the document meta information. Run Arxiv search and get the article meta information.
See https://lukasschwab.me/arxiv.py/index.html#Search See https://lukasschwab.me/arxiv.py/index.html#Search
See https://lukasschwab.me/arxiv.py/index.html#Result See https://lukasschwab.me/arxiv.py/index.html#Result
It uses only the most informative fields of document meta information. It uses only the most informative fields of article meta information.
""" """
try: try:
docs = [ docs = [
@ -82,10 +82,10 @@ class ArxivAPIWrapper(BaseModel):
def load(self, query: str) -> List[Document]: def load(self, query: str) -> List[Document]:
""" """
Run Arxiv search and get the PDF documents plus the meta information. Run Arxiv search and get the article texts plus the article meta information.
See https://lukasschwab.me/arxiv.py/index.html#Search See https://lukasschwab.me/arxiv.py/index.html#Search
Returns: a list of documents with the document.page_content in PDF format Returns: a list of documents with the document.page_content in text format
""" """
try: try:

View File

@ -1,8 +1,13 @@
"""Util that calls Wikipedia.""" """Util that calls Wikipedia."""
from typing import Any, Dict, Optional import logging
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Extra, root_validator from pydantic import BaseModel, Extra, root_validator
from langchain.schema import Document
logger = logging.getLogger(__name__)
WIKIPEDIA_MAX_QUERY_LENGTH = 300 WIKIPEDIA_MAX_QUERY_LENGTH = 300
@ -18,6 +23,7 @@ class WikipediaAPIWrapper(BaseModel):
wiki_client: Any #: :meta private: wiki_client: Any #: :meta private:
top_k_results: int = 3 top_k_results: int = 3
lang: str = "en" lang: str = "en"
load_all_available_meta: bool = False
class Config: class Config:
"""Configuration for this pydantic object.""" """Configuration for this pydantic object."""
@ -41,23 +47,70 @@ class WikipediaAPIWrapper(BaseModel):
def run(self, query: str) -> str: def run(self, query: str) -> str:
"""Run Wikipedia search and get page summaries.""" """Run Wikipedia search and get page summaries."""
search_results = self.wiki_client.search(query[:WIKIPEDIA_MAX_QUERY_LENGTH]) page_titles = self.wiki_client.search(query[:WIKIPEDIA_MAX_QUERY_LENGTH])
summaries = [] summaries = []
len_search_results = len(search_results) for page_title in page_titles[: self.top_k_results]:
if len_search_results == 0: if wiki_page := self._fetch_page(page_title):
if summary := self._formatted_page_summary(page_title, wiki_page):
summaries.append(summary)
if not summaries:
return "No good Wikipedia Search Result was found" return "No good Wikipedia Search Result was found"
for i in range(min(self.top_k_results, len_search_results)):
summary = self.fetch_formatted_page_summary(search_results[i])
if summary is not None:
summaries.append(summary)
return "\n\n".join(summaries) return "\n\n".join(summaries)
def fetch_formatted_page_summary(self, page: str) -> Optional[str]: @staticmethod
def _formatted_page_summary(page_title: str, wiki_page: Any) -> Optional[str]:
return f"Page: {page_title}\nSummary: {wiki_page.summary}"
def _page_to_document(self, page_title: str, wiki_page: Any) -> Document:
main_meta = {
"title": page_title,
"summary": wiki_page.summary,
}
add_meta = (
{
"categories": wiki_page.categories,
# "coordinates": wiki_page.coordinates,
"page_url": wiki_page.url,
"image_urls": wiki_page.images,
"related_titles": wiki_page.links,
"parent_id": wiki_page.parent_id,
"references": wiki_page.references,
"revision_id": wiki_page.revision_id,
"sections": wiki_page.sections,
}
if self.load_all_available_meta
else {}
)
doc = Document(
page_content=wiki_page.content,
metadata={
**main_meta,
**add_meta,
},
)
return doc
def _fetch_page(self, page: str) -> Optional[str]:
try: try:
wiki_page = self.wiki_client.page(title=page, auto_suggest=False) return self.wiki_client.page(title=page, auto_suggest=False)
return f"Page: {page}\nSummary: {wiki_page.summary}"
except ( except (
self.wiki_client.exceptions.PageError, self.wiki_client.exceptions.PageError,
self.wiki_client.exceptions.DisambiguationError, self.wiki_client.exceptions.DisambiguationError,
): ):
return None return None
def load(self, query: str) -> List[Document]:
"""
Run Wikipedia search and get the article text plus the meta information.
See
Returns: a list of documents with the document.page_content in PDF format
"""
page_titles = self.wiki_client.search(query[:WIKIPEDIA_MAX_QUERY_LENGTH])
docs = []
for page_title in page_titles[: self.top_k_results]:
if wiki_page := self._fetch_page(page_title):
if doc := self._page_to_document(page_title, wiki_page):
docs.append(doc)
return docs

View File

@ -1,19 +1,56 @@
"""Integration test for Wikipedia API Wrapper.""" """Integration test for Wikipedia API Wrapper."""
from typing import List
import pytest
from langchain.schema import Document
from langchain.utilities import WikipediaAPIWrapper from langchain.utilities import WikipediaAPIWrapper
def test_call() -> None: @pytest.fixture
"""Test that WikipediaAPIWrapper returns correct answer""" def api_client() -> WikipediaAPIWrapper:
return WikipediaAPIWrapper()
wikipedia = WikipediaAPIWrapper()
output = wikipedia.run("HUNTER X HUNTER") def test_run_success(api_client: WikipediaAPIWrapper) -> None:
output = api_client.run("HUNTER X HUNTER")
assert "Yoshihiro Togashi" in output assert "Yoshihiro Togashi" in output
def test_no_result_call() -> None: def test_run_no_result(api_client: WikipediaAPIWrapper) -> None:
"""Test that call gives no result.""" output = api_client.run(
wikipedia = WikipediaAPIWrapper()
output = wikipedia.run(
"NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL" "NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL"
) )
assert "No good Wikipedia Search Result was found" == output assert "No good Wikipedia Search Result was found" == output
def assert_docs(docs: List[Document], all_meta: bool = False) -> None:
for doc in docs:
assert doc.page_content
assert doc.metadata
main_meta = {"title", "summary"}
assert set(doc.metadata).issuperset(main_meta)
if all_meta:
assert len(set(doc.metadata)) > len(main_meta)
else:
assert len(set(doc.metadata)) == len(main_meta)
def test_load_success(api_client: WikipediaAPIWrapper) -> None:
docs = api_client.load("HUNTER X HUNTER")
assert len(docs) > 1
assert_docs(docs, all_meta=False)
def test_load_success_all_meta(api_client: WikipediaAPIWrapper) -> None:
api_client.load_all_available_meta = True
docs = api_client.load("HUNTER X HUNTER")
assert len(docs) > 1
assert_docs(docs, all_meta=True)
def test_load_no_result(api_client: WikipediaAPIWrapper) -> None:
docs = api_client.load(
"NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL"
)
assert not docs