From 9544b30821cc619842e837ddf8d61076053bd482 Mon Sep 17 00:00:00 2001 From: Leonid Ganeline Date: Sat, 6 May 2023 09:32:45 -0700 Subject: [PATCH] added `Wikipedia` document loader (#4141) - Added the `Wikipedia` document loader. It is based on the existing `unilities/WikipediaAPIWrapper` - Added a respective ut-s and example notebook - Sorted list of classes in __init__ --- .../agents/tools/examples/wikipedia.ipynb | 173 +++++++++--------- .../document_loaders/examples/wikipedia.ipynb | 130 +++++++++++++ langchain/document_loaders/__init__.py | 23 +-- langchain/document_loaders/wikipedia.py | 34 ++++ langchain/tools/wikipedia/tool.py | 2 +- langchain/utilities/arxiv.py | 8 +- langchain/utilities/wikipedia.py | 75 ++++++-- .../utilities/test_wikipedia_api.py | 53 +++++- 8 files changed, 376 insertions(+), 122 deletions(-) create mode 100644 docs/modules/indexes/document_loaders/examples/wikipedia.ipynb create mode 100644 langchain/document_loaders/wikipedia.py diff --git a/docs/modules/agents/tools/examples/wikipedia.ipynb b/docs/modules/agents/tools/examples/wikipedia.ipynb index bf502331..3592d833 100644 --- a/docs/modules/agents/tools/examples/wikipedia.ipynb +++ b/docs/modules/agents/tools/examples/wikipedia.ipynb @@ -1,93 +1,92 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "245a954a", - "metadata": {}, - "source": [ - "# Wikipedia API\n", - "\n", - "This notebook goes over how to use the wikipedia component.\n", - "\n", - "First, you need to install `wikipedia` python package." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "961b3689", - "metadata": { - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "pip install wikipedia" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8d32b39a", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.utilities import WikipediaAPIWrapper" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "2a50dd27", - "metadata": {}, - "outputs": [], - "source": [ - "wikipedia = WikipediaAPIWrapper()" - ] - }, + "cells": [ + { + "cell_type": "markdown", + "id": "245a954a", + "metadata": {}, + "source": [ + "# Wikipedia\n", + "\n", + ">[Wikipedia](https://wikipedia.org/) is a multilingual free online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. `Wikipedia` is the largest and most-read reference work in history.\n", + "\n", + "First, you need to install `wikipedia` python package." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "961b3689", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "!pip install wikipedia" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8d32b39a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.utilities import WikipediaAPIWrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2a50dd27", + "metadata": {}, + "outputs": [], + "source": [ + "wikipedia = WikipediaAPIWrapper()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "34bb5968", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "id": "34bb5968", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Page: Hunter × Hunter\\nSummary: Hunter × Hunter (stylized as HUNTER×HUNTER and pronounced \"hunter hunter\") is a Japanese manga series written and illustrated by Yoshihiro Togashi. It has been serialized in Shueisha\\'s shōnen manga magazine Weekly Shōnen Jump since March 1998, although the manga has frequently gone on extended hiatuses since 2006. Its chapters have been collected in 37 tankōbon volumes as of November 2022. The story focuses on a young boy named Gon Freecss who discovers that his father, who left him at a young age, is actually a world-renowned Hunter, a licensed professional who specializes in fantastical pursuits such as locating rare or unidentified animal species, treasure hunting, surveying unexplored enclaves, or hunting down lawless individuals. Gon departs on a journey to become a Hunter and eventually find his father. Along the way, Gon meets various other Hunters and encounters the paranormal.\\nHunter × Hunter was adapted into a 62-episode anime television series produced by Nippon Animation and directed by Kazuhiro Furuhashi, which ran on Fuji Television from October 1999 to March 2001. Three separate original video animations (OVAs) totaling 30 episodes were subsequently produced by Nippon Animation and released in Japan from 2002 to 2004. A second anime television series by Madhouse aired on Nippon Television from October 2011 to September 2014, totaling 148 episodes, with two animated theatrical films released in 2013. There are also numerous audio albums, video games, musicals, and other media based on Hunter × Hunter.\\nThe manga has been translated into English and released in North America by Viz Media since April 2005. Both television series have been also licensed by Viz Media, with the first series having aired on the Funimation Channel in 2009 and the second series broadcast on Adult Swim\\'s Toonami programming block from April 2016 to June 2019.\\nHunter × Hunter has been a huge critical and financial success and has become one of the best-selling manga series of all time, having over 84 million copies in circulation by July 2022.\\n\\nPage: Hunter × Hunter (2011 TV series)\\nSummary: Hunter × Hunter is an anime television series that aired from 2011 to 2014 based on Yoshihiro Togashi\\'s manga series Hunter × Hunter. The story begins with a young boy named Gon Freecss, who one day discovers that the father who he thought was dead, is in fact alive and well. He learns that his father, Ging, is a legendary \"Hunter\", an individual who has proven themselves an elite member of humanity. Despite the fact that Ging left his son with his relatives in order to pursue his own dreams, Gon becomes determined to follow in his father\\'s footsteps, pass the rigorous \"Hunter Examination\", and eventually find his father to become a Hunter in his own right.\\nThis new Hunter × Hunter anime was announced on July 24, 2011. It is a complete reboot of the anime adaptation starting from the beginning of the manga, with no connections to the first anime from 1999. Produced by Nippon TV, VAP, Shueisha and Madhouse, the series is directed by Hiroshi Kōjina, with Atsushi Maekawa and Tsutomu Kamishiro handling series composition, Takahiro Yoshimatsu designing the characters and Yoshihisa Hirano composing the music. Instead of having the old cast reprise their roles for the new adaptation, the series features an entirely new cast to voice the characters. The new series premiered airing weekly on Nippon TV and the nationwide Nippon News Network from October 2, 2011. The series started to be collected in both DVD and Blu-ray format on January 25, 2012. Viz Media has licensed the anime for a DVD/Blu-ray release in North America with an English dub. On television, the series began airing on Adult Swim\\'s Toonami programming block on April 17, 2016, and ended on June 23, 2019.The anime series\\' opening theme is alternated between the song \"Departure!\" and an alternate version titled \"Departure! -Second Version-\" both sung by Galneryus\\' vocalist Masatoshi Ono. Five pieces of music were used as the ending theme; \"Just Awake\" by the Japanese band Fear, and Loathing in Las Vegas in episodes 1 to 26, \"Hunting for Your Dream\" by Galneryus in episodes 27 to 58, \"Reason\" sung by Japanese duo Yuzu in episodes 59 to 75, \"Nagareboshi Kirari\" also sung by Yuzu from episode 76 to 98, which was originally from the anime film adaptation, Hunter × Hunter: Phantom Rouge, and \"Hyōri Ittai\" by Yuzu featuring Hyadain from episode 99 to 146, which was also used in the film Hunter × Hunter: The Last Mission. The background music and soundtrack for the series was composed by Yoshihisa Hirano.\\n\\n\\n\\nPage: List of Hunter × Hunter characters\\nSummary: The Hunter × Hunter manga series, created by Yoshihiro Togashi, features an extensive cast of characters. It takes place in a fictional universe where licensed specialists known as Hunters travel the world taking on special jobs ranging from treasure hunting to assassination. The story initially focuses on Gon Freecss and his quest to become a Hunter in order to find his father, Ging, who is himself a famous Hunter. On the way, Gon meets and becomes close friends with Killua Zoldyck, Kurapika and Leorio Paradinight.\\nAlthough most characters are human, most possess superhuman strength and/or supernatural abilities due to Nen, the ability to control one\\'s own life energy or aura. The world of the series also includes fantastical beasts such as the Chimera Ants or the Five great calamities.'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wikipedia.run('HUNTER X HUNTER')" + "data": { + "text/plain": [ + "'Page: Hunter × Hunter\\nSummary: Hunter × Hunter (stylized as HUNTER×HUNTER and pronounced \"hunter hunter\") is a Japanese manga series written and illustrated by Yoshihiro Togashi. It has been serialized in Shueisha\\'s shōnen manga magazine Weekly Shōnen Jump since March 1998, although the manga has frequently gone on extended hiatuses since 2006. Its chapters have been collected in 37 tankōbon volumes as of November 2022. The story focuses on a young boy named Gon Freecss who discovers that his father, who left him at a young age, is actually a world-renowned Hunter, a licensed professional who specializes in fantastical pursuits such as locating rare or unidentified animal species, treasure hunting, surveying unexplored enclaves, or hunting down lawless individuals. Gon departs on a journey to become a Hunter and eventually find his father. Along the way, Gon meets various other Hunters and encounters the paranormal.\\nHunter × Hunter was adapted into a 62-episode anime television series produced by Nippon Animation and directed by Kazuhiro Furuhashi, which ran on Fuji Television from October 1999 to March 2001. Three separate original video animations (OVAs) totaling 30 episodes were subsequently produced by Nippon Animation and released in Japan from 2002 to 2004. A second anime television series by Madhouse aired on Nippon Television from October 2011 to September 2014, totaling 148 episodes, with two animated theatrical films released in 2013. There are also numerous audio albums, video games, musicals, and other media based on Hunter × Hunter.\\nThe manga has been translated into English and released in North America by Viz Media since April 2005. Both television series have been also licensed by Viz Media, with the first series having aired on the Funimation Channel in 2009 and the second series broadcast on Adult Swim\\'s Toonami programming block from April 2016 to June 2019.\\nHunter × Hunter has been a huge critical and financial success and has become one of the best-selling manga series of all time, having over 84 million copies in circulation by July 2022.\\n\\nPage: Hunter × Hunter (2011 TV series)\\nSummary: Hunter × Hunter is an anime television series that aired from 2011 to 2014 based on Yoshihiro Togashi\\'s manga series Hunter × Hunter. The story begins with a young boy named Gon Freecss, who one day discovers that the father who he thought was dead, is in fact alive and well. He learns that his father, Ging, is a legendary \"Hunter\", an individual who has proven themselves an elite member of humanity. Despite the fact that Ging left his son with his relatives in order to pursue his own dreams, Gon becomes determined to follow in his father\\'s footsteps, pass the rigorous \"Hunter Examination\", and eventually find his father to become a Hunter in his own right.\\nThis new Hunter × Hunter anime was announced on July 24, 2011. It is a complete reboot of the anime adaptation starting from the beginning of the manga, with no connections to the first anime from 1999. Produced by Nippon TV, VAP, Shueisha and Madhouse, the series is directed by Hiroshi Kōjina, with Atsushi Maekawa and Tsutomu Kamishiro handling series composition, Takahiro Yoshimatsu designing the characters and Yoshihisa Hirano composing the music. Instead of having the old cast reprise their roles for the new adaptation, the series features an entirely new cast to voice the characters. The new series premiered airing weekly on Nippon TV and the nationwide Nippon News Network from October 2, 2011. The series started to be collected in both DVD and Blu-ray format on January 25, 2012. Viz Media has licensed the anime for a DVD/Blu-ray release in North America with an English dub. On television, the series began airing on Adult Swim\\'s Toonami programming block on April 17, 2016, and ended on June 23, 2019.The anime series\\' opening theme is alternated between the song \"Departure!\" and an alternate version titled \"Departure! -Second Version-\" both sung by Galneryus\\' vocalist Masatoshi Ono. Five pieces of music were used as the ending theme; \"Just Awake\" by the Japanese band Fear, and Loathing in Las Vegas in episodes 1 to 26, \"Hunting for Your Dream\" by Galneryus in episodes 27 to 58, \"Reason\" sung by Japanese duo Yuzu in episodes 59 to 75, \"Nagareboshi Kirari\" also sung by Yuzu from episode 76 to 98, which was originally from the anime film adaptation, Hunter × Hunter: Phantom Rouge, and \"Hyōri Ittai\" by Yuzu featuring Hyadain from episode 99 to 146, which was also used in the film Hunter × Hunter: The Last Mission. The background music and soundtrack for the series was composed by Yoshihisa Hirano.\\n\\n\\n\\nPage: List of Hunter × Hunter characters\\nSummary: The Hunter × Hunter manga series, created by Yoshihiro Togashi, features an extensive cast of characters. It takes place in a fictional universe where licensed specialists known as Hunters travel the world taking on special jobs ranging from treasure hunting to assassination. The story initially focuses on Gon Freecss and his quest to become a Hunter in order to find his father, Ging, who is himself a famous Hunter. On the way, Gon meets and becomes close friends with Killua Zoldyck, Kurapika and Leorio Paradinight.\\nAlthough most characters are human, most possess superhuman strength and/or supernatural abilities due to Nen, the ability to control one\\'s own life energy or aura. The world of the series also includes fantastical beasts such as the Chimera Ants or the Five great calamities.'" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } + ], + "source": [ + "wikipedia.run('HUNTER X HUNTER')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 5 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/docs/modules/indexes/document_loaders/examples/wikipedia.ipynb b/docs/modules/indexes/document_loaders/examples/wikipedia.ipynb new file mode 100644 index 00000000..84685f31 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/wikipedia.ipynb @@ -0,0 +1,130 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bda1f3f5", + "metadata": {}, + "source": [ + "# Wikipedia\n", + "\n", + ">[Wikipedia](https://wikipedia.org/) is a multilingual free online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. `Wikipedia` is the largest and most-read reference work in history.\n", + "\n", + "This notebook shows how to load wiki pages from `wikipedia.org` into the Document format that we use downstream." + ] + }, + { + "cell_type": "markdown", + "id": "1b7a1eef-7bf7-4e7d-8bfc-c4e27c9488cb", + "metadata": {}, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "markdown", + "id": "2abd5578-aa3d-46b9-99af-8b262f0b3df8", + "metadata": {}, + "source": [ + "First, you need to install `wikipedia` python package." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b674aaea-ed3a-4541-8414-260a8f67f623", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#!pip install wikipedia" + ] + }, + { + "cell_type": "markdown", + "id": "95f05e1c-195e-4e2b-ae8e-8d6637f15be6", + "metadata": {}, + "source": [ + "## Examples" + ] + }, + { + "cell_type": "markdown", + "id": "e29b954c-1407-4797-ae21-6ba8937156be", + "metadata": {}, + "source": [ + "`WikipediaLoader` has these arguments:\n", + "- `query`: free text which used to find documents in Wikipedia\n", + "- optional `lang`: default=\"en\". Use it to search in a specific language part of Wikipedia\n", + "- optional `load_max_docs`: default=100. Use it to limit number of downloaded documents. It takes time to download all 100 documents, so use a small number for experiments. There is a hard limit of 300 for now.\n", + "- optional `load_all_available_meta`: default=False. By default only the most important fields downloaded: `Published` (date when document was published/last updated), `title`, `Summary`. If True, other fields also downloaded." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9bfd5e46", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import WikipediaLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "700e4ef2", + "metadata": {}, + "outputs": [], + "source": [ + "docs = WikipediaLoader(query='HUNTER X HUNTER', load_max_docs=2).load()\n", + "len(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8977bac0-0042-4f23-9754-247dbd32439b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs[0].metadata # meta-information of the Document" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46969806-45a9-4c4d-a61b-cfb9658fc9de", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs[0].page_content[:400] # a content of the Document \n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index e5c5eb1c..f2ff3d9c 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -93,6 +93,7 @@ from langchain.document_loaders.url_playwright import PlaywrightURLLoader from langchain.document_loaders.url_selenium import SeleniumURLLoader from langchain.document_loaders.web_base import WebBaseLoader from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader +from langchain.document_loaders.wikipedia import WikipediaLoader from langchain.document_loaders.word_document import ( Docx2txtLoader, UnstructuredWordDocumentLoader, @@ -111,8 +112,6 @@ __all__ = [ "AirbyteJSONLoader", "ApifyDatasetLoader", "ArxivLoader", - "StripeLoader", - "SpreedlyLoader", "AzureBlobStorageContainerLoader", "AzureBlobStorageFileLoader", "BSHTMLLoader", @@ -129,6 +128,7 @@ __all__ = [ "DiffbotLoader", "DirectoryLoader", "DiscordChatLoader", + "Docx2txtLoader", "DuckDBLoader", "EverNoteLoader", "FacebookChatLoader", @@ -137,18 +137,19 @@ __all__ = [ "GitLoader", "GitbookLoader", "GoogleApiClient", - "RedditPostsLoader", "GoogleApiYoutubeLoader", "GoogleDriveLoader", "GutenbergLoader", "HNLoader", "HuggingFaceDatasetLoader", + "HuggingFaceDatasetLoader", "IFixitLoader", "IMSDbLoader", "ImageCaptionLoader", "JSONLoader", - "ModernTreasuryLoader", "MWDumpLoader", + "MathpixPDFLoader", + "ModernTreasuryLoader", "NotebookLoader", "NotionDBLoader", "NotionDirectoryLoader", @@ -161,10 +162,12 @@ __all__ = [ "PagedPDFSplitter", "PlaywrightURLLoader", "PyMuPDFLoader", + "PyPDFDirectoryLoader", "PyPDFLoader", "PyPDFium2Loader", "PythonLoader", "ReadTheDocsLoader", + "RedditPostsLoader", "RoamLoader", "S3DirectoryLoader", "S3FileLoader", @@ -172,15 +175,17 @@ __all__ = [ "SeleniumURLLoader", "SitemapLoader", "SlackDirectoryLoader", + "SpreedlyLoader", + "StripeLoader", "TelegramChatLoader", "TextLoader", "TomlLoader", "TwitterTweetLoader", + "UnstructuredAPIFileIOLoader", + "UnstructuredAPIFileLoader", "UnstructuredEPubLoader", "UnstructuredEmailLoader", - "UnstructuredAPIFileIOLoader", "UnstructuredFileIOLoader", - "UnstructuredAPIFileLoader", "UnstructuredFileLoader", "UnstructuredHTMLLoader", "UnstructuredImageLoader", @@ -192,10 +197,6 @@ __all__ = [ "UnstructuredWordDocumentLoader", "WebBaseLoader", "WhatsAppChatLoader", + "WikipediaLoader", "YoutubeLoader", - "PyPDFDirectoryLoader", - "MathpixPDFLoader", - "ChatGPTLoader", - "HuggingFaceDatasetLoader", - "Docx2txtLoader", ] diff --git a/langchain/document_loaders/wikipedia.py b/langchain/document_loaders/wikipedia.py new file mode 100644 index 00000000..c1b2b693 --- /dev/null +++ b/langchain/document_loaders/wikipedia.py @@ -0,0 +1,34 @@ +from typing import List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.utilities.wikipedia import WikipediaAPIWrapper + + +class WikipediaLoader(BaseLoader): + """Loads a query result from www.wikipedia.org into a list of Documents. + The hard limit on the number of downloaded Documents is 300 for now. + + Each wiki page represents one Document. + """ + + def __init__( + self, + query: str, + lang: str = "en", + load_max_docs: Optional[int] = 100, + load_all_available_meta: Optional[bool] = False, + ): + self.query = query + self.lang = lang + self.load_max_docs = load_max_docs + self.load_all_available_meta = load_all_available_meta + + def load(self) -> List[Document]: + client = WikipediaAPIWrapper( + lang=self.lang, + top_k_results=self.load_max_docs, + load_all_available_meta=self.load_all_available_meta, + ) + docs = client.load(self.query) + return docs diff --git a/langchain/tools/wikipedia/tool.py b/langchain/tools/wikipedia/tool.py index af398d7f..f8275c67 100644 --- a/langchain/tools/wikipedia/tool.py +++ b/langchain/tools/wikipedia/tool.py @@ -17,7 +17,7 @@ class WikipediaQueryRun(BaseTool): description = ( "A wrapper around Wikipedia. " "Useful for when you need to answer general questions about " - "people, places, companies, historical events, or other subjects. " + "people, places, companies, facts, historical events, or other subjects. " "Input should be a search query." ) api_wrapper: WikipediaAPIWrapper diff --git a/langchain/utilities/arxiv.py b/langchain/utilities/arxiv.py index 2ea37d6e..023ec750 100644 --- a/langchain/utilities/arxiv.py +++ b/langchain/utilities/arxiv.py @@ -62,10 +62,10 @@ class ArxivAPIWrapper(BaseModel): def run(self, query: str) -> str: """ - Run Arxiv search and get the document meta information. + Run Arxiv search and get the article meta information. See https://lukasschwab.me/arxiv.py/index.html#Search See https://lukasschwab.me/arxiv.py/index.html#Result - It uses only the most informative fields of document meta information. + It uses only the most informative fields of article meta information. """ try: docs = [ @@ -82,10 +82,10 @@ class ArxivAPIWrapper(BaseModel): def load(self, query: str) -> List[Document]: """ - Run Arxiv search and get the PDF documents plus the meta information. + Run Arxiv search and get the article texts plus the article meta information. See https://lukasschwab.me/arxiv.py/index.html#Search - Returns: a list of documents with the document.page_content in PDF format + Returns: a list of documents with the document.page_content in text format """ try: diff --git a/langchain/utilities/wikipedia.py b/langchain/utilities/wikipedia.py index cc6cc6ae..e59e15bb 100644 --- a/langchain/utilities/wikipedia.py +++ b/langchain/utilities/wikipedia.py @@ -1,8 +1,13 @@ """Util that calls Wikipedia.""" -from typing import Any, Dict, Optional +import logging +from typing import Any, Dict, List, Optional from pydantic import BaseModel, Extra, root_validator +from langchain.schema import Document + +logger = logging.getLogger(__name__) + WIKIPEDIA_MAX_QUERY_LENGTH = 300 @@ -18,6 +23,7 @@ class WikipediaAPIWrapper(BaseModel): wiki_client: Any #: :meta private: top_k_results: int = 3 lang: str = "en" + load_all_available_meta: bool = False class Config: """Configuration for this pydantic object.""" @@ -41,23 +47,70 @@ class WikipediaAPIWrapper(BaseModel): def run(self, query: str) -> str: """Run Wikipedia search and get page summaries.""" - search_results = self.wiki_client.search(query[:WIKIPEDIA_MAX_QUERY_LENGTH]) + page_titles = self.wiki_client.search(query[:WIKIPEDIA_MAX_QUERY_LENGTH]) summaries = [] - len_search_results = len(search_results) - if len_search_results == 0: + for page_title in page_titles[: self.top_k_results]: + if wiki_page := self._fetch_page(page_title): + if summary := self._formatted_page_summary(page_title, wiki_page): + summaries.append(summary) + if not summaries: return "No good Wikipedia Search Result was found" - for i in range(min(self.top_k_results, len_search_results)): - summary = self.fetch_formatted_page_summary(search_results[i]) - if summary is not None: - summaries.append(summary) return "\n\n".join(summaries) - def fetch_formatted_page_summary(self, page: str) -> Optional[str]: + @staticmethod + def _formatted_page_summary(page_title: str, wiki_page: Any) -> Optional[str]: + return f"Page: {page_title}\nSummary: {wiki_page.summary}" + + def _page_to_document(self, page_title: str, wiki_page: Any) -> Document: + main_meta = { + "title": page_title, + "summary": wiki_page.summary, + } + add_meta = ( + { + "categories": wiki_page.categories, + # "coordinates": wiki_page.coordinates, + "page_url": wiki_page.url, + "image_urls": wiki_page.images, + "related_titles": wiki_page.links, + "parent_id": wiki_page.parent_id, + "references": wiki_page.references, + "revision_id": wiki_page.revision_id, + "sections": wiki_page.sections, + } + if self.load_all_available_meta + else {} + ) + doc = Document( + page_content=wiki_page.content, + metadata={ + **main_meta, + **add_meta, + }, + ) + return doc + + def _fetch_page(self, page: str) -> Optional[str]: try: - wiki_page = self.wiki_client.page(title=page, auto_suggest=False) - return f"Page: {page}\nSummary: {wiki_page.summary}" + return self.wiki_client.page(title=page, auto_suggest=False) except ( self.wiki_client.exceptions.PageError, self.wiki_client.exceptions.DisambiguationError, ): return None + + def load(self, query: str) -> List[Document]: + """ + Run Wikipedia search and get the article text plus the meta information. + See + + Returns: a list of documents with the document.page_content in PDF format + + """ + page_titles = self.wiki_client.search(query[:WIKIPEDIA_MAX_QUERY_LENGTH]) + docs = [] + for page_title in page_titles[: self.top_k_results]: + if wiki_page := self._fetch_page(page_title): + if doc := self._page_to_document(page_title, wiki_page): + docs.append(doc) + return docs diff --git a/tests/integration_tests/utilities/test_wikipedia_api.py b/tests/integration_tests/utilities/test_wikipedia_api.py index db82dde2..ff5b0842 100644 --- a/tests/integration_tests/utilities/test_wikipedia_api.py +++ b/tests/integration_tests/utilities/test_wikipedia_api.py @@ -1,19 +1,56 @@ """Integration test for Wikipedia API Wrapper.""" +from typing import List + +import pytest + +from langchain.schema import Document from langchain.utilities import WikipediaAPIWrapper -def test_call() -> None: - """Test that WikipediaAPIWrapper returns correct answer""" +@pytest.fixture +def api_client() -> WikipediaAPIWrapper: + return WikipediaAPIWrapper() - wikipedia = WikipediaAPIWrapper() - output = wikipedia.run("HUNTER X HUNTER") + +def test_run_success(api_client: WikipediaAPIWrapper) -> None: + output = api_client.run("HUNTER X HUNTER") assert "Yoshihiro Togashi" in output -def test_no_result_call() -> None: - """Test that call gives no result.""" - wikipedia = WikipediaAPIWrapper() - output = wikipedia.run( +def test_run_no_result(api_client: WikipediaAPIWrapper) -> None: + output = api_client.run( "NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL" ) assert "No good Wikipedia Search Result was found" == output + + +def assert_docs(docs: List[Document], all_meta: bool = False) -> None: + for doc in docs: + assert doc.page_content + assert doc.metadata + main_meta = {"title", "summary"} + assert set(doc.metadata).issuperset(main_meta) + if all_meta: + assert len(set(doc.metadata)) > len(main_meta) + else: + assert len(set(doc.metadata)) == len(main_meta) + + +def test_load_success(api_client: WikipediaAPIWrapper) -> None: + docs = api_client.load("HUNTER X HUNTER") + assert len(docs) > 1 + assert_docs(docs, all_meta=False) + + +def test_load_success_all_meta(api_client: WikipediaAPIWrapper) -> None: + api_client.load_all_available_meta = True + docs = api_client.load("HUNTER X HUNTER") + assert len(docs) > 1 + assert_docs(docs, all_meta=True) + + +def test_load_no_result(api_client: WikipediaAPIWrapper) -> None: + docs = api_client.load( + "NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL" + ) + assert not docs