From 36c59e0c25dc4bd8be17321e8f9d1d58ad10b052 Mon Sep 17 00:00:00 2001 From: leo-gan Date: Wed, 26 Apr 2023 21:04:56 -0700 Subject: [PATCH] `Arxiv` document loader (#3627) It makes sense to use `arxiv` as another source of the documents for downloading. - Added the `arxiv` document_loader, based on the `utilities/arxiv.py:ArxivAPIWrapper` - added tests - added an example notebook - sorted `__all__` in `__init__.py` (otherwise it is hard to find a class in the very long list) --- .../document_loaders/examples/arxiv.ipynb | 177 ++++++++++++++++++ langchain/__init__.py | 2 +- langchain/document_loaders/__init__.py | 126 +++++++------ langchain/document_loaders/arxiv.py | 31 +++ langchain/utilities/arxiv.py | 80 +++++++- .../document_loaders/test_arxiv.py | 55 ++++++ .../integration_tests/utilities/test_arxiv.py | 61 +++++- 7 files changed, 462 insertions(+), 70 deletions(-) create mode 100644 docs/modules/indexes/document_loaders/examples/arxiv.ipynb create mode 100644 langchain/document_loaders/arxiv.py create mode 100644 tests/integration_tests/document_loaders/test_arxiv.py diff --git a/docs/modules/indexes/document_loaders/examples/arxiv.ipynb b/docs/modules/indexes/document_loaders/examples/arxiv.ipynb new file mode 100644 index 00000000..27644df7 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/arxiv.ipynb @@ -0,0 +1,177 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bda1f3f5", + "metadata": {}, + "source": [ + "# Arxiv\n", + "\n", + "[arXiv](https://arxiv.org/) is an open-access archive for 2 million scholarly articles in the fields of physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science, and economics.\n", + "\n", + "This notebook shows how to load scientific articles from `Arxiv.org` into a document format that we can use downstream." + ] + }, + { + "cell_type": "markdown", + "id": "1b7a1eef-7bf7-4e7d-8bfc-c4e27c9488cb", + "metadata": {}, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "markdown", + "id": "2abd5578-aa3d-46b9-99af-8b262f0b3df8", + "metadata": {}, + "source": [ + "First, you need to install `arxiv` python package." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b674aaea-ed3a-4541-8414-260a8f67f623", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install arxiv" + ] + }, + { + "cell_type": "markdown", + "id": "094b5f13-7e54-4354-9d83-26d6926ecaa0", + "metadata": { + "tags": [] + }, + "source": [ + "Second, you need to install `PyMuPDF` python package which transform PDF files from the `arxiv.org` site into the text fromat." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cd91121-2e96-43ba-af50-319853695f86", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install pymupdf" + ] + }, + { + "cell_type": "markdown", + "id": "95f05e1c-195e-4e2b-ae8e-8d6637f15be6", + "metadata": {}, + "source": [ + "## Examples" + ] + }, + { + "cell_type": "markdown", + "id": "e29b954c-1407-4797-ae21-6ba8937156be", + "metadata": {}, + "source": [ + "`ArxivLoader` has these arguments:\n", + "- `query`: free text which used to find documents in the Arxiv\n", + "- optional `load_max_docs`: default=100. Use it to limit number of downloaded documents. It takes time to download all 100 documents, so use a small number for experiments.\n", + "- optional `load_all_available_meta`: default=False. By defaul only the most important fields downloaded: `Published` (date when document was published/last updated), `Title`, `Authors`, `Summary`. If True, other fields also downloaded." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bfd5e46", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.base import Document\n", + "from langchain.document_loaders import ArxivLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "700e4ef2", + "metadata": {}, + "outputs": [], + "source": [ + "docs = ArxivLoader(query=\"1605.08386\", load_max_docs=2).load()\n", + "len(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8977bac0-0042-4f23-9754-247dbd32439b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Published': '2016-05-26',\n", + " 'Title': 'Heat-bath random walks with Markov bases',\n", + " 'Authors': 'Caprice Stanley, Tobias Windisch',\n", + " 'Summary': 'Graphs on lattice points are studied whose edges come from a finite set of\\nallowed moves of arbitrary length. We show that the diameter of these graphs on\\nfibers of a fixed integer matrix can be bounded from above by a constant. We\\nthen study the mixing behaviour of heat-bath random walks on these graphs. We\\nalso state explicit conditions on the set of moves so that the heat-bath random\\nwalk, a generalization of the Glauber dynamics, is an expander in fixed\\ndimension.'}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc[0].metadata # meta-information of the Document" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "46969806-45a9-4c4d-a61b-cfb9658fc9de", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'arXiv:1605.08386v1 [math.CO] 26 May 2016\\nHEAT-BATH RANDOM WALKS WITH MARKOV BASES\\nCAPRICE STANLEY AND TOBIAS WINDISCH\\nAbstract. Graphs on lattice points are studied whose edges come from a finite set of\\nallowed moves of arbitrary length. We show that the diameter of these graphs on fibers of a\\nfixed integer matrix can be bounded from above by a constant. We then study the mixing\\nbehaviour of heat-b'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc[0].page_content[:400] # all pages of the Document content\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/__init__.py b/langchain/__init__.py index de85fbfd..7982b222 100644 --- a/langchain/__init__.py +++ b/langchain/__init__.py @@ -47,7 +47,7 @@ from langchain.prompts import ( PromptTemplate, ) from langchain.sql_database import SQLDatabase -from langchain.utilities import ArxivAPIWrapper +from langchain.utilities.arxiv import ArxivAPIWrapper from langchain.utilities.google_search import GoogleSearchAPIWrapper from langchain.utilities.google_serper import GoogleSerperAPIWrapper from langchain.utilities.powerbi import PowerBIDataset diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index d73ae89a..3d978679 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -2,6 +2,7 @@ from langchain.document_loaders.airbyte_json import AirbyteJSONLoader from langchain.document_loaders.apify_dataset import ApifyDatasetLoader +from langchain.document_loaders.arxiv import ArxivLoader from langchain.document_loaders.azlyrics import AZLyricsLoader from langchain.document_loaders.azure_blob_storage_container import ( AzureBlobStorageContainerLoader, @@ -90,78 +91,79 @@ from langchain.document_loaders.youtube import ( PagedPDFSplitter = PyPDFLoader __all__ = [ - "UnstructuredFileLoader", - "UnstructuredFileIOLoader", - "UnstructuredURLLoader", - "SeleniumURLLoader", - "PlaywrightURLLoader", + "AZLyricsLoader", + "AirbyteJSONLoader", + "ApifyDatasetLoader", + "ArxivLoader", + "AzureBlobStorageContainerLoader", + "AzureBlobStorageFileLoader", + "BSHTMLLoader", + "BigQueryLoader", + "BiliBiliLoader", + "BlackboardLoader", + "BlockchainDocumentLoader", + "CSVLoader", + "ChatGPTLoader", + "CoNLLULoader", + "CollegeConfidentialLoader", + "ConfluenceLoader", + "DataFrameLoader", + "DiffbotLoader", "DirectoryLoader", - "NotionDirectoryLoader", - "NotionDBLoader", - "ReadTheDocsLoader", + "DiscordChatLoader", + "DuckDBLoader", + "EverNoteLoader", + "FacebookChatLoader", + "GCSDirectoryLoader", + "GCSFileLoader", + "GitLoader", + "GitbookLoader", + "GoogleApiClient", + "GoogleApiYoutubeLoader", "GoogleDriveLoader", - "UnstructuredHTMLLoader", - "BSHTMLLoader", - "UnstructuredPowerPointLoader", - "UnstructuredWordDocumentLoader", - "UnstructuredPDFLoader", - "UnstructuredImageLoader", - "ObsidianLoader", - "UnstructuredEmailLoader", - "OutlookMessageLoader", - "UnstructuredEPubLoader", - "UnstructuredMarkdownLoader", - "UnstructuredRTFLoader", - "RoamLoader", - "YoutubeLoader", - "S3FileLoader", - "TextLoader", + "GutenbergLoader", "HNLoader", - "GitbookLoader", - "S3DirectoryLoader", - "GCSFileLoader", - "GCSDirectoryLoader", - "WebBaseLoader", - "IMSDbLoader", - "AZLyricsLoader", - "CollegeConfidentialLoader", + "HuggingFaceDatasetLoader", "IFixitLoader", - "GutenbergLoader", - "PagedPDFSplitter", - "PyPDFLoader", - "EverNoteLoader", - "AirbyteJSONLoader", + "IMSDbLoader", + "ImageCaptionLoader", + "NotebookLoader", + "NotionDBLoader", + "NotionDirectoryLoader", + "ObsidianLoader", "OnlinePDFLoader", + "OutlookMessageLoader", "PDFMinerLoader", "PDFMinerPDFasHTMLLoader", + "PagedPDFSplitter", + "PlaywrightURLLoader", "PyMuPDFLoader", - "TelegramChatLoader", + "PyPDFLoader", + "PythonLoader", + "ReadTheDocsLoader", + "RoamLoader", + "S3DirectoryLoader", + "S3FileLoader", "SRTLoader", - "FacebookChatLoader", - "NotebookLoader", - "CoNLLULoader", - "GoogleApiYoutubeLoader", - "GoogleApiClient", - "CSVLoader", - "BlackboardLoader", - "ApifyDatasetLoader", - "WhatsAppChatLoader", - "DataFrameLoader", - "AzureBlobStorageFileLoader", - "AzureBlobStorageContainerLoader", + "SeleniumURLLoader", "SitemapLoader", - "DuckDBLoader", - "BigQueryLoader", - "DiffbotLoader", - "BiliBiliLoader", "SlackDirectoryLoader", - "GitLoader", + "TelegramChatLoader", + "TextLoader", "TwitterTweetLoader", - "ImageCaptionLoader", - "DiscordChatLoader", - "ConfluenceLoader", - "PythonLoader", - "ChatGPTLoader", - "HuggingFaceDatasetLoader", - "BlockchainDocumentLoader", + "UnstructuredEPubLoader", + "UnstructuredEmailLoader", + "UnstructuredFileIOLoader", + "UnstructuredFileLoader", + "UnstructuredHTMLLoader", + "UnstructuredImageLoader", + "UnstructuredMarkdownLoader", + "UnstructuredPDFLoader", + "UnstructuredPowerPointLoader", + "UnstructuredRTFLoader", + "UnstructuredURLLoader", + "UnstructuredWordDocumentLoader", + "WebBaseLoader", + "WhatsAppChatLoader", + "YoutubeLoader", ] diff --git a/langchain/document_loaders/arxiv.py b/langchain/document_loaders/arxiv.py new file mode 100644 index 00000000..612788ad --- /dev/null +++ b/langchain/document_loaders/arxiv.py @@ -0,0 +1,31 @@ +from typing import List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.utilities.arxiv import ArxivAPIWrapper + + +class ArxivLoader(BaseLoader): + """Loads a query result from arxiv.org into a list of Documents. + + Each document represents one Document. + The loader converts the original PDF format into the text. + """ + + def __init__( + self, + query: str, + load_max_docs: Optional[int] = 100, + load_all_available_meta: Optional[bool] = False, + ): + self.query = query + self.load_max_docs = load_max_docs + self.load_all_available_meta = load_all_available_meta + + def load(self) -> List[Document]: + arxiv_client = ArxivAPIWrapper( + load_max_docs=self.load_max_docs, + load_all_available_meta=self.load_all_available_meta, + ) + docs = arxiv_client.load(self.query) + return docs diff --git a/langchain/utilities/arxiv.py b/langchain/utilities/arxiv.py index e31a1383..2ea37d6e 100644 --- a/langchain/utilities/arxiv.py +++ b/langchain/utilities/arxiv.py @@ -1,8 +1,13 @@ """Util that calls Arxiv.""" -from typing import Any, Dict +import logging +from typing import Any, Dict, List from pydantic import BaseModel, Extra, root_validator +from langchain.schema import Document + +logger = logging.getLogger(__name__) + class ArxivAPIWrapper(BaseModel): """Wrapper around ArxivAPI. @@ -12,12 +17,23 @@ class ArxivAPIWrapper(BaseModel): This wrapper will use the Arxiv API to conduct searches and fetch document summaries. By default, it will return the document summaries of the top-k results of an input search. + + Parameters: + top_k_results: number of the top-scored document used for the arxiv tool + ARXIV_MAX_QUERY_LENGTH: the cut limit on the query used for the arxiv tool. + load_max_docs: a limit to the number of loaded documents + load_all_available_meta: + if True: the `metadata` of the loaded Documents gets all available meta info + (see https://lukasschwab.me/arxiv.py/index.html#Result), + if False: the `metadata` gets only the most informative fields. """ arxiv_client: Any #: :meta private: arxiv_exceptions: Any # :meta private: top_k_results: int = 3 ARXIV_MAX_QUERY_LENGTH = 300 + load_max_docs: int = 100 + load_all_available_meta: bool = False class Config: """Configuration for this pydantic object.""" @@ -36,6 +52,7 @@ class ArxivAPIWrapper(BaseModel): arxiv.UnexpectedEmptyPageError, arxiv.HTTPError, ) + values["arxiv_result"] = arxiv.Result except ImportError: raise ValueError( "Could not import arxiv python package. " @@ -62,3 +79,64 @@ class ArxivAPIWrapper(BaseModel): return "\n\n".join(docs) if docs else "No good Arxiv Result was found" except self.arxiv_exceptions as ex: return f"Arxiv exception: {ex}" + + def load(self, query: str) -> List[Document]: + """ + Run Arxiv search and get the PDF documents plus the meta information. + See https://lukasschwab.me/arxiv.py/index.html#Search + + Returns: a list of documents with the document.page_content in PDF format + + """ + try: + import fitz + except ImportError: + raise ValueError( + "PyMuPDF package not found, please install it with " + "`pip install pymupdf`" + ) + + try: + docs: List[Document] = [] + for result in self.arxiv_search( # type: ignore + query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs + ).results(): + try: + doc_file_name: str = result.download_pdf() + with fitz.open(doc_file_name) as doc_file: + text: str = "".join(page.get_text() for page in doc_file) + add_meta = ( + { + "entry_id": result.entry_id, + "published_first_time": str(result.published.date()), + "comment": result.comment, + "journal_ref": result.journal_ref, + "doi": result.doi, + "primary_category": result.primary_category, + "categories": result.categories, + "links": [link.href for link in result.links], + } + if self.load_all_available_meta + else {} + ) + doc = Document( + page_content=text, + metadata=( + { + "Published": str(result.updated.date()), + "Title": result.title, + "Authors": ", ".join( + a.name for a in result.authors + ), + "Summary": result.summary, + **add_meta, + } + ), + ) + docs.append(doc) + except FileNotFoundError as f_ex: + logger.debug(f_ex) + return docs + except self.arxiv_exceptions as ex: + logger.debug("Error on arxiv: %s", ex) + return [] diff --git a/tests/integration_tests/document_loaders/test_arxiv.py b/tests/integration_tests/document_loaders/test_arxiv.py new file mode 100644 index 00000000..60315e52 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_arxiv.py @@ -0,0 +1,55 @@ +from typing import List + +from langchain.document_loaders.arxiv import ArxivLoader +from langchain.schema import Document + + +def assert_docs(docs: List[Document]) -> None: + for doc in docs: + assert doc.page_content + assert doc.metadata + assert set(doc.metadata) == {"Published", "Title", "Authors", "Summary"} + + +def test_load_success() -> None: + """Test that returns one document""" + loader = ArxivLoader(query="1605.08386", load_max_docs=2) + + docs = loader.load() + assert len(docs) == 1 + print(docs[0].metadata) + print(docs[0].page_content) + assert_docs(docs) + + +def test_load_returns_no_result() -> None: + """Test that returns no docs""" + loader = ArxivLoader(query="1605.08386WWW", load_max_docs=2) + docs = loader.load() + + assert len(docs) == 0 + + +def test_load_returns_limited_docs() -> None: + """Test that returns several docs""" + expected_docs = 2 + loader = ArxivLoader(query="ChatGPT", load_max_docs=expected_docs) + docs = loader.load() + + assert len(docs) == expected_docs + assert_docs(docs) + + +def test_load_returns_full_set_of_metadata() -> None: + """Test that returns several docs""" + loader = ArxivLoader(query="ChatGPT", load_max_docs=1, load_all_available_meta=True) + docs = loader.load() + assert len(docs) == 1 + for doc in docs: + assert doc.page_content + assert doc.metadata + assert set(doc.metadata).issuperset( + {"Published", "Title", "Authors", "Summary"} + ) + print(doc.metadata) + assert len(set(doc.metadata)) > 4 diff --git a/tests/integration_tests/utilities/test_arxiv.py b/tests/integration_tests/utilities/test_arxiv.py index d7f2fdd1..f8dc8f14 100644 --- a/tests/integration_tests/utilities/test_arxiv.py +++ b/tests/integration_tests/utilities/test_arxiv.py @@ -1,6 +1,9 @@ """Integration test for Arxiv API Wrapper.""" +from typing import List + import pytest +from langchain.schema import Document from langchain.utilities import ArxivAPIWrapper @@ -9,22 +12,68 @@ def api_client() -> ArxivAPIWrapper: return ArxivAPIWrapper() -def test_call(api_client: ArxivAPIWrapper) -> None: - """Test that ArxivAPIWrapper returns correct answer""" +def test_run_success(api_client: ArxivAPIWrapper) -> None: + """Test that returns the correct answer""" output = api_client.run("1605.08386") assert "Heat-bath random walks with Markov bases" in output -def test_several_docs(api_client: ArxivAPIWrapper) -> None: - """Test that ArxivAPIWrapper returns several docs""" +def test_run_returns_several_docs(api_client: ArxivAPIWrapper) -> None: + """Test that returns several docs""" output = api_client.run("Caprice Stanley") assert "On Mixing Behavior of a Family of Random Walks" in output -def test_no_result_call(api_client: ArxivAPIWrapper) -> None: - """Test that call gives no result.""" +def test_run_returns_no_result(api_client: ArxivAPIWrapper) -> None: + """Test that gives no result.""" output = api_client.run("1605.08386WWW") assert "No good Arxiv Result was found" == output + + +def assert_docs(docs: List[Document]) -> None: + for doc in docs: + assert doc.page_content + assert doc.metadata + assert set(doc.metadata) == {"Published", "Title", "Authors", "Summary"} + + +def test_load_success(api_client: ArxivAPIWrapper) -> None: + """Test that returns one document""" + + docs = api_client.load("1605.08386") + assert len(docs) == 1 + assert_docs(docs) + + +def test_load_returns_no_result(api_client: ArxivAPIWrapper) -> None: + """Test that returns no docs""" + + docs = api_client.load("1605.08386WWW") + assert len(docs) == 0 + + +def test_load_returns_limited_docs() -> None: + """Test that returns several docs""" + expected_docs = 2 + api_client = ArxivAPIWrapper(load_max_docs=expected_docs) + docs = api_client.load("ChatGPT") + assert len(docs) == expected_docs + assert_docs(docs) + + +def test_load_returns_full_set_of_metadata() -> None: + """Test that returns several docs""" + api_client = ArxivAPIWrapper(load_max_docs=1, load_all_available_meta=True) + docs = api_client.load("ChatGPT") + assert len(docs) == 1 + for doc in docs: + assert doc.page_content + assert doc.metadata + assert set(doc.metadata).issuperset( + {"Published", "Title", "Authors", "Summary"} + ) + print(doc.metadata) + assert len(set(doc.metadata)) > 4