mirror of
https://github.com/hwchase17/langchain
synced 2024-11-16 06:13:16 +00:00
affa3e755a
Added the call function get_summaries_as_docs inside of Arxivloader - **Description:** Added a function that returns the documents from get_summaries_as_docs, as the call signature is present in the parent file but never used from Arxivloader, this can be used from Arxivloader itself just like .load() as both the signatures are same. - **Issue:** Reduces time to load papers as no pdf is processed only metadata is pulled from Arxiv allowing users for faster load times on bulk loads. Users can then choose one or more paper and use ID directly with .load() to load pdf thereby loading all the contents of the paper.
31 lines
879 B
Python
31 lines
879 B
Python
from typing import Any, List, Optional
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
from langchain_community.utilities.arxiv import ArxivAPIWrapper
|
|
|
|
|
|
class ArxivLoader(BaseLoader):
|
|
"""Load a query result from `Arxiv`.
|
|
|
|
The loader converts the original PDF format into the text.
|
|
|
|
Args:
|
|
Supports all arguments of `ArxivAPIWrapper`.
|
|
"""
|
|
|
|
def __init__(
|
|
self, query: str, doc_content_chars_max: Optional[int] = None, **kwargs: Any
|
|
):
|
|
self.query = query
|
|
self.client = ArxivAPIWrapper(
|
|
doc_content_chars_max=doc_content_chars_max, **kwargs
|
|
)
|
|
|
|
def load(self) -> List[Document]:
|
|
return self.client.load(self.query)
|
|
|
|
def get_summaries_as_docs(self) -> List[Document]:
|
|
return self.client.get_summaries_as_docs(self.query)
|