docstrings `document_loaders` 1 (#6847)

- Updated docstrings in `document_loaders` - several code fixes. - added `docs/extras/ecosystem/integrations/airtable.md` @rlancemartin, @eyurtsev
1 year ago · 77ae8084a0
parent e41b382e1c
commit 77ae8084a0
19 changed files with 127 additions and 51 deletions
--- a/docs/extras/ecosystem/integrations/airtable.md
+++ b/docs/extras/ecosystem/integrations/airtable.md
@ -0,0 +1,28 @@
+# Airtable
+
+>[Airtable](https://en.wikipedia.org/wiki/Airtable) is a cloud collaboration service.
+`Airtable` is a spreadsheet-database hybrid, with the features of a database but applied to a spreadsheet. 
+> The fields in an Airtable table are similar to cells in a spreadsheet, but have types such as 'checkbox', 
+> 'phone number', and 'drop-down list', and can reference file attachments like images.
+
+>Users can create a database, set up column types, add records, link tables to one another, collaborate, sort records
+> and publish views to external websites.
+
+## Installation and Setup
+
+```bash
+pip install pyairtable
+```
+
+* Get your [API key](https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens).
+* Get the [ID of your base](https://airtable.com/developers/web/api/introduction).
+* Get the [table ID from the table url](https://www.highviewapps.com/kb/where-can-i-find-the-airtable-base-id-and-table-id/#:~:text=Both%20the%20Airtable%20Base%20ID,URL%20that%20begins%20with%20tbl).
+
+## Document Loader
+
+
+```python
+from langchain.document_loaders import AirtableLoader
+```
+
+See an [example](/docs/modules/data_connection/document_loaders/integrations/airtable.html).
--- a/docs/extras/modules/data_connection/document_loaders/integrations/airtable.ipynb
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/airtable.ipynb
@ -134,7 +134,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
+   "version": "3.10.6"
  }
 },
 "nbformat": 4,
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -145,10 +145,10 @@ from langchain.document_loaders.youtube import (
    YoutubeLoader,
 )

-# Legacy: only for backwards compat. Use PyPDFLoader instead
+# Legacy: only for backwards compatibility. Use PyPDFLoader instead
 PagedPDFSplitter = PyPDFLoader

-# For backwards compatability
+# For backwards compatibility
 TelegramChatLoader = TelegramChatFileLoader

 __all__ = [
--- a/langchain/document_loaders/acreom.py
+++ b/langchain/document_loaders/acreom.py
@ -8,15 +8,20 @@ from langchain.document_loaders.base import BaseLoader


 class AcreomLoader(BaseLoader):
+    """Loader that loads acreom vault from a directory."""
+
    FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
+    """Regex to match front matter metadata in markdown files."""

    def __init__(
        self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
    ):
-        """Initialize with path."""
        self.file_path = path
+        """Path to the directory containing the markdown files."""
        self.encoding = encoding
+        """Encoding to use when reading the files."""
        self.collect_metadata = collect_metadata
+        """Whether to collect metadata from the front matter."""

    def _parse_front_matter(self, content: str) -> dict:
        """Parse front matter metadata from the content and return it as a dict."""
--- a/langchain/document_loaders/airbyte_json.py
+++ b/langchain/document_loaders/airbyte_json.py
@ -11,11 +11,11 @@ class AirbyteJSONLoader(BaseLoader):
    """Loader that loads local airbyte json files."""

    def __init__(self, file_path: str):
-        """Initialize with file path. This should start with '/tmp/airbyte_local/'."""
+        """Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
        self.file_path = file_path
+        """Path to the directory containing the json files."""

    def load(self) -> List[Document]:
-        """Load file."""
        text = ""
        for line in open(self.file_path, "r"):
            data = json.loads(line)["_airbyte_data"]
--- a/langchain/document_loaders/airtable.py
+++ b/langchain/document_loaders/airtable.py
@ -10,11 +10,14 @@ class AirtableLoader(BaseLoader):
    def __init__(self, api_token: str, table_id: str, base_id: str):
        """Initialize with API token and the IDs for table and base"""
        self.api_token = api_token
+        """Airtable API token."""
        self.table_id = table_id
+        """Airtable table ID."""
        self.base_id = base_id
+        """Airtable base ID."""

    def lazy_load(self) -> Iterator[Document]:
-        """Lazy load records from table."""
+        """Lazy load Documents from table."""

        from pyairtable import Table

@ -32,5 +35,5 @@ class AirtableLoader(BaseLoader):
            )

    def load(self) -> List[Document]:
-        """Load Table."""
+        """Load Documents from table."""
        return list(self.lazy_load())
--- a/langchain/document_loaders/apify_dataset.py
+++ b/langchain/document_loaders/apify_dataset.py
@ -1,4 +1,3 @@
-"""Logic for loading documents from Apify datasets."""
 from typing import Any, Callable, Dict, List

 from pydantic import BaseModel, root_validator
@ -8,9 +7,10 @@ from langchain.document_loaders.base import BaseLoader


 class ApifyDatasetLoader(BaseLoader, BaseModel):
-    """Logic for loading documents from Apify datasets."""
+    """Loading Documents from Apify datasets."""

    apify_client: Any
+    """An instance of the ApifyClient class from the apify-client Python package."""
    dataset_id: str
    """The ID of the dataset on the Apify platform."""
    dataset_mapping_function: Callable[[Dict], Document]
@ -34,7 +34,11 @@ class ApifyDatasetLoader(BaseLoader, BaseModel):

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
-        """Validate environment."""
+        """Validate environment.
+
+        Args:
+            values: The values to validate.
+        """

        try:
            from apify_client import ApifyClient
--- a/langchain/document_loaders/arxiv.py
+++ b/langchain/document_loaders/arxiv.py
@ -19,8 +19,11 @@ class ArxivLoader(BaseLoader):
        load_all_available_meta: Optional[bool] = False,
    ):
        self.query = query
+        """The query to be passed to the arxiv.org API."""
        self.load_max_docs = load_max_docs
+        """The maximum number of documents to load."""
        self.load_all_available_meta = load_all_available_meta
+        """Whether to load all available metadata."""

    def load(self) -> List[Document]:
        arxiv_client = ArxivAPIWrapper(
--- a/langchain/document_loaders/azlyrics.py
+++ b/langchain/document_loaders/azlyrics.py
@ -9,7 +9,7 @@ class AZLyricsLoader(WebBaseLoader):
    """Loader that loads AZLyrics webpages."""

    def load(self) -> List[Document]:
-        """Load webpage."""
+        """Load webpages into Documents."""
        soup = self.scrape()
        title = soup.title.text
        lyrics = soup.find_all("div", {"class": ""})[2].text
--- a/langchain/document_loaders/azure_blob_storage_container.py
+++ b/langchain/document_loaders/azure_blob_storage_container.py
@ -9,20 +9,23 @@ from langchain.document_loaders.base import BaseLoader


 class AzureBlobStorageContainerLoader(BaseLoader):
-    """Loading logic for loading documents from Azure Blob Storage."""
+    """Loading Documents from Azure Blob Storage."""

    def __init__(self, conn_str: str, container: str, prefix: str = ""):
        """Initialize with connection string, container and blob prefix."""
        self.conn_str = conn_str
+        """Connection string for Azure Blob Storage."""
        self.container = container
+        """Container name."""
        self.prefix = prefix
+        """Prefix for blob names."""

    def load(self) -> List[Document]:
        """Load documents."""
        try:
            from azure.storage.blob import ContainerClient
        except ImportError as exc:
-            raise ValueError(
+            raise ImportError(
                "Could not import azure storage blob python package. "
                "Please install it with `pip install azure-storage-blob`."
            ) from exc
--- a/langchain/document_loaders/azure_blob_storage_file.py
+++ b/langchain/document_loaders/azure_blob_storage_file.py
@ -1,4 +1,3 @@
-"""Loading logic for loading documents from an Azure Blob Storage file."""
 import os
 import tempfile
 from typing import List
@ -9,20 +8,23 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader


 class AzureBlobStorageFileLoader(BaseLoader):
-    """Loading logic for loading documents from Azure Blob Storage."""
+    """Loading Documents from Azure Blob Storage."""

    def __init__(self, conn_str: str, container: str, blob_name: str):
        """Initialize with connection string, container and blob name."""
        self.conn_str = conn_str
+        """Connection string for Azure Blob Storage."""
        self.container = container
+        """Container name."""
        self.blob = blob_name
+        """Blob name."""

    def load(self) -> List[Document]:
        """Load documents."""
        try:
            from azure.storage.blob import BlobClient
        except ImportError as exc:
-            raise ValueError(
+            raise ImportError(
                "Could not import azure storage blob python package. "
                "Please install it with `pip install azure-storage-blob`."
            ) from exc
--- a/langchain/document_loaders/base.py
+++ b/langchain/document_loaders/base.py
@ -8,10 +8,10 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter


 class BaseLoader(ABC):
-    """Interface for loading documents.
+    """Interface for loading Documents.

    Implementations should implement the lazy-loading method using generators
-    to avoid loading all documents into memory at once.
+    to avoid loading all Documents into memory at once.

    The `load` method will remain as is for backwards compatibility, but its
    implementation should be just `list(self.lazy_load())`.
@ -22,12 +22,20 @@ class BaseLoader(ABC):
    # This method returns a List which is materialized in memory.
    @abstractmethod
    def load(self) -> List[Document]:
-        """Load data into document objects."""
+        """Load data into Document objects."""

    def load_and_split(
        self, text_splitter: Optional[TextSplitter] = None
    ) -> List[Document]:
-        """Load documents and split into chunks."""
+        """Load Documents and split into chunks. Chunks are returned as Documents.
+
+        Args:
+            text_splitter: TextSplitter instance to use for splitting documents.
+              Defaults to RecursiveCharacterTextSplitter.
+
+        Returns:
+            List of Documents.
+        """
        if text_splitter is None:
            _text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
        else:
@ -40,7 +48,7 @@ class BaseLoader(ABC):
    def lazy_load(
        self,
    ) -> Iterator[Document]:
-        """A lazy loader for document content."""
+        """A lazy loader for Documents."""
        raise NotImplementedError(
            f"{self.__class__.__name__} does not implement lazy_load()"
        )
@ -49,7 +57,7 @@ class BaseLoader(ABC):
 class BaseBlobParser(ABC):
    """Abstract interface for blob parsers.

-    A blob parser is provides a way to parse raw data stored in a blob into one
+    A blob parser provides a way to parse raw data stored in a blob into one
    or more documents.

    The parser can be composed with blob loaders, making it easy to re-use
--- a/langchain/document_loaders/bibtex.py
+++ b/langchain/document_loaders/bibtex.py
@ -34,8 +34,12 @@ class BibtexLoader(BaseLoader):

        Args:
            file_path: Path to the bibtex file.
+            parser: The parser to use. If None, a default parser is used.
            max_docs: Max number of associated documents to load. Use -1 means
                           no limit.
+            max_content_chars: Maximum number of characters to load from the PDF.
+            load_extra_metadata: Whether to load extra metadata from the PDF.
+            file_pattern: Regex pattern to match the file name in the bibtex.
        """
        self.file_path = file_path
        self.parser = parser or BibtexparserWrapper()
@ -70,9 +74,7 @@ class BibtexLoader(BaseLoader):

    def lazy_load(self) -> Iterator[Document]:
        """Load bibtex file using bibtexparser and get the article texts plus the
-
        article metadata.
-
        See https://bibtexparser.readthedocs.io/en/master/

        Returns:
--- a/langchain/document_loaders/bigquery.py
+++ b/langchain/document_loaders/bigquery.py
@ -37,7 +37,7 @@ class BigQueryLoader(BaseLoader):
            metadata_columns: Optional. The columns to write into the `metadata` of the
                document.
            credentials : google.auth.credentials.Credentials, optional
-            Credentials for accessing Google APIs. Use this parameter to override
+              Credentials for accessing Google APIs. Use this parameter to override
                default credentials, such as to use Compute Engine
                (`google.auth.compute_engine.Credentials`) or Service Account
                (`google.oauth2.service_account.Credentials`) credentials directly.
@ -52,7 +52,7 @@ class BigQueryLoader(BaseLoader):
        try:
            from google.cloud import bigquery
        except ImportError as ex:
-            raise ValueError(
+            raise ImportError(
                "Could not import google-cloud-bigquery python package. "
                "Please install it with `pip install google-cloud-bigquery`."
            ) from ex
--- a/langchain/document_loaders/bilibili.py
+++ b/langchain/document_loaders/bilibili.py
@ -13,11 +13,15 @@ class BiliBiliLoader(BaseLoader):
    """Loader that loads bilibili transcripts."""

    def __init__(self, video_urls: List[str]):
-        """Initialize with bilibili url."""
+        """Initialize with bilibili url.
+
+        Args:
+            video_urls: List of bilibili urls.
+        """
        self.video_urls = video_urls

    def load(self) -> List[Document]:
-        """Load from bilibili url."""
+        """Load Documents from bilibili url."""
        results = []
        for url in self.video_urls:
            transcript, video_info = self._get_bilibili_subs_and_info(url)
@ -30,7 +34,7 @@ class BiliBiliLoader(BaseLoader):
        try:
            from bilibili_api import sync, video
        except ImportError:
-            raise ValueError(
+            raise ImportError(
                "requests package not found, please install it with "
                "`pip install bilibili-api-python`"
            )
--- a/langchain/document_loaders/blackboard.py
+++ b/langchain/document_loaders/blackboard.py
@ -12,7 +12,7 @@ from langchain.document_loaders.web_base import WebBaseLoader


 class BlackboardLoader(WebBaseLoader):
-    """Loader that loads all documents from a Blackboard course.
+    """Loads all documents from a Blackboard course.

    This loader is not compatible with all Blackboard courses. It is only
    compatible with courses that use the new Blackboard interface.
@ -34,8 +34,11 @@ class BlackboardLoader(WebBaseLoader):
    """

    base_url: str
+    """Base url of the blackboard course."""
    folder_path: str
+    """Path to the folder containing the documents."""
    load_all_recursively: bool
+    """If True, load all documents recursively."""

    def __init__(
        self,
@ -64,7 +67,7 @@ class BlackboardLoader(WebBaseLoader):
        try:
            self.base_url = blackboard_course_url.split("/webapps/blackboard")[0]
        except IndexError:
-            raise ValueError(
+            raise IndexError(
                "Invalid blackboard course url. "
                "Please provide a url that starts with "
                "https://<blackboard_url>/webapps/blackboard"
@ -94,10 +97,10 @@ class BlackboardLoader(WebBaseLoader):
            )

    def load(self) -> List[Document]:
-        """Load data into document objects.
+        """Load data into Document objects.

        Returns:
-            List of documents.
+            List of Documents.
        """
        if self.load_all_recursively:
            soup_info = self.scrape()
@ -118,7 +121,7 @@ class BlackboardLoader(WebBaseLoader):
            return self._get_documents(soup_info)

    def _get_folder_path(self, soup: Any) -> str:
-        """Get the folder path to save the documents in.
+        """Get the folder path to save the Documents in.

        Args:
            soup: BeautifulSoup4 soup object.
@ -229,7 +232,7 @@ class BlackboardLoader(WebBaseLoader):
        return relative_paths

    def download(self, path: str) -> None:
-        """Download a file from a url.
+        """Download a file from an url.

        Args:
            path: Path to the file.
@ -243,7 +246,7 @@ class BlackboardLoader(WebBaseLoader):
            f.write(response.content)

    def parse_filename(self, url: str) -> str:
-        """Parse the filename from a url.
+        """Parse the filename from an url.

        Args:
            url: Url to parse the filename from.
@ -257,7 +260,7 @@ class BlackboardLoader(WebBaseLoader):
            return self._parse_filename_from_url(url)

    def _parse_filename_from_url(self, url: str) -> str:
-        """Parse the filename from a url.
+        """Parse the filename from an url.

        Args:
            url: Url to parse the filename from.
--- a/langchain/document_loaders/blockchain.py
+++ b/langchain/document_loaders/blockchain.py
@ -55,6 +55,16 @@ class BlockchainDocumentLoader(BaseLoader):
        get_all_tokens: bool = False,
        max_execution_time: Optional[int] = None,
    ):
+        """
+
+        Args:
+            contract_address: The address of the smart contract.
+            blockchainType: The blockchain type.
+            api_key: The Alchemy API key.
+            startToken: The start token for pagination.
+            get_all_tokens: Whether to get all tokens on the contract.
+            max_execution_time: The maximum execution time (sec).
+        """
        self.contract_address = contract_address
        self.blockchainType = blockchainType.value
        self.api_key = os.environ.get("ALCHEMY_API_KEY") or api_key
--- a/langchain/document_loaders/chatgpt.py
+++ b/langchain/document_loaders/chatgpt.py
@ -1,4 +1,3 @@
-"""Load conversations from ChatGPT data export"""
 import datetime
 import json
 from typing import List
@ -29,9 +28,15 @@ def concatenate_rows(message: dict, title: str) -> str:


 class ChatGPTLoader(BaseLoader):
-    """Loader that loads conversations from exported ChatGPT data."""
+    """Load conversations from exported ChatGPT data."""

    def __init__(self, log_file: str, num_logs: int = -1):
+        """
+
+        Args:
+            log_file: Path to the log file
+            num_logs: Number of logs to load. If 0, load all logs.
+        """
        self.log_file = log_file
        self.num_logs = num_logs

--- a/langchain/experimental/client/tracing_datasets.ipynb
+++ b/langchain/experimental/client/tracing_datasets.ipynb
@ -284,9 +284,7 @@
    "    error=False,  # Only runs that succeed\n",
    ")\n",
    "for run in runs:\n",
-    "    client.create_example(\n",
-    "        inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n",
-    "    )"
+    "    client.create_example(inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id)"
   ]
  },
  {
@ -333,7 +331,7 @@
    "eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
    "\n",
    "# Measures accuracy against ground truth\n",
-    "qa_evaluator = get_qa_evaluator(eval_llm) \n",
+    "qa_evaluator = get_qa_evaluator(eval_llm)\n",
    "\n",
    "# Measures how effective and efficient the agent's actions are\n",
    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
@ -392,13 +390,13 @@
    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
    "\n",
+    "\n",
    "# Since chains can be stateful (e.g. they can have memory), we need provide\n",
    "# a way to initialize a new chain for each row in the dataset. This is done\n",
    "# by passing in a factory function that returns a new chain for each row.\n",
    "def agent_factory():\n",
-    "    return initialize_agent(\n",
-    "    tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
-    ")\n",
+    "    return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)\n",
+    "\n",
    "\n",
    "# If your chain is NOT stateful, your factory can return the object directly\n",
    "# to improve runtime performance. For example:\n",
@ -477,7 +475,7 @@
   "source": [
    "from langchain.client import (\n",
    "    arun_on_dataset,\n",
-    "    run_on_dataset, # Available if your chain doesn't support async calls.\n",
+    "    run_on_dataset,  # Available if your chain doesn't support async calls.\n",
    ")\n",
    "\n",
    "?arun_on_dataset"
@ -616,9 +614,7 @@
   },
   "outputs": [],
   "source": [
-    "agent = initialize_agent(\n",
-    "    tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
-    ")"
+    "agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)"
   ]
  },
  {