docstrings `document_loaders` 1 (#6847)

- Updated docstrings in `document_loaders`
- several code fixes.
- added `docs/extras/ecosystem/integrations/airtable.md`

@rlancemartin, @eyurtsev
pull/6937/head
Leonid Ganeline 1 year ago committed by GitHub
parent e41b382e1c
commit 77ae8084a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,28 @@
# Airtable
>[Airtable](https://en.wikipedia.org/wiki/Airtable) is a cloud collaboration service.
`Airtable` is a spreadsheet-database hybrid, with the features of a database but applied to a spreadsheet.
> The fields in an Airtable table are similar to cells in a spreadsheet, but have types such as 'checkbox',
> 'phone number', and 'drop-down list', and can reference file attachments like images.
>Users can create a database, set up column types, add records, link tables to one another, collaborate, sort records
> and publish views to external websites.
## Installation and Setup
```bash
pip install pyairtable
```
* Get your [API key](https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens).
* Get the [ID of your base](https://airtable.com/developers/web/api/introduction).
* Get the [table ID from the table url](https://www.highviewapps.com/kb/where-can-i-find-the-airtable-base-id-and-table-id/#:~:text=Both%20the%20Airtable%20Base%20ID,URL%20that%20begins%20with%20tbl).
## Document Loader
```python
from langchain.document_loaders import AirtableLoader
```
See an [example](/docs/modules/data_connection/document_loaders/integrations/airtable.html).

@ -134,7 +134,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.10.6"
}
},
"nbformat": 4,

@ -145,10 +145,10 @@ from langchain.document_loaders.youtube import (
YoutubeLoader,
)
# Legacy: only for backwards compat. Use PyPDFLoader instead
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader
# For backwards compatability
# For backwards compatibility
TelegramChatLoader = TelegramChatFileLoader
__all__ = [

@ -8,15 +8,20 @@ from langchain.document_loaders.base import BaseLoader
class AcreomLoader(BaseLoader):
"""Loader that loads acreom vault from a directory."""
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
"""Regex to match front matter metadata in markdown files."""
def __init__(
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
):
"""Initialize with path."""
self.file_path = path
"""Path to the directory containing the markdown files."""
self.encoding = encoding
"""Encoding to use when reading the files."""
self.collect_metadata = collect_metadata
"""Whether to collect metadata from the front matter."""
def _parse_front_matter(self, content: str) -> dict:
"""Parse front matter metadata from the content and return it as a dict."""

@ -11,11 +11,11 @@ class AirbyteJSONLoader(BaseLoader):
"""Loader that loads local airbyte json files."""
def __init__(self, file_path: str):
"""Initialize with file path. This should start with '/tmp/airbyte_local/'."""
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
self.file_path = file_path
"""Path to the directory containing the json files."""
def load(self) -> List[Document]:
"""Load file."""
text = ""
for line in open(self.file_path, "r"):
data = json.loads(line)["_airbyte_data"]

@ -10,11 +10,14 @@ class AirtableLoader(BaseLoader):
def __init__(self, api_token: str, table_id: str, base_id: str):
"""Initialize with API token and the IDs for table and base"""
self.api_token = api_token
"""Airtable API token."""
self.table_id = table_id
"""Airtable table ID."""
self.base_id = base_id
"""Airtable base ID."""
def lazy_load(self) -> Iterator[Document]:
"""Lazy load records from table."""
"""Lazy load Documents from table."""
from pyairtable import Table
@ -32,5 +35,5 @@ class AirtableLoader(BaseLoader):
)
def load(self) -> List[Document]:
"""Load Table."""
"""Load Documents from table."""
return list(self.lazy_load())

@ -1,4 +1,3 @@
"""Logic for loading documents from Apify datasets."""
from typing import Any, Callable, Dict, List
from pydantic import BaseModel, root_validator
@ -8,9 +7,10 @@ from langchain.document_loaders.base import BaseLoader
class ApifyDatasetLoader(BaseLoader, BaseModel):
"""Logic for loading documents from Apify datasets."""
"""Loading Documents from Apify datasets."""
apify_client: Any
"""An instance of the ApifyClient class from the apify-client Python package."""
dataset_id: str
"""The ID of the dataset on the Apify platform."""
dataset_mapping_function: Callable[[Dict], Document]
@ -34,7 +34,11 @@ class ApifyDatasetLoader(BaseLoader, BaseModel):
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""Validate environment."""
"""Validate environment.
Args:
values: The values to validate.
"""
try:
from apify_client import ApifyClient

@ -19,8 +19,11 @@ class ArxivLoader(BaseLoader):
load_all_available_meta: Optional[bool] = False,
):
self.query = query
"""The query to be passed to the arxiv.org API."""
self.load_max_docs = load_max_docs
"""The maximum number of documents to load."""
self.load_all_available_meta = load_all_available_meta
"""Whether to load all available metadata."""
def load(self) -> List[Document]:
arxiv_client = ArxivAPIWrapper(

@ -9,7 +9,7 @@ class AZLyricsLoader(WebBaseLoader):
"""Loader that loads AZLyrics webpages."""
def load(self) -> List[Document]:
"""Load webpage."""
"""Load webpages into Documents."""
soup = self.scrape()
title = soup.title.text
lyrics = soup.find_all("div", {"class": ""})[2].text

@ -9,20 +9,23 @@ from langchain.document_loaders.base import BaseLoader
class AzureBlobStorageContainerLoader(BaseLoader):
"""Loading logic for loading documents from Azure Blob Storage."""
"""Loading Documents from Azure Blob Storage."""
def __init__(self, conn_str: str, container: str, prefix: str = ""):
"""Initialize with connection string, container and blob prefix."""
self.conn_str = conn_str
"""Connection string for Azure Blob Storage."""
self.container = container
"""Container name."""
self.prefix = prefix
"""Prefix for blob names."""
def load(self) -> List[Document]:
"""Load documents."""
try:
from azure.storage.blob import ContainerClient
except ImportError as exc:
raise ValueError(
raise ImportError(
"Could not import azure storage blob python package. "
"Please install it with `pip install azure-storage-blob`."
) from exc

@ -1,4 +1,3 @@
"""Loading logic for loading documents from an Azure Blob Storage file."""
import os
import tempfile
from typing import List
@ -9,20 +8,23 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class AzureBlobStorageFileLoader(BaseLoader):
"""Loading logic for loading documents from Azure Blob Storage."""
"""Loading Documents from Azure Blob Storage."""
def __init__(self, conn_str: str, container: str, blob_name: str):
"""Initialize with connection string, container and blob name."""
self.conn_str = conn_str
"""Connection string for Azure Blob Storage."""
self.container = container
"""Container name."""
self.blob = blob_name
"""Blob name."""
def load(self) -> List[Document]:
"""Load documents."""
try:
from azure.storage.blob import BlobClient
except ImportError as exc:
raise ValueError(
raise ImportError(
"Could not import azure storage blob python package. "
"Please install it with `pip install azure-storage-blob`."
) from exc

@ -8,10 +8,10 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
class BaseLoader(ABC):
"""Interface for loading documents.
"""Interface for loading Documents.
Implementations should implement the lazy-loading method using generators
to avoid loading all documents into memory at once.
to avoid loading all Documents into memory at once.
The `load` method will remain as is for backwards compatibility, but its
implementation should be just `list(self.lazy_load())`.
@ -22,12 +22,20 @@ class BaseLoader(ABC):
# This method returns a List which is materialized in memory.
@abstractmethod
def load(self) -> List[Document]:
"""Load data into document objects."""
"""Load data into Document objects."""
def load_and_split(
self, text_splitter: Optional[TextSplitter] = None
) -> List[Document]:
"""Load documents and split into chunks."""
"""Load Documents and split into chunks. Chunks are returned as Documents.
Args:
text_splitter: TextSplitter instance to use for splitting documents.
Defaults to RecursiveCharacterTextSplitter.
Returns:
List of Documents.
"""
if text_splitter is None:
_text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
else:
@ -40,7 +48,7 @@ class BaseLoader(ABC):
def lazy_load(
self,
) -> Iterator[Document]:
"""A lazy loader for document content."""
"""A lazy loader for Documents."""
raise NotImplementedError(
f"{self.__class__.__name__} does not implement lazy_load()"
)
@ -49,7 +57,7 @@ class BaseLoader(ABC):
class BaseBlobParser(ABC):
"""Abstract interface for blob parsers.
A blob parser is provides a way to parse raw data stored in a blob into one
A blob parser provides a way to parse raw data stored in a blob into one
or more documents.
The parser can be composed with blob loaders, making it easy to re-use

@ -34,8 +34,12 @@ class BibtexLoader(BaseLoader):
Args:
file_path: Path to the bibtex file.
parser: The parser to use. If None, a default parser is used.
max_docs: Max number of associated documents to load. Use -1 means
no limit.
max_content_chars: Maximum number of characters to load from the PDF.
load_extra_metadata: Whether to load extra metadata from the PDF.
file_pattern: Regex pattern to match the file name in the bibtex.
"""
self.file_path = file_path
self.parser = parser or BibtexparserWrapper()
@ -70,9 +74,7 @@ class BibtexLoader(BaseLoader):
def lazy_load(self) -> Iterator[Document]:
"""Load bibtex file using bibtexparser and get the article texts plus the
article metadata.
See https://bibtexparser.readthedocs.io/en/master/
Returns:

@ -37,7 +37,7 @@ class BigQueryLoader(BaseLoader):
metadata_columns: Optional. The columns to write into the `metadata` of the
document.
credentials : google.auth.credentials.Credentials, optional
Credentials for accessing Google APIs. Use this parameter to override
Credentials for accessing Google APIs. Use this parameter to override
default credentials, such as to use Compute Engine
(`google.auth.compute_engine.Credentials`) or Service Account
(`google.oauth2.service_account.Credentials`) credentials directly.
@ -52,7 +52,7 @@ class BigQueryLoader(BaseLoader):
try:
from google.cloud import bigquery
except ImportError as ex:
raise ValueError(
raise ImportError(
"Could not import google-cloud-bigquery python package. "
"Please install it with `pip install google-cloud-bigquery`."
) from ex

@ -13,11 +13,15 @@ class BiliBiliLoader(BaseLoader):
"""Loader that loads bilibili transcripts."""
def __init__(self, video_urls: List[str]):
"""Initialize with bilibili url."""
"""Initialize with bilibili url.
Args:
video_urls: List of bilibili urls.
"""
self.video_urls = video_urls
def load(self) -> List[Document]:
"""Load from bilibili url."""
"""Load Documents from bilibili url."""
results = []
for url in self.video_urls:
transcript, video_info = self._get_bilibili_subs_and_info(url)
@ -30,7 +34,7 @@ class BiliBiliLoader(BaseLoader):
try:
from bilibili_api import sync, video
except ImportError:
raise ValueError(
raise ImportError(
"requests package not found, please install it with "
"`pip install bilibili-api-python`"
)

@ -12,7 +12,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
class BlackboardLoader(WebBaseLoader):
"""Loader that loads all documents from a Blackboard course.
"""Loads all documents from a Blackboard course.
This loader is not compatible with all Blackboard courses. It is only
compatible with courses that use the new Blackboard interface.
@ -34,8 +34,11 @@ class BlackboardLoader(WebBaseLoader):
"""
base_url: str
"""Base url of the blackboard course."""
folder_path: str
"""Path to the folder containing the documents."""
load_all_recursively: bool
"""If True, load all documents recursively."""
def __init__(
self,
@ -64,7 +67,7 @@ class BlackboardLoader(WebBaseLoader):
try:
self.base_url = blackboard_course_url.split("/webapps/blackboard")[0]
except IndexError:
raise ValueError(
raise IndexError(
"Invalid blackboard course url. "
"Please provide a url that starts with "
"https://<blackboard_url>/webapps/blackboard"
@ -94,10 +97,10 @@ class BlackboardLoader(WebBaseLoader):
)
def load(self) -> List[Document]:
"""Load data into document objects.
"""Load data into Document objects.
Returns:
List of documents.
List of Documents.
"""
if self.load_all_recursively:
soup_info = self.scrape()
@ -118,7 +121,7 @@ class BlackboardLoader(WebBaseLoader):
return self._get_documents(soup_info)
def _get_folder_path(self, soup: Any) -> str:
"""Get the folder path to save the documents in.
"""Get the folder path to save the Documents in.
Args:
soup: BeautifulSoup4 soup object.
@ -229,7 +232,7 @@ class BlackboardLoader(WebBaseLoader):
return relative_paths
def download(self, path: str) -> None:
"""Download a file from a url.
"""Download a file from an url.
Args:
path: Path to the file.
@ -243,7 +246,7 @@ class BlackboardLoader(WebBaseLoader):
f.write(response.content)
def parse_filename(self, url: str) -> str:
"""Parse the filename from a url.
"""Parse the filename from an url.
Args:
url: Url to parse the filename from.
@ -257,7 +260,7 @@ class BlackboardLoader(WebBaseLoader):
return self._parse_filename_from_url(url)
def _parse_filename_from_url(self, url: str) -> str:
"""Parse the filename from a url.
"""Parse the filename from an url.
Args:
url: Url to parse the filename from.

@ -55,6 +55,16 @@ class BlockchainDocumentLoader(BaseLoader):
get_all_tokens: bool = False,
max_execution_time: Optional[int] = None,
):
"""
Args:
contract_address: The address of the smart contract.
blockchainType: The blockchain type.
api_key: The Alchemy API key.
startToken: The start token for pagination.
get_all_tokens: Whether to get all tokens on the contract.
max_execution_time: The maximum execution time (sec).
"""
self.contract_address = contract_address
self.blockchainType = blockchainType.value
self.api_key = os.environ.get("ALCHEMY_API_KEY") or api_key

@ -1,4 +1,3 @@
"""Load conversations from ChatGPT data export"""
import datetime
import json
from typing import List
@ -29,9 +28,15 @@ def concatenate_rows(message: dict, title: str) -> str:
class ChatGPTLoader(BaseLoader):
"""Loader that loads conversations from exported ChatGPT data."""
"""Load conversations from exported ChatGPT data."""
def __init__(self, log_file: str, num_logs: int = -1):
"""
Args:
log_file: Path to the log file
num_logs: Number of logs to load. If 0, load all logs.
"""
self.log_file = log_file
self.num_logs = num_logs

@ -284,9 +284,7 @@
" error=False, # Only runs that succeed\n",
")\n",
"for run in runs:\n",
" client.create_example(\n",
" inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n",
" )"
" client.create_example(inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id)"
]
},
{
@ -333,7 +331,7 @@
"eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
"\n",
"# Measures accuracy against ground truth\n",
"qa_evaluator = get_qa_evaluator(eval_llm) \n",
"qa_evaluator = get_qa_evaluator(eval_llm)\n",
"\n",
"# Measures how effective and efficient the agent's actions are\n",
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
@ -392,13 +390,13 @@
"llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
"\n",
"\n",
"# Since chains can be stateful (e.g. they can have memory), we need provide\n",
"# a way to initialize a new chain for each row in the dataset. This is done\n",
"# by passing in a factory function that returns a new chain for each row.\n",
"def agent_factory():\n",
" return initialize_agent(\n",
" tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
")\n",
" return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)\n",
"\n",
"\n",
"# If your chain is NOT stateful, your factory can return the object directly\n",
"# to improve runtime performance. For example:\n",
@ -477,7 +475,7 @@
"source": [
"from langchain.client import (\n",
" arun_on_dataset,\n",
" run_on_dataset, # Available if your chain doesn't support async calls.\n",
" run_on_dataset, # Available if your chain doesn't support async calls.\n",
")\n",
"\n",
"?arun_on_dataset"
@ -616,9 +614,7 @@
},
"outputs": [],
"source": [
"agent = initialize_agent(\n",
" tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
")"
"agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)"
]
},
{

Loading…
Cancel
Save