mirror of https://github.com/hwchase17/langchain
airbyte[patch]: init pkg (#18236)
parent
ac1d7d9de8
commit
be8d2ff5f7
@ -0,0 +1,292 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1f3a5ebf",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# AirbyteLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "35ac77b1-449b-44f7-b8f3-3494d55c286e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
">[Airbyte](https://github.com/airbytehq/airbyte) is a data integration platform for ELT pipelines from APIs, databases & files to warehouses & lakes. It has the largest catalog of ELT connectors to data warehouses and databases.\n",
|
||||||
|
"\n",
|
||||||
|
"This covers how to load any source from Airbyte into LangChain documents\n",
|
||||||
|
"\n",
|
||||||
|
"## Installation\n",
|
||||||
|
"\n",
|
||||||
|
"In order to use `AirbyteLoader` you need to install the `langchain-airbyte` integration package."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "180c8b74",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"% pip install -qU langchain-airbyte"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "3dd92c62",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Loading Documents\n",
|
||||||
|
"\n",
|
||||||
|
"By default, the `AirbyteLoader` will load any structured data from a stream and output yaml-formatted documents."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "721d9316",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"```yaml\n",
|
||||||
|
"academic_degree: PhD\n",
|
||||||
|
"address:\n",
|
||||||
|
" city: Lauderdale Lakes\n",
|
||||||
|
" country_code: FI\n",
|
||||||
|
" postal_code: '75466'\n",
|
||||||
|
" province: New Jersey\n",
|
||||||
|
" state: Hawaii\n",
|
||||||
|
" street_name: Stoneyford\n",
|
||||||
|
" street_number: '1112'\n",
|
||||||
|
"age: 44\n",
|
||||||
|
"blood_type: \"O\\u2212\"\n",
|
||||||
|
"created_at: '2004-04-02T13:05:27+00:00'\n",
|
||||||
|
"email: bread2099+1@outlook.com\n",
|
||||||
|
"gender: Fluid\n",
|
||||||
|
"height: '1.62'\n",
|
||||||
|
"id: 1\n",
|
||||||
|
"language: Belarusian\n",
|
||||||
|
"name: Moses\n",
|
||||||
|
"nationality: Dutch\n",
|
||||||
|
"occupation: Track Worker\n",
|
||||||
|
"telephone: 1-467-194-2318\n",
|
||||||
|
"title: M.Sc.Tech.\n",
|
||||||
|
"updated_at: '2024-02-27T16:41:01+00:00'\n",
|
||||||
|
"weight: 6\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain_airbyte import AirbyteLoader\n",
|
||||||
|
"\n",
|
||||||
|
"loader = AirbyteLoader(\n",
|
||||||
|
" source=\"source-faker\",\n",
|
||||||
|
" stream=\"users\",\n",
|
||||||
|
" config={\"count\": 10},\n",
|
||||||
|
")\n",
|
||||||
|
"docs = loader.load()\n",
|
||||||
|
"print(docs[0].page_content[:500])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "fca024cb",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"You can also specify a custom prompt template for formatting documents:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "9fa002a5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"My name is Verdie and I am 1.73 meters tall.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain_core.prompts import PromptTemplate\n",
|
||||||
|
"\n",
|
||||||
|
"loader_templated = AirbyteLoader(\n",
|
||||||
|
" source=\"source-faker\",\n",
|
||||||
|
" stream=\"users\",\n",
|
||||||
|
" config={\"count\": 10},\n",
|
||||||
|
" template=PromptTemplate.from_template(\n",
|
||||||
|
" \"My name is {name} and I am {height} meters tall.\"\n",
|
||||||
|
" ),\n",
|
||||||
|
")\n",
|
||||||
|
"docs_templated = loader_templated.load()\n",
|
||||||
|
"print(docs_templated[0].page_content)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d3e6d887",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Lazy Loading Documents\n",
|
||||||
|
"\n",
|
||||||
|
"One of the powerful features of `AirbyteLoader` is its ability to load large documents from upstream sources. When working with large datasets, the default `.load()` behavior can be slow and memory-intensive. To avoid this, you can use the `.lazy_load()` method to load documents in a more memory-efficient manner."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "684b9187",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Just calling lazy load is quick! This took 0.0001 seconds\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import time\n",
|
||||||
|
"\n",
|
||||||
|
"loader = AirbyteLoader(\n",
|
||||||
|
" source=\"source-faker\",\n",
|
||||||
|
" stream=\"users\",\n",
|
||||||
|
" config={\"count\": 3},\n",
|
||||||
|
" template=PromptTemplate.from_template(\n",
|
||||||
|
" \"My name is {name} and I am {height} meters tall.\"\n",
|
||||||
|
" ),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"start_time = time.time()\n",
|
||||||
|
"my_iterator = loader.lazy_load()\n",
|
||||||
|
"print(\n",
|
||||||
|
" f\"Just calling lazy load is quick! This took {time.time() - start_time:.4f} seconds\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "6b24a64b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"And you can iterate over documents as they're yielded:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "3e8355d0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"My name is Andera and I am 1.91 meters tall.\n",
|
||||||
|
"My name is Jody and I am 1.85 meters tall.\n",
|
||||||
|
"My name is Zonia and I am 1.53 meters tall.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for doc in my_iterator:\n",
|
||||||
|
" print(doc.page_content)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d1040d81",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"You can also lazy load documents in an async manner with `.alazy_load()`:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "dc5d0911",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"My name is Carmelina and I am 1.74 meters tall.\n",
|
||||||
|
"My name is Ali and I am 1.90 meters tall.\n",
|
||||||
|
"My name is Rochell and I am 1.83 meters tall.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"loader = AirbyteLoader(\n",
|
||||||
|
" source=\"source-faker\",\n",
|
||||||
|
" stream=\"users\",\n",
|
||||||
|
" config={\"count\": 3},\n",
|
||||||
|
" template=PromptTemplate.from_template(\n",
|
||||||
|
" \"My name is {name} and I am {height} meters tall.\"\n",
|
||||||
|
" ),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"my_async_iterator = loader.alazy_load()\n",
|
||||||
|
"\n",
|
||||||
|
"async for doc in my_async_iterator:\n",
|
||||||
|
" print(doc.page_content)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ba4ede33",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Configuration\n",
|
||||||
|
"\n",
|
||||||
|
"`AirbyteLoader` can be configured with the following options:\n",
|
||||||
|
"\n",
|
||||||
|
"- `source` (str, required): The name of the Airbyte source to load from.\n",
|
||||||
|
"- `stream` (str, required): The name of the stream to load from (Airbyte sources can return multiple streams)\n",
|
||||||
|
"- `config` (dict, required): The configuration for the Airbyte source\n",
|
||||||
|
"- `template` (PromptTemplate, optional): A custom prompt template for formatting documents\n",
|
||||||
|
"- `include_metadata` (bool, optional, default True): Whether to include all fields as metadata in the output documents\n",
|
||||||
|
"\n",
|
||||||
|
"The majority of the configuration will be in `config`, and you can find the specific configuration options in the \"Config field reference\" for each source in the [Airbyte documentation](https://docs.airbyte.com/integrations/)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "2e2ed269",
|
||||||
|
"metadata": {},
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1 @@
|
|||||||
|
__pycache__
|
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2024 LangChain, Inc.
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
@ -0,0 +1,56 @@
|
|||||||
|
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
|
||||||
|
|
||||||
|
# Default target executed when no arguments are given to make.
|
||||||
|
all: help
|
||||||
|
|
||||||
|
# Define a variable for the test file path.
|
||||||
|
TEST_FILE ?= tests/unit_tests/
|
||||||
|
integration_test integration_tests: TEST_FILE = tests/integration_tests/
|
||||||
|
|
||||||
|
test tests integration_test integration_tests:
|
||||||
|
poetry run pytest $(TEST_FILE)
|
||||||
|
|
||||||
|
######################
|
||||||
|
# LINTING AND FORMATTING
|
||||||
|
######################
|
||||||
|
|
||||||
|
# Define a variable for Python and notebook files.
|
||||||
|
PYTHON_FILES=.
|
||||||
|
MYPY_CACHE=.mypy_cache
|
||||||
|
lint format: PYTHON_FILES=.
|
||||||
|
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/airbyte --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
|
||||||
|
lint_package: PYTHON_FILES=langchain_airbyte
|
||||||
|
lint_tests: PYTHON_FILES=tests
|
||||||
|
lint_tests: MYPY_CACHE=.mypy_cache_test
|
||||||
|
|
||||||
|
lint lint_diff lint_package lint_tests:
|
||||||
|
poetry run ruff .
|
||||||
|
poetry run ruff format $(PYTHON_FILES) --diff
|
||||||
|
poetry run ruff --select I $(PYTHON_FILES)
|
||||||
|
mkdir -p $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
|
||||||
|
|
||||||
|
format format_diff:
|
||||||
|
poetry run ruff format $(PYTHON_FILES)
|
||||||
|
poetry run ruff --select I --fix $(PYTHON_FILES)
|
||||||
|
|
||||||
|
spell_check:
|
||||||
|
poetry run codespell --toml pyproject.toml
|
||||||
|
|
||||||
|
spell_fix:
|
||||||
|
poetry run codespell --toml pyproject.toml -w
|
||||||
|
|
||||||
|
check_imports: $(shell find langchain_airbyte -name '*.py')
|
||||||
|
poetry run python ./scripts/check_imports.py $^
|
||||||
|
|
||||||
|
######################
|
||||||
|
# HELP
|
||||||
|
######################
|
||||||
|
|
||||||
|
help:
|
||||||
|
@echo '----'
|
||||||
|
@echo 'check_imports - check imports'
|
||||||
|
@echo 'format - run code formatters'
|
||||||
|
@echo 'lint - run linters'
|
||||||
|
@echo 'test - run unit tests'
|
||||||
|
@echo 'tests - run unit tests'
|
||||||
|
@echo 'test TEST_FILE=<test_file> - run all tests in file'
|
@ -0,0 +1,27 @@
|
|||||||
|
# langchain-airbyte
|
||||||
|
|
||||||
|
This package contains the LangChain integration with Airbyte
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -U langchain-airbyte
|
||||||
|
```
|
||||||
|
|
||||||
|
The integration package doesn't have any global environment variables that need to be
|
||||||
|
set, but some integrations (e.g. `source-github`) may need credentials passed in.
|
||||||
|
|
||||||
|
## Document Loaders
|
||||||
|
|
||||||
|
`AirbyteLoader` class exposes a single document loader for Airbyte sources.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain_airbyte import AirbyteLoader
|
||||||
|
|
||||||
|
loader = AirbyteLoader(
|
||||||
|
source="source-faker",
|
||||||
|
stream="users",
|
||||||
|
config={"count": 100},
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
```
|
@ -0,0 +1,3 @@
|
|||||||
|
from langchain_airbyte.document_loaders import AirbyteLoader
|
||||||
|
|
||||||
|
__all__ = ["AirbyteLoader"]
|
@ -0,0 +1,121 @@
|
|||||||
|
"""Airbyte vector stores."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Any,
|
||||||
|
AsyncIterator,
|
||||||
|
Dict,
|
||||||
|
Iterator,
|
||||||
|
List,
|
||||||
|
Mapping,
|
||||||
|
Optional,
|
||||||
|
TypeVar,
|
||||||
|
)
|
||||||
|
|
||||||
|
import airbyte as ab
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.prompts import PromptTemplate
|
||||||
|
from langchain_core.runnables import run_in_executor
|
||||||
|
from langchain_core.vectorstores import VectorStore
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from langchain.text_splitter import TextSplitter
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
VST = TypeVar("VST", bound=VectorStore)
|
||||||
|
|
||||||
|
|
||||||
|
class AirbyteLoader:
|
||||||
|
"""Airbyte Document Loader.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_airbyte import AirbyteLoader
|
||||||
|
|
||||||
|
loader = AirbyteLoader(
|
||||||
|
source="github",
|
||||||
|
stream="pull_requests",
|
||||||
|
)
|
||||||
|
documents = loader.lazy_load()
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
source: str,
|
||||||
|
stream: str,
|
||||||
|
*,
|
||||||
|
config: Optional[Dict] = None,
|
||||||
|
include_metadata: bool = True,
|
||||||
|
template: Optional[PromptTemplate] = None,
|
||||||
|
):
|
||||||
|
self._airbyte_source = ab.get_source(source, config=config, streams=[stream])
|
||||||
|
self._stream = stream
|
||||||
|
self._template = template
|
||||||
|
self._include_metadata = include_metadata
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load source data into Document objects."""
|
||||||
|
return list(self.lazy_load())
|
||||||
|
|
||||||
|
def load_and_split(
|
||||||
|
self, text_splitter: Optional[TextSplitter] = None
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Load Documents and split into chunks. Chunks are returned as Documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text_splitter: TextSplitter instance to use for splitting documents.
|
||||||
|
Defaults to RecursiveCharacterTextSplitter.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents.
|
||||||
|
"""
|
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
|
if text_splitter is None:
|
||||||
|
_text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
|
||||||
|
else:
|
||||||
|
_text_splitter = text_splitter
|
||||||
|
docs = self.lazy_load()
|
||||||
|
return _text_splitter.split_documents(docs)
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
"""A lazy loader for Documents."""
|
||||||
|
# if no prompt template defined, use default airbyte documents
|
||||||
|
if not self._template:
|
||||||
|
for document in self._airbyte_source.get_documents(self._stream):
|
||||||
|
# convert airbyte document to langchain document
|
||||||
|
metadata = (
|
||||||
|
{}
|
||||||
|
if not self._include_metadata
|
||||||
|
else {
|
||||||
|
**document.metadata,
|
||||||
|
"_last_modified": document.last_modified,
|
||||||
|
"_id": document.id,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
yield Document(
|
||||||
|
page_content=document.content,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
records: Iterator[Mapping[str, Any]] = self._airbyte_source.get_records(
|
||||||
|
self._stream
|
||||||
|
)
|
||||||
|
for record in records:
|
||||||
|
metadata = {} if not self._include_metadata else dict(record)
|
||||||
|
yield Document(
|
||||||
|
page_content=self._template.format(**record), metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
async def alazy_load(self) -> AsyncIterator[Document]:
|
||||||
|
"""A lazy loader for Documents."""
|
||||||
|
iterator = await run_in_executor(None, self.lazy_load)
|
||||||
|
done = object()
|
||||||
|
while True:
|
||||||
|
doc = await run_in_executor(None, next, iterator, done) # type: ignore[call-arg, arg-type]
|
||||||
|
if doc is done:
|
||||||
|
break
|
||||||
|
yield doc # type: ignore[misc]
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,85 @@
|
|||||||
|
[tool.poetry]
|
||||||
|
name = "langchain-airbyte"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "An integration package connecting Airbyte and LangChain"
|
||||||
|
authors = []
|
||||||
|
readme = "README.md"
|
||||||
|
repository = "https://github.com/langchain-ai/langchain"
|
||||||
|
license = "MIT"
|
||||||
|
|
||||||
|
[tool.poetry.urls]
|
||||||
|
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/airbyte"
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.10"
|
||||||
|
langchain-core = "^0.1"
|
||||||
|
airbyte = "^0.7.0"
|
||||||
|
|
||||||
|
[tool.poetry.group.test]
|
||||||
|
optional = true
|
||||||
|
|
||||||
|
[tool.poetry.group.test.dependencies]
|
||||||
|
pytest = "^7.4.3"
|
||||||
|
pytest-asyncio = "^0.23.2"
|
||||||
|
langchain-core = { path = "../../core", develop = true }
|
||||||
|
|
||||||
|
[tool.poetry.group.codespell]
|
||||||
|
optional = true
|
||||||
|
|
||||||
|
[tool.poetry.group.codespell.dependencies]
|
||||||
|
codespell = "^2.2.6"
|
||||||
|
|
||||||
|
[tool.poetry.group.test_integration]
|
||||||
|
optional = true
|
||||||
|
|
||||||
|
[tool.poetry.group.test_integration.dependencies]
|
||||||
|
|
||||||
|
[tool.poetry.group.lint]
|
||||||
|
optional = true
|
||||||
|
|
||||||
|
[tool.poetry.group.lint.dependencies]
|
||||||
|
ruff = "^0.1.8"
|
||||||
|
|
||||||
|
[tool.poetry.group.typing.dependencies]
|
||||||
|
mypy = "^1.7.1"
|
||||||
|
langchain-core = { path = "../../core", develop = true }
|
||||||
|
langchain = "^0.1.9"
|
||||||
|
|
||||||
|
[tool.poetry.group.dev]
|
||||||
|
optional = true
|
||||||
|
|
||||||
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
langchain-core = { path = "../../core", develop = true }
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
"E", # pycodestyle
|
||||||
|
"F", # pyflakes
|
||||||
|
"I", # isort
|
||||||
|
"T201", # print
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
disallow_untyped_defs = "True"
|
||||||
|
|
||||||
|
[tool.coverage.run]
|
||||||
|
omit = ["tests/*"]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core>=1.0.0"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
# --strict-markers will raise errors on unknown marks.
|
||||||
|
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
|
||||||
|
#
|
||||||
|
# https://docs.pytest.org/en/7.1.x/reference/reference.html
|
||||||
|
# --strict-config any warnings encountered while parsing the `pytest`
|
||||||
|
# section of the configuration file raise errors.
|
||||||
|
addopts = "--strict-markers --strict-config --durations=5"
|
||||||
|
# Registering custom markers.
|
||||||
|
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
|
||||||
|
markers = [
|
||||||
|
"compile: mark placeholder test used to compile integration tests without running them",
|
||||||
|
]
|
||||||
|
asyncio_mode = "auto"
|
@ -0,0 +1,17 @@
|
|||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
from importlib.machinery import SourceFileLoader
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
files = sys.argv[1:]
|
||||||
|
has_failure = False
|
||||||
|
for file in files:
|
||||||
|
try:
|
||||||
|
SourceFileLoader("x", file).load_module()
|
||||||
|
except Exception:
|
||||||
|
has_faillure = True
|
||||||
|
print(file) # noqa: T201
|
||||||
|
traceback.print_exc()
|
||||||
|
print() # noqa: T201
|
||||||
|
|
||||||
|
sys.exit(1 if has_failure else 0)
|
@ -0,0 +1,27 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# This script searches for lines starting with "import pydantic" or "from pydantic"
|
||||||
|
# in tracked files within a Git repository.
|
||||||
|
#
|
||||||
|
# Usage: ./scripts/check_pydantic.sh /path/to/repository
|
||||||
|
|
||||||
|
# Check if a path argument is provided
|
||||||
|
if [ $# -ne 1 ]; then
|
||||||
|
echo "Usage: $0 /path/to/repository"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
repository_path="$1"
|
||||||
|
|
||||||
|
# Search for lines matching the pattern within the specified repository
|
||||||
|
result=$(git -C "$repository_path" grep -E '^import pydantic|^from pydantic')
|
||||||
|
|
||||||
|
# Check if any matching lines were found
|
||||||
|
if [ -n "$result" ]; then
|
||||||
|
echo "ERROR: The following lines need to be updated:"
|
||||||
|
echo "$result"
|
||||||
|
echo "Please replace the code with an import from langchain_core.pydantic_v1."
|
||||||
|
echo "For example, replace 'from pydantic import BaseModel'"
|
||||||
|
echo "with 'from langchain_core.pydantic_v1 import BaseModel'"
|
||||||
|
exit 1
|
||||||
|
fi
|
@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
# Initialize a variable to keep track of errors
|
||||||
|
errors=0
|
||||||
|
|
||||||
|
# make sure not importing from langchain, langchain_experimental, or langchain_community
|
||||||
|
git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
|
||||||
|
git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
|
||||||
|
git --no-pager grep '^from langchain_community\.' . && errors=$((errors+1))
|
||||||
|
|
||||||
|
# Decide on an exit status based on the errors
|
||||||
|
if [ "$errors" -gt 0 ]; then
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
exit 0
|
||||||
|
fi
|
@ -0,0 +1,7 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.compile
|
||||||
|
def test_placeholder() -> None:
|
||||||
|
"""Used for compiling integration tests without running any real tests."""
|
||||||
|
pass
|
@ -0,0 +1,28 @@
|
|||||||
|
"""Test Airbyte embeddings."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from langchain_airbyte import AirbyteLoader
|
||||||
|
|
||||||
|
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_github() -> None:
|
||||||
|
"""Test loading from GitHub."""
|
||||||
|
airbyte_loader = AirbyteLoader(
|
||||||
|
source="source-github",
|
||||||
|
stream="issues",
|
||||||
|
config={
|
||||||
|
"repositories": ["airbytehq/quickstarts"],
|
||||||
|
"credentials": {"personal_access_token": GITHUB_TOKEN},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
documents = airbyte_loader.load()
|
||||||
|
assert len(documents) > 0
|
||||||
|
# make sure some documents have body in metadata
|
||||||
|
found_body = False
|
||||||
|
for doc in documents:
|
||||||
|
if "body" in doc.metadata and doc.metadata["body"]:
|
||||||
|
found_body = True
|
||||||
|
break
|
||||||
|
assert found_body, "No documents with body found"
|
@ -0,0 +1,77 @@
|
|||||||
|
from langchain_core.prompts import PromptTemplate
|
||||||
|
|
||||||
|
from langchain_airbyte import AirbyteLoader
|
||||||
|
|
||||||
|
|
||||||
|
def test_initialization() -> None:
|
||||||
|
"""Test integration loader initialization."""
|
||||||
|
AirbyteLoader(
|
||||||
|
source="source-faker",
|
||||||
|
stream="users",
|
||||||
|
config={"count": 3},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_load() -> None:
|
||||||
|
"""Test loading from source."""
|
||||||
|
airbyte_loader = AirbyteLoader(
|
||||||
|
source="source-faker",
|
||||||
|
stream="users",
|
||||||
|
config={"count": 5},
|
||||||
|
)
|
||||||
|
documents = airbyte_loader.load()
|
||||||
|
assert len(documents) == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_lazy_load() -> None:
|
||||||
|
"""Test lazy loading from source."""
|
||||||
|
airbyte_loader = AirbyteLoader(
|
||||||
|
source="source-faker",
|
||||||
|
stream="users",
|
||||||
|
config={"count": 3},
|
||||||
|
)
|
||||||
|
documents = airbyte_loader.lazy_load()
|
||||||
|
assert len(list(documents)) == 3
|
||||||
|
|
||||||
|
|
||||||
|
async def test_alazy_load() -> None:
|
||||||
|
"""Test async lazy loading from source."""
|
||||||
|
airbyte_loader = AirbyteLoader(
|
||||||
|
source="source-faker",
|
||||||
|
stream="users",
|
||||||
|
config={"count": 3},
|
||||||
|
)
|
||||||
|
documents = airbyte_loader.alazy_load()
|
||||||
|
lendocs = 0
|
||||||
|
async for _ in documents:
|
||||||
|
lendocs += 1
|
||||||
|
assert lendocs == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_with_template() -> None:
|
||||||
|
"""Test loading from source with template."""
|
||||||
|
airbyte_loader = AirbyteLoader(
|
||||||
|
source="source-faker",
|
||||||
|
stream="users",
|
||||||
|
config={"count": 3},
|
||||||
|
template=PromptTemplate.from_template("My name is {name}"),
|
||||||
|
)
|
||||||
|
documents = airbyte_loader.load()
|
||||||
|
assert len(documents) == 3
|
||||||
|
for doc in documents:
|
||||||
|
assert doc.page_content.startswith("My name is ")
|
||||||
|
assert doc.metadata["name"] # should have a name
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_no_metadata() -> None:
|
||||||
|
"""Test loading from source with no metadata."""
|
||||||
|
airbyte_loader = AirbyteLoader(
|
||||||
|
source="source-faker",
|
||||||
|
stream="users",
|
||||||
|
config={"count": 3},
|
||||||
|
include_metadata=False,
|
||||||
|
)
|
||||||
|
documents = airbyte_loader.load()
|
||||||
|
assert len(documents) == 3
|
||||||
|
for doc in documents:
|
||||||
|
assert doc.metadata == {}
|
@ -0,0 +1,9 @@
|
|||||||
|
from langchain_airbyte import __all__
|
||||||
|
|
||||||
|
EXPECTED_ALL = [
|
||||||
|
"AirbyteLoader",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_imports() -> None:
|
||||||
|
assert sorted(EXPECTED_ALL) == sorted(__all__)
|
Loading…
Reference in New Issue