mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Harrison/duckdb (#2064)
Co-authored-by: Trent Hauck <trent@trenthauck.com>
This commit is contained in:
parent
76ecca4d53
commit
f74a1bebf5
175
docs/modules/indexes/document_loaders/examples/duckdb.ipynb
Normal file
175
docs/modules/indexes/document_loaders/examples/duckdb.ipynb
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# DuckDB Loader\n",
|
||||||
|
"\n",
|
||||||
|
"Load a DuckDB query with one document per row."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import DuckDBLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Writing example.csv\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%file example.csv\n",
|
||||||
|
"Team,Payroll\n",
|
||||||
|
"Nationals,81.34\n",
|
||||||
|
"Reds,82.20"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = DuckDBLoader(\"SELECT * FROM read_csv_auto('example.csv')\")\n",
|
||||||
|
"\n",
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[Document(page_content='Team: Nationals\\nPayroll: 81.34', metadata={}), Document(page_content='Team: Reds\\nPayroll: 82.2', metadata={})]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Specifying Which Columns are Content vs Metadata"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = DuckDBLoader(\n",
|
||||||
|
" \"SELECT * FROM read_csv_auto('example.csv')\",\n",
|
||||||
|
" page_content_columns=[\"Team\"],\n",
|
||||||
|
" metadata_columns=[\"Payroll\"]\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[Document(page_content='Team: Nationals', metadata={'Payroll': 81.34}), Document(page_content='Team: Reds', metadata={'Payroll': 82.2})]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Adding Source to Metadata"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = DuckDBLoader(\n",
|
||||||
|
" \"SELECT Team, Payroll, Team As source FROM read_csv_auto('example.csv')\",\n",
|
||||||
|
" metadata_columns=[\"source\"]\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[Document(page_content='Team: Nationals\\nPayroll: 81.34\\nsource: Nationals', metadata={'source': 'Nationals'}), Document(page_content='Team: Reds\\nPayroll: 82.2\\nsource: Reds', metadata={'source': 'Reds'})]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 1
|
||||||
|
}
|
@ -14,6 +14,7 @@ from langchain.document_loaders.conllu import CoNLLULoader
|
|||||||
from langchain.document_loaders.csv_loader import CSVLoader
|
from langchain.document_loaders.csv_loader import CSVLoader
|
||||||
from langchain.document_loaders.dataframe import DataFrameLoader
|
from langchain.document_loaders.dataframe import DataFrameLoader
|
||||||
from langchain.document_loaders.directory import DirectoryLoader
|
from langchain.document_loaders.directory import DirectoryLoader
|
||||||
|
from langchain.document_loaders.duckdb_loader import DuckDBLoader
|
||||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||||
from langchain.document_loaders.evernote import EverNoteLoader
|
from langchain.document_loaders.evernote import EverNoteLoader
|
||||||
from langchain.document_loaders.facebook_chat import FacebookChatLoader
|
from langchain.document_loaders.facebook_chat import FacebookChatLoader
|
||||||
@ -61,7 +62,7 @@ from langchain.document_loaders.youtube import (
|
|||||||
YoutubeLoader,
|
YoutubeLoader,
|
||||||
)
|
)
|
||||||
|
|
||||||
"""Legacy: only for backwards compat. use PyPDFLoader instead"""
|
# Legacy: only for backwards compat. Use PyPDFLoader instead
|
||||||
PagedPDFSplitter = PyPDFLoader
|
PagedPDFSplitter = PyPDFLoader
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@ -116,4 +117,5 @@ __all__ = [
|
|||||||
"AzureBlobStorageFileLoader",
|
"AzureBlobStorageFileLoader",
|
||||||
"AzureBlobStorageContainerLoader",
|
"AzureBlobStorageContainerLoader",
|
||||||
"SitemapLoader",
|
"SitemapLoader",
|
||||||
|
"DuckDBLoader",
|
||||||
]
|
]
|
||||||
|
74
langchain/document_loaders/duckdb_loader.py
Normal file
74
langchain/document_loaders/duckdb_loader.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
from typing import Dict, List, Optional, cast
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class DuckDBLoader(BaseLoader):
|
||||||
|
"""Loads a query result from DuckDB into a list of documents.
|
||||||
|
|
||||||
|
Each document represents one row of the result. The `page_content_columns`
|
||||||
|
are written into the `page_content` of the document. The `metadata_columns`
|
||||||
|
are written into the `metadata` of the document. By default, all columns
|
||||||
|
are written into the `page_content` and none into the `metadata`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
database: str = ":memory:",
|
||||||
|
read_only: bool = False,
|
||||||
|
config: Optional[Dict[str, str]] = None,
|
||||||
|
page_content_columns: Optional[List[str]] = None,
|
||||||
|
metadata_columns: Optional[List[str]] = None,
|
||||||
|
):
|
||||||
|
self.query = query
|
||||||
|
self.database = database
|
||||||
|
self.read_only = read_only
|
||||||
|
self.config = config or {}
|
||||||
|
self.page_content_columns = page_content_columns
|
||||||
|
self.metadata_columns = metadata_columns
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
try:
|
||||||
|
import duckdb
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"Could not import duckdb python package. "
|
||||||
|
"Please it install it with `pip install duckdb`."
|
||||||
|
)
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
with duckdb.connect(
|
||||||
|
database=self.database, read_only=self.read_only, config=self.config
|
||||||
|
) as con:
|
||||||
|
query_result = con.execute(self.query)
|
||||||
|
results = query_result.fetchall()
|
||||||
|
description = cast(list, query_result.description)
|
||||||
|
field_names = [c[0] for c in description]
|
||||||
|
|
||||||
|
if self.page_content_columns is None:
|
||||||
|
page_content_columns = field_names
|
||||||
|
else:
|
||||||
|
page_content_columns = self.page_content_columns
|
||||||
|
|
||||||
|
if self.metadata_columns is None:
|
||||||
|
metadata_columns = []
|
||||||
|
else:
|
||||||
|
metadata_columns = self.metadata_columns
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
page_content = "\n".join(
|
||||||
|
f"{column}: {result[field_names.index(column)]}"
|
||||||
|
for column in page_content_columns
|
||||||
|
)
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
column: result[field_names.index(column)]
|
||||||
|
for column in metadata_columns
|
||||||
|
}
|
||||||
|
|
||||||
|
doc = Document(page_content=page_content, metadata=metadata)
|
||||||
|
docs.append(doc)
|
||||||
|
|
||||||
|
return docs
|
56
tests/integration_tests/document_loaders/test_duckdb.py
Normal file
56
tests/integration_tests/document_loaders/test_duckdb.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
from langchain.document_loaders.duckdb_loader import DuckDBLoader
|
||||||
|
|
||||||
|
try:
|
||||||
|
import duckdb # noqa: F401
|
||||||
|
|
||||||
|
duckdb_installed = True
|
||||||
|
except ImportError:
|
||||||
|
duckdb_installed = False
|
||||||
|
|
||||||
|
|
||||||
|
@unittest.skipIf(not duckdb_installed, "duckdb not installed")
|
||||||
|
def test_duckdb_loader_no_options() -> None:
|
||||||
|
"""Test DuckDB loader."""
|
||||||
|
|
||||||
|
loader = DuckDBLoader("SELECT 1 AS a, 2 AS b")
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 1
|
||||||
|
assert docs[0].page_content == "a: 1\nb: 2"
|
||||||
|
assert docs[0].metadata == {}
|
||||||
|
|
||||||
|
|
||||||
|
@unittest.skipIf(not duckdb_installed, "duckdb not installed")
|
||||||
|
def test_duckdb_loader_page_content_columns() -> None:
|
||||||
|
"""Test DuckDB loader."""
|
||||||
|
|
||||||
|
loader = DuckDBLoader(
|
||||||
|
"SELECT 1 AS a, 2 AS b UNION SELECT 3 AS a, 4 AS b",
|
||||||
|
page_content_columns=["a"],
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 2
|
||||||
|
assert docs[0].page_content == "a: 1"
|
||||||
|
assert docs[0].metadata == {}
|
||||||
|
|
||||||
|
assert docs[1].page_content == "a: 3"
|
||||||
|
assert docs[1].metadata == {}
|
||||||
|
|
||||||
|
|
||||||
|
@unittest.skipIf(not duckdb_installed, "duckdb not installed")
|
||||||
|
def test_duckdb_loader_metadata_columns() -> None:
|
||||||
|
"""Test DuckDB loader."""
|
||||||
|
|
||||||
|
loader = DuckDBLoader(
|
||||||
|
"SELECT 1 AS a, 2 AS b",
|
||||||
|
page_content_columns=["a"],
|
||||||
|
metadata_columns=["b"],
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 1
|
||||||
|
assert docs[0].page_content == "a: 1"
|
||||||
|
assert docs[0].metadata == {"b": 2}
|
Loading…
Reference in New Issue
Block a user