Harrison/fauna loader (#5864)

Co-authored-by: Shadid12 <Shadid12@users.noreply.github.com>
This commit is contained in:
Harrison Chase 2023-06-07 21:32:23 -07:00 committed by GitHub
parent 5518f24ec3
commit 658f8bdee7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 190 additions and 0 deletions

View File

@ -0,0 +1,84 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fauna\n",
"\n",
">[Fauna](https://fauna.com/) is a Document Database.\n",
"\n",
"Query `Fauna` documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#!pip install fauna"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Query data example"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders.fauna import FaunaLoader\n",
"\n",
"secret = \"<enter-valid-fauna-secret>\"\n",
"query = \"Item.all()\" # Fauna query. Assumes that the collection is called \"Item\"\n",
"field = \"text\" # The field that contains the page content. Assumes that the field is called \"text\"\n",
"\n",
"loader = FaunaLoader(query, field, secret)\n",
"docs = loader.lazy_load()\n",
"\n",
"for value in docs:\n",
" print(value)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Query with Pagination\n",
"You get a `after` value if there are more data. You can get values after the curcor by passing in the `after` string in query. \n",
"\n",
"To learn more following [this link](https://fqlx-beta--fauna-docs.netlify.app/fqlx/beta/reference/schema_entities/set/static-paginate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = \"\"\"\n",
"Item.paginate(\"hs+DzoPOg ... aY1hOohozrV7A\")\n",
"Item.all()\n",
"\"\"\"\n",
"loader = FaunaLoader(query, field, secret)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -34,6 +34,7 @@ from langchain.document_loaders.epub import UnstructuredEPubLoader
from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.evernote import EverNoteLoader
from langchain.document_loaders.excel import UnstructuredExcelLoader from langchain.document_loaders.excel import UnstructuredExcelLoader
from langchain.document_loaders.facebook_chat import FacebookChatLoader from langchain.document_loaders.facebook_chat import FacebookChatLoader
from langchain.document_loaders.fauna import FaunaLoader
from langchain.document_loaders.figma import FigmaFileLoader from langchain.document_loaders.figma import FigmaFileLoader
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
from langchain.document_loaders.gcs_file import GCSFileLoader from langchain.document_loaders.gcs_file import GCSFileLoader
@ -155,6 +156,7 @@ __all__ = [
"DocugamiLoader", "DocugamiLoader",
"Docx2txtLoader", "Docx2txtLoader",
"DuckDBLoader", "DuckDBLoader",
"FaunaLoader",
"EverNoteLoader", "EverNoteLoader",
"FacebookChatLoader", "FacebookChatLoader",
"FigmaFileLoader", "FigmaFileLoader",

View File

@ -0,0 +1,63 @@
from typing import Iterator, List, Optional, Sequence
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class FaunaLoader(BaseLoader):
"""
Attributes:
query (str): The FQL query string to execute.
page_content_field (str): The field that contains the content of each page.
secret (str): The secret key for authenticating to FaunaDB.
metadata_fields (Optional[Sequence[str]]):
Optional list of field names to include in metadata.
"""
def __init__(
self,
query: str,
page_content_field: str,
secret: str,
metadata_fields: Optional[Sequence[str]] = None,
):
self.query = query
self.page_content_field = page_content_field
self.secret = secret
self.metadata_fields = metadata_fields
def load(self) -> List[Document]:
return list(self.lazy_load())
def lazy_load(self) -> Iterator[Document]:
try:
from fauna import Page, fql
from fauna.client import Client
from fauna.encoding import QuerySuccess
except ImportError:
raise ImportError(
"Could not import fauna python package. "
"Please install it with `pip install fauna`."
)
# Create Fauna Client
client = Client(secret=self.secret)
# Run FQL Query
response: QuerySuccess = client.query(fql(self.query))
page: Page = response.data
for result in page:
if result is not None:
document_dict = dict(result.items())
page_content = ""
for key, value in document_dict.items():
if key == self.page_content_field:
page_content = value
document: Document = Document(
page_content=page_content,
metadata={"id": result.id, "ts": result.ts},
)
yield document
if page.after is not None:
yield Document(
page_content="Next Page Exists",
metadata={"after": page.after},
)

View File

@ -0,0 +1,41 @@
import unittest
from langchain.document_loaders.fauna import FaunaLoader
try:
import fauna # noqa: F401
fauna_installed = True
except ImportError:
fauna_installed = False
@unittest.skipIf(not fauna_installed, "fauna not installed")
class TestFaunaLoader(unittest.TestCase):
def setUp(self) -> None:
self.fauna_secret = "<enter-valid-fauna-secret>"
self.valid_fql_query = "Item.all()"
self.valid_page_content_field = "text"
self.valid_metadata_fields = ["valid_metadata_fields"]
def test_fauna_loader(self) -> None:
"""Test Fauna loader."""
loader = FaunaLoader(
query=self.valid_fql_query,
page_content_field=self.valid_page_content_field,
secret=self.fauna_secret,
metadata_fields=self.valid_metadata_fields,
)
docs = loader.load()
assert len(docs) > 0 # assuming the query returns at least one document
for doc in docs:
assert (
doc.page_content != ""
) # assuming that every document has page_content
assert (
"id" in doc.metadata and doc.metadata["id"] != ""
) # assuming that every document has 'id'
assert (
"ts" in doc.metadata and doc.metadata["ts"] != ""
) # assuming that every document has 'ts'