You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/document_transformers/nuclia_text_transform.py

49 lines
1.4 KiB
Python

import asyncio
import json
import uuid
from typing import Any, Sequence
from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_community.tools.nuclia.tool import NucliaUnderstandingAPI
class NucliaTextTransformer(BaseDocumentTransformer):
"""
The Nuclia Understanding API splits into paragraphs and sentences,
identifies entities, provides a summary of the text and generates
embeddings for all sentences.
"""
def __init__(self, nua: NucliaUnderstandingAPI):
self.nua = nua
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
raise NotImplementedError
async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
tasks = [
self.nua.arun(
{
"action": "push",
"id": str(uuid.uuid4()),
"text": doc.page_content,
"path": None,
}
)
for doc in documents
]
results = await asyncio.gather(*tasks)
for doc, result in zip(documents, results):
obj = json.loads(result)
metadata = {
"file": obj["file_extracted_data"][0],
"metadata": obj["field_metadata"][0],
}
doc.metadata["nuclia"] = metadata
return documents