You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/document_transformers/doctran_text_qa.py

62 lines
2.1 KiB
Python

from typing import Any, Optional, Sequence
from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_core.utils import get_from_env
class DoctranQATransformer(BaseDocumentTransformer):
"""Extract QA from text documents using doctran.
Arguments:
openai_api_key: OpenAI API key. Can also be specified via environment variable
``OPENAI_API_KEY``.
Example:
.. code-block:: python
from langchain_community.document_transformers import DoctranQATransformer
# Pass in openai_api_key or set env var OPENAI_API_KEY
qa_transformer = DoctranQATransformer()
transformed_document = await qa_transformer.atransform_documents(documents)
"""
def __init__(
self,
openai_api_key: Optional[str] = None,
openai_api_model: Optional[str] = None,
) -> None:
self.openai_api_key = openai_api_key or get_from_env(
"openai_api_key", "OPENAI_API_KEY"
)
self.openai_api_model = openai_api_model or get_from_env(
"openai_api_model", "OPENAI_API_MODEL"
)
async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
raise NotImplementedError
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""Extracts QA from text documents using doctran."""
try:
from doctran import Doctran
doctran = Doctran(
openai_api_key=self.openai_api_key, openai_model=self.openai_api_model
)
except ImportError:
raise ImportError(
"Install doctran to use this parser. (pip install doctran)"
)
for d in documents:
doctran_doc = doctran.parse(content=d.page_content).interrogate().execute()
questions_and_answers = doctran_doc.extracted_properties.get(
"questions_and_answers"
)
d.metadata["questions_and_answers"] = questions_and_answers
return documents