DocsGPT/application/parser/schema/base.py
Anton Larin 98a97f34f5 fix packaging and imports and introduce tests with pytest.
still issues with celery worker.
2023-08-14 18:20:25 +02:00

35 lines
1014 B
Python

"""Base schema for readers."""
from dataclasses import dataclass
from langchain.docstore.document import Document as LCDocument
from application.parser.schema.schema import BaseDocument
@dataclass
class Document(BaseDocument):
"""Generic interface for a data document.
This document connects to data sources.
"""
def __post_init__(self) -> None:
"""Post init."""
if self.text is None:
raise ValueError("text field not set.")
@classmethod
def get_type(cls) -> str:
"""Get Document type."""
return "Document"
def to_langchain_format(self) -> LCDocument:
"""Convert struct to LangChain document format."""
metadata = self.extra_info or {}
return LCDocument(page_content=self.text, metadata=metadata)
@classmethod
def from_langchain_format(cls, doc: LCDocument) -> "Document":
"""Convert struct from LangChain document format."""
return cls(text=doc.page_content, extra_info=doc.metadata)