2023-03-13 14:20:03 +00:00
|
|
|
"""Base schema for readers."""
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
|
|
from langchain.docstore.document import Document as LCDocument
|
2023-08-13 17:25:55 +00:00
|
|
|
from application.parser.schema.schema import BaseDocument
|
2023-03-13 14:20:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Document(BaseDocument):
|
|
|
|
"""Generic interface for a data document.
|
|
|
|
|
|
|
|
This document connects to data sources.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __post_init__(self) -> None:
|
|
|
|
"""Post init."""
|
|
|
|
if self.text is None:
|
|
|
|
raise ValueError("text field not set.")
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_type(cls) -> str:
|
|
|
|
"""Get Document type."""
|
|
|
|
return "Document"
|
|
|
|
|
|
|
|
def to_langchain_format(self) -> LCDocument:
|
|
|
|
"""Convert struct to LangChain document format."""
|
|
|
|
metadata = self.extra_info or {}
|
|
|
|
return LCDocument(page_content=self.text, metadata=metadata)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_langchain_format(cls, doc: LCDocument) -> "Document":
|
|
|
|
"""Convert struct from LangChain document format."""
|
|
|
|
return cls(text=doc.page_content, extra_info=doc.metadata)
|