From b8d78424abdd8d3df61310b9c0984b556253ff9e Mon Sep 17 00:00:00 2001 From: Kenzie Mihardja Date: Wed, 21 Jun 2023 19:24:04 -0700 Subject: [PATCH] Change Data Loader Namespace (#6568) Description: Update the artifact name of the xml file and the namespaces. Co-authored with @tjaffri Co-authored-by: Kenzie Mihardja --- langchain/document_loaders/docugami.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/langchain/document_loaders/docugami.py b/langchain/document_loaders/docugami.py index e30fa5f2..449b9687 100644 --- a/langchain/document_loaders/docugami.py +++ b/langchain/document_loaders/docugami.py @@ -243,7 +243,7 @@ class DocugamiLoader(BaseLoader, BaseModel): artifact_url = artifact.get("url") artifact_doc = artifact.get("document") - if artifact_name == f"{project_id}.xml" and artifact_url and artifact_doc: + if artifact_name == "report-values.xml" and artifact_url and artifact_doc: doc_id = artifact_doc["id"] metadata: Dict = {} @@ -266,11 +266,11 @@ class DocugamiLoader(BaseLoader, BaseModel): artifact_tree = etree.parse(io.BytesIO(response.content)) artifact_root = artifact_tree.getroot() ns = artifact_root.nsmap - entries = artifact_root.xpath("//wp:Entry", namespaces=ns) + entries = artifact_root.xpath("//pr:Entry", namespaces=ns) for entry in entries: - heading = entry.xpath("./wp:Heading", namespaces=ns)[0].text + heading = entry.xpath("./pr:Heading", namespaces=ns)[0].text value = " ".join( - entry.xpath("./wp:Value", namespaces=ns)[0].itertext() + entry.xpath("./pr:Value", namespaces=ns)[0].itertext() ).strip() metadata[heading] = value per_file_metadata[doc_id] = metadata