From 3c490b5ba337f0ca80e9020bec61e410d76e4dbc Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Mon, 15 May 2023 10:53:00 -0400 Subject: [PATCH] Docugami DataLoader (#4727) ### Adds a document loader for Docugami Specifically: 1. Adds a data loader that talks to the [Docugami](http://docugami.com) API to download processed documents as semantic XML 2. Parses the semantic XML into chunks, with additional metadata capturing chunk semantics 3. Adds a detailed notebook showing how you can use additional metadata returned by Docugami for techniques like the [self-querying retriever](https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/self_query_retriever.html) 4. Adds an integration test, and related documentation Here is an example of a result that is not possible without the capabilities added by Docugami (from the notebook): image --------- Co-authored-by: Taqi Jaffri Co-authored-by: Taqi Jaffri --- docs/ecosystem/docugami.md | 25 + .../document_loaders/examples/docugami.ipynb | 427 ++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/docugami.py | 343 ++++++++++++++ poetry.lock | 107 ++++- pyproject.toml | 9 +- .../document_loader/loaders/__init__.py | 0 .../loaders/vendors/__init__.py | 0 .../vendors/test_data/docugami-example.xml | 336 ++++++++++++++ .../loaders/vendors/test_docugami.py | 28 ++ 10 files changed, 1269 insertions(+), 8 deletions(-) create mode 100644 docs/ecosystem/docugami.md create mode 100644 docs/modules/indexes/document_loaders/examples/docugami.ipynb create mode 100644 langchain/document_loaders/docugami.py create mode 100644 tests/unit_tests/document_loader/loaders/__init__.py create mode 100644 tests/unit_tests/document_loader/loaders/vendors/__init__.py create mode 100644 tests/unit_tests/document_loader/loaders/vendors/test_data/docugami-example.xml create mode 100644 tests/unit_tests/document_loader/loaders/vendors/test_docugami.py diff --git a/docs/ecosystem/docugami.md b/docs/ecosystem/docugami.md new file mode 100644 index 00000000..58c305f4 --- /dev/null +++ b/docs/ecosystem/docugami.md @@ -0,0 +1,25 @@ +# Docugami + +This page covers how to use [Docugami](https://docugami.com) within LangChain. + +## What is Docugami? + +Docugami converts business documents into a Document XML Knowledge Graph, generating forests of XML semantic trees representing entire documents. This is a rich representation that includes the semantic and structural characteristics of various chunks in the document as an XML tree. + +## Quick start + +1. Create a Docugami workspace: http://www.docugami.com (free trials available) +2. Add your documents (PDF, DOCX or DOC) and allow Docugami to ingest and cluster them into sets of similar documents, e.g. NDAs, Lease Agreements, and Service Agreements. There is no fixed set of document types supported by the system, the clusters created depend on your particular documents, and you can [change the docset assignments](https://help.docugami.com/home/working-with-the-doc-sets-view) later. +3. Create an access token via the Developer Playground for your workspace. Detailed instructions: https://help.docugami.com/home/docugami-api +4. Explore the Docugami API at https://api-docs.docugami.com/ to get a list of your processed docset IDs, or just the document IDs for a particular docset. +6. Use the DocugamiLoader as detailed in [this notebook](../modules/indexes/document_loaders/examples/docugami.ipynb), to get rich semantic chunks for your documents. +7. Optionally, build and publish one or more [reports or abstracts](https://help.docugami.com/home/reports). This helps Docugami improve the semantic XML with better tags based on your preferences, which are then added to the DocugamiLoader output as metadata. Use techniques like [self-querying retriever](https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/self_query_retriever.html) to do high accuracy Document QA. + +# Advantages vs Other Chunking Techniques + +Appropriate chunking of your documents is critical for retrieval from documents. Many chunking techniques exist, including simple ones that rely on whitespace and recursive chunk splitting based on character length. Docugami offers a different approach: + +1. **Intelligent Chunking:** Docugami breaks down every document into a hierarchical semantic XML tree of chunks of varying sizes, from single words or numerical values to entire sections. These chunks follow the semantic contours of the document, providing a more meaningful representation than arbitrary length or simple whitespace-based chunking. +2. **Structured Representation:** In addition, the XML tree indicates the structural contours of every document, using attributes denoting headings, paragraphs, lists, tables, and other common elements, and does that consistently across all supported document formats, such as scanned PDFs or DOCX files. It appropriately handles long-form document characteristics like page headers/footers or multi-column flows for clean text extraction. +3. **Semantic Annotations:** Chunks are annotated with semantic tags that are coherent across the document set, facilitating consistent hierarchical queries across multiple documents, even if they are written and formatted differently. For example, in set of lease agreements, you can easily identify key provisions like the Landlord, Tenant, or Renewal Date, as well as more complex information such as the wording of any sub-lease provision or whether a specific jurisdiction has an exception section within a Termination Clause. +4. **Additional Metadata:** Chunks are also annotated with additional metadata, if a user has been using Docugami. This additional metadata can be used for high-accuracy Document QA without context window restrictions. See detailed code walk-through in [this notebook](../modules/indexes/document_loaders/examples/docugami.ipynb). diff --git a/docs/modules/indexes/document_loaders/examples/docugami.ipynb b/docs/modules/indexes/document_loaders/examples/docugami.ipynb new file mode 100644 index 00000000..ecb3dce1 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/docugami.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Docugami\n", + "This notebook covers how to load documents from `Docugami`. See [here](../../../../ecosystem/docugami.md) for more details, and the advantages of using this system over alternative data loaders.\n", + "\n", + "## Prerequisites\n", + "1. Follow the Quick Start section in [this document](../../../../ecosystem/docugami.md)\n", + "2. Grab an access token for your workspace, and make sure it is set as the DOCUGAMI_API_KEY environment variable\n", + "3. Grab some docset and document IDs for your processed documents, as described here: https://help.docugami.com/home/docugami-api" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# You need the lxml package to use the DocugamiLoader\n", + "!poetry run pip -q install lxml" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from langchain.document_loaders import DocugamiLoader" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Documents\n", + "\n", + "If the DOCUGAMI_API_KEY environment variable is set, there is no need to pass it in to the loader explicitly otherwise you can pass it in as the `access_token` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='MUTUAL NON-DISCLOSURE AGREEMENT This Mutual Non-Disclosure Agreement (this “ Agreement ”) is entered into and made effective as of April 4 , 2018 between Docugami Inc. , a Delaware corporation , whose address is 150 Lake Street South , Suite 221 , Kirkland , Washington 98033 , and Caleb Divine , an individual, whose address is 1201 Rt 300 , Newburgh NY 12550 .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ThisMutualNon-disclosureAgreement', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'ThisMutualNon-disclosureAgreement'}),\n", + " Document(page_content='The above named parties desire to engage in discussions regarding a potential agreement or other transaction between the parties (the “Purpose”). In connection with such discussions, it may be necessary for the parties to disclose to each other certain confidential information or materials to enable them to evaluate whether to enter into such agreement or transaction.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Discussions', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'Discussions'}),\n", + " Document(page_content='In consideration of the foregoing, the parties agree as follows:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Consideration', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'Consideration'}),\n", + " Document(page_content='1. Confidential Information . For purposes of this Agreement , “ Confidential Information ” means any information or materials disclosed by one party to the other party that: (i) if disclosed in writing or in the form of tangible materials, is marked “confidential” or “proprietary” at the time of such disclosure; (ii) if disclosed orally or by visual presentation, is identified as “confidential” or “proprietary” at the time of such disclosure, and is summarized in a writing sent by the disclosing party to the receiving party within thirty ( 30 ) days after any such disclosure; or (iii) due to its nature or the circumstances of its disclosure, a person exercising reasonable business judgment would understand to be confidential or proprietary.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Purposes/docset:ConfidentialInformation-section/docset:ConfidentialInformation[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ConfidentialInformation'}),\n", + " Document(page_content=\"2. Obligations and Restrictions . Each party agrees: (i) to maintain the other party's Confidential Information in strict confidence; (ii) not to disclose such Confidential Information to any third party; and (iii) not to use such Confidential Information for any purpose except for the Purpose. Each party may disclose the other party’s Confidential Information to its employees and consultants who have a bona fide need to know such Confidential Information for the Purpose, but solely to the extent necessary to pursue the Purpose and for no other purpose; provided, that each such employee and consultant first executes a written agreement (or is otherwise already bound by a written agreement) that contains use and nondisclosure restrictions at least as protective of the other party’s Confidential Information as those set forth in this Agreement .\", metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Obligations/docset:ObligationsAndRestrictions-section/docset:ObligationsAndRestrictions', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ObligationsAndRestrictions'}),\n", + " Document(page_content='3. Exceptions. The obligations and restrictions in Section 2 will not apply to any information or materials that:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Exceptions/docset:Exceptions-section/docset:Exceptions[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Exceptions'}),\n", + " Document(page_content='(i) were, at the date of disclosure, or have subsequently become, generally known or available to the public through no act or failure to act by the receiving party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheDate/docset:TheDate', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheDate'}),\n", + " Document(page_content='(ii) were rightfully known by the receiving party prior to receiving such information or materials from the disclosing party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:SuchInformation/docset:TheReceivingParty', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheReceivingParty'}),\n", + " Document(page_content='(iii) are rightfully acquired by the receiving party from a third party who has the right to disclose such information or materials without breach of any confidentiality obligation to the disclosing party;', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheReceivingParty/docset:TheReceivingParty', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheReceivingParty'}),\n", + " Document(page_content='4. Compelled Disclosure . Nothing in this Agreement will be deemed to restrict a party from disclosing the other party’s Confidential Information to the extent required by any order, subpoena, law, statute or regulation; provided, that the party required to make such a disclosure uses reasonable efforts to give the other party reasonable advance notice of such required disclosure in order to enable the other party to prevent or limit such disclosure.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Disclosure/docset:CompelledDisclosure-section/docset:CompelledDisclosure', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'CompelledDisclosure'}),\n", + " Document(page_content='5. Return of Confidential Information . Upon the completion or abandonment of the Purpose, and in any event upon the disclosing party’s request, the receiving party will promptly return to the disclosing party all tangible items and embodiments containing or consisting of the disclosing party’s Confidential Information and all copies thereof (including electronic copies), and any notes, analyses, compilations, studies, interpretations, memoranda or other documents (regardless of the form thereof) prepared by or on behalf of the receiving party that contain or are based upon the disclosing party’s Confidential Information .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheCompletion/docset:ReturnofConfidentialInformation-section/docset:ReturnofConfidentialInformation', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ReturnofConfidentialInformation'}),\n", + " Document(page_content='6. No Obligations . Each party retains the right to determine whether to disclose any Confidential Information to the other party.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoObligations/docset:NoObligations-section/docset:NoObligations[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'NoObligations'}),\n", + " Document(page_content='7. No Warranty. ALL CONFIDENTIAL INFORMATION IS PROVIDED BY THE DISCLOSING PARTY “AS IS ”.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoWarranty/docset:NoWarranty-section/docset:NoWarranty[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'NoWarranty'}),\n", + " Document(page_content='8. Term. This Agreement will remain in effect for a period of seven ( 7 ) years from the date of last disclosure of Confidential Information by either party, at which time it will terminate.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:ThisAgreement/docset:Term-section/docset:Term', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Term'}),\n", + " Document(page_content='9. Equitable Relief . Each party acknowledges that the unauthorized use or disclosure of the disclosing party’s Confidential Information may cause the disclosing party to incur irreparable harm and significant damages, the degree of which may be difficult to ascertain. Accordingly, each party agrees that the disclosing party will have the right to seek immediate equitable relief to enjoin any unauthorized use or disclosure of its Confidential Information , in addition to any other rights and remedies that it may have at law or otherwise.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:EquitableRelief/docset:EquitableRelief-section/docset:EquitableRelief[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'EquitableRelief'}),\n", + " Document(page_content='10. Non-compete. To the maximum extent permitted by applicable law, during the Term of this Agreement and for a period of one ( 1 ) year thereafter, Caleb Divine may not market software products or do business that directly or indirectly competes with Docugami software products .', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheMaximumExtent/docset:Non-compete-section/docset:Non-compete', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Non-compete'}),\n", + " Document(page_content='11. Miscellaneous. This Agreement will be governed and construed in accordance with the laws of the State of Washington , excluding its body of law controlling conflict of laws. This Agreement is the complete and exclusive understanding and agreement between the parties regarding the subject matter of this Agreement and supersedes all prior agreements, understandings and communications, oral or written, between the parties regarding the subject matter of this Agreement . If any provision of this Agreement is held invalid or unenforceable by a court of competent jurisdiction, that provision of this Agreement will be enforced to the maximum extent permissible and the other provisions of this Agreement will remain in full force and effect. Neither party may assign this Agreement , in whole or in part, by operation of law or otherwise, without the other party’s prior written consent, and any attempted assignment without such consent will be void. This Agreement may be executed in counterparts, each of which will be deemed an original, but all of which together will constitute one and the same instrument.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Accordance/docset:Miscellaneous-section/docset:Miscellaneous', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Miscellaneous'}),\n", + " Document(page_content='[SIGNATURE PAGE FOLLOWS] IN WITNESS WHEREOF, the parties hereto have executed this Mutual Non-Disclosure Agreement by their duly authorized officers or representatives as of the date first set forth above.', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:TheParties', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheParties'}),\n", + " Document(page_content='DOCUGAMI INC . : \\n\\n Caleb Divine : \\n\\n Signature: Signature: Name: \\n\\n Jean Paoli Name: Title: \\n\\n CEO Title:', metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:DocugamiInc/docset:DocugamiInc/xhtml:table', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': '', 'tag': 'table'})]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DOCUGAMI_API_KEY=os.environ.get('DOCUGAMI_API_KEY')\n", + "\n", + "# To load all docs in the given docset ID, just don't provide document_ids\n", + "loader = DocugamiLoader(docset_id=\"ecxqpipcoe2p\", document_ids=[\"43rj0ds7s0ur\"])\n", + "docs = loader.load()\n", + "docs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `metadata` for each `Document` (really, a chunk of an actual PDF, DOC or DOCX) contains some useful additional information:\n", + "\n", + "1. **id and name:** ID and Name of the file (PDF, DOC or DOCX) the chunk is sourced from within Docugami.\n", + "2. **xpath:** XPath inside the XML representation of the document, for the chunk. Useful for source citations directly to the actual chunk inside the document XML.\n", + "3. **structure:** Structural attributes of the chunk, e.g. h1, h2, div, table, td, etc. Useful to filter out certain kinds of chunks if needed by the caller.\n", + "4. **tag:** Semantic tag for the chunk, using various generative and extractive techniques. More details here: https://github.com/docugami/DFM-benchmarks" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic Use: Docugami Loader for Document QA\n", + "\n", + "You can use the Docugami Loader like a standard loader for Document QA over multiple docs, albeit with much better chunks that follow the natural contours of the document. There are many great tutorials on how to do this, e.g. [this one](https://www.youtube.com/watch?v=3yPBVii7Ct0). We can just use the same code, but use the `DocugamiLoader` for better chunking, instead of loading text or PDF files directly with basic splitting techniques." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!poetry run pip -q install openai tiktoken chromadb " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema import Document\n", + "from langchain.vectorstores import Chroma\n", + "from langchain.embeddings import OpenAIEmbeddings\n", + "from langchain.llms import OpenAI\n", + "from langchain.chains import RetrievalQA\n", + "\n", + "# For this example, we already have a processed docset for a set of lease documents\n", + "loader = DocugamiLoader(docset_id=\"wh2kned25uqm\")\n", + "documents = loader.load()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The documents returned by the loader are already split, so we don't need to use a text splitter. Optionally, we can use the metadata on each document, for example the structure or tag attributes, to do any post-processing we want.\n", + "\n", + "We will just use the output of the `DocugamiLoader` as-is to set up a retrieval QA chain the usual way." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using embedded DuckDB without persistence: data will be transient\n" + ] + } + ], + "source": [ + "embedding = OpenAIEmbeddings()\n", + "vectordb = Chroma.from_documents(documents=documents, embedding=embedding)\n", + "retriever = vectordb.as_retriever()\n", + "qa_chain = RetrievalQA.from_chain_type(\n", + " llm=OpenAI(), chain_type=\"stuff\", retriever=retriever, return_source_documents=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'query': 'What can tenants do with signage on their properties?',\n", + " 'result': ' Tenants may place signs (digital or otherwise) or other form of identification on the premises after receiving written permission from the landlord which shall not be unreasonably withheld. The tenant is responsible for any damage caused to the premises and must conform to any applicable laws, ordinances, etc. governing the same. The tenant must also remove and clean any window or glass identification promptly upon vacating the premises.',\n", + " 'source_documents': [Document(page_content='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', metadata={'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage', 'id': 'v1bvgaozfkak', 'name': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage', 'Landlord': 'BUBBA CENTER PARTNERSHIP', 'Tenant': 'Truetone Lane LLC'}),\n", + " Document(page_content='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', metadata={'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk', 'id': 'g2fvhekmltza', 'name': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk', 'Landlord': 'GLORY ROAD LLC', 'Tenant': 'Truetone Lane LLC'}),\n", + " Document(page_content='Landlord , its agents, servants, employees, licensees, invitees, and contractors during the last year of the term of this Lease at any and all times during regular business hours, after 24 hour notice to tenant, to pass and repass on and through the Premises, or such portion thereof as may be necessary, in order that they or any of them may gain access to the Premises for the purpose of showing the Premises to potential new tenants or real estate brokers. In addition, Landlord shall be entitled to place a \"FOR RENT \" or \"FOR LEASE\" sign (not exceeding 8.5 ” x 11 ”) in the front window of the Premises during the last six months of the term of this Lease .', metadata={'xpath': '/docset:Rider/docset:RIDERTOLEASE-section/docset:RIDERTOLEASE/docset:FixedRent/docset:TermYearPeriod/docset:Lease/docset:_42FLandlordSAccess-section/docset:_42FLandlordSAccess/docset:LandlordsRights/docset:Landlord', 'id': 'omvs4mysdk6b', 'name': 'TruTone Lane 1.docx', 'structure': 'p', 'tag': 'Landlord', 'Landlord': 'BIRCH STREET , LLC', 'Tenant': 'Trutone Lane LLC'}),\n", + " Document(page_content=\"24. SIGNS . No signage shall be placed by Tenant on any portion of the Project . However, Tenant shall be permitted to place a sign bearing its name in a location approved by Landlord near the entrance to the Premises (at Tenant's cost ) and will be furnished a single listing of its name in the Building's directory (at Landlord 's cost ), all in accordance with the criteria adopted from time to time by Landlord for the Project . Any changes or additional listings in the directory shall be furnished (subject to availability of space) for the then Building Standard charge .\", metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:TheTerms/docset:Indemnification/docset:INDEMNIFICATION-section/docset:INDEMNIFICATION/docset:Waiver/docset:Waiver/docset:Signs/docset:SIGNS-section/docset:SIGNS', 'id': 'qkn9cyqsiuch', 'name': 'Shorebucks LLC_AZ.pdf', 'structure': 'div', 'tag': 'SIGNS', 'Landlord': 'Menlo Group', 'Tenant': 'Shorebucks LLC'})]}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Try out the retriever with an example query\n", + "qa_chain(\"What can tenants do with signage on their properties?\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Docugami to Add Metadata to Chunks for High Accuracy Document QA\n", + "\n", + "One issue with large documents is that the correct answer to your question may depend on chunks that are far apart in the document. Typical chunking techniques, even with overlap, will struggle with providing the LLM sufficent context to answer such questions. With upcoming very large context LLMs, it may be possible to stuff a lot of tokens, perhaps even entire documents, inside the context but this will still hit limits at some point with very long documents, or a lot of documents.\n", + "\n", + "For example, if we ask a more complex question that requires the LLM to draw on chunks from different parts of the document, even OpenAI's powerful LLM is unable to answer correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' 9,753 square feet'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain_response = qa_chain(\"What is rentable area for the property owned by DHA Group?\")\n", + "chain_response[\"result\"] # the correct answer should be 13,500" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At first glance the answer may seem reasonable, but if you review the source chunks carefully for this answer, you will see that the chunking of the document did not end up putting the Landlord name and the rentable area in the same context, since they are far apart in the document. The retriever therefore ends up finding unrelated chunks from other documents not even related to the **Menlo Group** landlord. That landlord happens to be mentioned on the first page of the file **Shorebucks LLC_NJ.pdf** file, and while one of the source chunks used by the chain is indeed from that doc that contains the correct answer (**13,500**), other source chunks from different docs are included, and the answer is therefore incorrect." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n", + " Document(page_content='WITNESSES: LANDLORD: DHA Group , a Delaware limited liability company', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Guaranty-section/docset:Guaranty[2]/docset:SIGNATURESONNEXTPAGE-section/docset:INWITNESSWHEREOF-section/docset:INWITNESSWHEREOF/docset:Behalf/docset:Witnesses/xhtml:table/xhtml:tbody/xhtml:tr[3]/xhtml:td[2]/docset:DhaGroup', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'p', 'tag': 'DhaGroup', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n", + " Document(page_content=\"1.16 Landlord 's Notice Address . DHA Group , Suite 1010 , 111 Bauer Dr , Oakland , New Jersey , 07436 , with a copy to the Building Management Office at the Project , Attention: On - Site Property Manager .\", metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:PercentageRent/docset:NoticeAddress[2]/docset:LandlordsNoticeAddress-section/docset:LandlordsNoticeAddress[2]', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'LandlordsNoticeAddress', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n", + " Document(page_content='1.6 Rentable Area of the Premises. 9,753 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:PerryBlair/docset:PerryBlair/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises', 'id': 'dsyfhh4vpeyf', 'name': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'Landlord': 'Perry & Blair LLC', 'Tenant': 'Shorebucks LLC'})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain_response[\"source_documents\"]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Docugami can help here. Chunks are annotated with additional metadata created using different techniques if a user has been [using Docugami](https://help.docugami.com/home/reports). More technical approaches will be added later.\n", + "\n", + "Specifically, let's look at the additional metadata that is returned on the documents returned by docugami, in the form of some simple key/value pairs on all the text chunks:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOfficeLeaseAgreement',\n", + " 'id': 'v1bvgaozfkak',\n", + " 'name': 'TruTone Lane 2.docx',\n", + " 'structure': 'p',\n", + " 'tag': 'ThisOfficeLeaseAgreement',\n", + " 'Landlord': 'BUBBA CENTER PARTNERSHIP',\n", + " 'Tenant': 'Truetone Lane LLC'}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader = DocugamiLoader(docset_id=\"wh2kned25uqm\")\n", + "documents = loader.load()\n", + "documents[0].metadata" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use a [self-querying retriever](../../retrievers/examples/self_query_retriever.ipynb) to improve our query accuracy, using this additional metadata:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using embedded DuckDB without persistence: data will be transient\n" + ] + } + ], + "source": [ + "from langchain.chains.query_constructor.schema import AttributeInfo\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "\n", + "EXCLUDE_KEYS = [\"id\", \"xpath\", \"structure\"]\n", + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=key,\n", + " description=f\"The {key} for this chunk\",\n", + " type=\"string\",\n", + " )\n", + " for key in documents[0].metadata\n", + " if key.lower() not in EXCLUDE_KEYS\n", + "]\n", + "\n", + "\n", + "document_content_description = \"Contents of this chunk\"\n", + "llm = OpenAI(temperature=0)\n", + "vectordb = Chroma.from_documents(documents=documents, embedding=embedding)\n", + "retriever = SelfQueryRetriever.from_llm(\n", + " llm, vectordb, document_content_description, metadata_field_info, verbose=True\n", + ")\n", + "qa_chain = RetrievalQA.from_chain_type(\n", + " llm=OpenAI(), chain_type=\"stuff\", retriever=retriever, return_source_documents=True\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's run the same question again. It returns the correct result since all the chunks have metadata key/value pairs on them carrying key information about the document even if this infromation is physically very far away from the source chunk used to generate the answer." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='rentable area' filter=Comparison(comparator=, attribute='Landlord', value='DHA Group')\n" + ] + }, + { + "data": { + "text/plain": [ + "{'query': 'What is rentable area for the property owned by DHA Group?',\n", + " 'result': ' 13,500 square feet.',\n", + " 'source_documents': [Document(page_content='1.1 Landlord . DHA Group , a Delaware limited liability company authorized to transact business in New Jersey .', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:DhaGroup/docset:Landlord-section/docset:DhaGroup', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'DhaGroup', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n", + " Document(page_content='WITNESSES: LANDLORD: DHA Group , a Delaware limited liability company', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Guaranty-section/docset:Guaranty[2]/docset:SIGNATURESONNEXTPAGE-section/docset:INWITNESSWHEREOF-section/docset:INWITNESSWHEREOF/docset:Behalf/docset:Witnesses/xhtml:table/xhtml:tbody/xhtml:tr[3]/xhtml:td[2]/docset:DhaGroup', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'p', 'tag': 'DhaGroup', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n", + " Document(page_content=\"1.16 Landlord 's Notice Address . DHA Group , Suite 1010 , 111 Bauer Dr , Oakland , New Jersey , 07436 , with a copy to the Building Management Office at the Project , Attention: On - Site Property Manager .\", metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:PercentageRent/docset:NoticeAddress[2]/docset:LandlordsNoticeAddress-section/docset:LandlordsNoticeAddress[2]', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'LandlordsNoticeAddress', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'}),\n", + " Document(page_content='1.6 Rentable Area of the Premises. 13,500 square feet . This square footage figure includes an add-on factor for Common Areas in the Building and has been agreed upon by the parties as final and correct and is not subject to challenge or dispute by either party.', metadata={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:TheTerms/dg:chunk/docset:BasicLeaseInformation/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS-section/docset:BASICLEASEINFORMATIONANDDEFINEDTERMS/docset:DhaGroup/docset:DhaGroup/docset:Premises[2]/docset:RentableAreaofthePremises-section/docset:RentableAreaofthePremises', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'RentableAreaofthePremises', 'Landlord': 'DHA Group', 'Tenant': 'Shorebucks LLC'})]}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qa_chain(\"What is rentable area for the property owned by DHA Group?\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time the answer is correct, since the self-querying retriever created a filter on the landlord attribute of the metadata, correctly filtering to document that specifically is about the DHA Group landlord. The resulting source chunks are all relevant to this landlord, and this improves answer accuracy even though the landlord is not directly mentioned in the specific chunk that contains the correct answer." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index be3500cb..271afda2 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -23,6 +23,7 @@ from langchain.document_loaders.dataframe import DataFrameLoader from langchain.document_loaders.diffbot import DiffbotLoader from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.discord import DiscordChatLoader +from langchain.document_loaders.docugami import DocugamiLoader from langchain.document_loaders.duckdb_loader import DuckDBLoader from langchain.document_loaders.email import ( OutlookMessageLoader, @@ -136,6 +137,7 @@ __all__ = [ "DiffbotLoader", "DirectoryLoader", "DiscordChatLoader", + "DocugamiLoader", "Docx2txtLoader", "DuckDBLoader", "EverNoteLoader", diff --git a/langchain/document_loaders/docugami.py b/langchain/document_loaders/docugami.py new file mode 100644 index 00000000..41997760 --- /dev/null +++ b/langchain/document_loaders/docugami.py @@ -0,0 +1,343 @@ +"""Loader that loads processed documents from Docugami.""" + +import io +import logging +import os +import re +from pathlib import Path +from typing import Any, Dict, List, Mapping, Optional, Sequence + +import requests +from pydantic import BaseModel, root_validator + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + +TD_NAME = "{http://www.w3.org/1999/xhtml}td" +TABLE_NAME = "{http://www.w3.org/1999/xhtml}table" + +XPATH_KEY = "xpath" +DOCUMENT_ID_KEY = "id" +DOCUMENT_NAME_KEY = "name" +STRUCTURE_KEY = "structure" +TAG_KEY = "tag" +PROJECTS_KEY = "projects" + +DEFAULT_API_ENDPOINT = "https://api.docugami.com/v1preview1" + +logger = logging.getLogger(__name__) + + +class DocugamiLoader(BaseLoader, BaseModel): + """Loader that loads processed docs from Docugami. + + To use, you should have the ``lxml`` python package installed. + """ + + api: str = DEFAULT_API_ENDPOINT + + access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY") + docset_id: Optional[str] + document_ids: Optional[Sequence[str]] + file_paths: Optional[Sequence[Path]] + min_chunk_size: int = 32 # appended to the next chunk to avoid over-chunking + + @root_validator + def validate_local_or_remote(cls, values: Dict[str, Any]) -> Dict[str, Any]: + """Validate that either local file paths are given, or remote API docset ID.""" + if values.get("file_paths") and values.get("docset_id"): + raise ValueError("Cannot specify both file_paths and remote API docset_id") + + if not values.get("file_paths") and not values.get("docset_id"): + raise ValueError("Must specify either file_paths or remote API docset_id") + + if values.get("docset_id") and not values.get("access_token"): + raise ValueError("Must specify access token if using remote API docset_id") + + return values + + def _parse_dgml( + self, document: Mapping, content: bytes, doc_metadata: Optional[Mapping] = None + ) -> List[Document]: + """Parse a single DGML document into a list of Documents.""" + try: + from lxml import etree + except ImportError: + raise ValueError( + "Could not import lxml python package. " + "Please install it with `pip install lxml`." + ) + + # helpers + def _xpath_qname_for_chunk(chunk: Any) -> str: + """Get the xpath qname for a chunk.""" + qname = f"{chunk.prefix}:{chunk.tag.split('}')[-1]}" + + parent = chunk.getparent() + if parent is not None: + doppelgangers = [x for x in parent if x.tag == chunk.tag] + if len(doppelgangers) > 1: + idx_of_self = doppelgangers.index(chunk) + qname = f"{qname}[{idx_of_self + 1}]" + + return qname + + def _xpath_for_chunk(chunk: Any) -> str: + """Get the xpath for a chunk.""" + ancestor_chain = chunk.xpath("ancestor-or-self::*") + return "/" + "/".join(_xpath_qname_for_chunk(x) for x in ancestor_chain) + + def _structure_value(node: Any) -> str: + """Get the structure value for a node.""" + structure = ( + "table" + if node.tag == TABLE_NAME + else node.attrib["structure"] + if "structure" in node.attrib + else None + ) + return structure + + def _is_structural(node: Any) -> bool: + """Check if a node is structural.""" + return _structure_value(node) is not None + + def _is_heading(node: Any) -> bool: + """Check if a node is a heading.""" + structure = _structure_value(node) + return structure is not None and structure.lower().startswith("h") + + def _get_text(node: Any) -> str: + """Get the text of a node.""" + return " ".join(node.itertext()).strip() + + def _has_structural_descendant(node: Any) -> bool: + """Check if a node has a structural descendant.""" + for child in node: + if _is_structural(child) or _has_structural_descendant(child): + return True + return False + + def _leaf_structural_nodes(node: Any) -> List: + """Get the leaf structural nodes of a node.""" + if _is_structural(node) and not _has_structural_descendant(node): + return [node] + else: + leaf_nodes = [] + for child in node: + leaf_nodes.extend(_leaf_structural_nodes(child)) + return leaf_nodes + + def _create_doc(node: Any, text: str) -> Document: + """Create a Document from a node and text.""" + metadata = { + XPATH_KEY: _xpath_for_chunk(node), + DOCUMENT_ID_KEY: document["id"], + DOCUMENT_NAME_KEY: document["name"], + STRUCTURE_KEY: node.attrib.get("structure", ""), + TAG_KEY: re.sub(r"\{.*\}", "", node.tag), + } + + if doc_metadata: + metadata.update(doc_metadata) + + return Document( + page_content=text, + metadata=metadata, + ) + + # parse the tree and return chunks + tree = etree.parse(io.BytesIO(content)) + root = tree.getroot() + + chunks: List[Document] = [] + prev_small_chunk_text = None + for node in _leaf_structural_nodes(root): + text = _get_text(node) + if prev_small_chunk_text: + text = prev_small_chunk_text + " " + text + prev_small_chunk_text = None + + if _is_heading(node) or len(text) < self.min_chunk_size: + # Save headings or other small chunks to be appended to the next chunk + prev_small_chunk_text = text + else: + chunks.append(_create_doc(node, text)) + + if prev_small_chunk_text and len(chunks) > 0: + # small chunk at the end left over, just append to last chunk + chunks[-1].page_content += " " + prev_small_chunk_text + + return chunks + + def _document_details_for_docset_id(self, docset_id: str) -> List[Dict]: + """Gets all document details for the given docset ID""" + url = f"{self.api}/docsets/{docset_id}/documents" + all_documents = [] + + while url: + response = requests.get( + url, + headers={"Authorization": f"Bearer {self.access_token}"}, + ) + if response.ok: + data = response.json() + all_documents.extend(data["documents"]) + url = data.get("next", None) + else: + raise Exception( + f"Failed to download {url} (status: {response.status_code})" + ) + + return all_documents + + def _project_details_for_docset_id(self, docset_id: str) -> List[Dict]: + """Gets all project details for the given docset ID""" + url = f"{self.api}/projects?docset.id={docset_id}" + all_projects = [] + + while url: + response = requests.request( + "GET", + url, + headers={"Authorization": f"Bearer {self.access_token}"}, + data={}, + ) + if response.ok: + data = response.json() + all_projects.extend(data["projects"]) + url = data.get("next", None) + else: + raise Exception( + f"Failed to download {url} (status: {response.status_code})" + ) + + return all_projects + + def _metadata_for_project(self, project: Dict) -> Dict: + """Gets project metadata for all files""" + project_id = project.get("id") + + url = f"{self.api}/projects/{project_id}/artifacts/latest" + all_artifacts = [] + + while url: + response = requests.request( + "GET", + url, + headers={"Authorization": f"Bearer {self.access_token}"}, + data={}, + ) + if response.ok: + data = response.json() + all_artifacts.extend(data["artifacts"]) + url = data.get("next", None) + else: + raise Exception( + f"Failed to download {url} (status: {response.status_code})" + ) + + per_file_metadata = {} + for artifact in all_artifacts: + artifact_name = artifact.get("name") + artifact_url = artifact.get("url") + artifact_doc = artifact.get("document") + + if artifact_name == f"{project_id}.xml" and artifact_url and artifact_doc: + doc_id = artifact_doc["id"] + metadata: Dict = {} + + # the evaluated XML for each document is named after the project + response = requests.request( + "GET", + f"{artifact_url}/content", + headers={"Authorization": f"Bearer {self.access_token}"}, + data={}, + ) + + if response.ok: + try: + from lxml import etree + except ImportError: + raise ValueError( + "Could not import lxml python package. " + "Please install it with `pip install lxml`." + ) + artifact_tree = etree.parse(io.BytesIO(response.content)) + artifact_root = artifact_tree.getroot() + ns = artifact_root.nsmap + entries = artifact_root.xpath("//wp:Entry", namespaces=ns) + for entry in entries: + heading = entry.xpath("./wp:Heading", namespaces=ns)[0].text + value = " ".join( + entry.xpath("./wp:Value", namespaces=ns)[0].itertext() + ).strip() + metadata[heading] = value + per_file_metadata[doc_id] = metadata + else: + raise Exception( + f"Failed to download {artifact_url}/content " + + "(status: {response.status_code})" + ) + + return per_file_metadata + + def _load_chunks_for_document( + self, docset_id: str, document: Dict, doc_metadata: Optional[Dict] = None + ) -> List[Document]: + """Load chunks for a document.""" + document_id = document["id"] + url = f"{self.api}/docsets/{docset_id}/documents/{document_id}/dgml" + + response = requests.request( + "GET", + url, + headers={"Authorization": f"Bearer {self.access_token}"}, + data={}, + ) + + if response.ok: + return self._parse_dgml(document, response.content, doc_metadata) + else: + raise Exception( + f"Failed to download {url} (status: {response.status_code})" + ) + + def load(self) -> List[Document]: + """Load documents.""" + chunks: List[Document] = [] + + if self.access_token and self.docset_id: + # remote mode + _document_details = self._document_details_for_docset_id(self.docset_id) + if self.document_ids: + _document_details = [ + d for d in _document_details if d["id"] in self.document_ids + ] + + _project_details = self._project_details_for_docset_id(self.docset_id) + combined_project_metadata = {} + if _project_details: + # if there are any projects for this docset, load project metadata + for project in _project_details: + metadata = self._metadata_for_project(project) + combined_project_metadata.update(metadata) + + for doc in _document_details: + doc_metadata = combined_project_metadata.get(doc["id"]) + chunks += self._load_chunks_for_document( + self.docset_id, doc, doc_metadata + ) + elif self.file_paths: + # local mode (for integration testing, or pre-downloaded XML) + for path in self.file_paths: + with open(path, "rb") as file: + chunks += self._parse_dgml( + { + DOCUMENT_ID_KEY: path.name, + DOCUMENT_NAME_KEY: path.name, + }, + file.read(), + ) + + return chunks diff --git a/poetry.lock b/poetry.lock index 688b0364..bf5dc214 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "absl-py" @@ -3735,6 +3735,99 @@ win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""} [package.extras] dev = ["Sphinx (==5.3.0)", "colorama (==0.4.5)", "colorama (==0.4.6)", "freezegun (==1.1.0)", "freezegun (==1.2.2)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v0.990)", "pre-commit (==3.2.1)", "pytest (==6.1.2)", "pytest (==7.2.1)", "pytest-cov (==2.12.1)", "pytest-cov (==4.0.0)", "pytest-mypy-plugins (==1.10.1)", "pytest-mypy-plugins (==1.9.3)", "sphinx-autobuild (==2021.3.14)", "sphinx-rtd-theme (==1.2.0)", "tox (==3.27.1)", "tox (==4.4.6)"] +[[package]] +name = "lxml" +version = "4.9.2" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" +files = [ + {file = "lxml-4.9.2-cp27-cp27m-macosx_10_15_x86_64.whl", hash = "sha256:76cf573e5a365e790396a5cc2b909812633409306c6531a6877c59061e42c4f2"}, + {file = "lxml-4.9.2-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b1f42b6921d0e81b1bcb5e395bc091a70f41c4d4e55ba99c6da2b31626c44892"}, + {file = "lxml-4.9.2-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:9f102706d0ca011de571de32c3247c6476b55bb6bc65a20f682f000b07a4852a"}, + {file = "lxml-4.9.2-cp27-cp27m-win32.whl", hash = "sha256:8d0b4612b66ff5d62d03bcaa043bb018f74dfea51184e53f067e6fdcba4bd8de"}, + {file = "lxml-4.9.2-cp27-cp27m-win_amd64.whl", hash = "sha256:4c8f293f14abc8fd3e8e01c5bd86e6ed0b6ef71936ded5bf10fe7a5efefbaca3"}, + {file = "lxml-4.9.2-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2899456259589aa38bfb018c364d6ae7b53c5c22d8e27d0ec7609c2a1ff78b50"}, + {file = "lxml-4.9.2-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6749649eecd6a9871cae297bffa4ee76f90b4504a2a2ab528d9ebe912b101975"}, + {file = "lxml-4.9.2-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a08cff61517ee26cb56f1e949cca38caabe9ea9fbb4b1e10a805dc39844b7d5c"}, + {file = "lxml-4.9.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:85cabf64adec449132e55616e7ca3e1000ab449d1d0f9d7f83146ed5bdcb6d8a"}, + {file = "lxml-4.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8340225bd5e7a701c0fa98284c849c9b9fc9238abf53a0ebd90900f25d39a4e4"}, + {file = "lxml-4.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:1ab8f1f932e8f82355e75dda5413a57612c6ea448069d4fb2e217e9a4bed13d4"}, + {file = "lxml-4.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:699a9af7dffaf67deeae27b2112aa06b41c370d5e7633e0ee0aea2e0b6c211f7"}, + {file = "lxml-4.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b9cc34af337a97d470040f99ba4282f6e6bac88407d021688a5d585e44a23184"}, + {file = "lxml-4.9.2-cp310-cp310-win32.whl", hash = "sha256:d02a5399126a53492415d4906ab0ad0375a5456cc05c3fc0fc4ca11771745cda"}, + {file = "lxml-4.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:a38486985ca49cfa574a507e7a2215c0c780fd1778bb6290c21193b7211702ab"}, + {file = "lxml-4.9.2-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c83203addf554215463b59f6399835201999b5e48019dc17f182ed5ad87205c9"}, + {file = "lxml-4.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:2a87fa548561d2f4643c99cd13131acb607ddabb70682dcf1dff5f71f781a4bf"}, + {file = "lxml-4.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:d6b430a9938a5a5d85fc107d852262ddcd48602c120e3dbb02137c83d212b380"}, + {file = "lxml-4.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3efea981d956a6f7173b4659849f55081867cf897e719f57383698af6f618a92"}, + {file = "lxml-4.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:df0623dcf9668ad0445e0558a21211d4e9a149ea8f5666917c8eeec515f0a6d1"}, + {file = "lxml-4.9.2-cp311-cp311-win32.whl", hash = "sha256:da248f93f0418a9e9d94b0080d7ebc407a9a5e6d0b57bb30db9b5cc28de1ad33"}, + {file = "lxml-4.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:3818b8e2c4b5148567e1b09ce739006acfaa44ce3156f8cbbc11062994b8e8dd"}, + {file = "lxml-4.9.2-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ca989b91cf3a3ba28930a9fc1e9aeafc2a395448641df1f387a2d394638943b0"}, + {file = "lxml-4.9.2-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:822068f85e12a6e292803e112ab876bc03ed1f03dddb80154c395f891ca6b31e"}, + {file = "lxml-4.9.2-cp35-cp35m-win32.whl", hash = "sha256:be7292c55101e22f2a3d4d8913944cbea71eea90792bf914add27454a13905df"}, + {file = "lxml-4.9.2-cp35-cp35m-win_amd64.whl", hash = "sha256:998c7c41910666d2976928c38ea96a70d1aa43be6fe502f21a651e17483a43c5"}, + {file = "lxml-4.9.2-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:b26a29f0b7fc6f0897f043ca366142d2b609dc60756ee6e4e90b5f762c6adc53"}, + {file = "lxml-4.9.2-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:ab323679b8b3030000f2be63e22cdeea5b47ee0abd2d6a1dc0c8103ddaa56cd7"}, + {file = "lxml-4.9.2-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:689bb688a1db722485e4610a503e3e9210dcc20c520b45ac8f7533c837be76fe"}, + {file = "lxml-4.9.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:f49e52d174375a7def9915c9f06ec4e569d235ad428f70751765f48d5926678c"}, + {file = "lxml-4.9.2-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:36c3c175d34652a35475a73762b545f4527aec044910a651d2bf50de9c3352b1"}, + {file = "lxml-4.9.2-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a35f8b7fa99f90dd2f5dc5a9fa12332642f087a7641289ca6c40d6e1a2637d8e"}, + {file = "lxml-4.9.2-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:58bfa3aa19ca4c0f28c5dde0ff56c520fbac6f0daf4fac66ed4c8d2fb7f22e74"}, + {file = "lxml-4.9.2-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc718cd47b765e790eecb74d044cc8d37d58562f6c314ee9484df26276d36a38"}, + {file = "lxml-4.9.2-cp36-cp36m-win32.whl", hash = "sha256:d5bf6545cd27aaa8a13033ce56354ed9e25ab0e4ac3b5392b763d8d04b08e0c5"}, + {file = "lxml-4.9.2-cp36-cp36m-win_amd64.whl", hash = "sha256:3ab9fa9d6dc2a7f29d7affdf3edebf6ece6fb28a6d80b14c3b2fb9d39b9322c3"}, + {file = "lxml-4.9.2-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:05ca3f6abf5cf78fe053da9b1166e062ade3fa5d4f92b4ed688127ea7d7b1d03"}, + {file = "lxml-4.9.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:a5da296eb617d18e497bcf0a5c528f5d3b18dadb3619fbdadf4ed2356ef8d941"}, + {file = "lxml-4.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:04876580c050a8c5341d706dd464ff04fd597095cc8c023252566a8826505726"}, + {file = "lxml-4.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:c9ec3eaf616d67db0764b3bb983962b4f385a1f08304fd30c7283954e6a7869b"}, + {file = "lxml-4.9.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2a29ba94d065945944016b6b74e538bdb1751a1db6ffb80c9d3c2e40d6fa9894"}, + {file = "lxml-4.9.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a82d05da00a58b8e4c0008edbc8a4b6ec5a4bc1e2ee0fb6ed157cf634ed7fa45"}, + {file = "lxml-4.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:223f4232855ade399bd409331e6ca70fb5578efef22cf4069a6090acc0f53c0e"}, + {file = "lxml-4.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d17bc7c2ccf49c478c5bdd447594e82692c74222698cfc9b5daae7ae7e90743b"}, + {file = "lxml-4.9.2-cp37-cp37m-win32.whl", hash = "sha256:b64d891da92e232c36976c80ed7ebb383e3f148489796d8d31a5b6a677825efe"}, + {file = "lxml-4.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:a0a336d6d3e8b234a3aae3c674873d8f0e720b76bc1d9416866c41cd9500ffb9"}, + {file = "lxml-4.9.2-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:da4dd7c9c50c059aba52b3524f84d7de956f7fef88f0bafcf4ad7dde94a064e8"}, + {file = "lxml-4.9.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:821b7f59b99551c69c85a6039c65b75f5683bdc63270fec660f75da67469ca24"}, + {file = "lxml-4.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:e5168986b90a8d1f2f9dc1b841467c74221bd752537b99761a93d2d981e04889"}, + {file = "lxml-4.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:8e20cb5a47247e383cf4ff523205060991021233ebd6f924bca927fcf25cf86f"}, + {file = "lxml-4.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:13598ecfbd2e86ea7ae45ec28a2a54fb87ee9b9fdb0f6d343297d8e548392c03"}, + {file = "lxml-4.9.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:880bbbcbe2fca64e2f4d8e04db47bcdf504936fa2b33933efd945e1b429bea8c"}, + {file = "lxml-4.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7d2278d59425777cfcb19735018d897ca8303abe67cc735f9f97177ceff8027f"}, + {file = "lxml-4.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5344a43228767f53a9df6e5b253f8cdca7dfc7b7aeae52551958192f56d98457"}, + {file = "lxml-4.9.2-cp38-cp38-win32.whl", hash = "sha256:925073b2fe14ab9b87e73f9a5fde6ce6392da430f3004d8b72cc86f746f5163b"}, + {file = "lxml-4.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:9b22c5c66f67ae00c0199f6055705bc3eb3fcb08d03d2ec4059a2b1b25ed48d7"}, + {file = "lxml-4.9.2-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:5f50a1c177e2fa3ee0667a5ab79fdc6b23086bc8b589d90b93b4bd17eb0e64d1"}, + {file = "lxml-4.9.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:090c6543d3696cbe15b4ac6e175e576bcc3f1ccfbba970061b7300b0c15a2140"}, + {file = "lxml-4.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:63da2ccc0857c311d764e7d3d90f429c252e83b52d1f8f1d1fe55be26827d1f4"}, + {file = "lxml-4.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:5b4545b8a40478183ac06c073e81a5ce4cf01bf1734962577cf2bb569a5b3bbf"}, + {file = "lxml-4.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2e430cd2824f05f2d4f687701144556646bae8f249fd60aa1e4c768ba7018947"}, + {file = "lxml-4.9.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6804daeb7ef69e7b36f76caddb85cccd63d0c56dedb47555d2fc969e2af6a1a5"}, + {file = "lxml-4.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a6e441a86553c310258aca15d1c05903aaf4965b23f3bc2d55f200804e005ee5"}, + {file = "lxml-4.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ca34efc80a29351897e18888c71c6aca4a359247c87e0b1c7ada14f0ab0c0fb2"}, + {file = "lxml-4.9.2-cp39-cp39-win32.whl", hash = "sha256:6b418afe5df18233fc6b6093deb82a32895b6bb0b1155c2cdb05203f583053f1"}, + {file = "lxml-4.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:f1496ea22ca2c830cbcbd473de8f114a320da308438ae65abad6bab7867fe38f"}, + {file = "lxml-4.9.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:b264171e3143d842ded311b7dccd46ff9ef34247129ff5bf5066123c55c2431c"}, + {file = "lxml-4.9.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0dc313ef231edf866912e9d8f5a042ddab56c752619e92dfd3a2c277e6a7299a"}, + {file = "lxml-4.9.2-pp38-pypy38_pp73-macosx_10_15_x86_64.whl", hash = "sha256:16efd54337136e8cd72fb9485c368d91d77a47ee2d42b057564aae201257d419"}, + {file = "lxml-4.9.2-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0f2b1e0d79180f344ff9f321327b005ca043a50ece8713de61d1cb383fb8ac05"}, + {file = "lxml-4.9.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:7b770ed79542ed52c519119473898198761d78beb24b107acf3ad65deae61f1f"}, + {file = "lxml-4.9.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:efa29c2fe6b4fdd32e8ef81c1528506895eca86e1d8c4657fda04c9b3786ddf9"}, + {file = "lxml-4.9.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7e91ee82f4199af8c43d8158024cbdff3d931df350252288f0d4ce656df7f3b5"}, + {file = "lxml-4.9.2-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:b23e19989c355ca854276178a0463951a653309fb8e57ce674497f2d9f208746"}, + {file = "lxml-4.9.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:01d36c05f4afb8f7c20fd9ed5badca32a2029b93b1750f571ccc0b142531caf7"}, + {file = "lxml-4.9.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7b515674acfdcadb0eb5d00d8a709868173acece5cb0be3dd165950cbfdf5409"}, + {file = "lxml-4.9.2.tar.gz", hash = "sha256:2455cfaeb7ac70338b3257f41e21f0724f4b5b0c0e7702da67ee6c3640835b67"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=0.29.7)"] + [[package]] name = "lz4" version = "4.3.2" @@ -9994,18 +10087,18 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six"] -azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"] +all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "hnswlib", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "protobuf", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] -extended-testing = ["pypdf", "pdfminer-six", "tqdm", "jq"] -hnswlib = ["docarray", "protobuf", "hnswlib"] +extended-testing = ["jq", "lxml", "pdfminer-six", "pypdf", "tqdm"] +hnswlib = ["docarray", "hnswlib", "protobuf"] in-memory-store = ["docarray"] -llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] +llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] openai = ["openai", "tiktoken"] qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "42b518704c39bc25c6da05f81a9488a9a6fecfd7784b3c9915d30127ce384a63" +content-hash = "c84dcaf4bf2fb334d81cacfdfc5ca7f22924f07c2adc479f92d05c73c3fbeee1" diff --git a/pyproject.toml b/pyproject.toml index 3c141f54..b1c81881 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,6 +82,7 @@ pdfminer-six = {version = "^20221105", optional = true} docarray = {version="^0.31.0", optional=true} protobuf = {version="3.19", optional=true} hnswlib = {version="^0.7.0", optional=true} +lxml = {version = "^4.9.2", optional = true} [tool.poetry.group.docs.dependencies] @@ -170,8 +171,14 @@ embeddings = ["sentence-transformers"] azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"] all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six"] # An extra used to be able to add extended testing. +# Please use new-line on formatting to make it easier to add new packages without +# merge-conflicts extended_testing = [ - "pypdf", "pdfminer.six", "tqdm", "jq" + "jq", + "pdfminer.six", + "pypdf", + "tqdm", + "lxml", ] [tool.ruff] diff --git a/tests/unit_tests/document_loader/loaders/__init__.py b/tests/unit_tests/document_loader/loaders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit_tests/document_loader/loaders/vendors/__init__.py b/tests/unit_tests/document_loader/loaders/vendors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit_tests/document_loader/loaders/vendors/test_data/docugami-example.xml b/tests/unit_tests/document_loader/loaders/vendors/test_data/docugami-example.xml new file mode 100644 index 00000000..ec0a27c0 --- /dev/null +++ b/tests/unit_tests/document_loader/loaders/vendors/test_data/docugami-example.xml @@ -0,0 +1,336 @@ + + + + MUTUAL NON-DISCLOSURE AGREEMENT + + + + + This + Mutual Non-Disclosure Agreement (this “ + Agreement”) is entered into and made effective as of + 2/4/2018 between + Docugami Inc., a + Delaware corporation, whose address is 150 Lake Street South, Suite 221, Kirkland, + Washington + Delaware corporation, whose address is + + + + 150 + Lake Street South + , + + Suite + 221 + + , + Kirkland, + Washington + 98033 + , and + Leonarda Hosler, an individual, whose address is + + + 374 + William S Canning Blvd + , + + Fall River + MA + + + 2721 + 374 William S Canning Blvd, Fall River MA 2721. + + + + The above named parties desire to engage in discussions regarding a potential agreement or other transaction between the parties (the “Purpose”). In connection with such discussions, it may be necessary for the parties to disclose to each other certain confidential information or materials to enable them to evaluate whether to enter into such agreement or transaction. + + + + In consideration of the foregoing, the parties agree as follows: + + + 1. + + + Confidential Information. + + For purposes of this + Agreement, “ + Confidential Information” means any information or materials disclosed by + + one party + to the other party that: (i) if disclosed in writing or in the form of tangible materials, is marked “confidential” or “proprietary” at the time of such disclosure; (ii) if disclosed orally or by visual presentation, is identified as “confidential” or “proprietary” at the time of such disclosure, and is summarized in a writing sent by the disclosing party to the receiving party within + + thirty ( + 30) days + after any such disclosure; or (iii) due to its nature or the circumstances of its disclosure, a person exercising reasonable business judgment would understand to be confidential or proprietary. + + + + + 2. + + Obligations and + Restrictions. + + Each party agrees: (i) to maintain the + other party's Confidential Information in strict confidence; (ii) not to disclose + such Confidential Information to any third party; and (iii) not to use + such Confidential Information for any purpose except for the Purpose. Each party may disclose the + other party’s Confidential Information to its employees and consultants who have a bona fide need to know + such Confidential Information for the Purpose, but solely to the extent necessary to pursue the + Purpose and for no other purpose; provided, that each such employee and consultant first executes a written agreement (or is otherwise already bound by a written agreement) that contains use and nondisclosure restrictions at least as protective of the + other party’s Confidential Information as those set forth in this + Agreement. + + + + + 3. + + Exceptions. + The obligations and restrictions in Section + 2 will not apply to any information or materials that: + + + + + (i) + were, at the date of disclosure, or have subsequently become, generally known or available to the public through no act or failure to act by the receiving party; + + + (ii) + were rightfully known by the receiving party prior to receiving such information or materials from the disclosing party; + + + (iii) + are rightfully acquired by the receiving party from a third party who has the right to disclose such information or materials without breach of any confidentiality obligation to the disclosing party; or + + + (iv) + are independently developed by the receiving party without access to any + Confidential Information of the disclosing party. + + + + 4. + + + Compelled Disclosure. + + Nothing in this + Agreement will be deemed to restrict a party from disclosing the + other party’s Confidential Information to the extent required by any order, subpoena, law, statute or regulation; provided, that the party required to make such a disclosure uses reasonable efforts to give the other party reasonable advance notice of such required disclosure in order to enable the other party to prevent or limit such disclosure. + + + + + 5. + + Return of + Confidential Information. + + Upon the completion or abandonment of the Purpose, and in any event upon the disclosing party’s request, the receiving party will promptly return to the disclosing party all tangible items and embodiments containing or consisting of the + disclosing party’s Confidential Information and all copies thereof (including electronic copies), and any notes, analyses, compilations, studies, interpretations, memoranda or other documents (regardless of the form thereof) prepared by or on behalf of the receiving party that contain or are based upon the + disclosing party’s Confidential Information. + + + + + 6. + + No + Obligations. + + Each party retains the right, in its sole discretion, to determine whether to disclose any + Confidential Information to the other party. Neither party will be required to negotiate nor enter into any other agreements or arrangements with the other party, whether or not related to the Purpose. + + + + + 7. + + No + License. + + All + Confidential Information remains the sole and exclusive property of the disclosing party. Each party acknowledges and agrees that nothing in this + Agreement will be construed as granting any rights to the receiving party, by license or otherwise, in or to any + Confidential Information of the disclosing party, or any patent, copyright or other intellectual property or proprietary rights of the disclosing party, except as specified in this + Agreement. + + + + + 8. + No Warranty. ALL CONFIDENTIAL + INFORMATION + CONFIDENTIAL INFORMATION IS PROVIDED + + + The obligations and restrictions in Section 2 will not apply to any information or materials that: + + (i) were, at the date of disclosure, or have subsequently become, generally known or available to the public through no act or failure to act by the receiving party; + + (ii) were rightfully known by the receiving party prior to receiving such information or materials from the disclosing party; + + (iii) are rightfully acquired by the receiving party from a third party who has the right to disclose such information or materials without breach of any confidentiality obligation to the disclosing party; or + + (iv) are independently developed by the receiving party without access to any Confidential Information of the disclosing party. + + 4. Compelled Disclosure. Nothing in this Agreement will be deemed to restrict a party from disclosing the other party’s Confidential Information to the extent required by any order, subpoena, law, statute or regulation; provided, that the party required to make such a disclosure uses reasonable efforts to give the other party reasonable advance notice of such required disclosure in order to enable the other party to prevent or limit such disclosure. + + 5. Return of Confidential Information. Upon the completion or abandonment of the Purpose, and in any event upon the disclosing party’s request, the receiving party will promptly return to the disclosing party all tangible items and embodiments containing or consisting of the disclosing party’s Confidential Information and all copies thereof (including electronic copies), and any notes, analyses, compilations, studies, interpretations, memoranda or other documents (regardless of the form thereof) prepared by or on behalf of the receiving party that contain or are based upon the disclosing party’s Confidential Information. + + 6. No Obligations. Each party retains the right, in its sole discretion, to determine whether to disclose any Confidential Information to the other party. Neither party will be required to negotiate nor enter into any other agreements or arrangements with the other party, whether or not related to the Purpose. + + 7. No License. All Confidential Information remains the sole and exclusive property of the disclosing party. Each party acknowledges and agrees that nothing in this Agreement will be construed as granting any rights to the receiving party, by license or otherwise, in or to any Confidential Information of the disclosing party, or any patent, copyright or other intellectual property or proprietary rights of the disclosing party, except as specified in this Agreement. + + 8. No Warranty. ALL CONFIDENTIAL INFORMATION IS PROVIDED + + + + + + + BY THE + DISCLOSING PARTY “AS IS”. + + + + + 9. + + Term. + This + Agreement will remain in effect for a period of + + five ( + 5) years + from the date of last disclosure of + Confidential Information by either party, at which time it will terminate. + + + + + 10. + + + Equitable Relief. + + Each party acknowledges that the unauthorized use or disclosure of the + disclosing party’s Confidential Information may cause the disclosing party to incur irreparable harm and significant damages, the degree of which may be difficult to ascertain. Accordingly, each party agrees that the disclosing party will have the right to seek immediate equitable relief to enjoin any unauthorized use or disclosure of + its Confidential Information, in addition to any other rights and remedies that it may have at law or otherwise. + + + + + 11. + + Miscellaneous. + This + Agreement will be governed and construed in accordance with the laws of the + State of + Washington, excluding its body of law controlling conflict of laws. This + Agreement is the complete and exclusive understanding and agreement between the parties regarding the subject matter of this + Agreement and supersedes all prior agreements, understandings and communications, oral or written, between the parties regarding the subject matter of this + Agreement. If any provision of this + Agreement is held invalid or unenforceable by a court of competent jurisdiction, that provision of this + Agreement will be enforced to the maximum extent permissible and the other provisions of this + Agreement will remain in full force and effect. Neither party may assign this + Agreement, in whole or in part, by operation of law or otherwise, without the other party’s prior written consent, and any attempted assignment without such consent will be void. This + Agreement may be executed in counterparts, each of which will be deemed an original, but all of which together will constitute one and the same instrument. + + + + + + + + [SIGNATURE PAGE FOLLOWS] + + + + IN + WITNESS WHEREOF, + + + the parties hereto have executed this + Mutual Non-Disclosure Agreement by their duly authorized officers or representatives as of the date first set forth above. + + + + + + + + + + DOCUGAMI INC. + : + + + + + Leonarda Hosler: + + + + + + + Signatu re: + + + + + Signatu re: + + + + + + + + + + + + + Name: + + + Jean Paoli + + + Name: + + + + + Title: + + + CEO + + + + Title: + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/unit_tests/document_loader/loaders/vendors/test_docugami.py b/tests/unit_tests/document_loader/loaders/vendors/test_docugami.py new file mode 100644 index 00000000..81a4f697 --- /dev/null +++ b/tests/unit_tests/document_loader/loaders/vendors/test_docugami.py @@ -0,0 +1,28 @@ +"""Test DocugamiLoader.""" +from pathlib import Path + +import pytest + +from langchain.document_loaders import DocugamiLoader + +DOCUGAMI_XML_PATH = Path(__file__).parent / "test_data" / "docugami-example.xml" + + +@pytest.mark.requires("lxml") +def test_docugami_loader_local() -> None: + """Test DocugamiLoader.""" + loader = DocugamiLoader(file_paths=[DOCUGAMI_XML_PATH]) + docs = loader.load() + + assert len(docs) == 19 + + xpath = docs[0].metadata.get("xpath") + assert str(xpath).endswith("/docset:Preamble") + assert docs[0].metadata["structure"] == "p" + assert docs[0].metadata["tag"] == "Preamble" + assert docs[0].page_content.startswith("MUTUAL NON-DISCLOSURE AGREEMENT") + + +def test_docugami_initialization() -> None: + """Test correct initialization in remote mode.""" + DocugamiLoader(access_token="test", docset_id="123")