langchain/libs/community/langchain_community/document_loaders/parsers/vsdx.py

import json
import re
import zipfile
from abc import ABC
from pathlib import Path
from typing import Iterator, List, Set, Tuple

from langchain_community.docstore.document import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob


class VsdxParser(BaseBlobParser, ABC):
    """Parser for vsdx files."""

    def parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[override]
        """Parse a vsdx file."""
        return self.lazy_parse(blob)

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Retrieve the contents of pages from a .vsdx file
        and insert them into documents, one document per page."""

        with blob.as_bytes_io() as pdf_file_obj:
            with zipfile.ZipFile(pdf_file_obj, "r") as zfile:
                pages = self.get_pages_content(zfile, blob.source)  # type: ignore[arg-type]

        yield from [
            Document(
                page_content=page_content,
                metadata={
                    "source": blob.source,
                    "page": page_number,
                    "page_name": page_name,
                },
            )
            for page_number, page_name, page_content in pages
        ]

    def get_pages_content(
        self, zfile: zipfile.ZipFile, source: str
    ) -> List[Tuple[int, str, str]]:
        """Get the content of the pages of a vsdx file.

        Attributes:
            zfile (zipfile.ZipFile): The vsdx file under zip format.
            source (str): The path of the vsdx file.

        Returns:
            list[tuple[int, str, str]]: A list of tuples containing the page number,
            the name of the page and the content of the page
            for each page of the vsdx file.
        """

        try:
            import xmltodict
        except ImportError:
            raise ImportError(
                "The xmltodict library is required to parse vsdx files. "
                "Please install it with `pip install xmltodict`."
            )

        if "visio/pages/pages.xml" not in zfile.namelist():
            print("WARNING - No pages.xml file found in {}".format(source))  # noqa: T201
            return  # type: ignore[return-value]
        if "visio/pages/_rels/pages.xml.rels" not in zfile.namelist():
            print("WARNING - No pages.xml.rels file found in {}".format(source))  # noqa: T201
            return  # type: ignore[return-value]
        if "docProps/app.xml" not in zfile.namelist():
            print("WARNING - No app.xml file found in {}".format(source))  # noqa: T201
            return  # type: ignore[return-value]

        pagesxml_content: dict = xmltodict.parse(zfile.read("visio/pages/pages.xml"))
        appxml_content: dict = xmltodict.parse(zfile.read("docProps/app.xml"))
        pagesxmlrels_content: dict = xmltodict.parse(
            zfile.read("visio/pages/_rels/pages.xml.rels")
        )

        if isinstance(pagesxml_content["Pages"]["Page"], list):
            disordered_names: List[str] = [
                rel["@Name"].strip() for rel in pagesxml_content["Pages"]["Page"]
            ]
        else:
            disordered_names: List[str] = [  # type: ignore[no-redef]
                pagesxml_content["Pages"]["Page"]["@Name"].strip()
            ]
        if isinstance(pagesxmlrels_content["Relationships"]["Relationship"], list):
            disordered_paths: List[str] = [
                "visio/pages/" + rel["@Target"]
                for rel in pagesxmlrels_content["Relationships"]["Relationship"]
            ]
        else:
            disordered_paths: List[str] = [  # type: ignore[no-redef]
                "visio/pages/"
                + pagesxmlrels_content["Relationships"]["Relationship"]["@Target"]
            ]
        ordered_names: List[str] = appxml_content["Properties"]["TitlesOfParts"][
            "vt:vector"
        ]["vt:lpstr"][: len(disordered_names)]
        ordered_names = [name.strip() for name in ordered_names]
        ordered_paths = [
            disordered_paths[disordered_names.index(name.strip())]
            for name in ordered_names
        ]

        # Pages out of order and without content of their relationships
        disordered_pages = []
        for path in ordered_paths:
            content = zfile.read(path)
            string_content = json.dumps(xmltodict.parse(content))

            samples = re.findall(
                r'"#text"\s*:\s*"([^\\"]*(?:\\.[^\\"]*)*)"', string_content
            )
            if len(samples) > 0:
                page_content = "\n".join(samples)
                map_symboles = {
                    "\\n": "\n",
                    "\\t": "\t",
                    "\\u2013": "-",
                    "\\u2019": "'",
                    "\\u00e9r": "é",
                    "\\u00f4me": "ô",
                }
                for key, value in map_symboles.items():
                    page_content = page_content.replace(key, value)

                disordered_pages.append({"page": path, "page_content": page_content})

        # Direct relationships of each page in a dict format
        pagexml_rels = [
            {
                "path": page_path,
                "content": xmltodict.parse(
                    zfile.read(f"visio/pages/_rels/{Path(page_path).stem}.xml.rels")
                ),
            }
            for page_path in ordered_paths
            if f"visio/pages/_rels/{Path(page_path).stem}.xml.rels" in zfile.namelist()
        ]

        # Pages in order and with content of their relationships (direct and indirect)
        ordered_pages: List[Tuple[int, str, str]] = []
        for page_number, (path, page_name) in enumerate(
            zip(ordered_paths, ordered_names)
        ):
            relationships = self.get_relationships(
                path, zfile, ordered_paths, pagexml_rels
            )
            page_content = "\n".join(
                [
                    page_["page_content"]
                    for page_ in disordered_pages
                    if page_["page"] in relationships
                ]
                + [
                    page_["page_content"]
                    for page_ in disordered_pages
                    if page_["page"] == path
                ]
            )
            ordered_pages.append((page_number, page_name, page_content))

        return ordered_pages

    def get_relationships(
        self,
        page: str,
        zfile: zipfile.ZipFile,
        filelist: List[str],
        pagexml_rels: List[dict],
    ) -> Set[str]:
        """Get the relationships of a page and the relationships of its relationships,
        etc... recursively.
        Pages are based on other pages (ex: background page),
        so we need to get all the relationships to get all the content of a single page.
        """

        name_path = Path(page).name
        parent_path = Path(page).parent
        rels_path = parent_path / f"_rels/{name_path}.rels"

        if str(rels_path) not in zfile.namelist():
            return set()

        pagexml_rels_content = next(
            page_["content"] for page_ in pagexml_rels if page_["path"] == page
        )

        if isinstance(pagexml_rels_content["Relationships"]["Relationship"], list):
            targets = [
                rel["@Target"]
                for rel in pagexml_rels_content["Relationships"]["Relationship"]
            ]
        else:
            targets = [pagexml_rels_content["Relationships"]["Relationship"]["@Target"]]

        relationships = set(
            [str(parent_path / target) for target in targets]
        ).intersection(filelist)

        for rel in relationships:
            relationships = relationships | self.get_relationships(
                rel, zfile, filelist, pagexml_rels
            )

        return relationships
community[minor]: New documents loader for visio files (with extension .vsdx) (#16171) Description : New documents loader for visio files (with extension .vsdx) A [visio file](https://fr.wikipedia.org/wiki/Microsoft_Visio) (with extension .vsdx) is associated with Microsoft Visio, a diagram creation software. It stores information about the structure, layout, and graphical elements of a diagram. This format facilitates the creation and sharing of visualizations in areas such as business, engineering, and computer science. A Visio file can contain multiple pages. Some of them may serve as the background for others, and this can occur across multiple layers. This loader extracts the textual content from each page and its associated pages, enabling the extraction of all visible text from each page, similar to what an OCR algorithm would do. Dependencies : xmltodict package 2024-01-23 06:07:03 +00:00			`import json`
			`import re`
			`import zipfile`
			`from abc import ABC`
			`from pathlib import Path`
			`from typing import Iterator, List, Set, Tuple`

			`from langchain_community.docstore.document import Document`
			`from langchain_community.document_loaders.base import BaseBlobParser`
			`from langchain_community.document_loaders.blob_loaders import Blob`


			`class VsdxParser(BaseBlobParser, ABC):`
community[patch]: docstrings (#16810) - added missed docstrings - formated docstrings to the consistent form 2024-02-09 20:48:57 +00:00			`"""Parser for vsdx files."""`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[override]`
community[minor]: New documents loader for visio files (with extension .vsdx) (#16171) Description : New documents loader for visio files (with extension .vsdx) A [visio file](https://fr.wikipedia.org/wiki/Microsoft_Visio) (with extension .vsdx) is associated with Microsoft Visio, a diagram creation software. It stores information about the structure, layout, and graphical elements of a diagram. This format facilitates the creation and sharing of visualizations in areas such as business, engineering, and computer science. A Visio file can contain multiple pages. Some of them may serve as the background for others, and this can occur across multiple layers. This loader extracts the textual content from each page and its associated pages, enabling the extraction of all visible text from each page, similar to what an OCR algorithm would do. Dependencies : xmltodict package 2024-01-23 06:07:03 +00:00			`"""Parse a vsdx file."""`
			`return self.lazy_parse(blob)`

			`def lazy_parse(self, blob: Blob) -> Iterator[Document]:`
			`"""Retrieve the contents of pages from a .vsdx file`
			`and insert them into documents, one document per page."""`

			`with blob.as_bytes_io() as pdf_file_obj:`
			`with zipfile.ZipFile(pdf_file_obj, "r") as zfile:`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`pages = self.get_pages_content(zfile, blob.source) # type: ignore[arg-type]`
community[minor]: New documents loader for visio files (with extension .vsdx) (#16171) Description : New documents loader for visio files (with extension .vsdx) A [visio file](https://fr.wikipedia.org/wiki/Microsoft_Visio) (with extension .vsdx) is associated with Microsoft Visio, a diagram creation software. It stores information about the structure, layout, and graphical elements of a diagram. This format facilitates the creation and sharing of visualizations in areas such as business, engineering, and computer science. A Visio file can contain multiple pages. Some of them may serve as the background for others, and this can occur across multiple layers. This loader extracts the textual content from each page and its associated pages, enabling the extraction of all visible text from each page, similar to what an OCR algorithm would do. Dependencies : xmltodict package 2024-01-23 06:07:03 +00:00
			`yield from [`
			`Document(`
			`page_content=page_content,`
			`metadata={`
			`"source": blob.source,`
			`"page": page_number,`
			`"page_name": page_name,`
			`},`
			`)`
			`for page_number, page_name, page_content in pages`
			`]`

			`def get_pages_content(`
			`self, zfile: zipfile.ZipFile, source: str`
			`) -> List[Tuple[int, str, str]]:`
			`"""Get the content of the pages of a vsdx file.`

			`Attributes:`
			`zfile (zipfile.ZipFile): The vsdx file under zip format.`
			`source (str): The path of the vsdx file.`

			`Returns:`
			`list[tuple[int, str, str]]: A list of tuples containing the page number,`
			`the name of the page and the content of the page`
			`for each page of the vsdx file.`
			`"""`

			`try:`
			`import xmltodict`
			`except ImportError:`
			`raise ImportError(`
			`"The xmltodict library is required to parse vsdx files. "`
			"Please install it with `pip install xmltodict`."
			`)`

			`if "visio/pages/pages.xml" not in zfile.namelist():`
infra: add print rule to ruff (#16221) Added noqa for existing prints. Can slowly remove / will prevent more being intro'd 2024-02-10 00:13:30 +00:00			`print("WARNING - No pages.xml file found in {}".format(source)) # noqa: T201`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`return # type: ignore[return-value]`
community[minor]: New documents loader for visio files (with extension .vsdx) (#16171) Description : New documents loader for visio files (with extension .vsdx) A [visio file](https://fr.wikipedia.org/wiki/Microsoft_Visio) (with extension .vsdx) is associated with Microsoft Visio, a diagram creation software. It stores information about the structure, layout, and graphical elements of a diagram. This format facilitates the creation and sharing of visualizations in areas such as business, engineering, and computer science. A Visio file can contain multiple pages. Some of them may serve as the background for others, and this can occur across multiple layers. This loader extracts the textual content from each page and its associated pages, enabling the extraction of all visible text from each page, similar to what an OCR algorithm would do. Dependencies : xmltodict package 2024-01-23 06:07:03 +00:00			`if "visio/pages/_rels/pages.xml.rels" not in zfile.namelist():`
infra: add print rule to ruff (#16221) Added noqa for existing prints. Can slowly remove / will prevent more being intro'd 2024-02-10 00:13:30 +00:00			`print("WARNING - No pages.xml.rels file found in {}".format(source)) # noqa: T201`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`return # type: ignore[return-value]`
community[minor]: New documents loader for visio files (with extension .vsdx) (#16171) Description : New documents loader for visio files (with extension .vsdx) A [visio file](https://fr.wikipedia.org/wiki/Microsoft_Visio) (with extension .vsdx) is associated with Microsoft Visio, a diagram creation software. It stores information about the structure, layout, and graphical elements of a diagram. This format facilitates the creation and sharing of visualizations in areas such as business, engineering, and computer science. A Visio file can contain multiple pages. Some of them may serve as the background for others, and this can occur across multiple layers. This loader extracts the textual content from each page and its associated pages, enabling the extraction of all visible text from each page, similar to what an OCR algorithm would do. Dependencies : xmltodict package 2024-01-23 06:07:03 +00:00			`if "docProps/app.xml" not in zfile.namelist():`
infra: add print rule to ruff (#16221) Added noqa for existing prints. Can slowly remove / will prevent more being intro'd 2024-02-10 00:13:30 +00:00			`print("WARNING - No app.xml file found in {}".format(source)) # noqa: T201`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`return # type: ignore[return-value]`
community[minor]: New documents loader for visio files (with extension .vsdx) (#16171) Description : New documents loader for visio files (with extension .vsdx) A [visio file](https://fr.wikipedia.org/wiki/Microsoft_Visio) (with extension .vsdx) is associated with Microsoft Visio, a diagram creation software. It stores information about the structure, layout, and graphical elements of a diagram. This format facilitates the creation and sharing of visualizations in areas such as business, engineering, and computer science. A Visio file can contain multiple pages. Some of them may serve as the background for others, and this can occur across multiple layers. This loader extracts the textual content from each page and its associated pages, enabling the extraction of all visible text from each page, similar to what an OCR algorithm would do. Dependencies : xmltodict package 2024-01-23 06:07:03 +00:00
			`pagesxml_content: dict = xmltodict.parse(zfile.read("visio/pages/pages.xml"))`
			`appxml_content: dict = xmltodict.parse(zfile.read("docProps/app.xml"))`
			`pagesxmlrels_content: dict = xmltodict.parse(`
			`zfile.read("visio/pages/_rels/pages.xml.rels")`
			`)`

			`if isinstance(pagesxml_content["Pages"]["Page"], list):`
			`disordered_names: List[str] = [`
			`rel["@Name"].strip() for rel in pagesxml_content["Pages"]["Page"]`
			`]`
			`else:`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`disordered_names: List[str] = [ # type: ignore[no-redef]`
community[minor]: New documents loader for visio files (with extension .vsdx) (#16171) Description : New documents loader for visio files (with extension .vsdx) A [visio file](https://fr.wikipedia.org/wiki/Microsoft_Visio) (with extension .vsdx) is associated with Microsoft Visio, a diagram creation software. It stores information about the structure, layout, and graphical elements of a diagram. This format facilitates the creation and sharing of visualizations in areas such as business, engineering, and computer science. A Visio file can contain multiple pages. Some of them may serve as the background for others, and this can occur across multiple layers. This loader extracts the textual content from each page and its associated pages, enabling the extraction of all visible text from each page, similar to what an OCR algorithm would do. Dependencies : xmltodict package 2024-01-23 06:07:03 +00:00			`pagesxml_content["Pages"]["Page"]["@Name"].strip()`
			`]`
			`if isinstance(pagesxmlrels_content["Relationships"]["Relationship"], list):`
			`disordered_paths: List[str] = [`
			`"visio/pages/" + rel["@Target"]`
			`for rel in pagesxmlrels_content["Relationships"]["Relationship"]`
			`]`
			`else:`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`disordered_paths: List[str] = [ # type: ignore[no-redef]`
community[minor]: New documents loader for visio files (with extension .vsdx) (#16171) Description : New documents loader for visio files (with extension .vsdx) A [visio file](https://fr.wikipedia.org/wiki/Microsoft_Visio) (with extension .vsdx) is associated with Microsoft Visio, a diagram creation software. It stores information about the structure, layout, and graphical elements of a diagram. This format facilitates the creation and sharing of visualizations in areas such as business, engineering, and computer science. A Visio file can contain multiple pages. Some of them may serve as the background for others, and this can occur across multiple layers. This loader extracts the textual content from each page and its associated pages, enabling the extraction of all visible text from each page, similar to what an OCR algorithm would do. Dependencies : xmltodict package 2024-01-23 06:07:03 +00:00			`"visio/pages/"`
			`+ pagesxmlrels_content["Relationships"]["Relationship"]["@Target"]`
			`]`
			`ordered_names: List[str] = appxml_content["Properties"]["TitlesOfParts"][`
			`"vt:vector"`
			`]["vt:lpstr"][: len(disordered_names)]`
			`ordered_names = [name.strip() for name in ordered_names]`
			`ordered_paths = [`
			`disordered_paths[disordered_names.index(name.strip())]`
			`for name in ordered_names`
			`]`

			`# Pages out of order and without content of their relationships`
			`disordered_pages = []`
			`for path in ordered_paths:`
			`content = zfile.read(path)`
			`string_content = json.dumps(xmltodict.parse(content))`

			`samples = re.findall(`
			`r'"#text"\s:\s"([^\\"](?:\\.[^\\"])*)"', string_content`
			`)`
			`if len(samples) > 0:`
			`page_content = "\n".join(samples)`
			`map_symboles = {`
			`"\\n": "\n",`
			`"\\t": "\t",`
			`"\\u2013": "-",`
			`"\\u2019": "'",`
			`"\\u00e9r": "é",`
			`"\\u00f4me": "ô",`
			`}`
			`for key, value in map_symboles.items():`
			`page_content = page_content.replace(key, value)`

			`disordered_pages.append({"page": path, "page_content": page_content})`

			`# Direct relationships of each page in a dict format`
			`pagexml_rels = [`
			`{`
			`"path": page_path,`
			`"content": xmltodict.parse(`
			`zfile.read(f"visio/pages/_rels/{Path(page_path).stem}.xml.rels")`
			`),`
			`}`
			`for page_path in ordered_paths`
			`if f"visio/pages/_rels/{Path(page_path).stem}.xml.rels" in zfile.namelist()`
			`]`

			`# Pages in order and with content of their relationships (direct and indirect)`
			`ordered_pages: List[Tuple[int, str, str]] = []`
			`for page_number, (path, page_name) in enumerate(`
			`zip(ordered_paths, ordered_names)`
			`):`
			`relationships = self.get_relationships(`
			`path, zfile, ordered_paths, pagexml_rels`
			`)`
			`page_content = "\n".join(`
			`[`
			`page_["page_content"]`
			`for page_ in disordered_pages`
			`if page_["page"] in relationships`
			`]`
			`+ [`
			`page_["page_content"]`
			`for page_ in disordered_pages`
			`if page_["page"] == path`
			`]`
			`)`
			`ordered_pages.append((page_number, page_name, page_content))`

			`return ordered_pages`

			`def get_relationships(`
			`self,`
			`page: str,`
			`zfile: zipfile.ZipFile,`
			`filelist: List[str],`
			`pagexml_rels: List[dict],`
			`) -> Set[str]:`
			`"""Get the relationships of a page and the relationships of its relationships,`
			`etc... recursively.`
			`Pages are based on other pages (ex: background page),`
			`so we need to get all the relationships to get all the content of a single page.`
			`"""`

			`name_path = Path(page).name`
			`parent_path = Path(page).parent`
			`rels_path = parent_path / f"_rels/{name_path}.rels"`

			`if str(rels_path) not in zfile.namelist():`
			`return set()`

			`pagexml_rels_content = next(`
			`page_["content"] for page_ in pagexml_rels if page_["path"] == page`
			`)`

			`if isinstance(pagexml_rels_content["Relationships"]["Relationship"], list):`
			`targets = [`
			`rel["@Target"]`
			`for rel in pagexml_rels_content["Relationships"]["Relationship"]`
			`]`
			`else:`
			`targets = [pagexml_rels_content["Relationships"]["Relationship"]["@Target"]]`

			`relationships = set(`
			`[str(parent_path / target) for target in targets]`
			`).intersection(filelist)`

			`for rel in relationships:`
			`relationships = relationships \| self.get_relationships(`
			`rel, zfile, filelist, pagexml_rels`
			`)`

			`return relationships`