mirror of
https://github.com/hwchase17/langchain
synced 2024-11-18 09:25:54 +00:00
4b7969efc5
**Description** : New documents loader for visio files (with extension .vsdx) A [visio file](https://fr.wikipedia.org/wiki/Microsoft_Visio) (with extension .vsdx) is associated with Microsoft Visio, a diagram creation software. It stores information about the structure, layout, and graphical elements of a diagram. This format facilitates the creation and sharing of visualizations in areas such as business, engineering, and computer science. A Visio file can contain multiple pages. Some of them may serve as the background for others, and this can occur across multiple layers. This loader extracts the textual content from each page and its associated pages, enabling the extraction of all visible text from each page, similar to what an OCR algorithm would do. **Dependencies** : xmltodict package
54 lines
1.8 KiB
Python
54 lines
1.8 KiB
Python
import os
|
|
import tempfile
|
|
from abc import ABC
|
|
from typing import List
|
|
from urllib.parse import urlparse
|
|
|
|
import requests
|
|
|
|
from langchain_community.docstore.document import Document
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
from langchain_community.document_loaders.blob_loaders import Blob
|
|
from langchain_community.document_loaders.parsers import VsdxParser
|
|
|
|
|
|
class VsdxLoader(BaseLoader, ABC):
|
|
def __init__(self, file_path: str):
|
|
"""Initialize with file path."""
|
|
self.file_path = file_path
|
|
if "~" in self.file_path:
|
|
self.file_path = os.path.expanduser(self.file_path)
|
|
|
|
# If the file is a web path, download it to a temporary file, and use that
|
|
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
|
|
r = requests.get(self.file_path)
|
|
|
|
if r.status_code != 200:
|
|
raise ValueError(
|
|
"Check the url of your file; returned status code %s"
|
|
% r.status_code
|
|
)
|
|
|
|
self.web_path = self.file_path
|
|
self.temp_file = tempfile.NamedTemporaryFile()
|
|
self.temp_file.write(r.content)
|
|
self.file_path = self.temp_file.name
|
|
elif not os.path.isfile(self.file_path):
|
|
raise ValueError("File path %s is not a valid file or url" % self.file_path)
|
|
|
|
self.parser = VsdxParser()
|
|
|
|
def __del__(self) -> None:
|
|
if hasattr(self, "temp_file"):
|
|
self.temp_file.close()
|
|
|
|
@staticmethod
|
|
def _is_valid_url(url: str) -> bool:
|
|
"""Check if the url is valid."""
|
|
parsed = urlparse(url)
|
|
return bool(parsed.netloc) and bool(parsed.scheme)
|
|
|
|
def load(self) -> List[Document]:
|
|
blob = Blob.from_path(self.file_path)
|
|
return list(self.parser.parse(blob))
|