langchain/libs/community/langchain_community/document_loaders/excel.py
Bagatur a0c2281540
infra: update mypy 1.10, ruff 0.5 (#23721)
```python
"""python scripts/update_mypy_ruff.py"""
import glob
import tomllib
from pathlib import Path

import toml
import subprocess
import re

ROOT_DIR = Path(__file__).parents[1]


def main():
    for path in glob.glob(str(ROOT_DIR / "libs/**/pyproject.toml"), recursive=True):
        print(path)
        with open(path, "rb") as f:
            pyproject = tomllib.load(f)
        try:
            pyproject["tool"]["poetry"]["group"]["typing"]["dependencies"]["mypy"] = (
                "^1.10"
            )
            pyproject["tool"]["poetry"]["group"]["lint"]["dependencies"]["ruff"] = (
                "^0.5"
            )
        except KeyError:
            continue
        with open(path, "w") as f:
            toml.dump(pyproject, f)
        cwd = "/".join(path.split("/")[:-1])
        completed = subprocess.run(
            "poetry lock --no-update; poetry install --with typing; poetry run mypy . --no-color",
            cwd=cwd,
            shell=True,
            capture_output=True,
            text=True,
        )
        logs = completed.stdout.split("\n")

        to_ignore = {}
        for l in logs:
            if re.match("^(.*)\:(\d+)\: error:.*\[(.*)\]", l):
                path, line_no, error_type = re.match(
                    "^(.*)\:(\d+)\: error:.*\[(.*)\]", l
                ).groups()
                if (path, line_no) in to_ignore:
                    to_ignore[(path, line_no)].append(error_type)
                else:
                    to_ignore[(path, line_no)] = [error_type]
        print(len(to_ignore))
        for (error_path, line_no), error_types in to_ignore.items():
            all_errors = ", ".join(error_types)
            full_path = f"{cwd}/{error_path}"
            try:
                with open(full_path, "r") as f:
                    file_lines = f.readlines()
            except FileNotFoundError:
                continue
            file_lines[int(line_no) - 1] = (
                file_lines[int(line_no) - 1][:-1] + f"  # type: ignore[{all_errors}]\n"
            )
            with open(full_path, "w") as f:
                f.write("".join(file_lines))

        subprocess.run(
            "poetry run ruff format .; poetry run ruff --select I --fix .",
            cwd=cwd,
            shell=True,
            capture_output=True,
            text=True,
        )


if __name__ == "__main__":
    main()

```
2024-07-03 10:33:27 -07:00

52 lines
1.7 KiB
Python

"""Loads Microsoft Excel files."""
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
validate_unstructured_version,
)
class UnstructuredExcelLoader(UnstructuredFileLoader):
"""Load Microsoft Excel files using `Unstructured`.
Like other
Unstructured loaders, UnstructuredExcelLoader can be used in both
"single" and "elements" mode. If you use the loader in "elements"
mode, each sheet in the Excel file will be an Unstructured Table
element. If you use the loader in "single" mode, an
HTML representation of the table will be available in the
"text_as_html" key in the document metadata.
Examples
--------
from langchain_community.document_loaders.excel import UnstructuredExcelLoader
loader = UnstructuredExcelLoader("stanley-cups.xlsx", mode="elements")
docs = loader.load()
"""
def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Args:
file_path: The path to the Microsoft Excel file.
mode: The mode to use when partitioning the file. See unstructured docs
for more info. Optional. Defaults to "single".
**unstructured_kwargs: Keyword arguments to pass to unstructured.
"""
validate_unstructured_version(min_unstructured_version="0.6.7")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.xlsx import partition_xlsx
return partition_xlsx(filename=self.file_path, **self.unstructured_kwargs)