From 7d29bb2c0247a908cadb25bbccacdf277ae112fe Mon Sep 17 00:00:00 2001 From: Yifei Song <79988483+yifeis7@users.noreply.github.com> Date: Mon, 10 Jul 2023 16:24:47 +0800 Subject: [PATCH] Add Xorbits Dataframe as a Document Loader (#7319) - [Xorbits](https://doc.xorbits.io/en/latest/) is an open-source computing framework that makes it easy to scale data science and machine learning workloads in parallel. Xorbits can leverage multi cores or GPUs to accelerate computation on a single machine, or scale out up to thousands of machines to support processing terabytes of data. - This PR added support for the Xorbits document loader, which allows langchain to leverage Xorbits to parallelize and distribute the loading of data. - Dependencies: This change requires the Xorbits library to be installed in order to be used. `pip install xorbits` - Request for review: @rlancemartin, @eyurtsev - Twitter handle: https://twitter.com/Xorbitsio Co-authored-by: Bagatur --- docs/api_reference/api_reference.rst | 1 + .../integrations/xorbits.ipynb | 304 ++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/xorbits.py | 46 +++ pyproject.toml | 1 + .../document_loaders/test_xorbits.py | 64 ++++ 6 files changed, 418 insertions(+) create mode 100644 docs/extras/modules/data_connection/document_loaders/integrations/xorbits.ipynb create mode 100644 langchain/document_loaders/xorbits.py create mode 100644 tests/integration_tests/document_loaders/test_xorbits.py diff --git a/docs/api_reference/api_reference.rst b/docs/api_reference/api_reference.rst index 31d4ae1909..95043e3a1e 100644 --- a/docs/api_reference/api_reference.rst +++ b/docs/api_reference/api_reference.rst @@ -660,6 +660,7 @@ Classes document_loaders.word_document.Docx2txtLoader document_loaders.word_document.UnstructuredWordDocumentLoader document_loaders.xml.UnstructuredXMLLoader + document_loaders.xorbits.XorbitsLoader document_loaders.youtube.GoogleApiYoutubeLoader document_loaders.youtube.YoutubeLoader diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/xorbits.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/xorbits.ipynb new file mode 100644 index 0000000000..cf5f60f028 --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/xorbits.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Xorbits Pandas DataFrame\n", + "\n", + "This notebook goes over how to load data from a [xorbits.pandas](https://doc.xorbits.io/en/latest/reference/pandas/frame.html) DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install xorbits" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import xorbits.pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"example_data/mlb_teams_2012.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b0d1d84e23c04f1296f63b3ea3dd1e5b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0.00/100 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Team\"Payroll (millions)\"\"Wins\"
0Nationals81.3498
1Reds82.2097
2Yankees197.9695
3Giants117.6294
4Braves83.3194
\n", + "" + ], + "text/plain": [ + " Team \"Payroll (millions)\" \"Wins\"\n", + "0 Nationals 81.34 98\n", + "1 Reds 82.20 97\n", + "2 Yankees 197.96 95\n", + "3 Giants 117.62 94\n", + "4 Braves 83.31 94" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import XorbitsLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "loader = XorbitsLoader(df, page_content_column=\"Team\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c8c8b67f1aae4a3c9de7734bb6cf738e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0.00/100 [00:00 Iterator[Document]: + """Lazy load records from dataframe.""" + for _, row in self.data_frame.iterrows(): + text = row[self.page_content_column] + metadata = row.to_dict() + metadata.pop(self.page_content_column) + yield Document(page_content=text, metadata=metadata) + + def load(self) -> List[Document]: + """Load full dataframe.""" + return list(self.lazy_load()) diff --git a/pyproject.toml b/pyproject.toml index 94559a75f5..f3c043d74c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -349,6 +349,7 @@ extended_testing = [ "pyspark", "openai", "rapidfuzz" + ] [[tool.poetry.source]] diff --git a/tests/integration_tests/document_loaders/test_xorbits.py b/tests/integration_tests/document_loaders/test_xorbits.py new file mode 100644 index 0000000000..a83df60827 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_xorbits.py @@ -0,0 +1,64 @@ +import pytest + +from langchain.document_loaders import XorbitsLoader +from langchain.schema import Document + +try: + import xorbits # noqa: F401 + + xorbits_installed = True +except ImportError: + xorbits_installed = False + + +@pytest.mark.skipif(not xorbits_installed, reason="xorbits not installed") +def test_load_returns_list_of_documents() -> None: + import xorbits.pandas as pd + + data = { + "text": ["Hello", "World"], + "author": ["Alice", "Bob"], + "date": ["2022-01-01", "2022-01-02"], + } + loader = XorbitsLoader(pd.DataFrame(data)) + docs = loader.load() + assert isinstance(docs, list) + assert all(isinstance(doc, Document) for doc in docs) + assert len(docs) == 2 + + +@pytest.mark.skipif(not xorbits_installed, reason="xorbits not installed") +def test_load_converts_dataframe_columns_to_document_metadata() -> None: + import xorbits.pandas as pd + + data = { + "text": ["Hello", "World"], + "author": ["Alice", "Bob"], + "date": ["2022-01-01", "2022-01-02"], + } + loader = XorbitsLoader(pd.DataFrame(data)) + docs = loader.load() + expected = { + "author": ["Alice", "Bob"], + "date": ["2022-01-01", "2022-01-02"], + } + for i, doc in enumerate(docs): + assert doc.metadata["author"] == expected["author"][i] + assert doc.metadata["date"] == expected["date"][i] + + +@pytest.mark.skipif(not xorbits_installed, reason="xorbits not installed") +def test_load_uses_page_content_column_to_create_document_text() -> None: + import xorbits.pandas as pd + + data = { + "text": ["Hello", "World"], + "author": ["Alice", "Bob"], + "date": ["2022-01-01", "2022-01-02"], + } + sample_data_frame = pd.DataFrame(data) + sample_data_frame = sample_data_frame.rename(columns={"text": "dummy_test_column"}) + loader = XorbitsLoader(sample_data_frame, page_content_column="dummy_test_column") + docs = loader.load() + assert docs[0].page_content == "Hello" + assert docs[1].page_content == "World"