From eaa505fb0921fc356d72e845fb20700ebe7447eb Mon Sep 17 00:00:00 2001 From: Joshua Sundance Bailey <84336755+joshuasundance-swca@users.noreply.github.com> Date: Fri, 11 Aug 2023 17:33:40 -0400 Subject: [PATCH] Create ArcGISLoader & example notebook (#8873) - Description: Adds the ArcGISLoader class to `langchain.document_loaders` - Allows users to load data from ArcGIS Online, Portal, and similar - Users can authenticate with `arcgis.gis.GIS` or retrieve public data anonymously - Uses the `arcgis.features.FeatureLayer` class to retrieve the data - Defines the most relevant keywords arguments and accepts `**kwargs` - Dependencies: Using this class requires `arcgis` and, optionally, `bs4.BeautifulSoup`. Tagging maintainers: - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev --------- Co-authored-by: Bagatur --- .../document_loaders/arcgis.ipynb | 325 ++++++++++++++++++ .../langchain/document_loaders/__init__.py | 2 + .../document_loaders/arcgis_loader.py | 129 +++++++ .../document_loaders/test_arcgis_loader.py | 47 +++ 4 files changed, 503 insertions(+) create mode 100644 docs/extras/integrations/document_loaders/arcgis.ipynb create mode 100644 libs/langchain/langchain/document_loaders/arcgis_loader.py create mode 100644 libs/langchain/tests/unit_tests/document_loaders/test_arcgis_loader.py diff --git a/docs/extras/integrations/document_loaders/arcgis.ipynb b/docs/extras/integrations/document_loaders/arcgis.ipynb new file mode 100644 index 0000000000..bdb3eb1f01 --- /dev/null +++ b/docs/extras/integrations/document_loaders/arcgis.ipynb @@ -0,0 +1,325 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "62359e08-cf80-4210-a30c-f450000e65b9", + "metadata": {}, + "source": [ + "# ArcGISLoader\n", + "\n", + "This notebook demonstrates the use of the `langchain.document_loaders.ArcGISLoader` class.\n", + "\n", + "You will need to install the ArcGIS API for Python `arcgis` and, optionally, `bs4.BeautifulSoup`.\n", + "\n", + "You can use an `arcgis.gis.GIS` object for authenticated data loading, or leave it blank to access public data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b782cab5-0584-4e2a-9073-009fb8dc93a3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import ArcGISLoader\n", + "\n", + "\n", + "url = \"https://maps1.vcgov.org/arcgis/rest/services/Beaches/MapServer/7\"\n", + "\n", + "loader = ArcGISLoader(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "aa3053cf-4127-43ea-bf56-e378b348091f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4.04 ms, sys: 1.63 ms, total: 5.67 ms\n", + "Wall time: 644 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a2444519-9117-4feb-8bb9-8931ce286fa5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['url', 'layer_description', 'item_description', 'layer_properties'])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0].metadata.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6b6e9107-6a80-4ef7-8149-3013faa2de76", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "KeysView({\n", + " \"currentVersion\": 10.81,\n", + " \"id\": 7,\n", + " \"name\": \"Beach Ramps\",\n", + " \"type\": \"Feature Layer\",\n", + " \"description\": \"\",\n", + " \"geometryType\": \"esriGeometryPoint\",\n", + " \"sourceSpatialReference\": {\n", + " \"wkid\": 2881,\n", + " \"latestWkid\": 2881\n", + " },\n", + " \"copyrightText\": \"\",\n", + " \"parentLayer\": null,\n", + " \"subLayers\": [],\n", + " \"minScale\": 750000,\n", + " \"maxScale\": 0,\n", + " \"drawingInfo\": {\n", + " \"renderer\": {\n", + " \"type\": \"simple\",\n", + " \"symbol\": {\n", + " \"type\": \"esriPMS\",\n", + " \"url\": \"9bb2e5ca499bb68aa3ee0d4e1ecc3849\",\n", + " \"imageData\": \"iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IB2cksfwAAAAlwSFlzAAAOxAAADsQBlSsOGwAAAJJJREFUOI3NkDEKg0AQRZ9kkSnSGBshR7DJqdJYeg7BMpcS0uQWQsqoCLExkcUJzGqT38zw2fcY1rEzbp7vjXz0EXC7gBxs1ABcG/8CYkCcDqwyLqsV+RlV0I/w7PzuJBArr1VB20H58Ls6h+xoFITkTwWpQJX7XSIBAnFwVj7MLAjJV/AC6G3QoAmK+74Lom04THTBEp/HCSc6AAAAAElFTkSuQmCC\",\n", + " \"contentType\": \"image/png\",\n", + " \"width\": 12,\n", + " \"height\": 12,\n", + " \"angle\": 0,\n", + " \"xoffset\": 0,\n", + " \"yoffset\": 0\n", + " },\n", + " \"label\": \"\",\n", + " \"description\": \"\"\n", + " },\n", + " \"transparency\": 0,\n", + " \"labelingInfo\": null\n", + " },\n", + " \"defaultVisibility\": true,\n", + " \"extent\": {\n", + " \"xmin\": -81.09480168806815,\n", + " \"ymin\": 28.858349245353473,\n", + " \"xmax\": -80.77512908572814,\n", + " \"ymax\": 29.41078388840041,\n", + " \"spatialReference\": {\n", + " \"wkid\": 4326,\n", + " \"latestWkid\": 4326\n", + " }\n", + " },\n", + " \"hasAttachments\": false,\n", + " \"htmlPopupType\": \"esriServerHTMLPopupTypeNone\",\n", + " \"displayField\": \"AccessName\",\n", + " \"typeIdField\": null,\n", + " \"subtypeFieldName\": null,\n", + " \"subtypeField\": null,\n", + " \"defaultSubtypeCode\": null,\n", + " \"fields\": [\n", + " {\n", + " \"name\": \"OBJECTID\",\n", + " \"type\": \"esriFieldTypeOID\",\n", + " \"alias\": \"OBJECTID\",\n", + " \"domain\": null\n", + " },\n", + " {\n", + " \"name\": \"Shape\",\n", + " \"type\": \"esriFieldTypeGeometry\",\n", + " \"alias\": \"Shape\",\n", + " \"domain\": null\n", + " },\n", + " {\n", + " \"name\": \"AccessName\",\n", + " \"type\": \"esriFieldTypeString\",\n", + " \"alias\": \"AccessName\",\n", + " \"length\": 40,\n", + " \"domain\": null\n", + " },\n", + " {\n", + " \"name\": \"AccessID\",\n", + " \"type\": \"esriFieldTypeString\",\n", + " \"alias\": \"AccessID\",\n", + " \"length\": 50,\n", + " \"domain\": null\n", + " },\n", + " {\n", + " \"name\": \"AccessType\",\n", + " \"type\": \"esriFieldTypeString\",\n", + " \"alias\": \"AccessType\",\n", + " \"length\": 25,\n", + " \"domain\": null\n", + " },\n", + " {\n", + " \"name\": \"GeneralLoc\",\n", + " \"type\": \"esriFieldTypeString\",\n", + " \"alias\": \"GeneralLoc\",\n", + " \"length\": 100,\n", + " \"domain\": null\n", + " },\n", + " {\n", + " \"name\": \"MilePost\",\n", + " \"type\": \"esriFieldTypeDouble\",\n", + " \"alias\": \"MilePost\",\n", + " \"domain\": null\n", + " },\n", + " {\n", + " \"name\": \"City\",\n", + " \"type\": \"esriFieldTypeString\",\n", + " \"alias\": \"City\",\n", + " \"length\": 50,\n", + " \"domain\": null\n", + " },\n", + " {\n", + " \"name\": \"AccessStatus\",\n", + " \"type\": \"esriFieldTypeString\",\n", + " \"alias\": \"AccessStatus\",\n", + " \"length\": 50,\n", + " \"domain\": null\n", + " },\n", + " {\n", + " \"name\": \"Entry_Date_Time\",\n", + " \"type\": \"esriFieldTypeDate\",\n", + " \"alias\": \"Entry_Date_Time\",\n", + " \"length\": 8,\n", + " \"domain\": null\n", + " },\n", + " {\n", + " \"name\": \"DrivingZone\",\n", + " \"type\": \"esriFieldTypeString\",\n", + " \"alias\": \"DrivingZone\",\n", + " \"length\": 50,\n", + " \"domain\": null\n", + " }\n", + " ],\n", + " \"geometryField\": {\n", + " \"name\": \"Shape\",\n", + " \"type\": \"esriFieldTypeGeometry\",\n", + " \"alias\": \"Shape\"\n", + " },\n", + " \"indexes\": null,\n", + " \"subtypes\": [],\n", + " \"relationships\": [],\n", + " \"canModifyLayer\": true,\n", + " \"canScaleSymbols\": false,\n", + " \"hasLabels\": false,\n", + " \"capabilities\": \"Map,Query,Data\",\n", + " \"maxRecordCount\": 1000,\n", + " \"supportsStatistics\": true,\n", + " \"supportsAdvancedQueries\": true,\n", + " \"supportedQueryFormats\": \"JSON, geoJSON\",\n", + " \"isDataVersioned\": false,\n", + " \"ownershipBasedAccessControlForFeatures\": {\n", + " \"allowOthersToQuery\": true\n", + " },\n", + " \"useStandardizedQueries\": true,\n", + " \"advancedQueryCapabilities\": {\n", + " \"useStandardizedQueries\": true,\n", + " \"supportsStatistics\": true,\n", + " \"supportsHavingClause\": true,\n", + " \"supportsCountDistinct\": true,\n", + " \"supportsOrderBy\": true,\n", + " \"supportsDistinct\": true,\n", + " \"supportsPagination\": true,\n", + " \"supportsTrueCurve\": true,\n", + " \"supportsReturningQueryExtent\": true,\n", + " \"supportsQueryWithDistance\": true,\n", + " \"supportsSqlExpression\": true\n", + " },\n", + " \"supportsDatumTransformation\": true,\n", + " \"dateFieldsTimeReference\": null,\n", + " \"supportsCoordinatesQuantization\": true\n", + "})" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0].metadata['layer_properties'].keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1d132b7d-5a13-4d66-98e8-785ffdf87af0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"OBJECTID\": 2, \"AccessName\": \"27TH AV\", \"AccessID\": \"NS-141\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"3600 BLK S ATLANTIC AV\", \"MilePost\": 4.83, \"City\": \"NEW SMYRNA BEACH\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691397348000, \"DrivingZone\": \"BOTH\"}\n", + "{\"OBJECTID\": 7, \"AccessName\": \"BEACHWAY AV\", \"AccessID\": \"NS-106\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"1400 N ATLANTIC AV\", \"MilePost\": 1.57, \"City\": \"NEW SMYRNA BEACH\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691397348000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 10, \"AccessName\": \"SEABREEZE BLVD\", \"AccessID\": \"DB-051\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"500 BLK N ATLANTIC AV\", \"MilePost\": 14.24, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691394892000, \"DrivingZone\": \"BOTH\"}\n", + "{\"OBJECTID\": 13, \"AccessName\": \"GRANADA BLVD\", \"AccessID\": \"OB-030\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"20 BLK OCEAN SHORE BLVD\", \"MilePost\": 10.02, \"City\": \"ORMOND BEACH\", \"AccessStatus\": \"4X4 ONLY\", \"Entry_Date_Time\": 1691394952000, \"DrivingZone\": \"BOTH\"}\n", + "{\"OBJECTID\": 16, \"AccessName\": \"INTERNATIONAL SPEEDWAY BLVD\", \"AccessID\": \"DB-059\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"300 BLK S ATLANTIC AV\", \"MilePost\": 15.27, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691395174000, \"DrivingZone\": \"BOTH\"}\n", + "{\"OBJECTID\": 26, \"AccessName\": \"UNIVERSITY BLVD\", \"AccessID\": \"DB-048\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"900 BLK N ATLANTIC AV\", \"MilePost\": 13.74, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691394892000, \"DrivingZone\": \"BOTH\"}\n", + "{\"OBJECTID\": 36, \"AccessName\": \"BEACH ST\", \"AccessID\": \"PI-097\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"4890 BLK S ATLANTIC AV\", \"MilePost\": 25.85, \"City\": \"PONCE INLET\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691397348000, \"DrivingZone\": \"BOTH\"}\n", + "{\"OBJECTID\": 40, \"AccessName\": \"BOTEFUHR AV\", \"AccessID\": \"DBS-067\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"1900 BLK S ATLANTIC AV\", \"MilePost\": 16.68, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"4X4 ONLY\", \"Entry_Date_Time\": 1691395124000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 41, \"AccessName\": \"SILVER BEACH AV\", \"AccessID\": \"DB-064\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"1000 BLK S ATLANTIC AV\", \"MilePost\": 15.98, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691395174000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 50, \"AccessName\": \"3RD AV\", \"AccessID\": \"NS-118\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"1200 BLK HILL ST\", \"MilePost\": 3.25, \"City\": \"NEW SMYRNA BEACH\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691397348000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 58, \"AccessName\": \"DUNLAWTON BLVD\", \"AccessID\": \"DBS-078\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"3400 BLK S ATLANTIC AV\", \"MilePost\": 20.61, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691397348000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 63, \"AccessName\": \"MILSAP RD\", \"AccessID\": \"OB-037\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"700 BLK S ATLANTIC AV\", \"MilePost\": 11.52, \"City\": \"ORMOND BEACH\", \"AccessStatus\": \"4X4 ONLY\", \"Entry_Date_Time\": 1691394952000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 68, \"AccessName\": \"EMILIA AV\", \"AccessID\": \"DBS-082\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"3790 BLK S ATLANTIC AV\", \"MilePost\": 21.38, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691397348000, \"DrivingZone\": \"BOTH\"}\n", + "{\"OBJECTID\": 92, \"AccessName\": \"FLAGLER AV\", \"AccessID\": \"NS-110\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"500 BLK FLAGLER AV\", \"MilePost\": 2.57, \"City\": \"NEW SMYRNA BEACH\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691397348000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 94, \"AccessName\": \"CRAWFORD RD\", \"AccessID\": \"NS-108\", \"AccessType\": \"OPEN VEHICLE RAMP - PASS\", \"GeneralLoc\": \"800 BLK N ATLANTIC AV\", \"MilePost\": 2.19, \"City\": \"NEW SMYRNA BEACH\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691397348000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 122, \"AccessName\": \"HARTFORD AV\", \"AccessID\": \"DB-043\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"1890 BLK N ATLANTIC AV\", \"MilePost\": 12.76, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"CLOSED - SEASONAL\", \"Entry_Date_Time\": 1691394832000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 125, \"AccessName\": \"WILLIAMS AV\", \"AccessID\": \"DB-042\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"2200 BLK N ATLANTIC AV\", \"MilePost\": 12.5, \"City\": \"DAYTONA BEACH\", \"AccessStatus\": \"4X4 ONLY\", \"Entry_Date_Time\": 1691394952000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 134, \"AccessName\": \"CARDINAL DR\", \"AccessID\": \"OB-036\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"600 BLK S ATLANTIC AV\", \"MilePost\": 11.27, \"City\": \"ORMOND BEACH\", \"AccessStatus\": \"4X4 ONLY\", \"Entry_Date_Time\": 1691394952000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 229, \"AccessName\": \"EL PORTAL ST\", \"AccessID\": \"DBS-076\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"3200 BLK S ATLANTIC AV\", \"MilePost\": 20.04, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691397348000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 230, \"AccessName\": \"HARVARD DR\", \"AccessID\": \"OB-038\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"900 BLK S ATLANTIC AV\", \"MilePost\": 11.72, \"City\": \"ORMOND BEACH\", \"AccessStatus\": \"4X4 ONLY\", \"Entry_Date_Time\": 1691394952000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 232, \"AccessName\": \"VAN AV\", \"AccessID\": \"DBS-075\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"3100 BLK S ATLANTIC AV\", \"MilePost\": 19.6, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"OPEN\", \"Entry_Date_Time\": 1691397348000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 233, \"AccessName\": \"ROCKEFELLER DR\", \"AccessID\": \"OB-034\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"400 BLK S ATLANTIC AV\", \"MilePost\": 10.9, \"City\": \"ORMOND BEACH\", \"AccessStatus\": \"CLOSED - SEASONAL\", \"Entry_Date_Time\": 1691394832000, \"DrivingZone\": \"YES\"}\n", + "{\"OBJECTID\": 235, \"AccessName\": \"MINERVA RD\", \"AccessID\": \"DBS-069\", \"AccessType\": \"OPEN VEHICLE RAMP\", \"GeneralLoc\": \"2300 BLK S ATLANTIC AV\", \"MilePost\": 17.52, \"City\": \"DAYTONA BEACH SHORES\", \"AccessStatus\": \"4X4 ONLY\", \"Entry_Date_Time\": 1691395124000, \"DrivingZone\": \"YES\"}\n" + ] + } + ], + "source": [ + "for doc in docs:\n", + " print(doc.page_content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py index 586db500ac..237c5b539a 100644 --- a/libs/langchain/langchain/document_loaders/__init__.py +++ b/libs/langchain/langchain/document_loaders/__init__.py @@ -29,6 +29,7 @@ from langchain.document_loaders.airbyte import ( from langchain.document_loaders.airbyte_json import AirbyteJSONLoader from langchain.document_loaders.airtable import AirtableLoader from langchain.document_loaders.apify_dataset import ApifyDatasetLoader +from langchain.document_loaders.arcgis_loader import ArcGISLoader from langchain.document_loaders.arxiv import ArxivLoader from langchain.document_loaders.async_html import AsyncHtmlLoader from langchain.document_loaders.azlyrics import AZLyricsLoader @@ -214,6 +215,7 @@ __all__ = [ "AirtableLoader", "AmazonTextractPDFLoader", "ApifyDatasetLoader", + "ArcGISLoader", "ArxivLoader", "AsyncHtmlLoader", "AzureBlobStorageContainerLoader", diff --git a/libs/langchain/langchain/document_loaders/arcgis_loader.py b/libs/langchain/langchain/document_loaders/arcgis_loader.py new file mode 100644 index 0000000000..e1a4291b8a --- /dev/null +++ b/libs/langchain/langchain/document_loaders/arcgis_loader.py @@ -0,0 +1,129 @@ +"""Document Loader for ArcGIS FeatureLayers.""" +from __future__ import annotations + +import json +import re +import warnings +from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + +if TYPE_CHECKING: + import arcgis + +_NOT_PROVIDED = "(Not Provided)" + + +class ArcGISLoader(BaseLoader): + """Load records from an ArcGIS FeatureLayer.""" + + def __init__( + self, + layer: Union[str, arcgis.features.FeatureLayer], + gis: Optional[arcgis.gis.GIS] = None, + where: str = "1=1", + out_fields: Optional[Union[List[str], str]] = None, + return_geometry: bool = False, + **kwargs: Any, + ): + try: + import arcgis + except ImportError as e: + raise ImportError( + "arcgis is required to use the ArcGIS Loader. " + "Install it with pip or conda." + ) from e + + try: + from bs4 import BeautifulSoup # type: ignore + + self.BEAUTIFULSOUP = BeautifulSoup + except ImportError: + warnings.warn("BeautifulSoup not found. HTML will not be parsed.") + self.BEAUTIFULSOUP = None + + self.gis = gis or arcgis.gis.GIS() + + if isinstance(layer, str): + self.url = layer + self.layer = arcgis.features.FeatureLayer(layer, gis=gis) + else: + self.url = layer.url + self.layer = layer + + self.layer_properties = self._get_layer_properties() + + self.where = where + + if isinstance(out_fields, str): + self.out_fields = out_fields + elif out_fields is None: + self.out_fields = "*" + else: + self.out_fields = ",".join(out_fields) + + self.return_geometry = return_geometry + self.kwargs = kwargs + + def _get_layer_properties(self) -> dict: + """Get the layer properties from the FeatureLayer.""" + + layer_number_pattern = re.compile(r"/\d+$") + props = self.layer.properties + + try: + if self.BEAUTIFULSOUP: + lyr_desc = self.BEAUTIFULSOUP(props["description"]).text + else: + lyr_desc = props["description"] + lyr_desc = lyr_desc or _NOT_PROVIDED + except KeyError: + lyr_desc = _NOT_PROVIDED + try: + item_id = props["serviceItemId"] + item = self.gis.content.get(item_id) or arcgis.features.FeatureLayer( + re.sub(layer_number_pattern, "", self.url), + ) + try: + raw_desc = item.description + except AttributeError: + raw_desc = item.properties.description + if self.BEAUTIFULSOUP: + item_desc = self.BEAUTIFULSOUP(raw_desc).text + else: + item_desc = raw_desc + item_desc = item_desc or _NOT_PROVIDED + except KeyError: + item_desc = _NOT_PROVIDED + return { + "layer_description": lyr_desc, + "item_description": item_desc, + "layer_properties": props, + } + + def lazy_load(self) -> Iterator[Document]: + """Lazy load records from FeatureLayer.""" + + query_response = self.layer.query( + where=self.where, + out_fields=self.out_fields, + return_geometry=self.return_geometry, + return_all_records=True, + **self.kwargs, + ) + features = (feature.as_dict["attributes"] for feature in query_response) + for feature in features: + yield Document( + page_content=json.dumps(feature), + metadata={ + "url": self.url, + "layer_description": self.layer_properties["layer_description"], + "item_description": self.layer_properties["item_description"], + "layer_properties": self.layer_properties["layer_properties"], + }, + ) + + def load(self) -> List[Document]: + """Load all records from FeatureLayer.""" + return list(self.lazy_load()) diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_arcgis_loader.py b/libs/langchain/tests/unit_tests/document_loaders/test_arcgis_loader.py new file mode 100644 index 0000000000..645cb99e8a --- /dev/null +++ b/libs/langchain/tests/unit_tests/document_loaders/test_arcgis_loader.py @@ -0,0 +1,47 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from langchain.document_loaders import ArcGISLoader + + +@pytest.fixture +def arcgis_mocks(mock_feature_layer, mock_gis): # type: ignore + sys_modules = { + "arcgis": MagicMock(), + "arcgis.features.FeatureLayer": mock_feature_layer, + "arcgis.gis.GIS": mock_gis, + } + with patch.dict("sys.modules", sys_modules): + yield + + +@pytest.fixture +def mock_feature_layer(): # type: ignore + feature_layer = MagicMock() + feature_layer.query.return_value = [ + MagicMock(as_dict={"attributes": {"field": "value"}}) + ] + feature_layer.url = "https://example.com/layer_url" + feature_layer.properties = { + "description": "Some HTML content" + } + return feature_layer + + +@pytest.fixture +def mock_gis(): # type: ignore + gis = MagicMock() + gis.content.get.return_value = MagicMock(description="Item description") + return gis + + +def test_lazy_load(arcgis_mocks, mock_feature_layer, mock_gis): # type: ignore + loader = ArcGISLoader(layer=mock_feature_layer, gis=mock_gis) + loader.BEAUTIFULSOUP = None + + documents = list(loader.lazy_load()) + + assert len(documents) == 1 + assert documents[0].metadata["url"] == "https://example.com/layer_url" + # Add more assertions based on your expected behavior