You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/document_loaders/arcgis_loader.py

155 lines
5.1 KiB
Python

"""Document Loader for ArcGIS FeatureLayers."""
from __future__ import annotations
import json
import re
import warnings
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
if TYPE_CHECKING:
import arcgis
_NOT_PROVIDED = "(Not Provided)"
class ArcGISLoader(BaseLoader):
"""Load records from an ArcGIS FeatureLayer."""
def __init__(
self,
layer: Union[str, arcgis.features.FeatureLayer],
gis: Optional[arcgis.gis.GIS] = None,
where: str = "1=1",
out_fields: Optional[Union[List[str], str]] = None,
return_geometry: bool = False,
result_record_count: Optional[int] = None,
lyr_desc: Optional[str] = None,
**kwargs: Any,
):
try:
import arcgis
except ImportError as e:
raise ImportError(
"arcgis is required to use the ArcGIS Loader. "
"Install it with pip or conda."
) from e
try:
from bs4 import BeautifulSoup # type: ignore
self.BEAUTIFULSOUP = BeautifulSoup
except ImportError:
warnings.warn("BeautifulSoup not found. HTML will not be parsed.")
self.BEAUTIFULSOUP = None
self.gis = gis or arcgis.gis.GIS()
if isinstance(layer, str):
self.url = layer
self.layer = arcgis.features.FeatureLayer(layer, gis=gis)
else:
self.url = layer.url
self.layer = layer
self.layer_properties = self._get_layer_properties(lyr_desc)
self.where = where
if isinstance(out_fields, str):
self.out_fields = out_fields
elif out_fields is None:
self.out_fields = "*"
else:
self.out_fields = ",".join(out_fields)
self.return_geometry = return_geometry
self.result_record_count = result_record_count
self.return_all_records = not isinstance(result_record_count, int)
query_params = dict(
where=self.where,
out_fields=self.out_fields,
return_geometry=self.return_geometry,
return_all_records=self.return_all_records,
result_record_count=self.result_record_count,
)
query_params.update(kwargs)
self.query_params = query_params
def _get_layer_properties(self, lyr_desc: Optional[str] = None) -> dict:
"""Get the layer properties from the FeatureLayer."""
import arcgis
layer_number_pattern = re.compile(r"/\d+$")
props = self.layer.properties
if lyr_desc is None:
# retrieve description from the FeatureLayer if not provided
try:
if self.BEAUTIFULSOUP:
lyr_desc = self.BEAUTIFULSOUP(props["description"]).text
else:
lyr_desc = props["description"]
lyr_desc = lyr_desc or _NOT_PROVIDED
except KeyError:
lyr_desc = _NOT_PROVIDED
try:
item_id = props["serviceItemId"]
item = self.gis.content.get(item_id) or arcgis.features.FeatureLayer(
re.sub(layer_number_pattern, "", self.url),
)
try:
raw_desc = item.description
except AttributeError:
raw_desc = item.properties.description
if self.BEAUTIFULSOUP:
item_desc = self.BEAUTIFULSOUP(raw_desc).text
else:
item_desc = raw_desc
item_desc = item_desc or _NOT_PROVIDED
except KeyError:
item_desc = _NOT_PROVIDED
return {
"layer_description": lyr_desc,
"item_description": item_desc,
"layer_properties": props,
}
def lazy_load(self) -> Iterator[Document]:
"""Lazy load records from FeatureLayer."""
query_response = self.layer.query(**self.query_params)
features = (feature.as_dict for feature in query_response)
for feature in features:
attributes = feature["attributes"]
page_content = json.dumps(attributes)
metadata = {
"accessed": f"{datetime.now(timezone.utc).isoformat()}Z",
"name": self.layer_properties["layer_properties"]["name"],
"url": self.url,
"layer_description": self.layer_properties["layer_description"],
"item_description": self.layer_properties["item_description"],
"layer_properties": self.layer_properties["layer_properties"],
}
if self.return_geometry:
try:
metadata["geometry"] = feature["geometry"]
except KeyError:
warnings.warn(
"Geometry could not be retrieved from the feature layer."
)
yield Document(page_content=page_content, metadata=metadata)
def load(self) -> List[Document]:
"""Load all records from FeatureLayer."""
return list(self.lazy_load())