mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
200 lines
112 KiB
Plaintext
200 lines
112 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "ca4c8c2a",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Geopandas\n",
|
||
|
"\n",
|
||
|
"[Geopandas](https://geopandas.org/en/stable/index.html) is an open source project to make working with geospatial data in python easier. \n",
|
||
|
"\n",
|
||
|
"GeoPandas extends the datatypes used by pandas to allow spatial operations on geometric types. \n",
|
||
|
"\n",
|
||
|
"Geometric operations are performed by shapely. Geopandas further depends on fiona for file access and matplotlib for plotting.\n",
|
||
|
"\n",
|
||
|
"LLM applications (chat, QA) that utilize geospatial data are an interesting area for exploration."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "00b3bf80",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"! pip install sodapy\n",
|
||
|
"! pip install pandas\n",
|
||
|
"! pip install geopandas"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"id": "cecc9320",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import ast\n",
|
||
|
"import pandas as pd\n",
|
||
|
"import geopandas as gpd\n",
|
||
|
"from langchain.document_loaders import OpenCityDataLoader"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "04981332",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Create a GeoPandas dataframe from [`Open City Data`](https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/open_city_data) as an example input."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "5e7de46b",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Load Open City Data\n",
|
||
|
"dataset = \"tmnf-yvry\" # San Francisco crime data\n",
|
||
|
"loader = OpenCityDataLoader(city_id=\"data.sfgov.org\", dataset_id=dataset, limit=5000)\n",
|
||
|
"docs = loader.load()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 30,
|
||
|
"id": "7cda2e38",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Convert list of dictionaries to DataFrame\n",
|
||
|
"df = pd.DataFrame([ast.literal_eval(d.page_content) for d in docs])\n",
|
||
|
"\n",
|
||
|
"# Extract latitude and longitude\n",
|
||
|
"df[\"Latitude\"] = df[\"location\"].apply(lambda loc: loc[\"coordinates\"][1])\n",
|
||
|
"df[\"Longitude\"] = df[\"location\"].apply(lambda loc: loc[\"coordinates\"][0])\n",
|
||
|
"\n",
|
||
|
"# Create geopandas DF\n",
|
||
|
"gdf = gpd.GeoDataFrame(\n",
|
||
|
" df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude), crs=\"EPSG:4326\"\n",
|
||
|
")\n",
|
||
|
"\n",
|
||
|
"# Only keep valid longitudes and latitudes for San Francisco\n",
|
||
|
"gdf = gdf[\n",
|
||
|
" (gdf[\"Longitude\"] >= -123.173825)\n",
|
||
|
" & (gdf[\"Longitude\"] <= -122.281780)\n",
|
||
|
" & (gdf[\"Latitude\"] >= 37.623983)\n",
|
||
|
" & (gdf[\"Latitude\"] <= 37.929824)\n",
|
||
|
"]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "030a535c",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Visiualization of the sample of SF crimne data. "
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 31,
|
||
|
"id": "8148a63e",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0wAAALUCAYAAAA437ItAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9eXxU1f3//55kgGCBJLjUhSVErStitVpBtgBJlbb4rfbTz+/TQvlW2360te2ntqUaSUJA0aK1tYutrUtRtK3a2tIaFWQLCK4VRD9qUXaXSgUCaAKZ5Pz+eH+P994z5+73ztyZeT0fj3nM3Jm7nO2eOe/73lJCCEEAAAAAAAAAALIoy3cBAAAAAAAAACCpQGACAAAAAAAAABsgMAEAAAAAAACADRCYAAAAAAAAAMAGCEwAAAAAAAAAYAMEJgAAAAAAAACwAQITAAAAAAAAANgAgQkAAAAAAAAAbIDABAAAAAAAAAA2QGACAAAAAAAAABt8CUy/+tWv6IwzzqBBgwbRoEGDaPTo0fToo48SEdHWrVsplUppXw8++KDtOQ8cOEBXXnklDRkyhPr370+nnnoq/frXv/7w9927d9O3vvUtOumkk6h///40bNgw+va3v00dHR0BqwwAAAAAAAAA3kj72XnIkCF044030oknnkhCCFq4cCFddNFF9MILL9DJJ59Mb7/9tmX/3/zmN3TTTTfRhRdeaHvOq666ipYvX06LFi2impoaWrJkCX3jG9+gY489lqZNm0ZvvfUWvfXWW3TzzTfTqaeeStu2baPLL7+c3nrrLXrooYeC1RoAAAAAAAAAPJASQogwJxg8eDDddNNNdNlll2X99vGPf5zOOussuvPOO22PP/300+k///M/qamp6cPvzj77bLrwwgvpuuuu0x7z4IMP0vTp0+n999+ndNqXzAcAAAAAAAAAngksbfT09NCDDz5I77//Po0ePTrr9+eff57Wr19Pv/zlLx3PM2bMGFq8eDFdeumldOyxx9LKlSvpn//8J/3kJz+xPaajo4MGDRrkKCwdPHiQDh48+OF2b28v7d69mw4//HBKpVIeaggAAAAAAAAoRoQQtH//fjr22GOprMzFS0n45MUXXxQf+chHRHl5uaisrBSPPPKIdr8rrrhCnHLKKa7n6+rqEl/+8pcFEYl0Oi369u0rFi5caLv/rl27xLBhw0RjY6PjeVtaWgQR4YUXXnjhhRdeeOGFF154aV87duxwlVd8m+QdOnSItm/fTh0dHfTQQw/RHXfcQatWraJTTz31w306OzvpmGOOoaamJvre977neL6bb76Zfvvb39LNN99Mw4cPp/b2drrmmmvo4YcfpilTplj23bdvH9XX19PgwYNp8eLF1KdPH9vzqhqmjo4OGjZsGO3YsYMGDRrkp8oAAAAAAACAImLfvn00dOhQ2rt3L1VWVjruG9qHacqUKXT88cfT7bff/uF39957L1122WX05ptv0pFHHml7bGdnJ1VWVtLDDz9Mn/70pz/8/qtf/Srt3LmTHnvssQ+/279/P33qU5+iww47jP7+979TRUWFr3Lu27ePKisrPzTnAwAAAAAAAJQmfmSD0HmYent7LZocIqI777yTpk2b5igsERF1d3dTd3d3lt1geXk59fb2fri9b98+amhooL59+9LixYt9C0sAAAAAAAAAEARfQR+uueYauvDCC2nYsGG0f/9+uv/++2nlypX0+OOPf7jP66+/Tu3t7dTW1qY9x8knn0w33HADfe5zn6NBgwbRhAkT6Ac/+AH179+fhg8fTqtWraJ77rmHbrnlFiIyhKUPPviAFi1aRPv27aN9+/YREdGRRx5J5eXlQesOAAAAAAAAAI74Epjeffdd+vKXv0xvv/02VVZW0hlnnEGPP/441dfXf7jPXXfdRUOGDKGGhgbtOV577TVL0tk//OEPdM0119CXvvQl2r17Nw0fPpyuv/56uvzyy4mI6B//+Ac9/fTTRER0wgknWM61ZcsWqqmp8VMFAAAAAAAAAPBMaB+mQgE+TAAAAAAAAACiHPswAQAAAAAAAECxAoEJAAAAAAAAAGyAwAQAAAAAAAAANkBgAgAAAAAAAAAbIDABAAAAAAAAgA0QmAAAAAAAAADABghMAAAAAAAAAGADBCYAAAAAAAAAsAECEwAAAAAAAADYAIEJAAAAAAAAAGyAwAQAAAAAAAAANkBgAgAAAAAAAAAbIDABAAAAAAAAgA0QmAAAAAAAAADABghMAAAAAAAAAGADBCYAAAAAAAAAsAECEwAAAAAAAADYAIEJAAAAAAAAAGyAwAQAAAAAAAAANkBgAgAAAAAAAAAbIDABAAAAAAAAgA0QmAAAAAAAAADABghMAAAAAAAA5ItMhmjuXKKGBn7PZPJdIqCQzncBAAAAAAAAKFnmzyeaM4dICKInnuDvmpvzWiRgBRomAAAAAAAA8sWaNSwsEfH7mjX5LQ/IAgITAAAAAAAA+WLsWKJUij+nUrwNEgVM8gAAAAAAAMgXjY38vmYNC0tyGyQGCEwAAAAAAADki3QaPksJByZ5AAAAAAAAAGADBCYAAAAAAAAAsAECEwAAAAAAAADYAIEJAAAAAAAAAGyAwAQAAAAAAAAANkBgAgAAAAAAAAAbIDABAAAAAAAAgA0QmAAAAAAAAADABghMAAAAAAAAAGADBCYAAAAAAAAAsAECEwAAAAAAAADYAIEJAAAAAAAAAGyAwAQAAAAAAAAANkBgAgAAAAAAAAAbIDABAAAAAAAAgA0QmAAAAAAAAADABghMAAAAAAAAAGADBCYAAAAAAAAAsAECEwAAAAAAAADYAIEJAAAAAACAUiCTIZo7l6ihgd8zmXyXqCBI57sAAAAAAAAAgBwwfz7RnDlEQhA98QR/19yc1yIVAtAwAQAAAAAAUAqsWcPCEhG/r1mT3/IUCNAwAQAAAAAAUKy8+irRKadkf59KEY0dm/vyFCAQmAAAAAAAAChWdMJSfT0LS42NuS9PAQKBCQAAAAAAgFJiyZJ8l6CggA8TAAAAAAAAANgAgQkAAAAAAIBi5ZVXnLeBKzDJAwAAAAAAoFg5+WQjMh4IBDRMAAAAAIiWri6iSZOIDj+c37u6cnMsAADEAAQmAAAAAETL1KlEK1YQ7d7N71OnWn/PZIjmziVqaCC69lqiESOI+vTh94YG52NffZXDIcvXq6/mrl4AgJIkJURp6Oj27dtHlZWV1NHRQYMGDcp3cQAAAIDi5fDDWeCRVFcTnXkm0YYNRKNGcTjj667TmwmVlRH19hrbgwcTvfeesZ1KZR9TGksZAECE+JEN4MMEAAAAgPBkMkTz5xOtWUM0cKBVYOrtZW0REb+vX28v5JiFJSIWsIKWQ+aZSWO5AwAIDmYQAAAAAIRn/nyiOXP0gtDBg9btri7WFOn2HT6cqLbW0Ea1tQUvxxNP8HfNzf7OAQAAJiAwAQAAAMAdN83NmjX2WqP+/a3BG849lwM6rFlDdM45RPffT7RzJ9GQIUQbNxINGGBfjldeITrlFOu2GXM5hCBqb2d/KWicAAABwYwBAAAAlCp+zNfcNDdjx/L3qtCUShFdeSVfw6w1qqgw9mltNcpxyy3O5XALkWwuRyrFJn7QOAEAQgCBCQAAAChV/JivqZqbNWusvzc2GvuNGcP7rFvnTasTpRmduRxjxxKtXu1cbgAAcAECEwAAAFCquAlBZg1UJmP4HaVSLIyYSaeDCzlu5fCDWo65c4mWLbMvNwAAuIA8TAAAAECpMnasEaZbJ0xIzc/SpRzdbsIEDsgwYgRRTw/7Jcl8SnPnslAVRzn8Ys7z1NtL1NREVF/PdZEaKAkS5QIAXICGCQAAAChVVPM1VZhQAzls3060ZQt/N28em7utXBnelM6tHH5RTfzmzCFaskS/r0yyS2Qkyl2+PNz1AQBFBQQmAAAAoFRxM6NTAygQWU3nNmyIxpQujDmfDj8mfhs2OG8DAEoemOQBAPxhNnV57jk2ywEAFCeNjaydkeZs06dbTedGjYrWlC4q/Jj4qYlxR46MxswQAFA0QMMEAPCHztQFIXoBKE5UzU8mQ1R
|
||
|
"text/plain": [
|
||
|
"<Figure size 1000x1000 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"\n",
|
||
|
"# Load San Francisco map data\n",
|
||
|
"sf = gpd.read_file(\"https://data.sfgov.org/resource/3psu-pn9h.geojson\")\n",
|
||
|
"\n",
|
||
|
"# Plot the San Francisco map and the points\n",
|
||
|
"fig, ax = plt.subplots(figsize=(10, 10))\n",
|
||
|
"sf.plot(ax=ax, color=\"white\", edgecolor=\"black\")\n",
|
||
|
"gdf.plot(ax=ax, color=\"red\", markersize=5)\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "a081a9d1",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Load GeoPandas dataframe as a `Document` for downstream processing (embedding, chat, etc). \n",
|
||
|
"\n",
|
||
|
"The `geometry` will be the default `page_content` columns, and all other columns are placed in `metadata`.\n",
|
||
|
"\n",
|
||
|
"But, we can specify the `page_content_column`."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 32,
|
||
|
"id": "381a5f7b",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from langchain.document_loaders import GeoDataFrameLoader\n",
|
||
|
"\n",
|
||
|
"loader = GeoDataFrameLoader(data_frame=gdf, page_content_column=\"geometry\")\n",
|
||
|
"docs = loader.load()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 33,
|
||
|
"id": "74baf6ee",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"Document(page_content='POINT (-122.420084075249 37.7083109744362)', metadata={'pdid': '4133422003074', 'incidntnum': '041334220', 'incident_code': '03074', 'category': 'ROBBERY', 'descript': 'ROBBERY, BODILY FORCE', 'dayofweek': 'Monday', 'date': '2004-11-22T00:00:00.000', 'time': '17:50', 'pddistrict': 'INGLESIDE', 'resolution': 'NONE', 'address': 'GENEVA AV / SANTOS ST', 'x': '-122.420084075249', 'y': '37.7083109744362', 'location': {'type': 'Point', 'coordinates': [-122.420084075249, 37.7083109744362]}, ':@computed_region_26cr_cadq': '9', ':@computed_region_rxqg_mtj9': '8', ':@computed_region_bh8s_q3mv': '309', ':@computed_region_6qbp_sg9q': nan, ':@computed_region_qgnn_b9vv': nan, ':@computed_region_ajp5_b2md': nan, ':@computed_region_yftq_j783': nan, ':@computed_region_p5aj_wyqh': nan, ':@computed_region_fyvs_ahh9': nan, ':@computed_region_6pnf_4xz7': nan, ':@computed_region_jwn9_ihcz': nan, ':@computed_region_9dfj_4gjx': nan, ':@computed_region_4isq_27mq': nan, ':@computed_region_pigm_ib2e': nan, ':@computed_region_9jxd_iqea': nan, ':@computed_region_6ezc_tdp2': nan, ':@computed_region_h4ep_8xdi': nan, ':@computed_region_n4xg_c4py': nan, ':@computed_region_fcz8_est8': nan, ':@computed_region_nqbw_i6c3': nan, ':@computed_region_2dwj_jsy4': nan, 'Latitude': 37.7083109744362, 'Longitude': -122.420084075249})"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 33,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"docs[0]"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3 (ipykernel)",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.9.16"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|