docs: Updated docs for sitemap loader to use correct URL (#17395)

- **Description:** 
Updated URL for sitemap loader from
"https://langchain.readthedocs.io/sitemap.xml" to
"https://api.python.langchain.com/sitemap.xml"
  - **Issue:** Fixes #17236
pull/17348/head^2
Pennlaine 8 months ago committed by GitHub
parent bd0ad6637a
commit e1bc623f8f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -13,27 +13,16 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: nest_asyncio in /Users/tasp/Code/projects/langchain/.venv/lib/python3.10/site-packages (1.5.6)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
]
}
],
"outputs": [],
"source": [
"%pip install --upgrade --quiet nest_asyncio"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@ -54,11 +43,11 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sitemap_loader = SitemapLoader(web_path=\"https://langchain.readthedocs.io/sitemap.xml\")\n",
"sitemap_loader = SitemapLoader(web_path=\"https://api.python.langchain.com/sitemap.xml\")\n",
"\n",
"docs = sitemap_loader.load()"
]
@ -90,7 +79,7 @@
{
"data": {
"text/plain": [
"Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/stable/', 'loc': 'https://api.python.langchain.com/en/stable/', 'lastmod': '2023-10-13T18:13:26.966937+00:00', 'changefreq': 'weekly', 'priority': '1'})"
"Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/stable/', 'loc': 'https://api.python.langchain.com/en/stable/', 'lastmod': '2024-02-09T01:10:49.422114+00:00', 'changefreq': 'weekly', 'priority': '1'})"
]
},
"execution_count": 6,
@ -113,20 +102,12 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching pages: 100%|##########| 1/1 [00:00<00:00, 16.39it/s]\n"
]
}
],
"outputs": [],
"source": [
"loader = SitemapLoader(\n",
" web_path=\"https://langchain.readthedocs.io/sitemap.xml\",\n",
" web_path=\" https://api.python.langchain.com/sitemap.xml\",\n",
" filter_urls=[\"https://api.python.langchain.com/en/latest\"],\n",
")\n",
"documents = loader.load()"
@ -134,7 +115,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 8,
"metadata": {
"scrolled": true
},
@ -142,10 +123,10 @@
{
"data": {
"text/plain": [
"Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/latest/', 'loc': 'https://api.python.langchain.com/en/latest/', 'lastmod': '2023-10-13T18:09:58.478681+00:00', 'changefreq': 'daily', 'priority': '0.9'})"
"Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/latest/', 'loc': 'https://api.python.langchain.com/en/latest/', 'lastmod': '2024-02-12T05:26:10.971077+00:00', 'changefreq': 'daily', 'priority': '0.9'})"
]
},
"execution_count": 28,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -183,7 +164,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@ -211,12 +192,12 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"loader = SitemapLoader(\n",
" \"https://langchain.readthedocs.io/sitemap.xml\",\n",
" \"https://api.python.langchain.com/sitemap.xml\",\n",
" filter_urls=[\"https://api.python.langchain.com/en/latest/\"],\n",
" parsing_function=remove_nav_and_header_elements,\n",
")"
@ -233,17 +214,9 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching pages: 100%|##########| 3/3 [00:00<00:00, 12.46it/s]\n"
]
}
],
"outputs": [],
"source": [
"sitemap_loader = SitemapLoader(web_path=\"example_data/sitemap.xml\", is_local=True)\n",
"\n",

Loading…
Cancel
Save