From 72ad448daa60937ef8424d2e5d90d4d50d7af7f5 Mon Sep 17 00:00:00 2001 From: Lee Date: Fri, 10 Nov 2023 17:21:55 -0500 Subject: [PATCH] feat: Docusaurus Loader (#9138) Added a Docusaurus Loader Issue: #6353 I had to implement this for working with the Ionic documentation, and wanted to open this up as a draft to get some guidance on building this out further. I wasn't sure if having it be a light extension of the SitemapLoader was in the spirit of a proper feature for the library -- but I'm grateful for the opportunities Langchain has given me and I'd love to build this out properly for the sake of the community. Any feedback welcome! --- .../document_loaders/docusaurus.ipynb | 243 ++++++++++++++++++ .../langchain/document_loaders/__init__.py | 2 + .../langchain/document_loaders/docusaurus.py | 49 ++++ .../document_loaders/test_docusaurus.py | 43 ++++ .../examples/docusaurus-sitemap.xml | 42 +++ 5 files changed, 379 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/docusaurus.ipynb create mode 100644 libs/langchain/langchain/document_loaders/docusaurus.py create mode 100644 libs/langchain/tests/integration_tests/document_loaders/test_docusaurus.py create mode 100644 libs/langchain/tests/integration_tests/examples/docusaurus-sitemap.xml diff --git a/docs/docs/integrations/document_loaders/docusaurus.ipynb b/docs/docs/integrations/document_loaders/docusaurus.ipynb new file mode 100644 index 0000000000..ca953cb668 --- /dev/null +++ b/docs/docs/integrations/document_loaders/docusaurus.ipynb @@ -0,0 +1,243 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Docusaurus\n", + "> [Docusaurus](https://docusaurus.io/) is a static-site generator which provides out-of-the-box documentation features.\n", + "\n", + "By utilizing the existing `SitemapLoader`, this loader scans and loads all pages from a given Docusaurus application and returns the main documentation content of each page as a Document." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import DocusaurusLoader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install necessary dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U beautifulsoup4 lxml" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# fixes a bug with asyncio and jupyter\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching pages: 100%|##########| 939/939 [01:19<00:00, 11.85it/s]\n" + ] + } + ], + "source": [ + "loader = DocusaurusLoader(\"https://python.langchain.com\")\n", + "\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> `SitemapLoader` also provides the ability to utilize and tweak concurrency which can help optimize the time it takes to load the source documentation. Refer to the [sitemap docs](/docs/integrations/document_loaders/sitemap) for more info." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content=\"\\n\\n\\n\\n\\nCookbook | 🦜️🔗 Langchain\\n\\n\\n\\n\\n\\n\\nSkip to main content🦜️🔗 LangChainDocsUse casesIntegrationsAPICommunityChat our docsLangSmithJS/TS DocsSearchCTRLKCookbookThe page you're looking for has been moved to the cookbook section of the repo as a notebook.CommunityDiscordTwitterGitHubPythonJS/TSMoreHomepageBlogCopyright © 2023 LangChain, Inc.\\n\\n\\n\\n\", metadata={'source': 'https://python.langchain.com/cookbook', 'loc': 'https://python.langchain.com/cookbook', 'changefreq': 'weekly', 'priority': '0.5'})" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filtering sitemap URLs\n", + "\n", + "Sitemaps can contain thousands of URLs and ften you don't need every single one of them. You can filter the URLs by passing a list of strings or regex patterns to the `url_filter` parameter. Only URLs that match one of the patterns will be loaded." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching pages: 100%|##########| 1/1 [00:00<00:00, 5.21it/s]\n" + ] + } + ], + "source": [ + "loader = DocusaurusLoader(\n", + " \"https://python.langchain.com\",\n", + " filter_urls=[\"https://python.langchain.com/docs/integrations/document_loaders/sitemap\"],\n", + ")\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='\\n\\n\\n\\n\\nSitemap | 🦜️🔗 Langchain\\n\\n\\n\\n\\n\\n\\nSkip to main content🦜️🔗 LangChainDocsUse casesIntegrationsAPICommunityChat our docsLangSmithJS/TS DocsSearchCTRLKProvidersAnthropicAWSGoogleMicrosoftOpenAIMoreComponentsLLMsChat modelsDocument loadersacreomAirbyte CDKAirbyte GongAirbyte HubspotAirbyte JSONAirbyte SalesforceAirbyte ShopifyAirbyte StripeAirbyte TypeformAirbyte Zendesk SupportAirtableAlibaba Cloud MaxComputeApify DatasetArcGISArxivAssemblyAI Audio TranscriptsAsync ChromiumAsyncHtmlAWS S3 DirectoryAWS S3 FileAZLyricsAzure Blob Storage ContainerAzure Blob Storage FileAzure Document IntelligenceBibTeXBiliBiliBlackboardBlockchainBrave SearchBrowserlessChatGPT DataCollege ConfidentialConcurrent LoaderConfluenceCoNLL-UCopy PasteCSVCube Semantic LayerDatadog LogsDiffbotDiscordDocugamiDropboxDuckDBEmailEmbaasEPubEtherscanEverNoteexample_dataMicrosoft ExcelFacebook ChatFaunaFigmaGeopandasGitGitBookGitHubGoogle BigQueryGoogle Cloud Storage DirectoryGoogle Cloud Storage FileGoogle DriveGrobidGutenbergHacker NewsHuawei OBS DirectoryHuawei OBS FileHuggingFace datasetiFixitImagesImage captionsIMSDbIuguJoplinJupyter NotebookLarkSuite (FeiShu)MastodonMediaWiki DumpMerge Documents LoadermhtmlMicrosoft OneDriveMicrosoft PowerPointMicrosoft SharePointMicrosoft WordModern TreasuryMongoDBNews URLNotion DB 1/2Notion DB 2/2NucliaObsidianOpen Document Format (ODT)Open City DataOrg-modePandas DataFrameAmazon TextractPolars DataFramePsychicPubMedPySparkReadTheDocs DocumentationRecursive URLRedditRoamRocksetrspaceRSS FeedsRSTSitemapSlackSnowflakeSource CodeSpreedlyStripeSubtitleTelegramTencent COS DirectoryTencent COS FileTensorFlow Datasets2MarkdownTOMLTrelloTSVTwitterUnstructured FileURLWeatherWebBaseLoaderWhatsApp ChatWikipediaXMLXorbits Pandas DataFrameYouTube audioYouTube transcriptsDocument transformersText embedding modelsVector storesRetrieversToolsAgents and toolkitsMemoryCallbacksChat loadersComponentsDocument loadersSitemapOn this pageSitemapExtends from the WebBaseLoader, SitemapLoader loads a sitemap from a given URL, and then scrape and load all pages in the sitemap, returning each page as a Document.The scraping is done concurrently. There are reasonable limits to concurrent requests, defaulting to 2 per second. If you aren\\'t concerned about being a good citizen, or you control the scrapped server, or don\\'t care about load. Note, while this will speed up the scraping process, but it may cause the server to block you. Be careful!pip install nest_asyncio Requirement already satisfied: nest_asyncio in /Users/tasp/Code/projects/langchain/.venv/lib/python3.10/site-packages (1.5.6) [notice] A new release of pip available: 22.3.1 -> 23.0.1 [notice] To update, run: pip install --upgrade pip# fixes a bug with asyncio and jupyterimport nest_asyncionest_asyncio.apply()from langchain.document_loaders.sitemap import SitemapLoadersitemap_loader = SitemapLoader(web_path=\"https://langchain.readthedocs.io/sitemap.xml\")docs = sitemap_loader.load()You can change the requests_per_second parameter to increase the max concurrent requests. and use requests_kwargs to pass kwargs when send requests.sitemap_loader.requests_per_second = 2# Optional: avoid `[SSL: CERTIFICATE_VERIFY_FAILED]` issuesitemap_loader.requests_kwargs = {\"verify\": False}docs[0] Document(page_content=\\'\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nWelcome to LangChain — 🦜🔗 LangChain 0.0.123\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nSkip to main content\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nCtrl+K\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n🦜🔗 LangChain 0.0.123\\\\n\\\\n\\\\n\\\\nGetting Started\\\\n\\\\nQuickstart Guide\\\\n\\\\nModules\\\\n\\\\nPrompt Templates\\\\nGetting Started\\\\nKey Concepts\\\\nHow-To Guides\\\\nCreate a custom prompt template\\\\nCreate a custom example selector\\\\nProvide few shot examples to a prompt\\\\nPrompt Serialization\\\\nExample Selectors\\\\nOutput Parsers\\\\n\\\\n\\\\nReference\\\\nPromptTemplates\\\\nExample Selector\\\\n\\\\n\\\\n\\\\n\\\\nLLMs\\\\nGetting Started\\\\nKey Concepts\\\\nHow-To Guides\\\\nGeneric Functionality\\\\nCustom LLM\\\\nFake LLM\\\\nLLM Caching\\\\nLLM Serialization\\\\nToken Usage Tracking\\\\n\\\\n\\\\nIntegrations\\\\nAI21\\\\nAleph Alpha\\\\nAnthropic\\\\nAzure OpenAI LLM Example\\\\nBanana\\\\nCerebriumAI LLM Example\\\\nCohere\\\\nDeepInfra LLM Example\\\\nForefrontAI LLM Example\\\\nGooseAI LLM Example\\\\nHugging Face Hub\\\\nManifest\\\\nModal\\\\nOpenAI\\\\nPetals LLM Example\\\\nPromptLayer OpenAI\\\\nSageMakerEndpoint\\\\nSelf-Hosted Models via Runhouse\\\\nStochasticAI\\\\nWriter\\\\n\\\\n\\\\nAsync API for LLM\\\\nStreaming with LLMs\\\\n\\\\n\\\\nReference\\\\n\\\\n\\\\nDocument Loaders\\\\nKey Concepts\\\\nHow To Guides\\\\nCoNLL-U\\\\nAirbyte JSON\\\\nAZLyrics\\\\nBlackboard\\\\nCollege Confidential\\\\nCopy Paste\\\\nCSV Loader\\\\nDirectory Loader\\\\nEmail\\\\nEverNote\\\\nFacebook Chat\\\\nFigma\\\\nGCS Directory\\\\nGCS File Storage\\\\nGitBook\\\\nGoogle Drive\\\\nGutenberg\\\\nHacker News\\\\nHTML\\\\niFixit\\\\nImages\\\\nIMSDb\\\\nMarkdown\\\\nNotebook\\\\nNotion\\\\nObsidian\\\\nPDF\\\\nPowerPoint\\\\nReadTheDocs Documentation\\\\nRoam\\\\ns3 Directory\\\\ns3 File\\\\nSubtitle Files\\\\nTelegram\\\\nUnstructured File Loader\\\\nURL\\\\nWeb Base\\\\nWord Documents\\\\nYouTube\\\\n\\\\n\\\\n\\\\n\\\\nUtils\\\\nKey Concepts\\\\nGeneric Utilities\\\\nBash\\\\nBing Search\\\\nGoogle Search\\\\nGoogle Serper API\\\\nIFTTT WebHooks\\\\nPython REPL\\\\nRequests\\\\nSearxNG Search API\\\\nSerpAPI\\\\nWolfram Alpha\\\\nZapier Natural Language Actions API\\\\n\\\\n\\\\nReference\\\\nPython REPL\\\\nSerpAPI\\\\nSearxNG Search\\\\nDocstore\\\\nText Splitter\\\\nEmbeddings\\\\nVectorStores\\\\n\\\\n\\\\n\\\\n\\\\nIndexes\\\\nGetting Started\\\\nKey Concepts\\\\nHow To Guides\\\\nEmbeddings\\\\nHypothetical Document Embeddings\\\\nText Splitter\\\\nVectorStores\\\\nAtlasDB\\\\nChroma\\\\nDeep Lake\\\\nElasticSearch\\\\nFAISS\\\\nMilvus\\\\nOpenSearch\\\\nPGVector\\\\nPinecone\\\\nQdrant\\\\nRedis\\\\nWeaviate\\\\nChatGPT Plugin Retriever\\\\nVectorStore Retriever\\\\nAnalyze Document\\\\nChat Index\\\\nGraph QA\\\\nQuestion Answering with Sources\\\\nQuestion Answering\\\\nSummarization\\\\nRetrieval Question/Answering\\\\nRetrieval Question Answering with Sources\\\\nVector DB Text Generation\\\\n\\\\n\\\\n\\\\n\\\\nChains\\\\nGetting Started\\\\nHow-To Guides\\\\nGeneric Chains\\\\nLoading from LangChainHub\\\\nLLM Chain\\\\nSequential Chains\\\\nSerialization\\\\nTransformation Chain\\\\n\\\\n\\\\nUtility Chains\\\\nAPI Chains\\\\nSelf-Critique Chain with Constitutional AI\\\\nBashChain\\\\nLLMCheckerChain\\\\nLLM Math\\\\nLLMRequestsChain\\\\nLLMSummarizationCheckerChain\\\\nModeration\\\\nPAL\\\\nSQLite example\\\\n\\\\n\\\\nAsync API for Chain\\\\n\\\\n\\\\nKey Concepts\\\\nReference\\\\n\\\\n\\\\nAgents\\\\nGetting Started\\\\nKey Concepts\\\\nHow-To Guides\\\\nAgents and Vectorstores\\\\nAsync API for Agent\\\\nConversation Agent (for Chat Models)\\\\nChatGPT Plugins\\\\nCustom Agent\\\\nDefining Custom Tools\\\\nHuman as a tool\\\\nIntermediate Steps\\\\nLoading from LangChainHub\\\\nMax Iterations\\\\nMulti Input Tools\\\\nSearch Tools\\\\nSerialization\\\\nAdding SharedMemory to an Agent and its Tools\\\\nCSV Agent\\\\nJSON Agent\\\\nOpenAPI Agent\\\\nPandas Dataframe Agent\\\\nPython Agent\\\\nSQL Database Agent\\\\nVectorstore Agent\\\\nMRKL\\\\nMRKL Chat\\\\nReAct\\\\nSelf Ask With Search\\\\n\\\\n\\\\nReference\\\\n\\\\n\\\\nMemory\\\\nGetting Started\\\\nKey Concepts\\\\nHow-To Guides\\\\nConversationBufferMemory\\\\nConversationBufferWindowMemory\\\\nEntity Memory\\\\nConversation Knowledge Graph Memory\\\\nConversationSummaryMemory\\\\nConversationSummaryBufferMemory\\\\nConversationTokenBufferMemory\\\\nAdding Memory To an LLMChain\\\\nAdding Memory to a Multi-Input Chain\\\\nAdding Memory to an Agent\\\\nChatGPT Clone\\\\nConversation Agent\\\\nConversational Memory Customization\\\\nCustom Memory\\\\nMultiple Memory\\\\n\\\\n\\\\n\\\\n\\\\nChat\\\\nGetting Started\\\\nKey Concepts\\\\nHow-To Guides\\\\nAgent\\\\nChat Vector DB\\\\nFew Shot Examples\\\\nMemory\\\\nPromptLayer ChatOpenAI\\\\nStreaming\\\\nRetrieval Question/Answering\\\\nRetrieval Question Answering with Sources\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nUse Cases\\\\n\\\\nAgents\\\\nChatbots\\\\nGenerate Examples\\\\nData Augmented Generation\\\\nQuestion Answering\\\\nSummarization\\\\nQuerying Tabular Data\\\\nExtraction\\\\nEvaluation\\\\nAgent Benchmarking: Search + Calculator\\\\nAgent VectorDB Question Answering Benchmarking\\\\nBenchmarking Template\\\\nData Augmented Question Answering\\\\nUsing Hugging Face Datasets\\\\nLLM Math\\\\nQuestion Answering Benchmarking: Paul Graham Essay\\\\nQuestion Answering Benchmarking: State of the Union Address\\\\nQA Generation\\\\nQuestion Answering\\\\nSQL Question Answering Benchmarking: Chinook\\\\n\\\\n\\\\nModel Comparison\\\\n\\\\nReference\\\\n\\\\nInstallation\\\\nIntegrations\\\\nAPI References\\\\nPrompts\\\\nPromptTemplates\\\\nExample Selector\\\\n\\\\n\\\\nUtilities\\\\nPython REPL\\\\nSerpAPI\\\\nSearxNG Search\\\\nDocstore\\\\nText Splitter\\\\nEmbeddings\\\\nVectorStores\\\\n\\\\n\\\\nChains\\\\nAgents\\\\n\\\\n\\\\n\\\\nEcosystem\\\\n\\\\nLangChain Ecosystem\\\\nAI21 Labs\\\\nAtlasDB\\\\nBanana\\\\nCerebriumAI\\\\nChroma\\\\nCohere\\\\nDeepInfra\\\\nDeep Lake\\\\nForefrontAI\\\\nGoogle Search Wrapper\\\\nGoogle Serper Wrapper\\\\nGooseAI\\\\nGraphsignal\\\\nHazy Research\\\\nHelicone\\\\nHugging Face\\\\nMilvus\\\\nModal\\\\nNLPCloud\\\\nOpenAI\\\\nOpenSearch\\\\nPetals\\\\nPGVector\\\\nPinecone\\\\nPromptLayer\\\\nQdrant\\\\nRunhouse\\\\nSearxNG Search API\\\\nSerpAPI\\\\nStochasticAI\\\\nUnstructured\\\\nWeights & Biases\\\\nWeaviate\\\\nWolfram Alpha Wrapper\\\\nWriter\\\\n\\\\n\\\\n\\\\nAdditional Resources\\\\n\\\\nLangChainHub\\\\nGlossary\\\\nLangChain Gallery\\\\nDeployments\\\\nTracing\\\\nDiscord\\\\nProduction Support\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n.rst\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n.pdf\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nWelcome to LangChain\\\\n\\\\n\\\\n\\\\n\\\\n Contents \\\\n\\\\n\\\\n\\\\nGetting Started\\\\nModules\\\\nUse Cases\\\\nReference Docs\\\\nLangChain Ecosystem\\\\nAdditional Resources\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nWelcome to LangChain#\\\\nLarge language models (LLMs) are emerging as a transformative technology, enabling\\\\ndevelopers to build applications that they previously could not.\\\\nBut using these LLMs in isolation is often not enough to\\\\ncreate a truly powerful app - the real power comes when you are able to\\\\ncombine them with other sources of computation or knowledge.\\\\nThis library is aimed at assisting in the development of those types of applications. Common examples of these types of applications include:\\\\n❓ Question Answering over specific documents\\\\n\\\\nDocumentation\\\\nEnd-to-end Example: Question Answering over Notion Database\\\\n\\\\n💬 Chatbots\\\\n\\\\nDocumentation\\\\nEnd-to-end Example: Chat-LangChain\\\\n\\\\n🤖 Agents\\\\n\\\\nDocumentation\\\\nEnd-to-end Example: GPT+WolframAlpha\\\\n\\\\n\\\\nGetting Started#\\\\nCheckout the below guide for a walkthrough of how to get started using LangChain to create an Language Model application.\\\\n\\\\nGetting Started Documentation\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nModules#\\\\nThere are several main modules that LangChain provides support for.\\\\nFor each module we provide some examples to get started, how-to guides, reference docs, and conceptual guides.\\\\nThese modules are, in increasing order of complexity:\\\\n\\\\nPrompts: This includes prompt management, prompt optimization, and prompt serialization.\\\\nLLMs: This includes a generic interface for all LLMs, and common utilities for working with LLMs.\\\\nDocument Loaders: This includes a standard interface for loading documents, as well as specific integrations to all types of text data sources.\\\\nUtils: Language models are often more powerful when interacting with other sources of knowledge or computation. This can include Python REPLs, embeddings, search engines, and more. LangChain provides a large collection of common utils to use in your application.\\\\nChains: Chains go beyond just a single LLM call, and are sequences of calls (whether to an LLM or a different utility). LangChain provides a standard interface for chains, lots of integrations with other tools, and end-to-end chains for common applications.\\\\nIndexes: Language models are often more powerful when combined with your own text data - this module covers best practices for doing exactly that.\\\\nAgents: Agents involve an LLM making decisions about which Actions to take, taking that Action, seeing an Observation, and repeating that until done. LangChain provides a standard interface for agents, a selection of agents to choose from, and examples of end to end agents.\\\\nMemory: Memory is the concept of persisting state between calls of a chain/agent. LangChain provides a standard interface for memory, a collection of memory implementations, and examples of chains/agents that use memory.\\\\nChat: Chat models are a variation on Language Models that expose a different API - rather than working with raw text, they work with messages. LangChain provides a standard interface for working with them and doing all the same things as above.\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nUse Cases#\\\\nThe above modules can be used in a variety of ways. LangChain also provides guidance and assistance in this. Below are some of the common use cases LangChain supports.\\\\n\\\\nAgents: Agents are systems that use a language model to interact with other tools. These can be used to do more grounded question/answering, interact with APIs, or even take actions.\\\\nChatbots: Since language models are good at producing text, that makes them ideal for creating chatbots.\\\\nData Augmented Generation: Data Augmented Generation involves specific types of chains that first interact with an external datasource to fetch data to use in the generation step. Examples of this include summarization of long pieces of text and question/answering over specific data sources.\\\\nQuestion Answering: Answering questions over specific documents, only utilizing the information in those documents to construct an answer. A type of Data Augmented Generation.\\\\nSummarization: Summarizing longer documents into shorter, more condensed chunks of information. A type of Data Augmented Generation.\\\\nQuerying Tabular Data: If you want to understand how to use LLMs to query data that is stored in a tabular format (csvs, SQL, dataframes, etc) you should read this page.\\\\nEvaluation: Generative models are notoriously hard to evaluate with traditional metrics. One new way of evaluating them is using language models themselves to do the evaluation. LangChain provides some prompts/chains for assisting in this.\\\\nGenerate similar examples: Generating similar examples to a given input. This is a common use case for many applications, and LangChain provides some prompts/chains for assisting in this.\\\\nCompare models: Experimenting with different prompts, models, and chains is a big part of developing the best possible application. The ModelLaboratory makes it easy to do so.\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nReference Docs#\\\\nAll of LangChain’s reference documentation, in one place. Full documentation on all methods, classes, installation methods, and integration setups for LangChain.\\\\n\\\\nReference Documentation\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nLangChain Ecosystem#\\\\nGuides for how other companies/products can be used with LangChain\\\\n\\\\nLangChain Ecosystem\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nAdditional Resources#\\\\nAdditional collection of resources we think may be useful as you develop your application!\\\\n\\\\nLangChainHub: The LangChainHub is a place to share and explore other prompts, chains, and agents.\\\\nGlossary: A glossary of all related terms, papers, methods, etc. Whether implemented in LangChain or not!\\\\nGallery: A collection of our favorite projects that use LangChain. Useful for finding inspiration or seeing how things were done in other applications.\\\\nDeployments: A collection of instructions, code snippets, and template repositories for deploying LangChain apps.\\\\nDiscord: Join us on our Discord to discuss all things LangChain!\\\\nTracing: A guide on using tracing in LangChain to visualize the execution of chains and agents.\\\\nProduction Support: As you move your LangChains into production, we’d love to offer more comprehensive support. Please fill out this form and we’ll set up a dedicated support Slack channel.\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nnext\\\\nQuickstart Guide\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n Contents\\\\n \\\\n\\\\n\\\\nGetting Started\\\\nModules\\\\nUse Cases\\\\nReference Docs\\\\nLangChain Ecosystem\\\\nAdditional Resources\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nBy Harrison Chase\\\\n\\\\n\\\\n\\\\n\\\\n \\\\n © Copyright 2023, Harrison Chase.\\\\n \\\\n\\\\n\\\\n\\\\n\\\\n Last updated on Mar 24, 2023.\\\\n \\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\', lookup_str=\\'\\', metadata={\\'source\\': \\'https://python.langchain.com/en/stable/\\', \\'loc\\': \\'https://python.langchain.com/en/stable/\\', \\'lastmod\\': \\'2023-03-24T19:30:54.647430+00:00\\', \\'changefreq\\': \\'weekly\\', \\'priority\\': \\'1\\'}, lookup_index=0)Filtering sitemap URLs\\u200bSitemaps can be massive files, with thousands of URLs. Often you don\\'t need every single one of them. You can filter the URLs by passing a list of strings or regex patterns to the url_filter parameter. Only URLs that match one of the patterns will be loaded.loader = SitemapLoader( \"https://langchain.readthedocs.io/sitemap.xml\", filter_urls=[\"https://python.langchain.com/en/latest/\"],)documents = loader.load()documents[0] Document(page_content=\\'\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nWelcome to LangChain — 🦜🔗 LangChain 0.0.123\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nSkip to main content\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nCtrl+K\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n🦜🔗 LangChain 0.0.123\\\\n\\\\n\\\\n\\\\nGetting Started\\\\n\\\\nQuickstart Guide\\\\n\\\\nModules\\\\n\\\\nModels\\\\nLLMs\\\\nGetting Started\\\\nGeneric Functionality\\\\nHow to use the async API for LLMs\\\\nHow to write a custom LLM wrapper\\\\nHow (and why) to use the fake LLM\\\\nHow to cache LLM calls\\\\nHow to serialize LLM classes\\\\nHow to stream LLM responses\\\\nHow to track token usage\\\\n\\\\n\\\\nIntegrations\\\\nAI21\\\\nAleph Alpha\\\\nAnthropic\\\\nAzure OpenAI LLM Example\\\\nBanana\\\\nCerebriumAI LLM Example\\\\nCohere\\\\nDeepInfra LLM Example\\\\nForefrontAI LLM Example\\\\nGooseAI LLM Example\\\\nHugging Face Hub\\\\nManifest\\\\nModal\\\\nOpenAI\\\\nPetals LLM Example\\\\nPromptLayer OpenAI\\\\nSageMakerEndpoint\\\\nSelf-Hosted Models via Runhouse\\\\nStochasticAI\\\\nWriter\\\\n\\\\n\\\\nReference\\\\n\\\\n\\\\nChat Models\\\\nGetting Started\\\\nHow-To Guides\\\\nHow to use few shot examples\\\\nHow to stream responses\\\\n\\\\n\\\\nIntegrations\\\\nAzure\\\\nOpenAI\\\\nPromptLayer ChatOpenAI\\\\n\\\\n\\\\n\\\\n\\\\nText Embedding Models\\\\nAzureOpenAI\\\\nCohere\\\\nFake Embeddings\\\\nHugging Face Hub\\\\nInstructEmbeddings\\\\nOpenAI\\\\nSageMaker Endpoint Embeddings\\\\nSelf Hosted Embeddings\\\\nTensorflowHub\\\\n\\\\n\\\\n\\\\n\\\\nPrompts\\\\nPrompt Templates\\\\nGetting Started\\\\nHow-To Guides\\\\nHow to create a custom prompt template\\\\nHow to create a prompt template that uses few shot examples\\\\nHow to work with partial Prompt Templates\\\\nHow to serialize prompts\\\\n\\\\n\\\\nReference\\\\nPromptTemplates\\\\nExample Selector\\\\n\\\\n\\\\n\\\\n\\\\nChat Prompt Template\\\\nExample Selectors\\\\nHow to create a custom example selector\\\\nLengthBased ExampleSelector\\\\nMaximal Marginal Relevance ExampleSelector\\\\nNGram Overlap ExampleSelector\\\\nSimilarity ExampleSelector\\\\n\\\\n\\\\nOutput Parsers\\\\nOutput Parsers\\\\nCommaSeparatedListOutputParser\\\\nOutputFixingParser\\\\nPydanticOutputParser\\\\nRetryOutputParser\\\\nStructured Output Parser\\\\n\\\\n\\\\n\\\\n\\\\nIndexes\\\\nGetting Started\\\\nDocument Loaders\\\\nCoNLL-U\\\\nAirbyte JSON\\\\nAZLyrics\\\\nBlackboard\\\\nCollege Confidential\\\\nCopy Paste\\\\nCSV Loader\\\\nDirectory Loader\\\\nEmail\\\\nEverNote\\\\nFacebook Chat\\\\nFigma\\\\nGCS Directory\\\\nGCS File Storage\\\\nGitBook\\\\nGoogle Drive\\\\nGutenberg\\\\nHacker News\\\\nHTML\\\\niFixit\\\\nImages\\\\nIMSDb\\\\nMarkdown\\\\nNotebook\\\\nNotion\\\\nObsidian\\\\nPDF\\\\nPowerPoint\\\\nReadTheDocs Documentation\\\\nRoam\\\\ns3 Directory\\\\ns3 File\\\\nSubtitle Files\\\\nTelegram\\\\nUnstructured File Loader\\\\nURL\\\\nWeb Base\\\\nWord Documents\\\\nYouTube\\\\n\\\\n\\\\nText Splitters\\\\nGetting Started\\\\nCharacter Text Splitter\\\\nHuggingFace Length Function\\\\nLatex Text Splitter\\\\nMarkdown Text Splitter\\\\nNLTK Text Splitter\\\\nPython Code Text Splitter\\\\nRecursiveCharacterTextSplitter\\\\nSpacy Text Splitter\\\\ntiktoken (OpenAI) Length Function\\\\nTiktokenText Splitter\\\\n\\\\n\\\\nVectorstores\\\\nGetting Started\\\\nAtlasDB\\\\nChroma\\\\nDeep Lake\\\\nElasticSearch\\\\nFAISS\\\\nMilvus\\\\nOpenSearch\\\\nPGVector\\\\nPinecone\\\\nQdrant\\\\nRedis\\\\nWeaviate\\\\n\\\\n\\\\nRetrievers\\\\nChatGPT Plugin Retriever\\\\nVectorStore Retriever\\\\n\\\\n\\\\n\\\\n\\\\nMemory\\\\nGetting Started\\\\nHow-To Guides\\\\nConversationBufferMemory\\\\nConversationBufferWindowMemory\\\\nEntity Memory\\\\nConversation Knowledge Graph Memory\\\\nConversationSummaryMemory\\\\nConversationSummaryBufferMemory\\\\nConversationTokenBufferMemory\\\\nHow to add Memory to an LLMChain\\\\nHow to add memory to a Multi-Input Chain\\\\nHow to add Memory to an Agent\\\\nHow to customize conversational memory\\\\nHow to create a custom Memory class\\\\nHow to use multiple memroy classes in the same chain\\\\n\\\\n\\\\n\\\\n\\\\nChains\\\\nGetting Started\\\\nHow-To Guides\\\\nAsync API for Chain\\\\nLoading from LangChainHub\\\\nLLM Chain\\\\nSequential Chains\\\\nSerialization\\\\nTransformation Chain\\\\nAnalyze Document\\\\nChat Index\\\\nGraph QA\\\\nHypothetical Document Embeddings\\\\nQuestion Answering with Sources\\\\nQuestion Answering\\\\nSummarization\\\\nRetrieval Question/Answering\\\\nRetrieval Question Answering with Sources\\\\nVector DB Text Generation\\\\nAPI Chains\\\\nSelf-Critique Chain with Constitutional AI\\\\nBashChain\\\\nLLMCheckerChain\\\\nLLM Math\\\\nLLMRequestsChain\\\\nLLMSummarizationCheckerChain\\\\nModeration\\\\nPAL\\\\nSQLite example\\\\n\\\\n\\\\nReference\\\\n\\\\n\\\\nAgents\\\\nGetting Started\\\\nTools\\\\nGetting Started\\\\nDefining Custom Tools\\\\nMulti Input Tools\\\\nBash\\\\nBing Search\\\\nChatGPT Plugins\\\\nGoogle Search\\\\nGoogle Serper API\\\\nHuman as a tool\\\\nIFTTT WebHooks\\\\nPython REPL\\\\nRequests\\\\nSearch Tools\\\\nSearxNG Search API\\\\nSerpAPI\\\\nWolfram Alpha\\\\nZapier Natural Language Actions API\\\\n\\\\n\\\\nAgents\\\\nAgent Types\\\\nCustom Agent\\\\nConversation Agent (for Chat Models)\\\\nConversation Agent\\\\nMRKL\\\\nMRKL Chat\\\\nReAct\\\\nSelf Ask With Search\\\\n\\\\n\\\\nToolkits\\\\nCSV Agent\\\\nJSON Agent\\\\nOpenAPI Agent\\\\nPandas Dataframe Agent\\\\nPython Agent\\\\nSQL Database Agent\\\\nVectorstore Agent\\\\n\\\\n\\\\nAgent Executors\\\\nHow to combine agents and vectorstores\\\\nHow to use the async API for Agents\\\\nHow to create ChatGPT Clone\\\\nHow to access intermediate steps\\\\nHow to cap the max number of iterations\\\\nHow to add SharedMemory to an Agent and its Tools\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nUse Cases\\\\n\\\\nPersonal Assistants\\\\nQuestion Answering over Docs\\\\nChatbots\\\\nQuerying Tabular Data\\\\nInteracting with APIs\\\\nSummarization\\\\nExtraction\\\\nEvaluation\\\\nAgent Benchmarking: Search + Calculator\\\\nAgent VectorDB Question Answering Benchmarking\\\\nBenchmarking Template\\\\nData Augmented Question Answering\\\\nUsing Hugging Face Datasets\\\\nLLM Math\\\\nQuestion Answering Benchmarking: Paul Graham Essay\\\\nQuestion Answering Benchmarking: State of the Union Address\\\\nQA Generation\\\\nQuestion Answering\\\\nSQL Question Answering Benchmarking: Chinook\\\\n\\\\n\\\\n\\\\nReference\\\\n\\\\nInstallation\\\\nIntegrations\\\\nAPI References\\\\nPrompts\\\\nPromptTemplates\\\\nExample Selector\\\\n\\\\n\\\\nUtilities\\\\nPython REPL\\\\nSerpAPI\\\\nSearxNG Search\\\\nDocstore\\\\nText Splitter\\\\nEmbeddings\\\\nVectorStores\\\\n\\\\n\\\\nChains\\\\nAgents\\\\n\\\\n\\\\n\\\\nEcosystem\\\\n\\\\nLangChain Ecosystem\\\\nAI21 Labs\\\\nAtlasDB\\\\nBanana\\\\nCerebriumAI\\\\nChroma\\\\nCohere\\\\nDeepInfra\\\\nDeep Lake\\\\nForefrontAI\\\\nGoogle Search Wrapper\\\\nGoogle Serper Wrapper\\\\nGooseAI\\\\nGraphsignal\\\\nHazy Research\\\\nHelicone\\\\nHugging Face\\\\nMilvus\\\\nModal\\\\nNLPCloud\\\\nOpenAI\\\\nOpenSearch\\\\nPetals\\\\nPGVector\\\\nPinecone\\\\nPromptLayer\\\\nQdrant\\\\nRunhouse\\\\nSearxNG Search API\\\\nSerpAPI\\\\nStochasticAI\\\\nUnstructured\\\\nWeights & Biases\\\\nWeaviate\\\\nWolfram Alpha Wrapper\\\\nWriter\\\\n\\\\n\\\\n\\\\nAdditional Resources\\\\n\\\\nLangChainHub\\\\nGlossary\\\\nLangChain Gallery\\\\nDeployments\\\\nTracing\\\\nDiscord\\\\nProduction Support\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n.rst\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n.pdf\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nWelcome to LangChain\\\\n\\\\n\\\\n\\\\n\\\\n Contents \\\\n\\\\n\\\\n\\\\nGetting Started\\\\nModules\\\\nUse Cases\\\\nReference Docs\\\\nLangChain Ecosystem\\\\nAdditional Resources\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nWelcome to LangChain#\\\\nLangChain is a framework for developing applications powered by language models. We believe that the most powerful and differentiated applications will not only call out to a language model via an API, but will also:\\\\n\\\\nBe data-aware: connect a language model to other sources of data\\\\nBe agentic: allow a language model to interact with its environment\\\\n\\\\nThe LangChain framework is designed with the above principles in mind.\\\\nThis is the Python specific portion of the documentation. For a purely conceptual guide to LangChain, see here. For the JavaScript documentation, see here.\\\\n\\\\nGetting Started#\\\\nCheckout the below guide for a walkthrough of how to get started using LangChain to create an Language Model application.\\\\n\\\\nGetting Started Documentation\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nModules#\\\\nThere are several main modules that LangChain provides support for.\\\\nFor each module we provide some examples to get started, how-to guides, reference docs, and conceptual guides.\\\\nThese modules are, in increasing order of complexity:\\\\n\\\\nModels: The various model types and model integrations LangChain supports.\\\\nPrompts: This includes prompt management, prompt optimization, and prompt serialization.\\\\nMemory: Memory is the concept of persisting state between calls of a chain/agent. LangChain provides a standard interface for memory, a collection of memory implementations, and examples of chains/agents that use memory.\\\\nIndexes: Language models are often more powerful when combined with your own text data - this module covers best practices for doing exactly that.\\\\nChains: Chains go beyond just a single LLM call, and are sequences of calls (whether to an LLM or a different utility). LangChain provides a standard interface for chains, lots of integrations with other tools, and end-to-end chains for common applications.\\\\nAgents: Agents involve an LLM making decisions about which Actions to take, taking that Action, seeing an Observation, and repeating that until done. LangChain provides a standard interface for agents, a selection of agents to choose from, and examples of end to end agents.\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nUse Cases#\\\\nThe above modules can be used in a variety of ways. LangChain also provides guidance and assistance in this. Below are some of the common use cases LangChain supports.\\\\n\\\\nPersonal Assistants: The main LangChain use case. Personal assistants need to take actions, remember interactions, and have knowledge about your data.\\\\nQuestion Answering: The second big LangChain use case. Answering questions over specific documents, only utilizing the information in those documents to construct an answer.\\\\nChatbots: Since language models are good at producing text, that makes them ideal for creating chatbots.\\\\nQuerying Tabular Data: If you want to understand how to use LLMs to query data that is stored in a tabular format (csvs, SQL, dataframes, etc) you should read this page.\\\\nInteracting with APIs: Enabling LLMs to interact with APIs is extremely powerful in order to give them more up-to-date information and allow them to take actions.\\\\nExtraction: Extract structured information from text.\\\\nSummarization: Summarizing longer documents into shorter, more condensed chunks of information. A type of Data Augmented Generation.\\\\nEvaluation: Generative models are notoriously hard to evaluate with traditional metrics. One new way of evaluating them is using language models themselves to do the evaluation. LangChain provides some prompts/chains for assisting in this.\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nReference Docs#\\\\nAll of LangChain’s reference documentation, in one place. Full documentation on all methods, classes, installation methods, and integration setups for LangChain.\\\\n\\\\nReference Documentation\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nLangChain Ecosystem#\\\\nGuides for how other companies/products can be used with LangChain\\\\n\\\\nLangChain Ecosystem\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nAdditional Resources#\\\\nAdditional collection of resources we think may be useful as you develop your application!\\\\n\\\\nLangChainHub: The LangChainHub is a place to share and explore other prompts, chains, and agents.\\\\nGlossary: A glossary of all related terms, papers, methods, etc. Whether implemented in LangChain or not!\\\\nGallery: A collection of our favorite projects that use LangChain. Useful for finding inspiration or seeing how things were done in other applications.\\\\nDeployments: A collection of instructions, code snippets, and template repositories for deploying LangChain apps.\\\\nTracing: A guide on using tracing in LangChain to visualize the execution of chains and agents.\\\\nModel Laboratory: Experimenting with different prompts, models, and chains is a big part of developing the best possible application. The ModelLaboratory makes it easy to do so.\\\\nDiscord: Join us on our Discord to discuss all things LangChain!\\\\nProduction Support: As you move your LangChains into production, we’d love to offer more comprehensive support. Please fill out this form and we’ll set up a dedicated support Slack channel.\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nnext\\\\nQuickstart Guide\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n Contents\\\\n \\\\n\\\\n\\\\nGetting Started\\\\nModules\\\\nUse Cases\\\\nReference Docs\\\\nLangChain Ecosystem\\\\nAdditional Resources\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nBy Harrison Chase\\\\n\\\\n\\\\n\\\\n\\\\n \\\\n © Copyright 2023, Harrison Chase.\\\\n \\\\n\\\\n\\\\n\\\\n\\\\n Last updated on Mar 27, 2023.\\\\n \\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\', lookup_str=\\'\\', metadata={\\'source\\': \\'https://python.langchain.com/en/latest/\\', \\'loc\\': \\'https://python.langchain.com/en/latest/\\', \\'lastmod\\': \\'2023-03-27T22:50:49.790324+00:00\\', \\'changefreq\\': \\'daily\\', \\'priority\\': \\'0.9\\'}, lookup_index=0)Add custom scraping rules\\u200bThe SitemapLoader uses beautifulsoup4 for the scraping process, and it scrapes every element on the page by default. The SitemapLoader constructor accepts a custom scraping function. This feature can be helpful to tailor the scraping process to your specific needs; for example, you might want to avoid scraping headers or navigation elements. The following example shows how to develop and use a custom function to avoid navigation and header elements.Import the beautifulsoup4 library and define the custom function.pip install beautifulsoup4from bs4 import BeautifulSoupdef remove_nav_and_header_elements(content: BeautifulSoup) -> str: # Find all \\'nav\\' and \\'header\\' elements in the BeautifulSoup object nav_elements = content.find_all(\"nav\") header_elements = content.find_all(\"header\") # Remove each \\'nav\\' and \\'header\\' element from the BeautifulSoup object for element in nav_elements + header_elements: element.decompose() return str(content.get_text())Add your custom function to the SitemapLoader object.loader = SitemapLoader( \"https://langchain.readthedocs.io/sitemap.xml\", filter_urls=[\"https://python.langchain.com/en/latest/\"], parsing_function=remove_nav_and_header_elements,)Local Sitemap\\u200bThe sitemap loader can also be used to load local files.sitemap_loader = SitemapLoader(web_path=\"example_data/sitemap.xml\", is_local=True)docs = sitemap_loader.load() Fetching pages: 100%|####################################################################################################################################| 3/3 [00:00<00:00, 3.91it/s]PreviousRSTNextSlackFiltering sitemap URLsAdd custom scraping rulesLocal SitemapCommunityDiscordTwitterGitHubPythonJS/TSMoreHomepageBlogCopyright © 2023 LangChain, Inc.\\n\\n\\n\\n', metadata={'source': 'https://python.langchain.com/docs/integrations/document_loaders/sitemap', 'loc': 'https://python.langchain.com/docs/integrations/document_loaders/sitemap', 'changefreq': 'weekly', 'priority': '0.5'})" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add custom scraping rules\n", + "\n", + "By default, the parser **removes** all but the main content of the docusaurus page, which is normally the `
` tag. You also have the option to define an **inclusive** list HTML tags by providing them as a list utilizing the `custom_html_tags` parameter. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader = DocusaurusLoader(\n", + " \"https://python.langchain.com\",\n", + " filter_urls=[\"https://python.langchain.com/docs/integrations/document_loaders/sitemap\"],\n", + " # This will only include the content that matches these tags, otherwise they will be removed\n", + " custom_html_tags=[\"#content\", \".main\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also define an entirely custom parsing function if you need finer-grained control over the returned content for each page.\n", + "\n", + "The following example shows how to develop and use a custom function to avoid navigation and header elements." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "\n", + "\n", + "def remove_nav_and_header_elements(content: BeautifulSoup) -> str:\n", + " # Find all 'nav' and 'header' elements in the BeautifulSoup object\n", + " nav_elements = content.find_all(\"nav\")\n", + " header_elements = content.find_all(\"header\")\n", + "\n", + " # Remove each 'nav' and 'header' element from the BeautifulSoup object\n", + " for element in nav_elements + header_elements:\n", + " element.decompose()\n", + "\n", + " return str(content.get_text())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add your custom function to the `DocusaurusLoader` object." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "loader = DocusaurusLoader(\n", + " \"https://python.langchain.com\",\n", + " filter_urls=[\"https://python.langchain.com/docs/integrations/document_loaders/sitemap\"],\n", + " parsing_function=remove_nav_and_header_elements,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py index 730e2400b6..96cfd9e1b5 100644 --- a/libs/langchain/langchain/document_loaders/__init__.py +++ b/libs/langchain/langchain/document_loaders/__init__.py @@ -67,6 +67,7 @@ from langchain.document_loaders.diffbot import DiffbotLoader from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.discord import DiscordChatLoader from langchain.document_loaders.docugami import DocugamiLoader +from langchain.document_loaders.docusaurus import DocusaurusLoader from langchain.document_loaders.dropbox import DropboxLoader from langchain.document_loaders.duckdb_loader import DuckDBLoader from langchain.document_loaders.email import ( @@ -250,6 +251,7 @@ __all__ = [ "DirectoryLoader", "DiscordChatLoader", "DocugamiLoader", + "DocusaurusLoader", "Docx2txtLoader", "DropboxLoader", "DuckDBLoader", diff --git a/libs/langchain/langchain/document_loaders/docusaurus.py b/libs/langchain/langchain/document_loaders/docusaurus.py new file mode 100644 index 0000000000..3d434156d8 --- /dev/null +++ b/libs/langchain/langchain/document_loaders/docusaurus.py @@ -0,0 +1,49 @@ +"""Load Documents from Docusarus Documentation""" +from typing import Any, List, Optional + +from langchain.document_loaders.sitemap import SitemapLoader + + +class DocusaurusLoader(SitemapLoader): + """ + Loader that leverages the SitemapLoader to loop through the generated pages of a + Docusaurus Documentation website and extracts the content by looking for specific + HTML tags. By default, the parser searches for the main content of the Docusaurus + page, which is normally the
. You also have the option to define your own + custom HTML tags by providing them as a list, for example: ["div", ".main", "a"]. + """ + + def __init__( + self, + url: str, + custom_html_tags: Optional[List[str]] = None, + **kwargs: Any, + ): + """ + Initialize DocusaurusLoader + Args: + url: The base URL of the Docusaurus website. + custom_html_tags: Optional custom html tags to extract content from pages. + kwargs: Additional args to extend the underlying SitemapLoader, for example: + filter_urls, blocksize, meta_function, is_local, continue_on_failure + """ + if not kwargs.get("is_local"): + url = f"{url}/sitemap.xml" + + self.custom_html_tags = custom_html_tags or ["main article"] + + super().__init__( + url, + parsing_function=kwargs.get("parsing_function") or self._parsing_function, + **kwargs, + ) + + def _parsing_function(self, content: Any) -> str: + """Parses specific elements from a Docusarus page.""" + relevant_elements = content.select(",".join(self.custom_html_tags)) + + for element in relevant_elements: + if element not in relevant_elements: + element.decompose() + + return str(content.get_text()) diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_docusaurus.py b/libs/langchain/tests/integration_tests/document_loaders/test_docusaurus.py new file mode 100644 index 0000000000..53323ae9e4 --- /dev/null +++ b/libs/langchain/tests/integration_tests/document_loaders/test_docusaurus.py @@ -0,0 +1,43 @@ +from pathlib import Path + +from langchain.document_loaders import DocusaurusLoader + +DOCS_URL = str(Path(__file__).parent.parent / "examples/docusaurus-sitemap.xml") + + +def test_docusarus() -> None: + """Test sitemap loader.""" + loader = DocusaurusLoader(DOCS_URL, is_local=True) + documents = loader.load() + assert len(documents) > 1 + assert "🦜️🔗 Langchain" in documents[0].page_content + + +def test_filter_docusaurus_sitemap() -> None: + """Test sitemap loader.""" + loader = DocusaurusLoader( + DOCS_URL, + is_local=True, + filter_urls=[ + "https://python.langchain.com/docs/integrations/document_loaders/sitemap" + ], + ) + documents = loader.load() + assert len(documents) == 1 + assert "SitemapLoader" in documents[0].page_content + + +def test_docusarus_metadata() -> None: + def sitemap_metadata_one(meta: dict, _content: None) -> dict: + return {**meta, "mykey": "Super Important Metadata"} + + """Test sitemap loader.""" + loader = DocusaurusLoader( + DOCS_URL, + is_local=True, + meta_function=sitemap_metadata_one, + ) + documents = loader.load() + assert len(documents) > 1 + assert "mykey" in documents[0].metadata + assert "Super Important Metadata" in documents[0].metadata["mykey"] diff --git a/libs/langchain/tests/integration_tests/examples/docusaurus-sitemap.xml b/libs/langchain/tests/integration_tests/examples/docusaurus-sitemap.xml new file mode 100644 index 0000000000..eebae785b8 --- /dev/null +++ b/libs/langchain/tests/integration_tests/examples/docusaurus-sitemap.xml @@ -0,0 +1,42 @@ + + + + https://python.langchain.com/docs/integrations/document_loaders/sitemap + weekly + 0.5 + + + https://python.langchain.com/cookbook + weekly + 0.5 + + + https://python.langchain.com/docs/additional_resources + weekly + 0.5 + + + https://python.langchain.com/docs/modules/chains/how_to/ + weekly + 0.5 + + + https://python.langchain.com/docs/use_cases/question_answering/local_retrieval_qa + weekly + 0.5 + + + https://python.langchain.com/docs/use_cases/summarization + weekly + 0.5 + + + https://python.langchain.com/ + weekly + 0.5 + + \ No newline at end of file