forked from Archives/langchain
2markdown loader (#4796)
Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
aa73a888fa
commit
720ac49f42
@ -90,7 +90,6 @@
|
|||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
"id": "4be99e6c",
|
"id": "4be99e6c",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
"outputs_hidden": false
|
"outputs_hidden": false
|
||||||
},
|
},
|
||||||
@ -131,7 +130,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
228
docs/modules/indexes/document_loaders/examples/tomarkdown.ipynb
Normal file
228
docs/modules/indexes/document_loaders/examples/tomarkdown.ipynb
Normal file
@ -0,0 +1,228 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "77b854df",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# 2Markdown\n",
|
||||||
|
"\n",
|
||||||
|
"Uses [2markdown](https://2markdown.com/) to convert any webpage into a standard markdown file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "497736aa",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# You will need to get your own API key\n",
|
||||||
|
"\n",
|
||||||
|
"api_key = \"\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "009e0036",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import ToMarkdownLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "910fb6ee",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = ToMarkdownLoader.from_api_key(url=\"https://python.langchain.com/en/latest/\", api_key=api_key)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "ac8db139",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "706304e9",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"## Contents\n",
|
||||||
|
"\n",
|
||||||
|
"- [Getting Started](#getting-started)\n",
|
||||||
|
"- [Modules](#modules)\n",
|
||||||
|
"- [Use Cases](#use-cases)\n",
|
||||||
|
"- [Reference Docs](#reference-docs)\n",
|
||||||
|
"- [LangChain Ecosystem](#langchain-ecosystem)\n",
|
||||||
|
"- [Additional Resources](#additional-resources)\n",
|
||||||
|
"\n",
|
||||||
|
"## Welcome to LangChain [\\#](\\#welcome-to-langchain \"Permalink to this headline\")\n",
|
||||||
|
"\n",
|
||||||
|
"**LangChain** is a framework for developing applications powered by language models. We believe that the most powerful and differentiated applications will not only call out to a language model, but will also be:\n",
|
||||||
|
"\n",
|
||||||
|
"1. _Data-aware_: connect a language model to other sources of data\n",
|
||||||
|
"\n",
|
||||||
|
"2. _Agentic_: allow a language model to interact with its environment\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"The LangChain framework is designed around these principles.\n",
|
||||||
|
"\n",
|
||||||
|
"This is the Python specific portion of the documentation. For a purely conceptual guide to LangChain, see [here](https://docs.langchain.com/docs/). For the JavaScript documentation, see [here](https://js.langchain.com/docs/).\n",
|
||||||
|
"\n",
|
||||||
|
"## Getting Started [\\#](\\#getting-started \"Permalink to this headline\")\n",
|
||||||
|
"\n",
|
||||||
|
"How to get started using LangChain to create an Language Model application.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Quickstart Guide](https://python.langchain.com/en/latest/getting_started/getting_started.html)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"Concepts and terminology.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Concepts and terminology](https://python.langchain.com/en/latest/getting_started/concepts.html)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"Tutorials created by community experts and presented on YouTube.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Tutorials](https://python.langchain.com/en/latest/getting_started/tutorials.html)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## Modules [\\#](\\#modules \"Permalink to this headline\")\n",
|
||||||
|
"\n",
|
||||||
|
"These modules are the core abstractions which we view as the building blocks of any LLM-powered application.\n",
|
||||||
|
"\n",
|
||||||
|
"For each module LangChain provides standard, extendable interfaces. LanghChain also provides external integrations and even end-to-end implementations for off-the-shelf use.\n",
|
||||||
|
"\n",
|
||||||
|
"The docs for each module contain quickstart examples, how-to guides, reference docs, and conceptual guides.\n",
|
||||||
|
"\n",
|
||||||
|
"The modules are (from least to most complex):\n",
|
||||||
|
"\n",
|
||||||
|
"- [Models](https://python.langchain.com/en/latest/modules/models.html): Supported model types and integrations.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Prompts](https://python.langchain.com/en/latest/modules/prompts.html): Prompt management, optimization, and serialization.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Memory](https://python.langchain.com/en/latest/modules/memory.html): Memory refers to state that is persisted between calls of a chain/agent.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Indexes](https://python.langchain.com/en/latest/modules/indexes.html): Language models become much more powerful when combined with application-specific data - this module contains interfaces and integrations for loading, querying and updating external data.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Chains](https://python.langchain.com/en/latest/modules/chains.html): Chains are structured sequences of calls (to an LLM or to a different utility).\n",
|
||||||
|
"\n",
|
||||||
|
"- [Agents](https://python.langchain.com/en/latest/modules/agents.html): An agent is a Chain in which an LLM, given a high-level directive and a set of tools, repeatedly decides an action, executes the action and observes the outcome until the high-level directive is complete.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Callbacks](https://python.langchain.com/en/latest/modules/callbacks/getting_started.html): Callbacks let you log and stream the intermediate steps of any chain, making it easy to observe, debug, and evaluate the internals of an application.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## Use Cases [\\#](\\#use-cases \"Permalink to this headline\")\n",
|
||||||
|
"\n",
|
||||||
|
"Best practices and built-in implementations for common LangChain use cases:\n",
|
||||||
|
"\n",
|
||||||
|
"- [Autonomous Agents](https://python.langchain.com/en/latest/use_cases/autonomous_agents.html): Autonomous agents are long-running agents that take many steps in an attempt to accomplish an objective. Examples include AutoGPT and BabyAGI.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Agent Simulations](https://python.langchain.com/en/latest/use_cases/agent_simulations.html): Putting agents in a sandbox and observing how they interact with each other and react to events can be an effective way to evaluate their long-range reasoning and planning abilities.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Personal Assistants](https://python.langchain.com/en/latest/use_cases/personal_assistants.html): One of the primary LangChain use cases. Personal assistants need to take actions, remember interactions, and have knowledge about your data.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Question Answering](https://python.langchain.com/en/latest/use_cases/question_answering.html): Another common LangChain use case. Answering questions over specific documents, only utilizing the information in those documents to construct an answer.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Chatbots](https://python.langchain.com/en/latest/use_cases/chatbots.html): Language models love to chat, making this a very natural use of them.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Querying Tabular Data](https://python.langchain.com/en/latest/use_cases/tabular.html): Recommended reading if you want to use language models to query structured data (CSVs, SQL, dataframes, etc).\n",
|
||||||
|
"\n",
|
||||||
|
"- [Code Understanding](https://python.langchain.com/en/latest/use_cases/code.html): Recommended reading if you want to use language models to analyze code.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Interacting with APIs](https://python.langchain.com/en/latest/use_cases/apis.html): Enabling language models to interact with APIs is extremely powerful. It gives them access to up-to-date information and allows them to take actions.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Extraction](https://python.langchain.com/en/latest/use_cases/extraction.html): Extract structured information from text.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Summarization](https://python.langchain.com/en/latest/use_cases/summarization.html): Compressing longer documents. A type of Data-Augmented Generation.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Evaluation](https://python.langchain.com/en/latest/use_cases/evaluation.html): Generative models are hard to evaluate with traditional metrics. One promising approach is to use language models themselves to do the evaluation.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## Reference Docs [\\#](\\#reference-docs \"Permalink to this headline\")\n",
|
||||||
|
"\n",
|
||||||
|
"Full documentation on all methods, classes, installation methods, and integration setups for LangChain.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Reference Documentation](https://python.langchain.com/en/latest/reference.html)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## LangChain Ecosystem [\\#](\\#langchain-ecosystem \"Permalink to this headline\")\n",
|
||||||
|
"\n",
|
||||||
|
"Guides for how other companies/products can be used with LangChain.\n",
|
||||||
|
"\n",
|
||||||
|
"- [LangChain Ecosystem](https://python.langchain.com/en/latest/ecosystem.html)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## Additional Resources [\\#](\\#additional-resources \"Permalink to this headline\")\n",
|
||||||
|
"\n",
|
||||||
|
"Additional resources we think may be useful as you develop your application!\n",
|
||||||
|
"\n",
|
||||||
|
"- [LangChainHub](https://github.com/hwchase17/langchain-hub): The LangChainHub is a place to share and explore other prompts, chains, and agents.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Gallery](https://python.langchain.com/en/latest/additional_resources/gallery.html): A collection of our favorite projects that use LangChain. Useful for finding inspiration or seeing how things were done in other applications.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Deployments](https://python.langchain.com/en/latest/additional_resources/deployments.html): A collection of instructions, code snippets, and template repositories for deploying LangChain apps.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Tracing](https://python.langchain.com/en/latest/additional_resources/tracing.html): A guide on using tracing in LangChain to visualize the execution of chains and agents.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Model Laboratory](https://python.langchain.com/en/latest/additional_resources/model_laboratory.html): Experimenting with different prompts, models, and chains is a big part of developing the best possible application. The ModelLaboratory makes it easy to do so.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Discord](https://discord.gg/6adMQxSpJS): Join us on our Discord to discuss all things LangChain!\n",
|
||||||
|
"\n",
|
||||||
|
"- [YouTube](https://python.langchain.com/en/latest/additional_resources/youtube.html): A collection of the LangChain tutorials and videos.\n",
|
||||||
|
"\n",
|
||||||
|
"- [Production Support](https://forms.gle/57d8AmXBYp8PP8tZA): As you move your LangChains into production, we’d love to offer more comprehensive support. Please fill out this form and we’ll set up a dedicated support Slack channel.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(docs[0].page_content)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5dde17e7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -86,6 +86,7 @@ from langchain.document_loaders.telegram import (
|
|||||||
TelegramChatFileLoader,
|
TelegramChatFileLoader,
|
||||||
)
|
)
|
||||||
from langchain.document_loaders.text import TextLoader
|
from langchain.document_loaders.text import TextLoader
|
||||||
|
from langchain.document_loaders.tomarkdown import ToMarkdownLoader
|
||||||
from langchain.document_loaders.toml import TomlLoader
|
from langchain.document_loaders.toml import TomlLoader
|
||||||
from langchain.document_loaders.twitter import TwitterTweetLoader
|
from langchain.document_loaders.twitter import TwitterTweetLoader
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
@ -213,4 +214,5 @@ __all__ = [
|
|||||||
"WikipediaLoader",
|
"WikipediaLoader",
|
||||||
"YoutubeLoader",
|
"YoutubeLoader",
|
||||||
"TelegramChatLoader",
|
"TelegramChatLoader",
|
||||||
|
"ToMarkdownLoader",
|
||||||
]
|
]
|
||||||
|
35
langchain/document_loaders/tomarkdown.py
Normal file
35
langchain/document_loaders/tomarkdown.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
"""Loader that loads HTML to markdown using 2markdown."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Iterator, List
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class ToMarkdownLoader(BaseLoader):
|
||||||
|
"""Loader that loads HTML to markdown using 2markdown."""
|
||||||
|
|
||||||
|
def __init__(self, url: str, api_key: str):
|
||||||
|
"""Initialize with url and api key."""
|
||||||
|
self.url = url
|
||||||
|
self.api_key = api_key
|
||||||
|
|
||||||
|
def lazy_load(
|
||||||
|
self,
|
||||||
|
) -> Iterator[Document]:
|
||||||
|
"""Lazily load the file."""
|
||||||
|
response = requests.post(
|
||||||
|
"https://2markdown.com/api/2md",
|
||||||
|
headers={"X-Api-Key": self.api_key},
|
||||||
|
json={"url": self.url},
|
||||||
|
)
|
||||||
|
text = response.json()["article"]
|
||||||
|
metadata = {"source": self.url}
|
||||||
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load file."""
|
||||||
|
return list(self.lazy_load())
|
Loading…
Reference in New Issue
Block a user