From 720ac49f4237e8c177ac65a27903da6215fe91c8 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Tue, 16 May 2023 23:42:53 -0700 Subject: [PATCH] 2markdown loader (#4796) Co-authored-by: Eugene Yurtsev --- .../document_loaders/examples/html.ipynb | 3 +- .../examples/tomarkdown.ipynb | 228 ++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/tomarkdown.py | 35 +++ 4 files changed, 266 insertions(+), 2 deletions(-) create mode 100644 docs/modules/indexes/document_loaders/examples/tomarkdown.ipynb create mode 100644 langchain/document_loaders/tomarkdown.py diff --git a/docs/modules/indexes/document_loaders/examples/html.ipynb b/docs/modules/indexes/document_loaders/examples/html.ipynb index 445ec597..9646dec6 100644 --- a/docs/modules/indexes/document_loaders/examples/html.ipynb +++ b/docs/modules/indexes/document_loaders/examples/html.ipynb @@ -90,7 +90,6 @@ "execution_count": 2, "id": "4be99e6c", "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -131,7 +130,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/modules/indexes/document_loaders/examples/tomarkdown.ipynb b/docs/modules/indexes/document_loaders/examples/tomarkdown.ipynb new file mode 100644 index 00000000..da9d262e --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/tomarkdown.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "77b854df", + "metadata": {}, + "source": [ + "# 2Markdown\n", + "\n", + "Uses [2markdown](https://2markdown.com/) to convert any webpage into a standard markdown file" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "497736aa", + "metadata": {}, + "outputs": [], + "source": [ + "# You will need to get your own API key\n", + "\n", + "api_key = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "009e0036", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import ToMarkdownLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "910fb6ee", + "metadata": {}, + "outputs": [], + "source": [ + "loader = ToMarkdownLoader.from_api_key(url=\"https://python.langchain.com/en/latest/\", api_key=api_key)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ac8db139", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "706304e9", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Contents\n", + "\n", + "- [Getting Started](#getting-started)\n", + "- [Modules](#modules)\n", + "- [Use Cases](#use-cases)\n", + "- [Reference Docs](#reference-docs)\n", + "- [LangChain Ecosystem](#langchain-ecosystem)\n", + "- [Additional Resources](#additional-resources)\n", + "\n", + "## Welcome to LangChain [\\#](\\#welcome-to-langchain \"Permalink to this headline\")\n", + "\n", + "**LangChain** is a framework for developing applications powered by language models. We believe that the most powerful and differentiated applications will not only call out to a language model, but will also be:\n", + "\n", + "1. _Data-aware_: connect a language model to other sources of data\n", + "\n", + "2. _Agentic_: allow a language model to interact with its environment\n", + "\n", + "\n", + "The LangChain framework is designed around these principles.\n", + "\n", + "This is the Python specific portion of the documentation. For a purely conceptual guide to LangChain, see [here](https://docs.langchain.com/docs/). For the JavaScript documentation, see [here](https://js.langchain.com/docs/).\n", + "\n", + "## Getting Started [\\#](\\#getting-started \"Permalink to this headline\")\n", + "\n", + "How to get started using LangChain to create an Language Model application.\n", + "\n", + "- [Quickstart Guide](https://python.langchain.com/en/latest/getting_started/getting_started.html)\n", + "\n", + "\n", + "Concepts and terminology.\n", + "\n", + "- [Concepts and terminology](https://python.langchain.com/en/latest/getting_started/concepts.html)\n", + "\n", + "\n", + "Tutorials created by community experts and presented on YouTube.\n", + "\n", + "- [Tutorials](https://python.langchain.com/en/latest/getting_started/tutorials.html)\n", + "\n", + "\n", + "## Modules [\\#](\\#modules \"Permalink to this headline\")\n", + "\n", + "These modules are the core abstractions which we view as the building blocks of any LLM-powered application.\n", + "\n", + "For each module LangChain provides standard, extendable interfaces. LanghChain also provides external integrations and even end-to-end implementations for off-the-shelf use.\n", + "\n", + "The docs for each module contain quickstart examples, how-to guides, reference docs, and conceptual guides.\n", + "\n", + "The modules are (from least to most complex):\n", + "\n", + "- [Models](https://python.langchain.com/en/latest/modules/models.html): Supported model types and integrations.\n", + "\n", + "- [Prompts](https://python.langchain.com/en/latest/modules/prompts.html): Prompt management, optimization, and serialization.\n", + "\n", + "- [Memory](https://python.langchain.com/en/latest/modules/memory.html): Memory refers to state that is persisted between calls of a chain/agent.\n", + "\n", + "- [Indexes](https://python.langchain.com/en/latest/modules/indexes.html): Language models become much more powerful when combined with application-specific data - this module contains interfaces and integrations for loading, querying and updating external data.\n", + "\n", + "- [Chains](https://python.langchain.com/en/latest/modules/chains.html): Chains are structured sequences of calls (to an LLM or to a different utility).\n", + "\n", + "- [Agents](https://python.langchain.com/en/latest/modules/agents.html): An agent is a Chain in which an LLM, given a high-level directive and a set of tools, repeatedly decides an action, executes the action and observes the outcome until the high-level directive is complete.\n", + "\n", + "- [Callbacks](https://python.langchain.com/en/latest/modules/callbacks/getting_started.html): Callbacks let you log and stream the intermediate steps of any chain, making it easy to observe, debug, and evaluate the internals of an application.\n", + "\n", + "\n", + "## Use Cases [\\#](\\#use-cases \"Permalink to this headline\")\n", + "\n", + "Best practices and built-in implementations for common LangChain use cases:\n", + "\n", + "- [Autonomous Agents](https://python.langchain.com/en/latest/use_cases/autonomous_agents.html): Autonomous agents are long-running agents that take many steps in an attempt to accomplish an objective. Examples include AutoGPT and BabyAGI.\n", + "\n", + "- [Agent Simulations](https://python.langchain.com/en/latest/use_cases/agent_simulations.html): Putting agents in a sandbox and observing how they interact with each other and react to events can be an effective way to evaluate their long-range reasoning and planning abilities.\n", + "\n", + "- [Personal Assistants](https://python.langchain.com/en/latest/use_cases/personal_assistants.html): One of the primary LangChain use cases. Personal assistants need to take actions, remember interactions, and have knowledge about your data.\n", + "\n", + "- [Question Answering](https://python.langchain.com/en/latest/use_cases/question_answering.html): Another common LangChain use case. Answering questions over specific documents, only utilizing the information in those documents to construct an answer.\n", + "\n", + "- [Chatbots](https://python.langchain.com/en/latest/use_cases/chatbots.html): Language models love to chat, making this a very natural use of them.\n", + "\n", + "- [Querying Tabular Data](https://python.langchain.com/en/latest/use_cases/tabular.html): Recommended reading if you want to use language models to query structured data (CSVs, SQL, dataframes, etc).\n", + "\n", + "- [Code Understanding](https://python.langchain.com/en/latest/use_cases/code.html): Recommended reading if you want to use language models to analyze code.\n", + "\n", + "- [Interacting with APIs](https://python.langchain.com/en/latest/use_cases/apis.html): Enabling language models to interact with APIs is extremely powerful. It gives them access to up-to-date information and allows them to take actions.\n", + "\n", + "- [Extraction](https://python.langchain.com/en/latest/use_cases/extraction.html): Extract structured information from text.\n", + "\n", + "- [Summarization](https://python.langchain.com/en/latest/use_cases/summarization.html): Compressing longer documents. A type of Data-Augmented Generation.\n", + "\n", + "- [Evaluation](https://python.langchain.com/en/latest/use_cases/evaluation.html): Generative models are hard to evaluate with traditional metrics. One promising approach is to use language models themselves to do the evaluation.\n", + "\n", + "\n", + "## Reference Docs [\\#](\\#reference-docs \"Permalink to this headline\")\n", + "\n", + "Full documentation on all methods, classes, installation methods, and integration setups for LangChain.\n", + "\n", + "- [Reference Documentation](https://python.langchain.com/en/latest/reference.html)\n", + "\n", + "\n", + "## LangChain Ecosystem [\\#](\\#langchain-ecosystem \"Permalink to this headline\")\n", + "\n", + "Guides for how other companies/products can be used with LangChain.\n", + "\n", + "- [LangChain Ecosystem](https://python.langchain.com/en/latest/ecosystem.html)\n", + "\n", + "\n", + "## Additional Resources [\\#](\\#additional-resources \"Permalink to this headline\")\n", + "\n", + "Additional resources we think may be useful as you develop your application!\n", + "\n", + "- [LangChainHub](https://github.com/hwchase17/langchain-hub): The LangChainHub is a place to share and explore other prompts, chains, and agents.\n", + "\n", + "- [Gallery](https://python.langchain.com/en/latest/additional_resources/gallery.html): A collection of our favorite projects that use LangChain. Useful for finding inspiration or seeing how things were done in other applications.\n", + "\n", + "- [Deployments](https://python.langchain.com/en/latest/additional_resources/deployments.html): A collection of instructions, code snippets, and template repositories for deploying LangChain apps.\n", + "\n", + "- [Tracing](https://python.langchain.com/en/latest/additional_resources/tracing.html): A guide on using tracing in LangChain to visualize the execution of chains and agents.\n", + "\n", + "- [Model Laboratory](https://python.langchain.com/en/latest/additional_resources/model_laboratory.html): Experimenting with different prompts, models, and chains is a big part of developing the best possible application. The ModelLaboratory makes it easy to do so.\n", + "\n", + "- [Discord](https://discord.gg/6adMQxSpJS): Join us on our Discord to discuss all things LangChain!\n", + "\n", + "- [YouTube](https://python.langchain.com/en/latest/additional_resources/youtube.html): A collection of the LangChain tutorials and videos.\n", + "\n", + "- [Production Support](https://forms.gle/57d8AmXBYp8PP8tZA): As you move your LangChains into production, we’d love to offer more comprehensive support. Please fill out this form and we’ll set up a dedicated support Slack channel.\n" + ] + } + ], + "source": [ + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5dde17e7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 271afda2..a456f2c8 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -86,6 +86,7 @@ from langchain.document_loaders.telegram import ( TelegramChatFileLoader, ) from langchain.document_loaders.text import TextLoader +from langchain.document_loaders.tomarkdown import ToMarkdownLoader from langchain.document_loaders.toml import TomlLoader from langchain.document_loaders.twitter import TwitterTweetLoader from langchain.document_loaders.unstructured import ( @@ -213,4 +214,5 @@ __all__ = [ "WikipediaLoader", "YoutubeLoader", "TelegramChatLoader", + "ToMarkdownLoader", ] diff --git a/langchain/document_loaders/tomarkdown.py b/langchain/document_loaders/tomarkdown.py new file mode 100644 index 00000000..a3fbf6f7 --- /dev/null +++ b/langchain/document_loaders/tomarkdown.py @@ -0,0 +1,35 @@ +"""Loader that loads HTML to markdown using 2markdown.""" +from __future__ import annotations + +from typing import Iterator, List + +import requests + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class ToMarkdownLoader(BaseLoader): + """Loader that loads HTML to markdown using 2markdown.""" + + def __init__(self, url: str, api_key: str): + """Initialize with url and api key.""" + self.url = url + self.api_key = api_key + + def lazy_load( + self, + ) -> Iterator[Document]: + """Lazily load the file.""" + response = requests.post( + "https://2markdown.com/api/2md", + headers={"X-Api-Key": self.api_key}, + json={"url": self.url}, + ) + text = response.json()["article"] + metadata = {"source": self.url} + yield Document(page_content=text, metadata=metadata) + + def load(self) -> List[Document]: + """Load file.""" + return list(self.lazy_load())