From 88d3ce12b8de8f461e8b95dc8a7f50f4feb4c0f1 Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Sun, 16 Apr 2023 09:11:24 -0700
Subject: [PATCH] Harrison/diffbot (#2984)

Co-authored-by: Manuel Saelices <msaelices@gmail.com>
---
 .../document_loaders/examples/diffbot.ipynb   | 99 +++++++++++++++++++
 langchain/document_loaders/__init__.py        |  2 +
 langchain/document_loaders/diffbot.py         | 55 +++++++++++
 3 files changed, 156 insertions(+)
 create mode 100644 docs/modules/indexes/document_loaders/examples/diffbot.ipynb
 create mode 100644 langchain/document_loaders/diffbot.py

diff --git a/docs/modules/indexes/document_loaders/examples/diffbot.ipynb b/docs/modules/indexes/document_loaders/examples/diffbot.ipynb
new file mode 100644
index 00000000..14f481f4
--- /dev/null
+++ b/docs/modules/indexes/document_loaders/examples/diffbot.ipynb
@@ -0,0 +1,99 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "2dfc4698",
+   "metadata": {},
+   "source": [
+    "# Diffbot\n",
+    "\n",
+    "This covers how to extract HTML documents from a list of URLs using the [Diffbot extract API](https://www.diffbot.com/products/extract/), into a document format that we can use downstream."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "836fbac1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urls = [\n",
+    "    \"https://python.langchain.com/en/latest/index.html\",\n",
+    "]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "6fffec88",
+   "metadata": {},
+   "source": [
+    "The Diffbot Extract API Requires an API token. Once you have it, you can extract the data from the previous URLs\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "00f46fda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from langchain.document_loaders import DiffbotLoader\n",
+    "loader = DiffbotLoader(urls=urls, api_token=os.environ.get(\"DIFFBOT_API_TOKEN\"))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "e0ce8c05",
+   "metadata": {},
+   "source": [
+    "With the `.load()` method, you can see the documents loaded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b68a26b3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='LangChain is a framework for developing applications powered by language models. We believe that the most powerful and differentiated applications will not only call out to a language model via an API, but will also:\\nBe data-aware: connect a language model to other sources of data\\nBe agentic: allow a language model to interact with its environment\\nThe LangChain framework is designed with the above principles in mind.\\nThis is the Python specific portion of the documentation. For a purely conceptual guide to LangChain, see here. For the JavaScript documentation, see here.\\nGetting Started\\nCheckout the below guide for a walkthrough of how to get started using LangChain to create an Language Model application.\\nGetting Started Documentation\\nModules\\nThere are several main modules that LangChain provides support for. For each module we provide some examples to get started, how-to guides, reference docs, and conceptual guides. These modules are, in increasing order of complexity:\\nModels: The various model types and model integrations LangChain supports.\\nPrompts: This includes prompt management, prompt optimization, and prompt serialization.\\nMemory: Memory is the concept of persisting state between calls of a chain/agent. LangChain provides a standard interface for memory, a collection of memory implementations, and examples of chains/agents that use memory.\\nIndexes: Language models are often more powerful when combined with your own text data - this module covers best practices for doing exactly that.\\nChains: Chains go beyond just a single LLM call, and are sequences of calls (whether to an LLM or a different utility). LangChain provides a standard interface for chains, lots of integrations with other tools, and end-to-end chains for common applications.\\nAgents: Agents involve an LLM making decisions about which Actions to take, taking that Action, seeing an Observation, and repeating that until done. LangChain provides a standard interface for agents, a selection of agents to choose from, and examples of end to end agents.\\nUse Cases\\nThe above modules can be used in a variety of ways. LangChain also provides guidance and assistance in this. Below are some of the common use cases LangChain supports.\\nPersonal Assistants: The main LangChain use case. Personal assistants need to take actions, remember interactions, and have knowledge about your data.\\nQuestion Answering: The second big LangChain use case. Answering questions over specific documents, only utilizing the information in those documents to construct an answer.\\nChatbots: Since language models are good at producing text, that makes them ideal for creating chatbots.\\nQuerying Tabular Data: If you want to understand how to use LLMs to query data that is stored in a tabular format (csvs, SQL, dataframes, etc) you should read this page.\\nInteracting with APIs: Enabling LLMs to interact with APIs is extremely powerful in order to give them more up-to-date information and allow them to take actions.\\nExtraction: Extract structured information from text.\\nSummarization: Summarizing longer documents into shorter, more condensed chunks of information. A type of Data Augmented Generation.\\nEvaluation: Generative models are notoriously hard to evaluate with traditional metrics. One new way of evaluating them is using language models themselves to do the evaluation. LangChain provides some prompts/chains for assisting in this.\\nReference Docs\\nAll of LangChain’s reference documentation, in one place. Full documentation on all methods, classes, installation methods, and integration setups for LangChain.\\nReference Documentation\\nLangChain Ecosystem\\nGuides for how other companies/products can be used with LangChain\\nLangChain Ecosystem\\nAdditional Resources\\nAdditional collection of resources we think may be useful as you develop your application!\\nLangChainHub: The LangChainHub is a place to share and explore other prompts, chains, and agents.\\nGlossary: A glossary of all related terms, papers, methods, etc. Whether implemented in LangChain or not!\\nGallery: A collection of our favorite projects that use LangChain. Useful for finding inspiration or seeing how things were done in other applications.\\nDeployments: A collection of instructions, code snippets, and template repositories for deploying LangChain apps.\\nTracing: A guide on using tracing in LangChain to visualize the execution of chains and agents.\\nModel Laboratory: Experimenting with different prompts, models, and chains is a big part of developing the best possible application. The ModelLaboratory makes it easy to do so.\\nDiscord: Join us on our Discord to discuss all things LangChain!\\nProduction Support: As you move your LangChains into production, we’d love to offer more comprehensive support. Please fill out this form and we’ll set up a dedicated support Slack channel.', metadata={'source': 'https://python.langchain.com/en/latest/index.html'})]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "loader.load()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index b5924cef..491b3956 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -16,6 +16,7 @@ from langchain.document_loaders.college_confidential import CollegeConfidentialL
 from langchain.document_loaders.conllu import CoNLLULoader
 from langchain.document_loaders.csv_loader import CSVLoader
 from langchain.document_loaders.dataframe import DataFrameLoader
+from langchain.document_loaders.diffbot import DiffbotLoader
 from langchain.document_loaders.directory import DirectoryLoader
 from langchain.document_loaders.duckdb_loader import DuckDBLoader
 from langchain.document_loaders.email import (
@@ -141,6 +142,7 @@ __all__ = [
     "SitemapLoader",
     "DuckDBLoader",
     "BigQueryLoader",
+    "DiffbotLoader",
     "BiliBiliLoader",
     "SlackDirectoryLoader",
     "GitLoader",
diff --git a/langchain/document_loaders/diffbot.py b/langchain/document_loaders/diffbot.py
new file mode 100644
index 00000000..149c0c5a
--- /dev/null
+++ b/langchain/document_loaders/diffbot.py
@@ -0,0 +1,55 @@
+"""Loader that uses Diffbot to load webpages in text format."""
+import logging
+from typing import Any, List
+
+import requests
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+logger = logging.getLogger(__file__)
+
+
+class DiffbotLoader(BaseLoader):
+    """Loader that loads Diffbot file json."""
+
+    def __init__(
+        self, api_token: str, urls: List[str], continue_on_failure: bool = True
+    ):
+        """Initialize with API token, ids, and key."""
+        self.api_token = api_token
+        self.urls = urls
+        self.continue_on_failure = continue_on_failure
+
+    def _diffbot_api_url(self, diffbot_api: str) -> str:
+        return f"https://api.diffbot.com/v3/{diffbot_api}"
+
+    def _get_diffbot_data(self, url: str) -> Any:
+        """Get Diffbot file from Diffbot REST API."""
+        # TODO: Add support for other Diffbot APIs
+        diffbot_url = self._diffbot_api_url("article")
+        params = {
+            "token": self.api_token,
+            "url": url,
+        }
+        response = requests.get(diffbot_url, params=params, timeout=10)
+
+        # TODO: handle non-ok errors
+        return response.json() if response.ok else {}
+
+    def load(self) -> List[Document]:
+        """Extract text from Diffbot on all the URLs and return Document instances"""
+        docs: List[Document] = list()
+
+        for url in self.urls:
+            try:
+                data = self._get_diffbot_data(url)
+                text = data["objects"][0]["text"] if "objects" in data else ""
+                metadata = {"source": url}
+                docs.append(Document(page_content=text, metadata=metadata))
+            except Exception as e:
+                if self.continue_on_failure:
+                    logger.error(f"Error fetching or processing {url}, exception: {e}")
+                else:
+                    raise e
+        return docs