langchain/docs/extras/integrations/document_loaders/async_html.ipynb

108 lines
4.1 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"id": "e229e34c",
"metadata": {},
"source": [
"# AsyncHtmlLoader\n",
"\n",
"AsyncHtmlLoader loads raw HTML from a list of urls concurrently."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4c8e4dab",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import AsyncHtmlLoader"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e76b5ddc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching pages: 100%|############| 2/2 [00:00<00:00, 9.96it/s]\n"
]
}
],
"source": [
"urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n",
"loader = AsyncHtmlLoader(urls)\n",
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "5dca1c0c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"' news. Stream exclusive games on ESPN+ and play fantasy sports.\" />\\n<meta property=\"og:image\" content=\"https://a1.espncdn.com/combiner/i?img=%2Fi%2Fespn%2Fespn_logos%2Fespn_red.png\"/>\\n<meta property=\"og:image:width\" content=\"1200\" />\\n<meta property=\"og:image:height\" content=\"630\" />\\n<meta property=\"og:type\" content=\"website\" />\\n<meta name=\"twitter:site\" content=\"espn\" />\\n<meta name=\"twitter:url\" content=\"https://www.espn.com\" />\\n<meta name=\"twitter:title\" content=\"ESPN - Serving Sports Fans. Anytime. Anywhere.\"/>\\n<meta name=\"twitter:description\" content=\"Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.\" />\\n<meta name=\"twitter:card\" content=\"summary\">\\n<meta name=\"twitter:app:name:iphone\" content=\"ESPN\"/>\\n<meta name=\"twitter:app:id:iphone\" content=\"317469184\"/>\\n<meta name=\"twitter:app:name:googleplay\" content=\"ESPN\"/>\\n<meta name=\"twitter:app:id:googleplay\" content=\"com.espn.score_center\"/>\\n<meta name=\"title\" content=\"ESPN - '"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0].page_content[1000:2000]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4d024f0f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'al\" href=\"https://lilianweng.github.io/posts/2023-06-23-agent/\" />\\n<link crossorigin=\"anonymous\" href=\"/assets/css/stylesheet.min.67a6fb6e33089cb29e856bcc95d7aa39f70049a42b123105531265a0d9f1258b.css\" integrity=\"sha256-Z6b7bjMInLKehWvMldeqOfcASaQrEjEFUxJloNnxJYs=\" rel=\"preload stylesheet\" as=\"style\">\\n<script defer crossorigin=\"anonymous\" src=\"/assets/js/highlight.min.7680afc38aa6b15ddf158a4f3780b7b1f7dde7e91d26f073e6229bb7a0793c92.js\" integrity=\"sha256-doCvw4qmsV3fFYpPN4C3sffd5&#43;kdJvBz5iKbt6B5PJI=\"\\n onload=\"hljs.initHighlightingOnLoad();\"></script>\\n<link rel=\"icon\" href=\"https://lilianweng.github.io/favicon_peach.ico\">\\n<link rel=\"icon\" type=\"image/png\" sizes=\"16x16\" href=\"https://lilianweng.github.io/favicon-16x16.png\">\\n<link rel=\"icon\" type=\"image/png\" sizes=\"32x32\" href=\"https://lilianweng.github.io/favicon-32x32.png\">\\n<link rel=\"apple-touch-icon\" href=\"https://lilianweng.github.io/apple-touch-icon.png\">\\n<link rel=\"mask-icon\" href=\"https://lilianweng.github.io/safari-pinned-tab.'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[1].page_content[1000:2000]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}