langchain/docs/modules/document_loaders/examples/online_pdf.ipynb

131 lines
44 KiB
Plaintext
Raw Normal View History

2023-02-10 23:42:30 +00:00
{
"cells": [
{
"cell_type": "markdown",
"id": "8d9e1096",
"metadata": {},
"source": [
"# Online PDF\n",
"\n",
"This covers how to load online pdfs into a document format that we can use downstream. This can be used for various online pdf sites such as https://open.umn.edu/opentextbooks/textbooks/ and https://arxiv.org/archive/"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3dde8f63",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import OnlinePDFLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "04e27651",
"metadata": {},
"outputs": [],
"source": [
"loader = OnlinePDFLoader(\"https://arxiv.org/pdf/2302.03803.pdf\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "85690c73",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/var/folders/v9/s3kf_mtn2k9f4chcfvsftj380000gq/T/tmp0nnippcn/online_file.pdf\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/langchain_db/langchain/document_loaders/online_pdf.py:30\u001b[0m, in \u001b[0;36mOnlinePDFLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 28\u001b[0m file\u001b[38;5;241m.\u001b[39mclose()\n\u001b[1;32m 29\u001b[0m loader \u001b[38;5;241m=\u001b[39m UnstructuredPDFLoader(file_path)\n\u001b[0;32m---> 30\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/langchain_db/langchain/document_loaders/pdf.py:26\u001b[0m, in \u001b[0;36mUnstructuredPDFLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01munstructured\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpartition\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpdf\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m partition_pdf\n\u001b[1;32m 25\u001b[0m elements \u001b[38;5;241m=\u001b[39m partition_pdf(filename\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path)\n\u001b[0;32m---> 26\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([\u001b[38;5;28mstr\u001b[39m(el) \u001b[38;5;28;01mfor\u001b[39;00m el \u001b[38;5;129;01min\u001b[39;00m elements])\n\u001b[1;32m 27\u001b[0m metadata \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msource\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path}\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [Document(page_content\u001b[38;5;241m=\u001b[39mtext, metadata\u001b[38;5;241m=\u001b[39mmetadata)]\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/unstructured/partition/pdf.py:34\u001b[0m, in \u001b[0;36mpartition_pdf\u001b[0;34m(filename, file, url, template, token)\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m template \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 33\u001b[0m template \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayout/pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 34\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpartition_pdf_or_image\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtemplate\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\n\u001b[1;32m 36\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/unstructured/partition/pdf.py:73\u001b[0m, in \u001b[0;36mpartition_pdf_or_image\u001b[0;34m(filename, file, url, template, token, is_image)\u001b[0m\n\u001b[1;32m 71\u001b[0m url \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;241m.\u001b[39mrstrip(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtemplate\u001b[38;5;241m.\u001b[39mlstrip(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;66;03m# NOTE(alan): Remove \"data=data\" after different models are handled by routing\u001b[39;00m\n\u001b[0;32m---> 73\u001b[0m layout_elements \u001b[38;5;241m=\u001b[39m \u001b[43m_partition_via_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 74\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 77\u001b[0m elements: List[Element] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m()\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m layout_element \u001b[38;5;129;01min\u001b[39;00m layout_elements:\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/unstructured/partition/__init__.py:34\u001b[0m, in \u001b[0;36m_partition_via_api\u001b[0;34m(filename, file, url, token, data)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint api healthcheck has failed!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 28\u001b[0m file_: Mapping[\u001b[38;5;28mstr\u001b[39m, Tuple[\u001b[38;5;28mstr\u001b[39m, Union[BinaryIO, \u001b[38;5;28mbytes\u001b[39m]]] \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 29\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfile\u001b[39m\u001b[38;5;124m\"\u001b[39m: (\n\u001b[1;32m 30\u001b[0m filename,\n\u001b[1;32m 31\u001b[0m file \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(filename, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 32\u001b[0m )\n\u001b[1;32m 33\u001b[0m }\n\u001b[0;32m---> 34\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mrequests\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpost\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mAuthorization\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBearer \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mtoken\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[43m \u001b[49m\u001b[43mfiles\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# NOTE(alan): Remove after unstructured API is using routing\u001b[39;49;00m\n\u001b[1;32m 39\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[1;32m 42\u001b[0m pages \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpages\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/requests/api.py:115\u001b[0m, in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpost\u001b[39m(url, data\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, json\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 104\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Sends a POST request.\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \n\u001b[1;32m 106\u001b[0m \u001b[38;5;124;03m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;124;03m :rtype: requests.Response\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpost\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/requests/api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;66;03m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;66;03m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m sessions\u001b[38;5;241m.\u001b[39mSession() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[0;32m---> 59\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/requests/sessions.py:587\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 582\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 583\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[1;32m 584\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 585\u001b[0m }\n\u001b[1;32m 586\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 587\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 589\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/requests/sessions.py:701\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 698\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[1;32m 700\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 701\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 703\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m 704\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/requests/adapters.py:489\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m chunked:\n\u001b[0;32m--> 489\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 491\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 492\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 493\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 494\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 495\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 499\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 502\u001b[0m \u001b[38;5;66;03m# Send the request.\u001b[39;00m\n\u001b[1;32m 503\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(conn, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mproxy_pool\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/urllib3/connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 700\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_proxy(conn)\n\u001b[1;32m 702\u001b[0m \u001b[38;5;66;03m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 704\u001b[0m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 705\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 706\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 707\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 708\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 709\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 710\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 711\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;66;03m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[1;32m 714\u001b[0m \u001b[38;5;66;03m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[1;32m 715\u001b[0m \u001b[38;5;66;03m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[1;32m 716\u001b[0m \u001b[38;5;66;03m# mess.\u001b[39;00m\n\u001b[1;32m 717\u001b[0m response_conn \u001b[38;5;241m=\u001b[39m conn \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m release_conn \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/urllib3/connectionpool.py:449\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 444\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m conn\u001b[38;5;241m.\u001b[39mgetresponse()\n\u001b[1;32m 445\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 446\u001b[0m \u001b[38;5;66;03m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[1;32m 447\u001b[0m \u001b[38;5;66;03m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[1;32m 448\u001b[0m \u001b[38;5;66;03m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[0;32m--> 449\u001b[0m \u001b[43msix\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_from\u001b[49m\u001b[43m(\u001b[49m\u001b[43me\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (SocketTimeout, BaseSSLError, SocketError) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_timeout(err\u001b[38;5;241m=\u001b[39me, url\u001b[38;5;241m=\u001b[39murl, timeout_value\u001b[38;5;241m=\u001b[39mread_timeout)\n",
"File \u001b[0;32m<string>:3\u001b[0m, in \u001b[0;36mraise_from\u001b[0;34m(value, from_value)\u001b[0m\n",
"File \u001b[0;32m~/langchain_db/.venv/lib/python3.8/site-packages/urllib3/connectionpool.py:444\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 442\u001b[0m \u001b[38;5;66;03m# Python 3\u001b[39;00m\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 444\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 445\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 446\u001b[0m \u001b[38;5;66;03m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[1;32m 447\u001b[0m \u001b[38;5;66;03m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[1;32m 448\u001b[0m \u001b[38;5;66;03m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[1;32m 449\u001b[0m six\u001b[38;5;241m.\u001b[39mraise_from(e, \u001b[38;5;28;01mNone\u001b[39;00m)\n",
"File \u001b[0;32m~/.pyenv/versions/3.8.1/lib/python3.8/http/client.py:1322\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1321\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1322\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbegin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1323\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[1;32m 1324\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n",
"File \u001b[0;32m~/.pyenv/versions/3.8.1/lib/python3.8/http/client.py:303\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 301\u001b[0m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 303\u001b[0m version, status, reason \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m!=\u001b[39m CONTINUE:\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
"File \u001b[0;32m~/.pyenv/versions/3.8.1/lib/python3.8/http/client.py:264\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 264\u001b[0m line \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_MAXLINE\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miso-8859-1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 265\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) \u001b[38;5;241m>\u001b[39m _MAXLINE:\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus line\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"File \u001b[0;32m~/.pyenv/versions/3.8.1/lib/python3.8/socket.py:669\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 667\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 668\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 669\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 670\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m 671\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
"File \u001b[0;32m~/.pyenv/versions/3.8.1/lib/python3.8/ssl.py:1241\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1237\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m flags \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1238\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1239\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m\n\u001b[1;32m 1240\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n\u001b[0;32m-> 1241\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1242\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1243\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mrecv_into(buffer, nbytes, flags)\n",
"File \u001b[0;32m~/.pyenv/versions/3.8.1/lib/python3.8/ssl.py:1099\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1097\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1098\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1099\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sslobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1100\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sslobj\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;28mlen\u001b[39m)\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2d48610e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Document(page_content='A WEAK ( k, k ) -LEFSCHETZ THEOREM FOR PROJECTIVE TORIC ORBIFOLDS\\n\\nWilliam D. Montoya\\n\\nInstituto de Matem´atica, Estat´ıstica e Computa¸c˜ao Cient´ıfica,\\n\\nFirstly we show a generalization of the ( 1 , 1 ) -Lefschetz theorem for projective toric orbifolds and secondly we prove that on 2 k -dimensional quasi-smooth hyper- surfaces coming from quasi-smooth intersection surfaces, under the Cayley trick, every rational ( k, k ) -cohomology class is algebraic, i.e., the Hodge conjecture holds\\n\\nIn [3] we proved that, under suitable conditions, on a very general codimension s quasi- smooth intersection subvariety X in a projective toric orbifold P d Σ with d + s = 2 ( k + 1 ) the Hodge conjecture holds, that is, every ( p, p ) -cohomology class, under the Poincar´e duality is a rational linear combination of fundamental classes of algebraic subvarieties of X . The proof of the above-mentioned result relies, for p ≠ d + 1 s , on a Lefschetz\\n\\nKeywords: (1,1)- Lefschetz theorem, Hodge conjecture, toric varieties, complete intersection Email: wmontoya@ime.unicamp.br\\n\\ntheorem ([7]) and the Hard Lefschetz theorem for projective orbifolds ([11]). When p = d + 1 s the proof relies on the Cayley trick, a trick which associates to X a quasi-smooth hypersurface Y in a projective vector bundle, and the Cayley Proposition (4.3) which gives an isomorphism of some primitive cohomologies (4.2) of X and Y . The Cayley trick, following the philosophy of Mavlyutov in [7], reduces results known for quasi-smooth hypersurfaces to quasi-smooth intersection subvarieties. The idea in this paper goes the other way around, we translate some results for quasi-smooth intersection subvarieties to quasi-smooth hypersurfaces, mainly the ( 1 , 1 ) -Lefschetz theorem.\\n\\nAcknowledgement. I thank Prof. Ugo Bruzzo and Tiago Fonseca for useful discus- sions. I also acknowledge support from FAPESP postdoctoral grant No. 2019/23499-7.\\n\\nPreliminaries and Notation\\n\\nLet M be a free abelian group of rank d , let N = Hom ( M, Z ) , and N R = N ⊗ Z R\\n\\nif there exist k linearly independent primitive elements e\\n\\n, . . . , e k ∈ N such that σ = { µ\\n\\ne\\n\\n+ ⋯ + µ k e k } . • The generators e i are integral if for every i and any nonnegative rational number µ the product µe i is in N only if µ is an integer. • Given two rational simplicial cones σ , σ one says that σ is a face of σ ( σ < σ ) if the set of integral generators of σ is a subset of the set of integral generators of σ . • A finite set Σ = { σ\\n\\n, . . . , σ t } of rational simplicial cones is called a rational simplicial complete d -dimensional fan if:\\n\\nall faces of cones in Σ are in Σ ;\\n\\nif σ, σ ∈ Σ then σσ < σ and σσ < σ ;\\n\\nN R = σ\\n\\n ⋅ ⋅ ⋅ σ t .\\n\\nA rational simplicial complete d -dimensional fan Σ defines a d -dimensional toric variety P d Σ having only orbifold singularities which we assume to be projective. Moreover, T = N ⊗ Z C ≃ ( C ) d is the torus action on P d Σ . We denote by Σ ( i ) the i -dimensional cones\\n\\nFor a cone σ ∈ Σ, ˆ σ is the set of 1-dimensional cone in Σ that are not contained in σ\\n\\nand x ˆ σ = ∏ ρˆ σ x ρ is the associated monomial in S .\\n\\nDefinition 2.2. The irrelevant ideal of P d Σ is the monomial ideal B Σ =< x ˆ σ σ ∈ Σ > and the zero locus Z ( Σ ) = V ( B Σ ) in the affine space A d = Spec ( S ) is the irrelevant locus.\\n\\nProposition 2.3 (Theorem 5.1.11 [5]) . The toric variety P d Σ is a categorical quotient A d Z ( Σ ) by the group Hom ( Cl ( Σ ) , C ) and the group action is induced by the Cl ( Σ ) - grading of S .\\n\\nNow we give a brief introduction to complex orbifolds and we mention the needed theorems for the next section. Namely: de Rham theorem and Dolbeault theorem for complex orbifolds.\\n\\nDefinition 2.4. A complex orbifold of complex dimension d is a sin
]
}
],
"source": [
"print(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3258869",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}