From 44ecec38961cbb58d292a850e129c654f46278dd Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Wed, 8 Feb 2023 00:35:33 -0800 Subject: [PATCH] Harrison/add roam loader (#939) --- .../examples/gcs_directory.ipynb | 156 ++++++++++++++++++ .../document_loaders/examples/gcs_file.ipynb | 104 ++++++++++++ .../document_loaders/examples/roam.ipynb | 78 +++++++++ .../examples/s3_directory.ipynb | 134 +++++++++++++++ .../document_loaders/examples/s3_file.ipynb | 94 +++++++++++ .../document_loaders/examples/youtube.ipynb | 88 ++++++++++ .../document_loaders/how_to_guides.rst | 13 ++ langchain/document_loaders/__init__.py | 12 ++ langchain/document_loaders/gcs_directory.py | 32 ++++ langchain/document_loaders/gcs_file.py | 40 +++++ langchain/document_loaders/obsidian.py | 2 +- langchain/document_loaders/roam.py | 25 +++ langchain/document_loaders/s3_directory.py | 32 ++++ langchain/document_loaders/s3_file.py | 32 ++++ langchain/document_loaders/youtube.py | 35 ++++ 15 files changed, 876 insertions(+), 1 deletion(-) create mode 100644 docs/modules/document_loaders/examples/gcs_directory.ipynb create mode 100644 docs/modules/document_loaders/examples/gcs_file.ipynb create mode 100644 docs/modules/document_loaders/examples/roam.ipynb create mode 100644 docs/modules/document_loaders/examples/s3_directory.ipynb create mode 100644 docs/modules/document_loaders/examples/s3_file.ipynb create mode 100644 docs/modules/document_loaders/examples/youtube.ipynb create mode 100644 langchain/document_loaders/gcs_directory.py create mode 100644 langchain/document_loaders/gcs_file.py create mode 100644 langchain/document_loaders/roam.py create mode 100644 langchain/document_loaders/s3_directory.py create mode 100644 langchain/document_loaders/s3_file.py create mode 100644 langchain/document_loaders/youtube.py diff --git a/docs/modules/document_loaders/examples/gcs_directory.ipynb b/docs/modules/document_loaders/examples/gcs_directory.ipynb new file mode 100644 index 0000000000..963d20f0be --- /dev/null +++ b/docs/modules/document_loaders/examples/gcs_directory.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0ef41fd4", + "metadata": {}, + "source": [ + "# GCS Directory\n", + "\n", + "This covers how to load document objects from an Google Cloud Storage (GCS) directory." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5cfb25c9", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import GCSDirectoryLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "93a4d0f1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# !pip install google-cloud-storage" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "633dc839", + "metadata": {}, + "outputs": [], + "source": [ + "loader = GCSDirectoryLoader(project_name=\"aist\", bucket=\"testing-hwc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a863467d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/harrisonchase/workplace/langchain/.venv/lib/python3.10/site-packages/google/auth/_default.py:83: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "/Users/harrisonchase/workplace/langchain/.venv/lib/python3.10/site-packages/google/auth/_default.py:83: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpz37njh7u/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "17c0dcbb", + "metadata": {}, + "source": [ + "## Specifying a prefix\n", + "You can also specify a prefix for more finegrained control over what files to load." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b3143c89", + "metadata": {}, + "outputs": [], + "source": [ + "loader = GCSDirectoryLoader(project_name=\"aist\", bucket=\"testing-hwc\", prefix=\"fake\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "226ac6f5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/harrisonchase/workplace/langchain/.venv/lib/python3.10/site-packages/google/auth/_default.py:83: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "/Users/harrisonchase/workplace/langchain/.venv/lib/python3.10/site-packages/google/auth/_default.py:83: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpylg6291i/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9c0734f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/gcs_file.ipynb b/docs/modules/document_loaders/examples/gcs_file.ipynb new file mode 100644 index 0000000000..2399c9dfe8 --- /dev/null +++ b/docs/modules/document_loaders/examples/gcs_file.ipynb @@ -0,0 +1,104 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0ef41fd4", + "metadata": {}, + "source": [ + "# GCS File Storage\n", + "\n", + "This covers how to load document objects from an Google Cloud Storage (GCS) file object." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5cfb25c9", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import GCSFileLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "93a4d0f1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# !pip install google-cloud-storage" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "633dc839", + "metadata": {}, + "outputs": [], + "source": [ + "loader = GCSFileLoader(project_name=\"aist\", bucket=\"testing-hwc\", blob=\"fake.docx\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a863467d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/harrisonchase/workplace/langchain/.venv/lib/python3.10/site-packages/google/auth/_default.py:83: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmp3srlf8n8/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eba3002d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/roam.ipynb b/docs/modules/document_loaders/examples/roam.ipynb new file mode 100644 index 0000000000..2174b74400 --- /dev/null +++ b/docs/modules/document_loaders/examples/roam.ipynb @@ -0,0 +1,78 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1dc7df1d", + "metadata": {}, + "source": [ + "# Roam\n", + "This notebook covers how to load documents from a Roam database. This takes a lot of inspiration from the example repo [here](https://github.com/JimmyLv/roam-qa).\n", + "\n", + "## 🧑 Instructions for ingesting your own dataset\n", + "\n", + "Export your dataset from Roam Research. You can do this by clicking on the three dots in the upper right hand corner and then clicking `Export`.\n", + "\n", + "When exporting, make sure to select the `Markdown & CSV` format option.\n", + "\n", + "This will produce a `.zip` file in your Downloads folder. Move the `.zip` file into this repository.\n", + "\n", + "Run the following command to unzip the zip file (replace the `Export...` with your own file name as needed).\n", + "\n", + "```shell\n", + "unzip Roam-Export-1675782732639.zip -d Roam_DB\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "007c5cbf", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import RoamLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1caec59", + "metadata": {}, + "outputs": [], + "source": [ + "loader = ObsidianLoader(\"Roam_DB\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1c30ff7", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/s3_directory.ipynb b/docs/modules/document_loaders/examples/s3_directory.ipynb new file mode 100644 index 0000000000..707b9697d4 --- /dev/null +++ b/docs/modules/document_loaders/examples/s3_directory.ipynb @@ -0,0 +1,134 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a634365e", + "metadata": {}, + "source": [ + "# s3 Directory\n", + "\n", + "This covers how to load document objects from an s3 directory object." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2f0cd6a5", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import S3DirectoryLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "49815096", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install boto3" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "321cc7f1", + "metadata": {}, + "outputs": [], + "source": [ + "loader = S3DirectoryLoader(\"testing-hwc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2b11d155", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpaa9xl6ch/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "0690c40a", + "metadata": {}, + "source": [ + "## Specifying a prefix\n", + "You can also specify a prefix for more finegrained control over what files to load." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "72d44781", + "metadata": {}, + "outputs": [], + "source": [ + "loader = S3DirectoryLoader(\"testing-hwc\", prefix=\"fake\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2d3c32db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "885dc280", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/s3_file.ipynb b/docs/modules/document_loaders/examples/s3_file.ipynb new file mode 100644 index 0000000000..2225c4d61c --- /dev/null +++ b/docs/modules/document_loaders/examples/s3_file.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "66a7777e", + "metadata": {}, + "source": [ + "# s3 File\n", + "\n", + "This covers how to load document objects from an s3 file object." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9ec8a3b3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import S3FileLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "43128d8d", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install boto3" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "35d6809a", + "metadata": {}, + "outputs": [], + "source": [ + "loader = S3FileLoader(\"testing-hwc\", \"fake.docx\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "efd6be84", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpxvave6wl/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93689594", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/youtube.ipynb b/docs/modules/document_loaders/examples/youtube.ipynb new file mode 100644 index 0000000000..ffbbf5e335 --- /dev/null +++ b/docs/modules/document_loaders/examples/youtube.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "df770c72", + "metadata": {}, + "source": [ + "# YouTube\n", + "\n", + "How to load documents from YouTube transcripts." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "da4a867f", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import YoutubeLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "34a25b57", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# !pip install youtube-transcript-api" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bc8b308a", + "metadata": {}, + "outputs": [], + "source": [ + "loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d073dd36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU. THANK YOU VERY MUCH. I\\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED \"THE LAST OF US\" ON HBO. FOR SOME HBO SHOES, YOU GET TO SHOOT IN A FIVE STAR ITALIAN RESORT SURROUNDED BY BEAUTIFUL PEOPLE, BUT I SAID, NO, THAT\\'S TOO EASY. I WANT TO SHOOT IN A FREEZING CANADIAN FOREST WHILE BEING CHASED AROUND BY A GUY WHOSE HEAD LOOKS LIKE A GENITAL WART. IT IS AN HONOR BEING A PART OF THESE HUGE FRANCHISEs LIKE \"GAME OF THRONES\" AND \"STAR WARS,\" BUT I\\'M STILL GETTING USED TO PEOPLE RECOGNIZING ME. THE OTHER DAY, A GUY STOPPED ME ON THE STREET AND SAYS, MY SON LOVES \"THE MANDALORIAN\" AND THE NEXT THING I KNOW, I\\'M FACE TIMING WITH A 6-YEAR-OLD WHO HAS NO IDEA WHO I AM BECAUSE MY CHARACTER WEARS A MASK THE ENTIRE SHOW. THE GUY IS LIKE, DO THE MANDO VOICE, BUT IT\\'S LIKE A BEDROOM VOICE. WITHOUT THE MASK, IT JUST SOUNDS PORNY. PEOPLE WALKING BY ON THE STREET SEE ME WHISPERING TO A 6-YEAR-OLD KID. I CAN BRING YOU IN WARM, OR I CAN BRING YOU IN COLD. EVEN THOUGH I CAME TO THE U.S. WHEN I WAS LITTLE, I WAS BORN IN CHILE, AND I HAVE 34 FIRST COUSINS WHO ARE STILL THERE. THEY\\'RE VERY PROUD OF ME. I KNOW THEY\\'RE PROUD BECAUSE THEY GIVE MY PHONE NUMBER TO EVERY PERSON THEY MEET, WHICH MEANS EVERY DAY, SOMEONE IN SANTIAGO WILL TEXT ME STUFF LIKE, CAN YOU COME TO MY WEDDING, OR CAN YOU SING MY PRIEST HAPPY BIRTHDAY, OR IS BABY YODA MEAN IN REAL LIFE. SO I HAVE TO BE LIKE NO, NO, AND HIS NAME IS GROGU. BUT MY COUSINS WEREN\\'T ALWAYS SO PROUD. EARLY IN MY CAREER, I PLAYED SMALL PARTS IN EVERY CRIME SHOW. I EVEN PLAYED TWO DIFFERENT CHARACTERS ON \"LAW AND ORDER.\" TITO CABASSA WHO LOOKED LIKE THIS. AND ONE YEAR LATER, I PLAYED REGGIE LUCKMAN WHO LOOKS LIKE THIS. AND THAT, MY FRIENDS, IS CALLED RANGE. BUT IT IS AMAZING TO BE HERE, LIKE I SAID. I WAS BORN IN CHILE, AND NINE MONTHS LATER, MY PARENTS FLED AND BROUGHT ME AND MY SISTER TO THE U.S. THEY WERE SO BRAVE, AND WITHOUT THEM, I WOULDN\\'T BE HERE IN THIS WONDERFUL COUNTRY, AND I CERTAINLY WOULDN\\'T BE STANDING HERE WITH YOU ALL TONIGHT. SO TO ALL MY FAMILY WATCHING IN CHILE, I WANT TO SAY [ SPEAKING NON-ENGLISH ] WHICH MEANS, I LOVE YOU, I MISS YOU, AND STOP GIVING OUT MY PHONE NUMBER. WE\\'VE GOT AN AMAZING SHOW FOR YOU TONIGHT. COLDPLAY IS HERE, SO STICK', lookup_str='', metadata={'source': 'QsYGlZkevEg'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/how_to_guides.rst b/docs/modules/document_loaders/how_to_guides.rst index 7d64326ddd..ee807d4db5 100644 --- a/docs/modules/document_loaders/how_to_guides.rst +++ b/docs/modules/document_loaders/how_to_guides.rst @@ -25,6 +25,19 @@ There are a lot of different document loaders that LangChain supports. Below are `Obsidian <./examples/obsidian.html>`_: A walkthrough of how to load data from an Obsidian file dump. +`Roam <./examples/roam.html>`_: A walkthrough of how to load data from a Roam file export. + +`YouTube <./examples/youtube.html>`_: A walkthrough of how to load the transcript from a YouTube video. + +`s3 File <./examples/s3_file.html>`_: A walkthrough of how to load a file from s3. + +`s3 Directory <./examples/s3_directory.html>`_: A walkthrough of how to load all files in a directory from s3. + +`GCS File <./examples/gcs_file.html>`_: A walkthrough of how to load a file from Google Cloud Storage (GCS). + +`GCS Directory <./examples/gcs_directory.html>`_: A walkthrough of how to load all files in a directory from Google Cloud Storage (GCS). + + .. toctree:: :maxdepth: 1 :glob: diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 0b5b3b6c78..c95f2d072d 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -3,6 +3,8 @@ from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.docx import UnstructuredDocxLoader from langchain.document_loaders.email import UnstructuredEmailLoader +from langchain.document_loaders.gcs_directory import GCSDirectoryLoader +from langchain.document_loaders.gcs_file import GCSFileLoader from langchain.document_loaders.googledrive import GoogleDriveLoader from langchain.document_loaders.html import UnstructuredHTMLLoader from langchain.document_loaders.notion import NotionDirectoryLoader @@ -10,7 +12,11 @@ from langchain.document_loaders.obsidian import ObsidianLoader from langchain.document_loaders.pdf import UnstructuredPDFLoader from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader from langchain.document_loaders.readthedocs import ReadTheDocsLoader +from langchain.document_loaders.roam import RoamLoader +from langchain.document_loaders.s3_directory import S3DirectoryLoader +from langchain.document_loaders.s3_file import S3FileLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader +from langchain.document_loaders.youtube import YoutubeLoader __all__ = [ "UnstructuredFileLoader", @@ -24,4 +30,10 @@ __all__ = [ "ObsidianLoader", "UnstructuredDocxLoader", "UnstructuredEmailLoader", + "RoamLoader", + "YoutubeLoader", + "S3FileLoader", + "S3DirectoryLoader", + "GCSFileLoader", + "GCSDirectoryLoader", ] diff --git a/langchain/document_loaders/gcs_directory.py b/langchain/document_loaders/gcs_directory.py new file mode 100644 index 0000000000..52939eb3b0 --- /dev/null +++ b/langchain/document_loaders/gcs_directory.py @@ -0,0 +1,32 @@ +"""Loading logic for loading documents from an GCS directory.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.gcs_file import GCSFileLoader + + +class GCSDirectoryLoader(BaseLoader): + """Loading logic for loading documents from GCS.""" + + def __init__(self, project_name: str, bucket: str, prefix: str = ""): + """Initialize with bucket and key name.""" + self.project_name = project_name + self.bucket = bucket + self.prefix = prefix + + def load(self) -> List[Document]: + """Load documents.""" + try: + from google.cloud import storage + except ImportError: + raise ValueError( + "Could not import google-cloud-storage python package. " + "Please it install it with `pip install google-cloud-storage`." + ) + client = storage.Client(project=self.project_name) + docs = [] + for blob in client.list_blobs(self.bucket, prefix=self.prefix): + loader = GCSFileLoader(self.project_name, self.bucket, blob.name) + docs.extend(loader.load()) + return docs diff --git a/langchain/document_loaders/gcs_file.py b/langchain/document_loaders/gcs_file.py new file mode 100644 index 0000000000..d6fb172d93 --- /dev/null +++ b/langchain/document_loaders/gcs_file.py @@ -0,0 +1,40 @@ +"""Loading logic for loading documents from a GCS file.""" +import tempfile +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader + + +class GCSFileLoader(BaseLoader): + """Loading logic for loading documents from GCS.""" + + def __init__(self, project_name: str, bucket: str, blob: str): + """Initialize with bucket and key name.""" + self.bucket = bucket + self.blob = blob + self.project_name = project_name + + def load(self) -> List[Document]: + """Load documents.""" + try: + from google.cloud import storage + except ImportError: + raise ValueError( + "Could not import google-cloud-storage python package. " + "Please it install it with `pip install google-cloud-storage`." + ) + + # Initialise a client + storage_client = storage.Client(self.project_name) + # Create a bucket object for our bucket + bucket = storage_client.get_bucket(self.bucket) + # Create a blob object from the filepath + blob = bucket.blob(self.blob) + with tempfile.TemporaryDirectory() as temp_dir: + file_path = f"{temp_dir}/{self.blob}" + # Download the file to a destination + blob.download_to_filename(file_path) + loader = UnstructuredFileLoader(file_path) + return loader.load() diff --git a/langchain/document_loaders/obsidian.py b/langchain/document_loaders/obsidian.py index 13403b2fe4..1ad30db786 100644 --- a/langchain/document_loaders/obsidian.py +++ b/langchain/document_loaders/obsidian.py @@ -1,4 +1,4 @@ -"""Loader that loads Notion directory dump.""" +"""Loader that loads Obsidian directory dump.""" from pathlib import Path from typing import List diff --git a/langchain/document_loaders/roam.py b/langchain/document_loaders/roam.py new file mode 100644 index 0000000000..ff06885764 --- /dev/null +++ b/langchain/document_loaders/roam.py @@ -0,0 +1,25 @@ +"""Loader that loads Roam directory dump.""" +from pathlib import Path +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class RoamLoader(BaseLoader): + """Loader that loads Roam files from disk.""" + + def __init__(self, path: str): + """Initialize with path.""" + self.file_path = path + + def load(self) -> List[Document]: + """Load documents.""" + ps = list(Path(self.file_path).glob("**/*.md")) + docs = [] + for p in ps: + with open(p) as f: + text = f.read() + metadata = {"source": str(p)} + docs.append(Document(page_content=text, metadata=metadata)) + return docs diff --git a/langchain/document_loaders/s3_directory.py b/langchain/document_loaders/s3_directory.py new file mode 100644 index 0000000000..98fa440881 --- /dev/null +++ b/langchain/document_loaders/s3_directory.py @@ -0,0 +1,32 @@ +"""Loading logic for loading documents from an s3 directory.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.s3_file import S3FileLoader + + +class S3DirectoryLoader(BaseLoader): + """Loading logic for loading documents from s3.""" + + def __init__(self, bucket: str, prefix: str = ""): + """Initialize with bucket and key name.""" + self.bucket = bucket + self.prefix = prefix + + def load(self) -> List[Document]: + """Load documents.""" + try: + import boto3 + except ImportError: + raise ValueError( + "Could not import boto3 python package. " + "Please it install it with `pip install boto3`." + ) + s3 = boto3.resource("s3") + bucket = s3.Bucket(self.bucket) + docs = [] + for obj in bucket.objects.filter(Prefix=self.prefix): + loader = S3FileLoader(self.bucket, obj.key) + docs.extend(loader.load()) + return docs diff --git a/langchain/document_loaders/s3_file.py b/langchain/document_loaders/s3_file.py new file mode 100644 index 0000000000..797d503af5 --- /dev/null +++ b/langchain/document_loaders/s3_file.py @@ -0,0 +1,32 @@ +"""Loading logic for loading documents from an s3 file.""" +import tempfile +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader + + +class S3FileLoader(BaseLoader): + """Loading logic for loading documents from s3.""" + + def __init__(self, bucket: str, key: str): + """Initialize with bucket and key name.""" + self.bucket = bucket + self.key = key + + def load(self) -> List[Document]: + """Load documents.""" + try: + import boto3 + except ImportError: + raise ValueError( + "Could not import boto3 python package. " + "Please it install it with `pip install boto3`." + ) + s3 = boto3.client("s3") + with tempfile.TemporaryDirectory() as temp_dir: + file_path = f"{temp_dir}/{self.key}" + s3.download_file(self.bucket, self.key, file_path) + loader = UnstructuredFileLoader(file_path) + return loader.load() diff --git a/langchain/document_loaders/youtube.py b/langchain/document_loaders/youtube.py new file mode 100644 index 0000000000..c8f10dfed4 --- /dev/null +++ b/langchain/document_loaders/youtube.py @@ -0,0 +1,35 @@ +"""Loader that loads YouTube transcript.""" +from __future__ import annotations + +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class YoutubeLoader(BaseLoader): + """Loader that loads Youtube transcripts.""" + + def __init__(self, video_id: str): + """Initialize with YouTube video ID.""" + self.video_id = video_id + + @classmethod + def from_youtube_url(cls, youtube_url: str) -> YoutubeLoader: + """Parse out video id from YouTube url.""" + video_id = youtube_url.split("youtube.com/watch?v=")[-1] + return cls(video_id) + + def load(self) -> List[Document]: + """Load documents.""" + try: + from youtube_transcript_api import YouTubeTranscriptApi + except ImportError: + raise ValueError( + "Could not import youtube_transcript_api python package. " + "Please it install it with `pip install youtube-transcript-api`." + ) + transcript_pieces = YouTubeTranscriptApi.get_transcript(self.video_id) + transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces]) + metadata = {"source": self.video_id} + return [Document(page_content=transcript, metadata=metadata)]