mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Harrison/youtube loader (#1545)
Co-authored-by: Julian Wustl <57504258+Julianwustl@users.noreply.github.com>
This commit is contained in:
parent
b053f831cd
commit
cc423f40f1
@ -1,28 +1,86 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "df770c72",
|
"id": "df770c72",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# YouTube\n",
|
"# YouTube\n",
|
||||||
"\n",
|
"\n",
|
||||||
"How to load documents from YouTube transcripts."
|
"How to load documents from YouTube transcripts.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## Prerequisites\n",
|
||||||
|
"\n",
|
||||||
|
"1. Create a Google Cloud project or use an existing project\n",
|
||||||
|
"1. Enable the [Youtube Api](https://console.cloud.google.com/apis/enableflow?apiid=youtube.googleapis.com&project=sixth-grammar-344520)\n",
|
||||||
|
"1. [Authorize credentials for desktop app](https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application)\n",
|
||||||
|
"1. `pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib youtube-transcript-api`\n",
|
||||||
|
"\n",
|
||||||
|
"## 🧑 Instructions for ingesting your Google Docs data\n",
|
||||||
|
"By default, the `GoogleDriveLoader` expects the `credentials.json` file to be `~/.credentials/credentials.json`, but this is configurable using the `credentials_file` keyword argument. Same thing with `token.json`. Note that `token.json` will be created automatically the first time you use the loader.\n",
|
||||||
|
"\n",
|
||||||
|
"`GoogleApiYoutubeLoader` can load from a list of Google Docs document ids or a folder id. You can obtain your folder and document id from the URL:\n",
|
||||||
|
"Note depending on your set up, the `service_account_path` needs to be set up. See [here](https://developers.google.com/drive/api/v3/quickstart/python) for more details."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
"id": "da4a867f",
|
"id": "da4a867f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from langchain.document_loaders import YoutubeLoader"
|
"from langchain.document_loaders import GoogleApiClient, GoogleApiYoutubeLoader"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": null,
|
||||||
|
"id": "3994986e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Init the GoogleApiClient \n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"google_api_client = GoogleApiClient(credentials_path=Path(\"your_path_creds.json\"))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Use a Channel\n",
|
||||||
|
"youtube_loader_channel = GoogleApiYoutubeLoader(google_api_client=google_api_client, channel_name=\"Reducible\",captions_language=\"en\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Use Youtube Ids\n",
|
||||||
|
"\n",
|
||||||
|
"youtube_loader_ids = GoogleApiYoutubeLoader(google_api_client=google_api_client, video_ids=[\"TrdevFK_am4\"], add_video_info=True)\n",
|
||||||
|
"\n",
|
||||||
|
"# returns a list of Documents\n",
|
||||||
|
"youtube_loader_channel.load()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "507506db",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "427d5745",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import YoutubeLoader\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
"id": "34a25b57",
|
"id": "34a25b57",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
@ -34,7 +92,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": null,
|
||||||
"id": "bc8b308a",
|
"id": "bc8b308a",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -44,21 +102,10 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": null,
|
||||||
"id": "d073dd36",
|
"id": "d073dd36",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[Document(page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU. THANK YOU VERY MUCH. I\\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED \"THE LAST OF US\" ON HBO. FOR SOME HBO SHOES, YOU GET TO SHOOT IN A FIVE STAR ITALIAN RESORT SURROUNDED BY BEAUTIFUL PEOPLE, BUT I SAID, NO, THAT\\'S TOO EASY. I WANT TO SHOOT IN A FREEZING CANADIAN FOREST WHILE BEING CHASED AROUND BY A GUY WHOSE HEAD LOOKS LIKE A GENITAL WART. IT IS AN HONOR BEING A PART OF THESE HUGE FRANCHISEs LIKE \"GAME OF THRONES\" AND \"STAR WARS,\" BUT I\\'M STILL GETTING USED TO PEOPLE RECOGNIZING ME. THE OTHER DAY, A GUY STOPPED ME ON THE STREET AND SAYS, MY SON LOVES \"THE MANDALORIAN\" AND THE NEXT THING I KNOW, I\\'M FACE TIMING WITH A 6-YEAR-OLD WHO HAS NO IDEA WHO I AM BECAUSE MY CHARACTER WEARS A MASK THE ENTIRE SHOW. THE GUY IS LIKE, DO THE MANDO VOICE, BUT IT\\'S LIKE A BEDROOM VOICE. WITHOUT THE MASK, IT JUST SOUNDS PORNY. PEOPLE WALKING BY ON THE STREET SEE ME WHISPERING TO A 6-YEAR-OLD KID. I CAN BRING YOU IN WARM, OR I CAN BRING YOU IN COLD. EVEN THOUGH I CAME TO THE U.S. WHEN I WAS LITTLE, I WAS BORN IN CHILE, AND I HAVE 34 FIRST COUSINS WHO ARE STILL THERE. THEY\\'RE VERY PROUD OF ME. I KNOW THEY\\'RE PROUD BECAUSE THEY GIVE MY PHONE NUMBER TO EVERY PERSON THEY MEET, WHICH MEANS EVERY DAY, SOMEONE IN SANTIAGO WILL TEXT ME STUFF LIKE, CAN YOU COME TO MY WEDDING, OR CAN YOU SING MY PRIEST HAPPY BIRTHDAY, OR IS BABY YODA MEAN IN REAL LIFE. SO I HAVE TO BE LIKE NO, NO, AND HIS NAME IS GROGU. BUT MY COUSINS WEREN\\'T ALWAYS SO PROUD. EARLY IN MY CAREER, I PLAYED SMALL PARTS IN EVERY CRIME SHOW. I EVEN PLAYED TWO DIFFERENT CHARACTERS ON \"LAW AND ORDER.\" TITO CABASSA WHO LOOKED LIKE THIS. AND ONE YEAR LATER, I PLAYED REGGIE LUCKMAN WHO LOOKS LIKE THIS. AND THAT, MY FRIENDS, IS CALLED RANGE. BUT IT IS AMAZING TO BE HERE, LIKE I SAID. I WAS BORN IN CHILE, AND NINE MONTHS LATER, MY PARENTS FLED AND BROUGHT ME AND MY SISTER TO THE U.S. THEY WERE SO BRAVE, AND WITHOUT THEM, I WOULDN\\'T BE HERE IN THIS WONDERFUL COUNTRY, AND I CERTAINLY WOULDN\\'T BE STANDING HERE WITH YOU ALL TONIGHT. SO TO ALL MY FAMILY WATCHING IN CHILE, I WANT TO SAY [ SPEAKING NON-ENGLISH ] WHICH MEANS, I LOVE YOU, I MISS YOU, AND STOP GIVING OUT MY PHONE NUMBER. WE\\'VE GOT AN AMAZING SHOW FOR YOU TONIGHT. COLDPLAY IS HERE, SO STICK', lookup_str='', metadata={'source': 'QsYGlZkevEg', 'title': 'Pedro Pascal Monologue - SNL', 'description': 'First-time host Pedro Pascal talks about filming The Last of Us and being recognized by fans.\\n\\nSaturday Night Live. Stream now on Peacock: https://pck.tv/3uQxh4q\\n\\nSubscribe to SNL: https://goo.gl/tUsXwM\\nStream Current Full Episodes: http://www.nbc.com/saturday-night-live\\n\\nWATCH PAST SNL SEASONS\\nGoogle Play - http://bit.ly/SNLGooglePlay\\niTunes - http://bit.ly/SNLiTunes\\n\\nSNL ON SOCIAL\\nSNL Instagram: http://instagram.com/nbcsnl\\nSNL Facebook: https://www.facebook.com/snl\\nSNL Twitter: https://twitter.com/nbcsnl\\nSNL TikTok: https://www.tiktok.com/@nbcsnl\\n\\nGET MORE NBC\\nLike NBC: http://Facebook.com/NBC\\nFollow NBC: http://Twitter.com/NBC\\nNBC Tumblr: http://NBCtv.tumblr.com/\\nYouTube: http://www.youtube.com/nbc\\nNBC Instagram: http://instagram.com/nbc\\n\\n#SNL #PedroPascal #SNL48 #Coldplay', 'view_count': 1175057, 'thumbnail_url': 'https://i.ytimg.com/vi/QsYGlZkevEg/sddefault.jpg', 'publish_date': datetime.datetime(2023, 2, 4, 0, 0), 'length': 224, 'author': 'Saturday Night Live'}, lookup_index=0)]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"loader.load()"
|
"loader.load()"
|
||||||
]
|
]
|
||||||
@ -73,7 +120,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": null,
|
||||||
"id": "ba28af69",
|
"id": "ba28af69",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -83,7 +130,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": null,
|
||||||
"id": "9b8ea390",
|
"id": "9b8ea390",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -93,21 +140,10 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": null,
|
||||||
"id": "97b98e92",
|
"id": "97b98e92",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[Document(page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU. THANK YOU VERY MUCH. I\\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED \"THE LAST OF US\" ON HBO. FOR SOME HBO SHOES, YOU GET TO SHOOT IN A FIVE STAR ITALIAN RESORT SURROUNDED BY BEAUTIFUL PEOPLE, BUT I SAID, NO, THAT\\'S TOO EASY. I WANT TO SHOOT IN A FREEZING CANADIAN FOREST WHILE BEING CHASED AROUND BY A GUY WHOSE HEAD LOOKS LIKE A GENITAL WART. IT IS AN HONOR BEING A PART OF THESE HUGE FRANCHISEs LIKE \"GAME OF THRONES\" AND \"STAR WARS,\" BUT I\\'M STILL GETTING USED TO PEOPLE RECOGNIZING ME. THE OTHER DAY, A GUY STOPPED ME ON THE STREET AND SAYS, MY SON LOVES \"THE MANDALORIAN\" AND THE NEXT THING I KNOW, I\\'M FACE TIMING WITH A 6-YEAR-OLD WHO HAS NO IDEA WHO I AM BECAUSE MY CHARACTER WEARS A MASK THE ENTIRE SHOW. THE GUY IS LIKE, DO THE MANDO VOICE, BUT IT\\'S LIKE A BEDROOM VOICE. WITHOUT THE MASK, IT JUST SOUNDS PORNY. PEOPLE WALKING BY ON THE STREET SEE ME WHISPERING TO A 6-YEAR-OLD KID. I CAN BRING YOU IN WARM, OR I CAN BRING YOU IN COLD. EVEN THOUGH I CAME TO THE U.S. WHEN I WAS LITTLE, I WAS BORN IN CHILE, AND I HAVE 34 FIRST COUSINS WHO ARE STILL THERE. THEY\\'RE VERY PROUD OF ME. I KNOW THEY\\'RE PROUD BECAUSE THEY GIVE MY PHONE NUMBER TO EVERY PERSON THEY MEET, WHICH MEANS EVERY DAY, SOMEONE IN SANTIAGO WILL TEXT ME STUFF LIKE, CAN YOU COME TO MY WEDDING, OR CAN YOU SING MY PRIEST HAPPY BIRTHDAY, OR IS BABY YODA MEAN IN REAL LIFE. SO I HAVE TO BE LIKE NO, NO, AND HIS NAME IS GROGU. BUT MY COUSINS WEREN\\'T ALWAYS SO PROUD. EARLY IN MY CAREER, I PLAYED SMALL PARTS IN EVERY CRIME SHOW. I EVEN PLAYED TWO DIFFERENT CHARACTERS ON \"LAW AND ORDER.\" TITO CABASSA WHO LOOKED LIKE THIS. AND ONE YEAR LATER, I PLAYED REGGIE LUCKMAN WHO LOOKS LIKE THIS. AND THAT, MY FRIENDS, IS CALLED RANGE. BUT IT IS AMAZING TO BE HERE, LIKE I SAID. I WAS BORN IN CHILE, AND NINE MONTHS LATER, MY PARENTS FLED AND BROUGHT ME AND MY SISTER TO THE U.S. THEY WERE SO BRAVE, AND WITHOUT THEM, I WOULDN\\'T BE HERE IN THIS WONDERFUL COUNTRY, AND I CERTAINLY WOULDN\\'T BE STANDING HERE WITH YOU ALL TONIGHT. SO TO ALL MY FAMILY WATCHING IN CHILE, I WANT TO SAY [ SPEAKING NON-ENGLISH ] WHICH MEANS, I LOVE YOU, I MISS YOU, AND STOP GIVING OUT MY PHONE NUMBER. WE\\'VE GOT AN AMAZING SHOW FOR YOU TONIGHT. COLDPLAY IS HERE, SO STICK', lookup_str='', metadata={'source': 'QsYGlZkevEg', 'title': 'Pedro Pascal Monologue - SNL', 'description': 'First-time host Pedro Pascal talks about filming The Last of Us and being recognized by fans.\\n\\nSaturday Night Live. Stream now on Peacock: https://pck.tv/3uQxh4q\\n\\nSubscribe to SNL: https://goo.gl/tUsXwM\\nStream Current Full Episodes: http://www.nbc.com/saturday-night-live\\n\\nWATCH PAST SNL SEASONS\\nGoogle Play - http://bit.ly/SNLGooglePlay\\niTunes - http://bit.ly/SNLiTunes\\n\\nSNL ON SOCIAL\\nSNL Instagram: http://instagram.com/nbcsnl\\nSNL Facebook: https://www.facebook.com/snl\\nSNL Twitter: https://twitter.com/nbcsnl\\nSNL TikTok: https://www.tiktok.com/@nbcsnl\\n\\nGET MORE NBC\\nLike NBC: http://Facebook.com/NBC\\nFollow NBC: http://Twitter.com/NBC\\nNBC Tumblr: http://NBCtv.tumblr.com/\\nYouTube: http://www.youtube.com/nbc\\nNBC Instagram: http://instagram.com/nbc\\n\\n#SNL #PedroPascal #SNL48 #Coldplay', 'view_count': 1175057, 'thumbnail_url': 'https://i.ytimg.com/vi/QsYGlZkevEg/sddefault.jpg', 'publish_date': datetime.datetime(2023, 2, 4, 0, 0), 'length': 224, 'author': 'Saturday Night Live'}, lookup_index=0)]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"loader.load()"
|
"loader.load()"
|
||||||
]
|
]
|
||||||
@ -115,7 +151,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": ".venv",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -129,7 +165,12 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.1"
|
"version": "3.10.6"
|
||||||
|
},
|
||||||
|
"vscode": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "604c1013f65d31a2eb1fca07aae054bedd5a5a0d272dbb31e502c81f0b254b99"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -44,7 +44,11 @@ from langchain.document_loaders.unstructured import (
|
|||||||
from langchain.document_loaders.url import UnstructuredURLLoader
|
from langchain.document_loaders.url import UnstructuredURLLoader
|
||||||
from langchain.document_loaders.web_base import WebBaseLoader
|
from langchain.document_loaders.web_base import WebBaseLoader
|
||||||
from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
|
from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
|
||||||
from langchain.document_loaders.youtube import YoutubeLoader
|
from langchain.document_loaders.youtube import (
|
||||||
|
GoogleApiClient,
|
||||||
|
GoogleApiYoutubeLoader,
|
||||||
|
YoutubeLoader,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"UnstructuredFileLoader",
|
"UnstructuredFileLoader",
|
||||||
@ -88,4 +92,6 @@ __all__ = [
|
|||||||
"FacebookChatLoader",
|
"FacebookChatLoader",
|
||||||
"NotebookLoader",
|
"NotebookLoader",
|
||||||
"CoNLLULoader",
|
"CoNLLULoader",
|
||||||
|
"GoogleApiYoutubeLoader",
|
||||||
|
"GoogleApiClient",
|
||||||
]
|
]
|
||||||
|
@ -1,11 +1,98 @@
|
|||||||
"""Loader that loads YouTube transcript."""
|
"""Loader that loads YouTube transcript."""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Any, List
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from pydantic import root_validator
|
||||||
|
from pydantic.dataclasses import dataclass
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GoogleApiClient:
|
||||||
|
"""A Generic Google Api Client.
|
||||||
|
|
||||||
|
To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
|
||||||
|
python package installed.
|
||||||
|
As the google api expects credentials you need to set up a google account and
|
||||||
|
register your Service. "https://developers.google.com/docs/api/quickstart/python"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.document_loaders import GoogleApiClient
|
||||||
|
google_api_client = GoogleApiClient(
|
||||||
|
service_account_path=Path("path_to_your_sec_file.json")
|
||||||
|
)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
|
||||||
|
service_account_path: Path = Path.home() / ".credentials" / "credentials.json"
|
||||||
|
token_path: Path = Path.home() / ".credentials" / "token.json"
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
self.creds = self._load_credentials()
|
||||||
|
|
||||||
|
@root_validator
|
||||||
|
def validate_channel_or_videoIds_is_set(
|
||||||
|
cls, values: Dict[str, Any]
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Validate that either folder_id or document_ids is set, but not both."""
|
||||||
|
|
||||||
|
if not values.get("credentials_path") and not values.get(
|
||||||
|
"service_account_path"
|
||||||
|
):
|
||||||
|
raise ValueError("Must specify either channel_name or video_ids")
|
||||||
|
return values
|
||||||
|
|
||||||
|
def _load_credentials(self) -> Any:
|
||||||
|
"""Load credentials."""
|
||||||
|
# Adapted from https://developers.google.com/drive/api/v3/quickstart/python
|
||||||
|
try:
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
from google.oauth2 import service_account
|
||||||
|
from google.oauth2.credentials import Credentials
|
||||||
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||||
|
from youtube_transcript_api import YouTubeTranscriptApi # noqa: F401
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"You must run"
|
||||||
|
"`pip install --upgrade "
|
||||||
|
"google-api-python-client google-auth-httplib2 "
|
||||||
|
"google-auth-oauthlib"
|
||||||
|
"youtube-transcript-api`"
|
||||||
|
"to use the Google Drive loader"
|
||||||
|
)
|
||||||
|
|
||||||
|
creds = None
|
||||||
|
if self.service_account_path.exists():
|
||||||
|
return service_account.Credentials.from_service_account_file(
|
||||||
|
str(self.service_account_path)
|
||||||
|
)
|
||||||
|
if self.token_path.exists():
|
||||||
|
creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)
|
||||||
|
|
||||||
|
if not creds or not creds.valid:
|
||||||
|
if creds and creds.expired and creds.refresh_token:
|
||||||
|
creds.refresh(Request())
|
||||||
|
else:
|
||||||
|
flow = InstalledAppFlow.from_client_secrets_file(
|
||||||
|
str(self.credentials_path), SCOPES
|
||||||
|
)
|
||||||
|
creds = flow.run_local_server(port=0)
|
||||||
|
with open(self.token_path, "w") as token:
|
||||||
|
token.write(creds.to_json())
|
||||||
|
|
||||||
|
return creds
|
||||||
|
|
||||||
|
|
||||||
class YoutubeLoader(BaseLoader):
|
class YoutubeLoader(BaseLoader):
|
||||||
"""Loader that loads Youtube transcripts."""
|
"""Loader that loads Youtube transcripts."""
|
||||||
@ -19,8 +106,8 @@ class YoutubeLoader(BaseLoader):
|
|||||||
self.language = language
|
self.language = language
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
def from_youtube_channel(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
||||||
"""Parse out video id from YouTube url."""
|
"""Given a channel name, load all videos."""
|
||||||
video_id = youtube_url.split("youtube.com/watch?v=")[-1]
|
video_id = youtube_url.split("youtube.com/watch?v=")[-1]
|
||||||
return cls(video_id, **kwargs)
|
return cls(video_id, **kwargs)
|
||||||
|
|
||||||
@ -43,7 +130,7 @@ class YoutubeLoader(BaseLoader):
|
|||||||
metadata.update(video_info)
|
metadata.update(video_info)
|
||||||
|
|
||||||
transcript_pieces = YouTubeTranscriptApi.get_transcript(
|
transcript_pieces = YouTubeTranscriptApi.get_transcript(
|
||||||
self.video_id, languages=(self.language,)
|
self.video_id, languages=[self.language]
|
||||||
)
|
)
|
||||||
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
||||||
|
|
||||||
@ -79,3 +166,147 @@ class YoutubeLoader(BaseLoader):
|
|||||||
"author": yt.author,
|
"author": yt.author,
|
||||||
}
|
}
|
||||||
return video_info
|
return video_info
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GoogleApiYoutubeLoader(BaseLoader):
|
||||||
|
"""Loader that loads all Videos from a Channel
|
||||||
|
|
||||||
|
To use, you should have the ``googleapiclient,youtube_transcript_api``
|
||||||
|
python package installed.
|
||||||
|
As the service needs a google_api_client, you first have to initialize
|
||||||
|
the GoogleApiClient.
|
||||||
|
|
||||||
|
Additonali you have to either provide a channel name or a list of videoids
|
||||||
|
"https://developers.google.com/docs/api/quickstart/python"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.document_loaders import GoogleApiClient
|
||||||
|
from langchain.document_loaders import GoogleApiYoutubeLoader
|
||||||
|
google_api_client = GoogleApiClient(
|
||||||
|
service_account_path=Path("path_to_your_sec_file.json")
|
||||||
|
)
|
||||||
|
loader = GoogleApiYoutubeLoader(
|
||||||
|
google_api_client=google_api_client,
|
||||||
|
channel_name = "CodeAesthetic"
|
||||||
|
)
|
||||||
|
load.load()
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
google_api_client: GoogleApiClient
|
||||||
|
channel_name: Optional[str] = None
|
||||||
|
video_ids: Optional[List[str]] = None
|
||||||
|
add_video_info: bool = True
|
||||||
|
captions_language: str = "en"
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
self.youtube_client = self._build_youtube_client(self.google_api_client.creds)
|
||||||
|
|
||||||
|
def _build_youtube_client(self, creds: Any) -> Any:
|
||||||
|
try:
|
||||||
|
from googleapiclient.discovery import build
|
||||||
|
from youtube_transcript_api import YouTubeTranscriptApi # noqa: F401
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"You must run"
|
||||||
|
"`pip install --upgrade "
|
||||||
|
"google-api-python-client google-auth-httplib2 "
|
||||||
|
"google-auth-oauthlib"
|
||||||
|
"youtube-transcript-api`"
|
||||||
|
"to use the Google Drive loader"
|
||||||
|
)
|
||||||
|
|
||||||
|
return build("youtube", "v3", credentials=creds)
|
||||||
|
|
||||||
|
@root_validator
|
||||||
|
def validate_channel_or_videoIds_is_set(
|
||||||
|
cls, values: Dict[str, Any]
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Validate that either folder_id or document_ids is set, but not both."""
|
||||||
|
if not values.get("channel_name") and not values.get("video_ids"):
|
||||||
|
raise ValueError("Must specify either channel_name or video_ids")
|
||||||
|
return values
|
||||||
|
|
||||||
|
def _get_transcripe_for_video_id(self, video_id: str) -> str:
|
||||||
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
|
|
||||||
|
transcript_pieces = YouTubeTranscriptApi.get_transcript(video_id)
|
||||||
|
return " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
||||||
|
|
||||||
|
def _get_document_for_video_id(self, video_id: str, **kwargs: Any) -> Document:
|
||||||
|
captions = self._get_transcripe_for_video_id(video_id)
|
||||||
|
video_response = (
|
||||||
|
self.youtube_client.videos()
|
||||||
|
.list(
|
||||||
|
part="id,snippet",
|
||||||
|
id=video_id,
|
||||||
|
)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
return Document(
|
||||||
|
page_content=captions,
|
||||||
|
metadata=video_response.get("items")[0],
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_channel_id(self, channel_name: str) -> str:
|
||||||
|
request = self.youtube_client.search().list(
|
||||||
|
part="id",
|
||||||
|
q=channel_name,
|
||||||
|
type="channel",
|
||||||
|
maxResults=1, # we only need one result since channel names are unique
|
||||||
|
)
|
||||||
|
response = request.execute()
|
||||||
|
channel_id = response["items"][0]["id"]["channelId"]
|
||||||
|
return channel_id
|
||||||
|
|
||||||
|
def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
|
||||||
|
channel_id = self._get_channel_id(channel)
|
||||||
|
request = self.youtube_client.search().list(
|
||||||
|
part="id,snippet",
|
||||||
|
channelId=channel_id,
|
||||||
|
maxResults=50, # adjust this value to retrieve more or fewer videos
|
||||||
|
)
|
||||||
|
video_ids = []
|
||||||
|
while request is not None:
|
||||||
|
response = request.execute()
|
||||||
|
|
||||||
|
# Add each video ID to the list
|
||||||
|
for item in response["items"]:
|
||||||
|
if not item["id"].get("videoId"):
|
||||||
|
continue
|
||||||
|
meta_data = {"videoId": item["id"]["videoId"]}
|
||||||
|
if self.add_video_info:
|
||||||
|
item["snippet"].pop("thumbnails")
|
||||||
|
meta_data.update(item["snippet"])
|
||||||
|
video_ids.append(
|
||||||
|
Document(
|
||||||
|
page_content=self._get_transcripe_for_video_id(
|
||||||
|
item["id"]["videoId"]
|
||||||
|
),
|
||||||
|
metadata=meta_data,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
request = self.youtube_client.search().list_next(request, response)
|
||||||
|
|
||||||
|
return video_ids
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
document_list = []
|
||||||
|
if self.channel_name:
|
||||||
|
document_list.extend(self._get_document_for_channel(self.channel_name))
|
||||||
|
elif self.video_ids:
|
||||||
|
document_list.extend(
|
||||||
|
[
|
||||||
|
self._get_document_for_video_id(video_id)
|
||||||
|
for video_id in self.video_ids
|
||||||
|
]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError("Must specify either channel_name or video_ids")
|
||||||
|
return document_list
|
||||||
|
0
tests/unit_tests/document_loader/__init__.py
Normal file
0
tests/unit_tests/document_loader/__init__.py
Normal file
0
tests/unit_tests/document_loader/test_youtube.py
Normal file
0
tests/unit_tests/document_loader/test_youtube.py
Normal file
Loading…
Reference in New Issue
Block a user