From 33ea606f455f195d74f09ac654e03da8850ecb9b Mon Sep 17 00:00:00 2001 From: Ricardo Reis Date: Sat, 3 Jun 2023 20:56:17 -0300 Subject: [PATCH] Update youtube.py - Fix metadata validation error in YoutubeLoader (#5479) This commit addresses a ValueError occurring when the YoutubeLoader class tries to add datetime metadata from a YouTube video's publish date. The error was happening because the ChromaDB metadata validation only accepts str, int, or float data types. In the `_get_video_info` method of the `YoutubeLoader` class, the publish date retrieved from the YouTube video was of datetime type. This commit fixes the issue by converting the datetime object to a string before adding it to the metadata dictionary. Additionally, this commit introduces error handling in the `_get_video_info` method to ensure that all metadata fields have valid values. If a metadata field is found to be None, a default value is assigned. This prevents potential errors during metadata validation when metadata fields are None. The file modified in this commit is youtube.py. # Your PR Title (What it does) Fixes # (issue) ## Before submitting ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: --------- Co-authored-by: Harrison Chase --- langchain/document_loaders/youtube.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/langchain/document_loaders/youtube.py b/langchain/document_loaders/youtube.py index c506454b..9960411c 100644 --- a/langchain/document_loaders/youtube.py +++ b/langchain/document_loaders/youtube.py @@ -231,13 +231,15 @@ class YoutubeLoader(BaseLoader): ) yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}") video_info = { - "title": yt.title, - "description": yt.description, - "view_count": yt.views, - "thumbnail_url": yt.thumbnail_url, - "publish_date": yt.publish_date, - "length": yt.length, - "author": yt.author, + "title": yt.title or "Unknown", + "description": yt.description or "Unknown", + "view_count": yt.views or 0, + "thumbnail_url": yt.thumbnail_url or "Unknown", + "publish_date": yt.publish_date.strftime("%Y-%m-%d %H:%M:%S") + if yt.publish_date + else "Unknown", + "length": yt.length or 0, + "author": yt.author or "Unknown", } return video_info