From d00a247da7cc49af5f646be09796401d5026884c Mon Sep 17 00:00:00 2001 From: liguoqinjim Date: Sat, 5 Aug 2023 05:30:41 +0800 Subject: [PATCH] fix:get bilibili subtitles (#8165) - Description: fix the Loader 'BiliBiliLoader' - Issue: the API response was changed ![image](https://github.com/langchain-ai/langchain/assets/2113954/91216793-82f8-4c82-a018-d49f36f5f6aa) The previously used API no longer returns the "subtitle_url" property. ![image](https://github.com/langchain-ai/langchain/assets/2113954/a8ec2a7a-f40d-4c2a-b7d0-0ccdf2b327cc) We should use another API to get `subtitle_url` property. The `subtitle_url` returned by this API does not include the http schema and needs to be added. - Dependencies: Nope - Tag maintainer: @rlancemartin --- libs/langchain/langchain/document_loaders/bilibili.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/bilibili.py b/libs/langchain/langchain/document_loaders/bilibili.py index d3269de0c6..0c1c815180 100644 --- a/libs/langchain/langchain/document_loaders/bilibili.py +++ b/libs/langchain/langchain/document_loaders/bilibili.py @@ -54,12 +54,14 @@ class BiliBiliLoader(BaseLoader): video_info = sync(v.get_info()) video_info.update({"url": url}) + sub = sync(v.get_subtitle(video_info["cid"])) # Get subtitle url - subtitle = video_info.pop("subtitle") - sub_list = subtitle["list"] + sub_list = sub["subtitles"] if sub_list: sub_url = sub_list[0]["subtitle_url"] + if not sub_url.startswith("http"): + sub_url = "https:" + sub_url result = requests.get(sub_url) raw_sub_titles = json.loads(result.content)["body"] raw_transcript = " ".join([c["content"] for c in raw_sub_titles])