mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
fix:get bilibili subtitles (#8165)
- Description: fix the Loader 'BiliBiliLoader' - Issue: the API response was changed ![image](https://github.com/langchain-ai/langchain/assets/2113954/91216793-82f8-4c82-a018-d49f36f5f6aa) The previously used API no longer returns the "subtitle_url" property. ![image](https://github.com/langchain-ai/langchain/assets/2113954/a8ec2a7a-f40d-4c2a-b7d0-0ccdf2b327cc) We should use another API to get `subtitle_url` property. The `subtitle_url` returned by this API does not include the http schema and needs to be added. - Dependencies: Nope - Tag maintainer: @rlancemartin
This commit is contained in:
parent
21771a6f1c
commit
d00a247da7
@ -54,12 +54,14 @@ class BiliBiliLoader(BaseLoader):
|
|||||||
|
|
||||||
video_info = sync(v.get_info())
|
video_info = sync(v.get_info())
|
||||||
video_info.update({"url": url})
|
video_info.update({"url": url})
|
||||||
|
sub = sync(v.get_subtitle(video_info["cid"]))
|
||||||
|
|
||||||
# Get subtitle url
|
# Get subtitle url
|
||||||
subtitle = video_info.pop("subtitle")
|
sub_list = sub["subtitles"]
|
||||||
sub_list = subtitle["list"]
|
|
||||||
if sub_list:
|
if sub_list:
|
||||||
sub_url = sub_list[0]["subtitle_url"]
|
sub_url = sub_list[0]["subtitle_url"]
|
||||||
|
if not sub_url.startswith("http"):
|
||||||
|
sub_url = "https:" + sub_url
|
||||||
result = requests.get(sub_url)
|
result = requests.get(sub_url)
|
||||||
raw_sub_titles = json.loads(result.content)["body"]
|
raw_sub_titles = json.loads(result.content)["body"]
|
||||||
raw_transcript = " ".join([c["content"] for c in raw_sub_titles])
|
raw_transcript = " ".join([c["content"] for c in raw_sub_titles])
|
||||||
|
Loading…
Reference in New Issue
Block a user