langchain/libs/community/langchain_community/document_loaders/bilibili.py

84 lines
2.7 KiB
Python
Raw Normal View History

community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2023-12-11 21:53:30 +00:00
import json
import re
import warnings
from typing import List, Tuple
import requests
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
class BiliBiliLoader(BaseLoader):
"""Load `BiliBili` video transcripts."""
def __init__(self, video_urls: List[str]):
"""Initialize with bilibili url.
Args:
video_urls: List of bilibili urls.
"""
self.video_urls = video_urls
def load(self) -> List[Document]:
"""Load Documents from bilibili url."""
results = []
for url in self.video_urls:
transcript, video_info = self._get_bilibili_subs_and_info(url)
doc = Document(page_content=transcript, metadata=video_info)
results.append(doc)
return results
def _get_bilibili_subs_and_info(self, url: str) -> Tuple[str, dict]:
try:
from bilibili_api import sync, video
except ImportError:
raise ImportError(
"requests package not found, please install it with "
"`pip install bilibili-api-python`"
)
bvid = re.search(r"BV\w+", url)
if bvid is not None:
v = video.Video(bvid=bvid.group())
else:
aid = re.search(r"av[0-9]+", url)
if aid is not None:
try:
v = video.Video(aid=int(aid.group()[2:]))
except AttributeError:
raise ValueError(f"{url} is not bilibili url.")
else:
raise ValueError(f"{url} is not bilibili url.")
video_info = sync(v.get_info())
video_info.update({"url": url})
sub = sync(v.get_subtitle(video_info["cid"]))
# Get subtitle url
sub_list = sub["subtitles"]
if sub_list:
sub_url = sub_list[0]["subtitle_url"]
if not sub_url.startswith("http"):
sub_url = "https:" + sub_url
result = requests.get(sub_url)
raw_sub_titles = json.loads(result.content)["body"]
raw_transcript = " ".join([c["content"] for c in raw_sub_titles])
raw_transcript_with_meta_info = (
f"Video Title: {video_info['title']},"
f"description: {video_info['desc']}\n\n"
f"Transcript: {raw_transcript}"
)
return raw_transcript_with_meta_info, video_info
else:
raw_transcript = ""
warnings.warn(
f"""
No subtitles found for video: {url}.
Return Empty transcript.
"""
)
return raw_transcript, video_info