diff --git a/docs/modules/indexes/document_loaders/examples/bilibili.ipynb b/docs/modules/indexes/document_loaders/examples/bilibili.ipynb new file mode 100644 index 00000000..294b9441 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/bilibili.ipynb @@ -0,0 +1,87 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "66a7777e", + "metadata": {}, + "source": [ + "# Bilibili\n", + "\n", + "This loader utilizes the `bilibili-api` to fetch the text transcript from Bilibili, one of the most beloved long-form video sites in China.\n", + "\n", + "With this BiliBiliLoader, users can easily obtain the transcript of their desired video content on the platform." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9ec8a3b3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.bilibili import BiliBiliLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "43128d8d", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install bilibili-api" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "35d6809a", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "loader = BiliBiliLoader(\n", + " [\"https://www.bilibili.com/video/BV1xt411o7Xu/\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "loader.load()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/langchain/document_loaders/bilibili.py b/langchain/document_loaders/bilibili.py new file mode 100644 index 00000000..909b060a --- /dev/null +++ b/langchain/document_loaders/bilibili.py @@ -0,0 +1,77 @@ +import json +import re +import warnings +from typing import List, Tuple + +import requests + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class BiliBiliLoader(BaseLoader): + """Loader that loads bilibili transcripts.""" + + def __init__(self, video_urls: List[str]): + """Initialize with bilibili url.""" + self.video_urls = video_urls + + def load(self) -> List[Document]: + """Load from bilibili url.""" + results = [] + for url in self.video_urls: + transcript, video_info = self._get_bilibili_subs_and_info(url) + doc = Document(page_content=transcript, metadata=video_info) + results.append(doc) + + return results + + def _get_bilibili_subs_and_info(self, url: str) -> Tuple[str, dict]: + try: + from bilibili_api import sync, video + except ImportError: + raise ValueError( + "requests package not found, please install it with " + "`pip install bilibili-api`" + ) + + bvid = re.search(r"BV\w+", url) + if bvid is not None: + v = video.Video(bvid=bvid.group()) + else: + aid = re.search(r"av[0-9]+", url) + if aid is not None: + try: + v = video.Video(aid=int(aid.group()[2:])) + except AttributeError: + raise ValueError(f"{url} is not bilibili url.") + else: + raise ValueError(f"{url} is not bilibili url.") + + video_info = sync(v.get_info()) + video_info.update({"url": url}) + + # Get subtitle url + subtitle = video_info.pop("subtitle") + sub_list = subtitle["list"] + if sub_list: + sub_url = sub_list[0]["subtitle_url"] + result = requests.get(sub_url) + raw_sub_titles = json.loads(result.content)["body"] + raw_transcript = " ".join([c["content"] for c in raw_sub_titles]) + + raw_transcript_with_meta_info = f""" + Video Title: {video_info['title']}, + description: {video_info['desc']}\n + Transcript: {raw_transcript} + """ + return raw_transcript_with_meta_info, video_info + else: + raw_transcript = "" + warnings.warn( + f""" + No subtitles found for video: {url}. + Return Empty transcript. + """ + ) + return raw_transcript, video_info diff --git a/tests/integration_tests/document_loaders/test_bilibili.py b/tests/integration_tests/document_loaders/test_bilibili.py new file mode 100644 index 00000000..191d7bd8 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_bilibili.py @@ -0,0 +1,20 @@ +from langchain.document_loaders.bilibili import BiliBiliLoader + + +def test_bilibili_loader() -> None: + """Test Bilibili Loader.""" + loader = BiliBiliLoader( + [ + "https://www.bilibili.com/video/BV1xt411o7Xu/", + "https://www.bilibili.com/video/av330407025/", + ] + ) + docs = loader.load() + + assert len(docs) == 2 + + assert len(docs[0].page_content) > 0 + assert docs[1].metadata["owner"]["mid"] == 398095160 + + assert docs[1].page_content == "" + assert docs[1].metadata["owner"]["mid"] == 398095160