@ -8,20 +8,55 @@ from langchain_core.documents import Document
from langchain_community . document_loaders . base import BaseLoader
# Pre-compile regular expressions for video ID extraction
BV_PATTERN = re . compile ( r " BV \ w+ " )
AV_PATTERN = re . compile ( r " av[0-9]+ " )
class BiliBiliLoader ( BaseLoader ) :
""" Load `BiliBili` video transcripts. """
"""
Loader for fetching transcripts from BiliBili videos .
"""
def __init__ ( self , video_urls : List [ str ] ) :
""" Initialize with bilibili url.
def __init__ (
self ,
video_urls : List [ str ] ,
sessdata : str = " " ,
bili_jct : str = " " ,
buvid3 : str = " " ,
) :
"""
Initialize the loader with BiliBili video URLs and authentication cookies .
if no authentication cookies are provided , the loader can ' t get transcripts
and will only fetch videos info .
Args :
video_urls : List of bilibili urls .
video_urls ( List [ str ] ) : List of BiliBili video URLs .
sessdata ( str ) : SESSDATA cookie value for authentication .
bili_jct ( str ) : BILI_JCT cookie value for authentication .
buvid3 ( str ) : BUVI3 cookie value for authentication .
"""
self . video_urls = video_urls
self . credential = None
try :
from bilibili_api import video
except ImportError :
raise ImportError (
" requests package not found, please install it with "
" `pip install bilibili-api-python` "
)
if sessdata and bili_jct and buvid3 :
self . credential = video . Credential (
sessdata = sessdata , bili_jct = bili_jct , buvid3 = buvid3
)
def load ( self ) - > List [ Document ] :
""" Load Documents from bilibili url. """
"""
Load and return a list of documents containing video transcripts .
Returns :
List [ Document ] : List of Document objects transcripts and metadata .
"""
results = [ ]
for url in self . video_urls :
transcript , video_info = self . _get_bilibili_subs_and_info ( url )
@ -31,6 +66,10 @@ class BiliBiliLoader(BaseLoader):
return results
def _get_bilibili_subs_and_info ( self , url : str ) - > Tuple [ str , dict ] :
"""
Retrieve video information and transcript for a given BiliBili URL .
"""
bvid = BV_PATTERN . search ( url )
try :
from bilibili_api import sync , video
except ImportError :
@ -38,46 +77,50 @@ class BiliBiliLoader(BaseLoader):
" requests package not found, please install it with "
" `pip install bilibili-api-python` "
)
bvid = re . search ( r " BV \ w+ " , url )
if bvid is not None :
v = video . Video ( bvid = bvid . group ( ) )
if bvid :
v = video . Video ( bvid = bvid . group ( ) , credential = self . credential )
else :
aid = re . search ( r " av[0-9]+ " , url )
if aid is not None :
try :
v = video . Video ( aid = int ( aid . group ( ) [ 2 : ] ) )
except AttributeError :
raise ValueError ( f " { url } is not bilibili url. " )
aid = AV_PATTERN . search ( url )
if aid :
v = video . Video ( aid = int ( aid . group ( ) [ 2 : ] ) , credential = self . credential )
else :
raise ValueError ( f " { url } is not bilibili url. " )
raise ValueError ( f " Unable to find a valid video ID in URL: { url } " )
video_info = sync ( v . get_info ( ) )
video_info . update ( { " url " : url } )
sub = sync ( v . get_subtitle ( video_info [ " cid " ] ) )
# Get subtitle url
sub_list = sub [ " subtitles " ]
# Return if no credential is provided
if not self . credential :
return " " , video_info
# Fetching and processing subtitles
sub = sync ( v . get_subtitle ( video_info [ " cid " ] ) )
sub_list = sub . get ( " subtitles " , [ ] )
if sub_list :
sub_url = sub_list [ 0 ] [ " subtitle_url " ]
sub_url = sub_list [ 0 ] . get ( " subtitle_url " , " " )
if not sub_url . startswith ( " http " ) :
sub_url = " https: " + sub_url
result = requests . get ( sub_url )
raw_sub_titles = json . loads ( result . content ) [ " body " ]
raw_transcript = " " . join ( [ c [ " content " ] for c in raw_sub_titles ] )
raw_transcript_with_meta_info = (
f " Video Title: { video_info [ ' title ' ] } , "
f " description: { video_info [ ' desc ' ] } \n \n "
f " Transcript: { raw_transcript } "
)
return raw_transcript_with_meta_info , video_info
response = requests . get ( sub_url )
if response . status_code == 200 :
raw_sub_titles = json . loads ( response . content ) . get ( " body " , [ ] )
raw_transcript = " " . join ( [ c [ " content " ] for c in raw_sub_titles ] )
raw_transcript_with_meta_info = (
f " Video Title: { video_info [ ' title ' ] } , "
f " description: { video_info [ ' desc ' ] } \n \n "
f " Transcript: { raw_transcript } "
)
return raw_transcript_with_meta_info , video_info
else :
warnings . warn (
f " Failed to fetch subtitles for { url } . "
f " HTTP Status Code: { response . status_code } "
)
else :
raw_transcript = " "
warnings . warn (
f """
No subtitles found for video : { url } .
Return Empty transcript .
"""
f " No subtitles found for video: { url } . Returning empty transcript. "
)
return raw_transcript , video_info
# Return empty transcript if no subtitles are found
return " " , video_info