mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
b7d180a70d
Description: Video imagery to text (Closed Captioning) This pull request introduces the VideoCaptioningChain, a tool for automated video captioning. It processes audio and video to generate subtitles and closed captions, merging them into a single SRT output. Issue: https://github.com/langchain-ai/langchain/issues/11770 Dependencies: opencv-python, ffmpeg-python, assemblyai, transformers, pillow, torch, openai Tag maintainer: @baskaryan @hwchase17 Hello! We are a group of students from the University of Toronto (@LunarECL, @TomSadan, @nicoledroi1, @A2113S) that want to make a contribution to the LangChain community! We have ran make format, make lint and make test locally before submitting the PR. To our knowledge, our changes do not introduce any new errors. Thank you for taking the time to review our PR! --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
149 lines
4.9 KiB
Python
149 lines
4.9 KiB
Python
from typing import Any, Dict, List, Optional
|
|
|
|
from langchain.chains.base import Chain
|
|
from langchain_core.callbacks import CallbackManagerForChainRun
|
|
from langchain_core.language_models import BaseLanguageModel
|
|
from langchain_core.prompts import PromptTemplate
|
|
from langchain_core.pydantic_v1 import Extra
|
|
|
|
from langchain_experimental.video_captioning.services.audio_service import (
|
|
AudioProcessor,
|
|
)
|
|
from langchain_experimental.video_captioning.services.caption_service import (
|
|
CaptionProcessor,
|
|
)
|
|
from langchain_experimental.video_captioning.services.combine_service import (
|
|
CombineProcessor,
|
|
)
|
|
from langchain_experimental.video_captioning.services.image_service import (
|
|
ImageProcessor,
|
|
)
|
|
from langchain_experimental.video_captioning.services.srt_service import SRTProcessor
|
|
|
|
|
|
class VideoCaptioningChain(Chain):
|
|
"""
|
|
Video Captioning Chain.
|
|
"""
|
|
|
|
llm: BaseLanguageModel
|
|
assemblyai_key: str
|
|
prompt: Optional[PromptTemplate] = None
|
|
verbose: bool = True
|
|
use_logging: Optional[bool] = True
|
|
frame_skip: int = -1
|
|
image_delta_threshold: int = 3000000
|
|
closed_caption_char_limit: int = 20
|
|
closed_caption_similarity_threshold: int = 80
|
|
use_unclustered_video_models: bool = False
|
|
|
|
class Config:
|
|
extra = Extra.allow
|
|
arbitrary_types_allowed = True
|
|
|
|
@property
|
|
def input_keys(self) -> List[str]:
|
|
return ["video_file_path"]
|
|
|
|
@property
|
|
def output_keys(self) -> List[str]:
|
|
return ["srt"]
|
|
|
|
def _call(
|
|
self,
|
|
inputs: Dict[str, Any],
|
|
run_manager: Optional[CallbackManagerForChainRun] = None,
|
|
) -> Dict[str, str]:
|
|
if "video_file_path" not in inputs:
|
|
raise ValueError(
|
|
"Missing 'video_file_path' in inputs for video captioning."
|
|
)
|
|
video_file_path = inputs["video_file_path"]
|
|
nl = "\n"
|
|
|
|
run_manager.on_text(
|
|
"Loading processors..." + nl
|
|
) if self.use_logging and run_manager else None
|
|
|
|
audio_processor = AudioProcessor(api_key=self.assemblyai_key)
|
|
image_processor = ImageProcessor(
|
|
frame_skip=self.frame_skip, threshold=self.image_delta_threshold
|
|
)
|
|
caption_processor = CaptionProcessor(
|
|
llm=self.llm,
|
|
verbose=self.verbose,
|
|
similarity_threshold=self.closed_caption_similarity_threshold,
|
|
use_unclustered_models=self.use_unclustered_video_models,
|
|
)
|
|
combine_processor = CombineProcessor(
|
|
llm=self.llm,
|
|
verbose=self.verbose,
|
|
char_limit=self.closed_caption_char_limit,
|
|
)
|
|
srt_processor = SRTProcessor()
|
|
|
|
run_manager.on_text(
|
|
"Finished loading processors."
|
|
+ nl
|
|
+ "Generating subtitles from audio..."
|
|
+ nl
|
|
) if self.use_logging and run_manager else None
|
|
|
|
# Get models for speech to text subtitles
|
|
audio_models = audio_processor.process(video_file_path, run_manager)
|
|
run_manager.on_text(
|
|
"Finished generating subtitles:"
|
|
+ nl
|
|
+ f"{nl.join(str(obj) for obj in audio_models)}"
|
|
+ nl
|
|
+ "Generating closed captions from video..."
|
|
+ nl
|
|
) if self.use_logging and run_manager else None
|
|
|
|
# Get models for image frame description
|
|
image_models = image_processor.process(video_file_path, run_manager)
|
|
run_manager.on_text(
|
|
"Finished generating closed captions:"
|
|
+ nl
|
|
+ f"{nl.join(str(obj) for obj in image_models)}"
|
|
+ nl
|
|
+ "Refining closed captions..."
|
|
+ nl
|
|
) if self.use_logging and run_manager else None
|
|
|
|
# Get models for video event closed-captions
|
|
video_models = caption_processor.process(image_models, run_manager)
|
|
run_manager.on_text(
|
|
"Finished refining closed captions:"
|
|
+ nl
|
|
+ f"{nl.join(str(obj) for obj in video_models)}"
|
|
+ nl
|
|
+ "Combining subtitles with closed captions..."
|
|
+ nl
|
|
) if self.use_logging and run_manager else None
|
|
|
|
# Combine the subtitle models with the closed-caption models
|
|
caption_models = combine_processor.process(
|
|
video_models, audio_models, run_manager
|
|
)
|
|
run_manager.on_text(
|
|
"Finished combining subtitles with closed captions:"
|
|
+ nl
|
|
+ f"{nl.join(str(obj) for obj in caption_models)}"
|
|
+ nl
|
|
+ "Generating SRT file..."
|
|
+ nl
|
|
) if self.use_logging and run_manager else None
|
|
|
|
# Convert the combined model to SRT format
|
|
srt_content = srt_processor.process(caption_models)
|
|
run_manager.on_text(
|
|
"Finished generating srt file." + nl
|
|
) if self.use_logging and run_manager else None
|
|
|
|
return {"srt": srt_content}
|
|
|
|
@property
|
|
def _chain_type(self) -> str:
|
|
return "video_captioning_chain"
|