mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
b7d180a70d
Description: Video imagery to text (Closed Captioning) This pull request introduces the VideoCaptioningChain, a tool for automated video captioning. It processes audio and video to generate subtitles and closed captions, merging them into a single SRT output. Issue: https://github.com/langchain-ai/langchain/issues/11770 Dependencies: opencv-python, ffmpeg-python, assemblyai, transformers, pillow, torch, openai Tag maintainer: @baskaryan @hwchase17 Hello! We are a group of students from the University of Toronto (@LunarECL, @TomSadan, @nicoledroi1, @A2113S) that want to make a contribution to the LangChain community! We have ran make format, make lint and make test locally before submitting the PR. To our knowledge, our changes do not introduce any new errors. Thank you for taking the time to review our PR! --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
112 lines
3.7 KiB
Python
112 lines
3.7 KiB
Python
from typing import List, Optional
|
|
|
|
import numpy as np
|
|
from langchain_community.document_loaders import ImageCaptionLoader
|
|
from langchain_core.callbacks import CallbackManagerForChainRun
|
|
|
|
from langchain_experimental.video_captioning.models import VideoModel
|
|
|
|
|
|
class ImageProcessor:
|
|
_SAMPLES_PER_SECOND: int = 4
|
|
|
|
def __init__(self, frame_skip: int = -1, threshold: int = 3000000) -> None:
|
|
self.threshold = threshold
|
|
self.frame_skip = frame_skip
|
|
|
|
def process(
|
|
self,
|
|
video_file_path: str,
|
|
run_manager: Optional[CallbackManagerForChainRun] = None,
|
|
) -> list:
|
|
return self._extract_frames(video_file_path)
|
|
|
|
def _extract_frames(self, video_file_path: str) -> list:
|
|
try:
|
|
import cv2
|
|
from cv2.typing import MatLike
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"Unable to import cv2, please install it with "
|
|
"`pip install -U opencv-python`"
|
|
) from e
|
|
video_models: List[VideoModel] = []
|
|
|
|
def _add_model(start_time: int, end_time: int) -> None:
|
|
middle_frame_time = start_time / end_time
|
|
cap.set(cv2.CAP_PROP_POS_MSEC, middle_frame_time)
|
|
|
|
# Convert the frame to bytes
|
|
_, encoded_frame = cv2.imencode(".jpg", frame)
|
|
notable_frame_bytes = encoded_frame.tobytes()
|
|
|
|
cap.set(cv2.CAP_PROP_POS_MSEC, end_time)
|
|
|
|
# Create an instance of the ImageCaptionLoader
|
|
loader = ImageCaptionLoader(images=notable_frame_bytes)
|
|
|
|
# Load captions for the images
|
|
list_docs = loader.load()
|
|
|
|
video_model = VideoModel(
|
|
start_time,
|
|
end_time,
|
|
list_docs[len(list_docs) - 1].page_content.replace("[SEP]", "").strip(),
|
|
)
|
|
video_models.append(video_model)
|
|
|
|
def _is_notable_frame(frame1: MatLike, frame2: MatLike, threshold: int) -> bool:
|
|
# Convert frames to grayscale
|
|
gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
|
|
gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Compute absolute difference between frames
|
|
frame_diff = cv2.absdiff(gray1, gray2)
|
|
|
|
# Apply threshold to identify notable differences
|
|
_, thresholded_diff = cv2.threshold(frame_diff, 30, 255, cv2.THRESH_BINARY)
|
|
|
|
# Count the number of white pixels (indicating differences)
|
|
num_diff_pixels = np.sum(thresholded_diff)
|
|
|
|
return num_diff_pixels > threshold
|
|
|
|
# Open the video file
|
|
cap = cv2.VideoCapture(video_file_path)
|
|
|
|
if self.frame_skip == -1:
|
|
self.frame_skip = int(cap.get(cv2.CAP_PROP_FPS)) // self._SAMPLES_PER_SECOND
|
|
|
|
# Read the first frame
|
|
ret, prev_frame = cap.read()
|
|
|
|
# Loop through the video frames
|
|
start_time = 0
|
|
end_time = 0
|
|
|
|
while True:
|
|
# Read the next frame
|
|
ret, frame = cap.read()
|
|
if not ret:
|
|
break # Break the loop if there are no more frames
|
|
|
|
# Check if the current frame is notable
|
|
if _is_notable_frame(prev_frame, frame, self.threshold):
|
|
end_time = int(cap.get(cv2.CAP_PROP_POS_MSEC))
|
|
_add_model(start_time, end_time)
|
|
start_time = end_time
|
|
|
|
# Update the previous frame
|
|
prev_frame = frame.copy()
|
|
|
|
# Increment the frame position by the skip value
|
|
cap.set(
|
|
cv2.CAP_PROP_POS_FRAMES,
|
|
cap.get(cv2.CAP_PROP_POS_FRAMES) + self.frame_skip,
|
|
)
|
|
|
|
# Release the video capture object
|
|
cap.release()
|
|
|
|
return video_models
|