langchain/libs/experimental/langchain_experimental/video_captioning/services/image_service.py

from typing import List, Optional

import numpy as np
from langchain_community.document_loaders import ImageCaptionLoader
from langchain_core.callbacks import CallbackManagerForChainRun

from langchain_experimental.video_captioning.models import VideoModel


class ImageProcessor:
    _SAMPLES_PER_SECOND: int = 4

    def __init__(self, frame_skip: int = -1, threshold: int = 3000000) -> None:
        self.threshold = threshold
        self.frame_skip = frame_skip

    def process(
        self,
        video_file_path: str,
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> list:
        return self._extract_frames(video_file_path)

    def _extract_frames(self, video_file_path: str) -> list:
        try:
            import cv2
            from cv2.typing import MatLike
        except ImportError as e:
            raise ImportError(
                "Unable to import cv2, please install it with "
                "`pip install -U opencv-python`"
            ) from e
        video_models: List[VideoModel] = []

        def _add_model(start_time: int, end_time: int) -> None:
            middle_frame_time = start_time / end_time
            cap.set(cv2.CAP_PROP_POS_MSEC, middle_frame_time)

            # Convert the frame to bytes
            _, encoded_frame = cv2.imencode(".jpg", frame)
            notable_frame_bytes = encoded_frame.tobytes()

            cap.set(cv2.CAP_PROP_POS_MSEC, end_time)

            # Create an instance of the ImageCaptionLoader
            loader = ImageCaptionLoader(images=notable_frame_bytes)

            # Load captions for the images
            list_docs = loader.load()

            video_model = VideoModel(
                start_time,
                end_time,
                list_docs[len(list_docs) - 1].page_content.replace("[SEP]", "").strip(),
            )
            video_models.append(video_model)

        def _is_notable_frame(frame1: MatLike, frame2: MatLike, threshold: int) -> bool:
            # Convert frames to grayscale
            gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
            gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)

            # Compute absolute difference between frames
            frame_diff = cv2.absdiff(gray1, gray2)

            # Apply threshold to identify notable differences
            _, thresholded_diff = cv2.threshold(frame_diff, 30, 255, cv2.THRESH_BINARY)

            # Count the number of white pixels (indicating differences)
            num_diff_pixels = np.sum(thresholded_diff)

            return num_diff_pixels > threshold

        # Open the video file
        cap = cv2.VideoCapture(video_file_path)

        if self.frame_skip == -1:
            self.frame_skip = int(cap.get(cv2.CAP_PROP_FPS)) // self._SAMPLES_PER_SECOND

        # Read the first frame
        ret, prev_frame = cap.read()

        # Loop through the video frames
        start_time = 0
        end_time = 0

        while True:
            # Read the next frame
            ret, frame = cap.read()
            if not ret:
                break  # Break the loop if there are no more frames

            # Check if the current frame is notable
            if _is_notable_frame(prev_frame, frame, self.threshold):
                end_time = int(cap.get(cv2.CAP_PROP_POS_MSEC))
                _add_model(start_time, end_time)
                start_time = end_time

            # Update the previous frame
            prev_frame = frame.copy()

            # Increment the frame position by the skip value
            cap.set(
                cv2.CAP_PROP_POS_FRAMES,
                cap.get(cv2.CAP_PROP_POS_FRAMES) + self.frame_skip,
            )

        # Release the video capture object
        cap.release()

        return video_models
experimental[minor]: Create Closed Captioning Chain for .mp4 videos (#14059) Description: Video imagery to text (Closed Captioning) This pull request introduces the VideoCaptioningChain, a tool for automated video captioning. It processes audio and video to generate subtitles and closed captions, merging them into a single SRT output. Issue: https://github.com/langchain-ai/langchain/issues/11770 Dependencies: opencv-python, ffmpeg-python, assemblyai, transformers, pillow, torch, openai Tag maintainer: @baskaryan @hwchase17 Hello!  We are a group of students from the University of Toronto (@LunarECL, @TomSadan, @nicoledroi1, @A2113S) that want to make a contribution to the LangChain community! We have ran make format, make lint and make test locally before submitting the PR. To our knowledge, our changes do not introduce any new errors. Thank you for taking the time to review our PR! --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-03-30 01:57:53 +00:00			`from typing import List, Optional`

			`import numpy as np`
			`from langchain_community.document_loaders import ImageCaptionLoader`
			`from langchain_core.callbacks import CallbackManagerForChainRun`

			`from langchain_experimental.video_captioning.models import VideoModel`


			`class ImageProcessor:`
			`_SAMPLES_PER_SECOND: int = 4`

			`def __init__(self, frame_skip: int = -1, threshold: int = 3000000) -> None:`
			`self.threshold = threshold`
			`self.frame_skip = frame_skip`

			`def process(`
			`self,`
			`video_file_path: str,`
			`run_manager: Optional[CallbackManagerForChainRun] = None,`
			`) -> list:`
			`return self._extract_frames(video_file_path)`

			`def _extract_frames(self, video_file_path: str) -> list:`
			`try:`
			`import cv2`
			`from cv2.typing import MatLike`
			`except ImportError as e:`
			`raise ImportError(`
			`"Unable to import cv2, please install it with "`
			"`pip install -U opencv-python`"
			`) from e`
			`video_models: List[VideoModel] = []`

			`def _add_model(start_time: int, end_time: int) -> None:`
			`middle_frame_time = start_time / end_time`
			`cap.set(cv2.CAP_PROP_POS_MSEC, middle_frame_time)`

			`# Convert the frame to bytes`
			`_, encoded_frame = cv2.imencode(".jpg", frame)`
			`notable_frame_bytes = encoded_frame.tobytes()`

			`cap.set(cv2.CAP_PROP_POS_MSEC, end_time)`

			`# Create an instance of the ImageCaptionLoader`
			`loader = ImageCaptionLoader(images=notable_frame_bytes)`

			`# Load captions for the images`
			`list_docs = loader.load()`

			`video_model = VideoModel(`
			`start_time,`
			`end_time,`
			`list_docs[len(list_docs) - 1].page_content.replace("[SEP]", "").strip(),`
			`)`
			`video_models.append(video_model)`

			`def _is_notable_frame(frame1: MatLike, frame2: MatLike, threshold: int) -> bool:`
			`# Convert frames to grayscale`
			`gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)`
			`gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)`

			`# Compute absolute difference between frames`
			`frame_diff = cv2.absdiff(gray1, gray2)`

			`# Apply threshold to identify notable differences`
			`_, thresholded_diff = cv2.threshold(frame_diff, 30, 255, cv2.THRESH_BINARY)`

			`# Count the number of white pixels (indicating differences)`
			`num_diff_pixels = np.sum(thresholded_diff)`

			`return num_diff_pixels > threshold`

			`# Open the video file`
			`cap = cv2.VideoCapture(video_file_path)`

			`if self.frame_skip == -1:`
			`self.frame_skip = int(cap.get(cv2.CAP_PROP_FPS)) // self._SAMPLES_PER_SECOND`

			`# Read the first frame`
			`ret, prev_frame = cap.read()`

			`# Loop through the video frames`
			`start_time = 0`
			`end_time = 0`

			`while True:`
			`# Read the next frame`
			`ret, frame = cap.read()`
			`if not ret:`
			`break # Break the loop if there are no more frames`

			`# Check if the current frame is notable`
			`if _is_notable_frame(prev_frame, frame, self.threshold):`
			`end_time = int(cap.get(cv2.CAP_PROP_POS_MSEC))`
			`_add_model(start_time, end_time)`
			`start_time = end_time`

			`# Update the previous frame`
			`prev_frame = frame.copy()`

			`# Increment the frame position by the skip value`
			`cap.set(`
			`cv2.CAP_PROP_POS_FRAMES,`
			`cap.get(cv2.CAP_PROP_POS_FRAMES) + self.frame_skip,`
			`)`

			`# Release the video capture object`
			`cap.release()`

			`return video_models`