langchain/libs/experimental/langchain_experimental/video_captioning/services/audio_service.py

import subprocess
from pathlib import Path
from typing import List, Optional

from langchain.schema import Document
from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
from langchain_community.document_loaders.assemblyai import TranscriptFormat
from langchain_core.callbacks.manager import CallbackManagerForChainRun

from langchain_experimental.video_captioning.models import AudioModel, BaseModel


class AudioProcessor:
    def __init__(
        self,
        api_key: str,
        output_audio_path: str = "output_audio.mp3",
    ):
        self.output_audio_path = Path(output_audio_path)
        self.api_key = api_key

    def process(
        self,
        video_file_path: str,
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> list:
        try:
            self._extract_audio(video_file_path)
            return self._transcribe_audio()
        finally:
            # Cleanup: Delete the MP3 file after processing
            try:
                self.output_audio_path.unlink()
            except FileNotFoundError:
                pass  # File not found, nothing to delete

    def _extract_audio(self, video_file_path: str) -> None:
        # Ensure the directory exists where the output file will be saved
        self.output_audio_path.parent.mkdir(parents=True, exist_ok=True)

        command = [
            "ffmpeg",
            "-i",
            video_file_path,
            "-vn",
            "-acodec",
            "mp3",
            self.output_audio_path.as_posix(),
            "-y",  # The '-y' flag overwrites the output file if it exists
        ]

        subprocess.run(
            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True
        )

    def _transcribe_audio(self) -> List[BaseModel]:
        if not self.api_key:
            raise ValueError("API key for AssemblyAI is not configured")
        audio_file_path_str = str(self.output_audio_path)
        loader = AssemblyAIAudioTranscriptLoader(
            file_path=audio_file_path_str,
            api_key=self.api_key,
            transcript_format=TranscriptFormat.SUBTITLES_SRT,
        )
        docs = loader.load()
        return self._create_transcript_models(docs)

    @staticmethod
    def _create_transcript_models(docs: List[Document]) -> List[BaseModel]:
        # Assuming docs is a list of Documents with .page_content as the transcript data
        models = []
        for doc in docs:
            models.extend(AudioProcessor._parse_transcript(doc.page_content))
        return models

    @staticmethod
    def _parse_transcript(srt_content: str) -> List[BaseModel]:
        models = []
        entries = srt_content.strip().split("\n\n")  # Split based on double newline

        for entry in entries:
            index, timespan, *subtitle_lines = entry.split("\n")

            # If not a valid entry format, skip
            if len(subtitle_lines) == 0:
                continue

            start_time, end_time = timespan.split(" --> ")
            subtitle_text = " ".join(subtitle_lines).strip()
            models.append(AudioModel.from_srt(start_time, end_time, subtitle_text))

        return models
experimental[minor]: Create Closed Captioning Chain for .mp4 videos (#14059) Description: Video imagery to text (Closed Captioning) This pull request introduces the VideoCaptioningChain, a tool for automated video captioning. It processes audio and video to generate subtitles and closed captions, merging them into a single SRT output. Issue: https://github.com/langchain-ai/langchain/issues/11770 Dependencies: opencv-python, ffmpeg-python, assemblyai, transformers, pillow, torch, openai Tag maintainer: @baskaryan @hwchase17 Hello!  We are a group of students from the University of Toronto (@LunarECL, @TomSadan, @nicoledroi1, @A2113S) that want to make a contribution to the LangChain community! We have ran make format, make lint and make test locally before submitting the PR. To our knowledge, our changes do not introduce any new errors. Thank you for taking the time to review our PR! --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-03-30 01:57:53 +00:00			`import subprocess`
			`from pathlib import Path`
			`from typing import List, Optional`

			`from langchain.schema import Document`
			`from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader`
			`from langchain_community.document_loaders.assemblyai import TranscriptFormat`
langchain: `callbacks` imports fix (#20348) Replaced all `from langchain.callbacks` into `from langchain_core.callbacks` . Changes in the `langchain` and `langchain_experimental` --------- Co-authored-by: Erick Friis <erick@langchain.dev> 2024-04-12 20:13:14 +00:00			`from langchain_core.callbacks.manager import CallbackManagerForChainRun`
experimental[minor]: Create Closed Captioning Chain for .mp4 videos (#14059) Description: Video imagery to text (Closed Captioning) This pull request introduces the VideoCaptioningChain, a tool for automated video captioning. It processes audio and video to generate subtitles and closed captions, merging them into a single SRT output. Issue: https://github.com/langchain-ai/langchain/issues/11770 Dependencies: opencv-python, ffmpeg-python, assemblyai, transformers, pillow, torch, openai Tag maintainer: @baskaryan @hwchase17 Hello!  We are a group of students from the University of Toronto (@LunarECL, @TomSadan, @nicoledroi1, @A2113S) that want to make a contribution to the LangChain community! We have ran make format, make lint and make test locally before submitting the PR. To our knowledge, our changes do not introduce any new errors. Thank you for taking the time to review our PR! --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-03-30 01:57:53 +00:00
			`from langchain_experimental.video_captioning.models import AudioModel, BaseModel`


			`class AudioProcessor:`
			`def __init__(`
			`self,`
			`api_key: str,`
			`output_audio_path: str = "output_audio.mp3",`
			`):`
			`self.output_audio_path = Path(output_audio_path)`
			`self.api_key = api_key`

			`def process(`
			`self,`
			`video_file_path: str,`
			`run_manager: Optional[CallbackManagerForChainRun] = None,`
			`) -> list:`
			`try:`
			`self._extract_audio(video_file_path)`
			`return self._transcribe_audio()`
			`finally:`
			`# Cleanup: Delete the MP3 file after processing`
			`try:`
			`self.output_audio_path.unlink()`
			`except FileNotFoundError:`
			`pass # File not found, nothing to delete`

			`def _extract_audio(self, video_file_path: str) -> None:`
			`# Ensure the directory exists where the output file will be saved`
			`self.output_audio_path.parent.mkdir(parents=True, exist_ok=True)`

			`command = [`
			`"ffmpeg",`
			`"-i",`
			`video_file_path,`
			`"-vn",`
			`"-acodec",`
			`"mp3",`
			`self.output_audio_path.as_posix(),`
			`"-y", # The '-y' flag overwrites the output file if it exists`
			`]`

			`subprocess.run(`
			`command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True`
			`)`

			`def _transcribe_audio(self) -> List[BaseModel]:`
			`if not self.api_key:`
			`raise ValueError("API key for AssemblyAI is not configured")`
			`audio_file_path_str = str(self.output_audio_path)`
			`loader = AssemblyAIAudioTranscriptLoader(`
			`file_path=audio_file_path_str,`
			`api_key=self.api_key,`
			`transcript_format=TranscriptFormat.SUBTITLES_SRT,`
			`)`
			`docs = loader.load()`
			`return self._create_transcript_models(docs)`

			`@staticmethod`
			`def _create_transcript_models(docs: List[Document]) -> List[BaseModel]:`
			`# Assuming docs is a list of Documents with .page_content as the transcript data`
			`models = []`
			`for doc in docs:`
			`models.extend(AudioProcessor._parse_transcript(doc.page_content))`
			`return models`

			`@staticmethod`
			`def _parse_transcript(srt_content: str) -> List[BaseModel]:`
			`models = []`
			`entries = srt_content.strip().split("\n\n") # Split based on double newline`

			`for entry in entries:`
			`index, timespan, *subtitle_lines = entry.split("\n")`

			`# If not a valid entry format, skip`
			`if len(subtitle_lines) == 0:`
			`continue`

			`start_time, end_time = timespan.split(" --> ")`
			`subtitle_text = " ".join(subtitle_lines).strip()`
			`models.append(AudioModel.from_srt(start_time, end_time, subtitle_text))`

			`return models`