From b7d180a70d3aae281e1a0400ef58c363628aa044 Mon Sep 17 00:00:00 2001
From: LunarECL <38317983+LunarECL@users.noreply.github.com>
Date: Sat, 30 Mar 2024 10:57:53 +0900
Subject: [PATCH] experimental[minor]: Create Closed Captioning Chain for .mp4
 videos (#14059)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Description: Video imagery to text (Closed Captioning)
This pull request introduces the VideoCaptioningChain, a tool for
automated video captioning. It processes audio and video to generate
subtitles and closed captions, merging them into a single SRT output.

Issue: https://github.com/langchain-ai/langchain/issues/11770
Dependencies: opencv-python, ffmpeg-python, assemblyai, transformers,
pillow, torch, openai
Tag maintainer:
@baskaryan
@hwchase17


Hello!  We are a group of students from the University of Toronto
(@LunarECL, @TomSadan, @nicoledroi1, @A2113S) that want to make a
contribution to the LangChain community! We have ran make format, make
lint and make test locally before submitting the PR. To our knowledge,
our changes do not introduce any new errors.

Thank you for taking the time to review our PR!

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 .../video_captioning/video_captioning.ipynb   | 174 +++++++++++
 .../video_captioning/__init__.py              |   3 +
 .../video_captioning/base.py                  | 148 ++++++++++
 .../video_captioning/models.py                | 150 ++++++++++
 .../video_captioning/prompts.py               |  90 ++++++
 .../services/audio_service.py                 |  92 ++++++
 .../services/caption_service.py               | 279 ++++++++++++++++++
 .../services/combine_service.py               | 141 +++++++++
 .../services/image_service.py                 | 111 +++++++
 .../video_captioning/services/srt_service.py  |  14 +
 libs/experimental/poetry.lock                 | 121 +++++++-
 libs/experimental/pyproject.toml              |   1 +
 .../test_video_captioning.py                  |  28 ++
 13 files changed, 1343 insertions(+), 9 deletions(-)
 create mode 100644 cookbook/video_captioning/video_captioning.ipynb
 create mode 100644 libs/experimental/langchain_experimental/video_captioning/__init__.py
 create mode 100644 libs/experimental/langchain_experimental/video_captioning/base.py
 create mode 100644 libs/experimental/langchain_experimental/video_captioning/models.py
 create mode 100644 libs/experimental/langchain_experimental/video_captioning/prompts.py
 create mode 100644 libs/experimental/langchain_experimental/video_captioning/services/audio_service.py
 create mode 100644 libs/experimental/langchain_experimental/video_captioning/services/caption_service.py
 create mode 100644 libs/experimental/langchain_experimental/video_captioning/services/combine_service.py
 create mode 100644 libs/experimental/langchain_experimental/video_captioning/services/image_service.py
 create mode 100644 libs/experimental/langchain_experimental/video_captioning/services/srt_service.py
 create mode 100644 libs/experimental/tests/integration_tests/test_video_captioning.py

diff --git a/cookbook/video_captioning/video_captioning.ipynb b/cookbook/video_captioning/video_captioning.ipynb
new file mode 100644
index 0000000000..f232410c97
--- /dev/null
+++ b/cookbook/video_captioning/video_captioning.ipynb
@@ -0,0 +1,174 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Video Captioning\n",
+    "This notebook shows how to use VideoCaptioningChain, which is implemented using Langchain's ImageCaptionLoader and AssemblyAI to produce .srt files.\n",
+    "\n",
+    "This system autogenerates both subtitles and closed captions from a video URL."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Installing Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install ffmpeg-python\n",
+    "# !pip install assemblyai\n",
+    "# !pip install opencv-python\n",
+    "# !pip install torch\n",
+    "# !pip install pillow\n",
+    "# !pip install transformers\n",
+    "# !pip install langchain"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-30T03:39:14.078232Z",
+     "start_time": "2023-11-30T03:39:12.534410Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "\n",
+    "from langchain.chains.video_captioning import VideoCaptioningChain\n",
+    "from langchain.chat_models.openai import ChatOpenAI"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setting up API Keys"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-11-30T03:39:17.423806Z",
+     "start_time": "2023-11-30T03:39:17.417945Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "OPENAI_API_KEY = getpass.getpass(\"OpenAI API Key:\")\n",
+    "\n",
+    "ASSEMBLYAI_API_KEY = getpass.getpass(\"AssemblyAI API Key:\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Required parameters:**\n",
+    "\n",
+    "* llm: The language model this chain will use to get suggestions on how to refine the closed-captions\n",
+    "* assemblyai_key: The API key for AssemblyAI, used to generate the subtitles\n",
+    "\n",
+    "**Optional Parameters:**\n",
+    "\n",
+    "* verbose (Default: True): Sets verbose mode for downstream chain calls\n",
+    "* use_logging (Default: True): Log the chain's processes in run manager\n",
+    "* frame_skip (Default: None): Choose how many video frames to skip during processing. Increasing it results in faster execution, but less accurate results. If None, frame skip is calculated manually based on the framerate Set this to 0 to sample all frames\n",
+    "* image_delta_threshold (Default: 3000000): Set the sensitivity for what the image processor considers a change in scenery in the video, used to delimit closed captions. Higher = less sensitive\n",
+    "* closed_caption_char_limit (Default: 20): Sets the character limit on closed captions\n",
+    "* closed_caption_similarity_threshold (Default: 80): Sets the percentage value to how similar two closed caption models should be in order to be clustered into one longer closed caption\n",
+    "* use_unclustered_video_models (Default: False): If true, closed captions that could not be clustered will be included. May result in spontaneous behaviour from closed captions such as very short lasting captions or fast-changing captions. Enabling this is experimental and not recommended"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://ia804703.us.archive.org/27/items/uh-oh-here-we-go-again/Uh-Oh%2C%20Here%20we%20go%20again.mp4\n",
+    "# https://ia601200.us.archive.org/9/items/f58703d4-61e6-4f8f-8c08-b42c7e16f7cb/f58703d4-61e6-4f8f-8c08-b42c7e16f7cb.mp4\n",
+    "\n",
+    "chain = VideoCaptioningChain(\n",
+    "    llm=ChatOpenAI(model=\"gpt-4\", max_tokens=4000, openai_api_key=OPENAI_API_KEY),\n",
+    "    assemblyai_key=ASSEMBLYAI_API_KEY,\n",
+    ")\n",
+    "\n",
+    "srt_content = chain.run(\n",
+    "    video_file_path=\"https://ia601200.us.archive.org/9/items/f58703d4-61e6-4f8f-8c08-b42c7e16f7cb/f58703d4-61e6-4f8f-8c08-b42c7e16f7cb.mp4\"\n",
+    ")\n",
+    "\n",
+    "print(srt_content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Writing output to .srt file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"output.srt\", \"w\") as file:\n",
+    "    file.write(srt_content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv",
+   "language": "python",
+   "name": "myenv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/libs/experimental/langchain_experimental/video_captioning/__init__.py b/libs/experimental/langchain_experimental/video_captioning/__init__.py
new file mode 100644
index 0000000000..4cf101f1eb
--- /dev/null
+++ b/libs/experimental/langchain_experimental/video_captioning/__init__.py
@@ -0,0 +1,3 @@
+from langchain_experimental.video_captioning.base import VideoCaptioningChain
+
+__all__ = ["VideoCaptioningChain"]
diff --git a/libs/experimental/langchain_experimental/video_captioning/base.py b/libs/experimental/langchain_experimental/video_captioning/base.py
new file mode 100644
index 0000000000..5ae2c02dfa
--- /dev/null
+++ b/libs/experimental/langchain_experimental/video_captioning/base.py
@@ -0,0 +1,148 @@
+from typing import Any, Dict, List, Optional
+
+from langchain.chains.base import Chain
+from langchain_core.callbacks import CallbackManagerForChainRun
+from langchain_core.language_models import BaseLanguageModel
+from langchain_core.prompts import PromptTemplate
+from langchain_core.pydantic_v1 import Extra
+
+from langchain_experimental.video_captioning.services.audio_service import (
+    AudioProcessor,
+)
+from langchain_experimental.video_captioning.services.caption_service import (
+    CaptionProcessor,
+)
+from langchain_experimental.video_captioning.services.combine_service import (
+    CombineProcessor,
+)
+from langchain_experimental.video_captioning.services.image_service import (
+    ImageProcessor,
+)
+from langchain_experimental.video_captioning.services.srt_service import SRTProcessor
+
+
+class VideoCaptioningChain(Chain):
+    """
+    Video Captioning Chain.
+    """
+
+    llm: BaseLanguageModel
+    assemblyai_key: str
+    prompt: Optional[PromptTemplate] = None
+    verbose: bool = True
+    use_logging: Optional[bool] = True
+    frame_skip: int = -1
+    image_delta_threshold: int = 3000000
+    closed_caption_char_limit: int = 20
+    closed_caption_similarity_threshold: int = 80
+    use_unclustered_video_models: bool = False
+
+    class Config:
+        extra = Extra.allow
+        arbitrary_types_allowed = True
+
+    @property
+    def input_keys(self) -> List[str]:
+        return ["video_file_path"]
+
+    @property
+    def output_keys(self) -> List[str]:
+        return ["srt"]
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        if "video_file_path" not in inputs:
+            raise ValueError(
+                "Missing 'video_file_path' in inputs for video captioning."
+            )
+        video_file_path = inputs["video_file_path"]
+        nl = "\n"
+
+        run_manager.on_text(
+            "Loading processors..." + nl
+        ) if self.use_logging and run_manager else None
+
+        audio_processor = AudioProcessor(api_key=self.assemblyai_key)
+        image_processor = ImageProcessor(
+            frame_skip=self.frame_skip, threshold=self.image_delta_threshold
+        )
+        caption_processor = CaptionProcessor(
+            llm=self.llm,
+            verbose=self.verbose,
+            similarity_threshold=self.closed_caption_similarity_threshold,
+            use_unclustered_models=self.use_unclustered_video_models,
+        )
+        combine_processor = CombineProcessor(
+            llm=self.llm,
+            verbose=self.verbose,
+            char_limit=self.closed_caption_char_limit,
+        )
+        srt_processor = SRTProcessor()
+
+        run_manager.on_text(
+            "Finished loading processors."
+            + nl
+            + "Generating subtitles from audio..."
+            + nl
+        ) if self.use_logging and run_manager else None
+
+        # Get models for speech to text subtitles
+        audio_models = audio_processor.process(video_file_path, run_manager)
+        run_manager.on_text(
+            "Finished generating subtitles:"
+            + nl
+            + f"{nl.join(str(obj) for obj in audio_models)}"
+            + nl
+            + "Generating closed captions from video..."
+            + nl
+        ) if self.use_logging and run_manager else None
+
+        # Get models for image frame description
+        image_models = image_processor.process(video_file_path, run_manager)
+        run_manager.on_text(
+            "Finished generating closed captions:"
+            + nl
+            + f"{nl.join(str(obj) for obj in image_models)}"
+            + nl
+            + "Refining closed captions..."
+            + nl
+        ) if self.use_logging and run_manager else None
+
+        # Get models for video event closed-captions
+        video_models = caption_processor.process(image_models, run_manager)
+        run_manager.on_text(
+            "Finished refining closed captions:"
+            + nl
+            + f"{nl.join(str(obj) for obj in video_models)}"
+            + nl
+            + "Combining subtitles with closed captions..."
+            + nl
+        ) if self.use_logging and run_manager else None
+
+        # Combine the subtitle models with the closed-caption models
+        caption_models = combine_processor.process(
+            video_models, audio_models, run_manager
+        )
+        run_manager.on_text(
+            "Finished combining subtitles with closed captions:"
+            + nl
+            + f"{nl.join(str(obj) for obj in caption_models)}"
+            + nl
+            + "Generating SRT file..."
+            + nl
+        ) if self.use_logging and run_manager else None
+
+        # Convert the combined model to SRT format
+        srt_content = srt_processor.process(caption_models)
+        run_manager.on_text(
+            "Finished generating srt file." + nl
+        ) if self.use_logging and run_manager else None
+
+        return {"srt": srt_content}
+
+    @property
+    def _chain_type(self) -> str:
+        return "video_captioning_chain"
diff --git a/libs/experimental/langchain_experimental/video_captioning/models.py b/libs/experimental/langchain_experimental/video_captioning/models.py
new file mode 100644
index 0000000000..b464b435d7
--- /dev/null
+++ b/libs/experimental/langchain_experimental/video_captioning/models.py
@@ -0,0 +1,150 @@
+from datetime import datetime
+from typing import Any
+
+
+class BaseModel:
+    def __init__(self, start_time: int, end_time: int) -> None:
+        # Start and end times representing milliseconds
+        self._start_time = start_time
+        self._end_time = end_time
+
+    @property
+    def start_time(self) -> int:
+        return self._start_time
+
+    @start_time.setter
+    def start_time(self, value: int) -> None:
+        self._start_time = value
+
+    @property
+    def end_time(self) -> int:
+        return self._end_time
+
+    @end_time.setter
+    def end_time(self, value: int) -> None:
+        self._end_time = value
+
+    def __str__(self) -> str:
+        return f"start_time: {self.start_time}, end_time: {self.end_time}"
+
+    @classmethod
+    def from_srt(cls, start_time: str, end_time: str, *args: Any) -> "BaseModel":
+        return cls(
+            cls._srt_time_to_ms(start_time), cls._srt_time_to_ms(end_time), *args
+        )
+
+    @staticmethod
+    def _srt_time_to_ms(srt_time_string: str) -> int:
+        # Parse SRT time string into a datetime object
+        time_format = "%H:%M:%S,%f"
+        dt = datetime.strptime(srt_time_string, time_format)
+        ms = dt.microsecond // 1000
+        return dt.second * 1000 + ms
+
+
+class VideoModel(BaseModel):
+    def __init__(self, start_time: int, end_time: int, image_description: str) -> None:
+        super().__init__(start_time, end_time)
+        self._image_description = image_description
+
+    @property
+    def image_description(self) -> str:
+        return self._image_description
+
+    @image_description.setter
+    def image_description(self, value: str) -> None:
+        self._image_description = value
+
+    def __str__(self) -> str:
+        return f"{super().__str__()}, image_description: {self.image_description}"
+
+    def similarity_score(self, other: "VideoModel") -> float:
+        # Tokenize the image descriptions by extracting individual words, stripping
+        # trailing 's' (plural = singular) and converting the words to lowercase in
+        # order to be case-insensitive
+        self_tokenized = set(
+            word.lower().rstrip("s") for word in self.image_description.split()
+        )
+        other_tokenized = set(
+            word.lower().rstrip("s") for word in other.image_description.split()
+        )
+
+        # Find common words
+        common_words = self_tokenized.intersection(other_tokenized)
+
+        # Calculate similarity score
+        similarity_score = (
+            len(common_words) / max(len(self_tokenized), len(other_tokenized)) * 100
+        )
+
+        return similarity_score
+
+
+class AudioModel(BaseModel):
+    def __init__(self, start_time: int, end_time: int, subtitle_text: str) -> None:
+        super().__init__(start_time, end_time)
+        self._subtitle_text = subtitle_text
+
+    @property
+    def subtitle_text(self) -> str:
+        return self._subtitle_text
+
+    @subtitle_text.setter
+    def subtitle_text(self, value: str) -> None:
+        self._subtitle_text = value
+
+    def __str__(self) -> str:
+        return f"{super().__str__()}, subtitle_text: {self.subtitle_text}"
+
+
+class CaptionModel(BaseModel):
+    def __init__(self, start_time: int, end_time: int, closed_caption: str) -> None:
+        super().__init__(start_time, end_time)
+        self._closed_caption = closed_caption
+
+    @property
+    def closed_caption(self) -> str:
+        return self._closed_caption
+
+    @closed_caption.setter
+    def closed_caption(self, value: str) -> None:
+        self._closed_caption = value
+
+    def add_subtitle_text(self, subtitle_text: str) -> "CaptionModel":
+        self._closed_caption = self._closed_caption + " " + subtitle_text
+        return self
+
+    def __str__(self) -> str:
+        return f"{super().__str__()}, closed_caption: {self.closed_caption}"
+
+    def to_srt_entry(self, index: int) -> str:
+        def _ms_to_srt_time(ms: int) -> str:
+            """Converts milliseconds to SRT time format 'HH:MM:SS,mmm'."""
+            hours = int(ms // 3600000)
+            minutes = int((ms % 3600000) // 60000)
+            seconds = int((ms % 60000) // 1000)
+            milliseconds = int(ms % 1000)
+
+            return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
+
+        return "\n".join(
+            [
+                f"""{index}
+            {_ms_to_srt_time(self._start_time)} --> {_ms_to_srt_time(self._end_time)}
+            {self._closed_caption}""",
+            ]
+        )
+
+    @classmethod
+    def from_audio_model(cls, audio_model: AudioModel) -> "CaptionModel":
+        return cls(
+            audio_model.start_time, audio_model.end_time, audio_model.subtitle_text
+        )
+
+    @classmethod
+    def from_video_model(cls, video_model: VideoModel) -> "CaptionModel":
+        return cls(
+            video_model.start_time,
+            video_model.end_time,
+            f"[{video_model.image_description}]",
+        )
diff --git a/libs/experimental/langchain_experimental/video_captioning/prompts.py b/libs/experimental/langchain_experimental/video_captioning/prompts.py
new file mode 100644
index 0000000000..547f174201
--- /dev/null
+++ b/libs/experimental/langchain_experimental/video_captioning/prompts.py
@@ -0,0 +1,90 @@
+# flake8: noqa
+from langchain_core.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from langchain_core.messages import SystemMessage
+
+JOIN_SIMILAR_VIDEO_MODELS_TEMPLATE = """
+I will provide you with several descriptions depicting events in one scene.
+Your task is to combine these descriptions into one description that contains only the important details from all descriptions.
+Especially if the two descriptions are very similar, make sure your response doesn't repeat itself.
+IMPORTANT: Do not make up a description. Do not make up events or anything that happened outside of the descriptions I am to provide you.
+I will now provide an example for you to learn from:
+Example: Description 1: The cat is at the beach, Description 2: The cat is eating lunch, Description 3: The cat is enjoying his time with friends
+Result: The cat is at the beach, eating lunch with his friends
+Now that I gave you the example, I will explain to you what exactly you need to return:
+Just give back one description, the description which is a combination of the descriptions you are provided with.
+Do not include anything else in your response other than the combined description.
+IMPORTANT: the output in your response should be 'Result:text', where text is the description you generated.
+Here is the data for you to work with in order to formulate your response:
+"""
+
+JOIN_SIMILAR_VIDEO_MODELS_PROMPT = ChatPromptTemplate(
+    messages=[
+        SystemMessage(content=JOIN_SIMILAR_VIDEO_MODELS_TEMPLATE),
+        HumanMessagePromptTemplate.from_template("{descriptions}"),
+    ]
+)
+
+REMOVE_VIDEO_MODEL_DESCRIPTION_TEMPLATE = """
+Given a closed-caption description of an image or scene, remove any common prefixes like "an image of," "a scene of," or "footage of." 
+For instance, if the description is "an image of a beautiful landscape," the modified version should be "a beautiful landscape."
+
+IMPORTANT: the output in your response should be 'Result:text', where text is the description you generated.
+
+Here are some examples:
+
+Input: an image of a beautiful landscape
+Result: a beautiful landscape
+
+Input: a scene of people enjoying a picnic
+Result: people enjoying a picnic
+
+Below is the input for you to generate the result from:
+"""
+
+REMOVE_VIDEO_MODEL_DESCRIPTION_PROMPT = ChatPromptTemplate(
+    messages=[
+        SystemMessage(content=REMOVE_VIDEO_MODEL_DESCRIPTION_TEMPLATE),
+        HumanMessagePromptTemplate.from_template("Input: {description}"),
+    ]
+)
+
+VALIDATE_AND_ADJUST_DESCRIPTION_TEMPLATE = """
+You are tasked with enhancing closed-caption descriptions based on corresponding subtitles from the audio of a real movie clip. 
+Assignment details, from highest to lowest priority:
+
+1) If the subtitle exceeds Limit characters, creatively rewrite the description to not exceed the character limit, preserving as many details as you can.
+    If you feel that you cannot complete the response under the character limit, you must omit details in order to remain below the character limit.
+    
+2) If the details in the subtitle provide meaningful additional information to its closed-caption description, incorporate those details into the description.
+
+Enhance the closed-caption description by integrating details from the subtitle if they contribute meaningful information.
+
+Example:
+Subtitle: car screeching, tires squealing
+Closed-Caption Description: A car speeds down the street.
+
+Output: Result: A car speeds down the street, its tires screeching and squealing.
+
+**IMPORTANT**: Remember your assignment details when formulating your response! YOU MUST NOT EXCEED LIMIT CHARACTERS at human message.
+
+***IMPORTANT***: You must only return the following text in your response. You may not return a response that does not follow the exact format in the next line:
+Result: Text
+
+**** YOU MUST PROVIDE ME WITH THE BEST ANSWER YOU CAN COME UP WITH,
+**** EVEN IF YOU DEEM THAT IT IS A BAD ONE. YOU MUST ONLY RESPOND IN THE FORMAT IN THE NEXT LINE:
+Result: Text
+
+Below is the data provided, generate a response using this data:
+"""
+
+VALIDATE_AND_ADJUST_DESCRIPTION_PROMPT = ChatPromptTemplate(
+    messages=[
+        SystemMessage(content=VALIDATE_AND_ADJUST_DESCRIPTION_TEMPLATE),
+        HumanMessagePromptTemplate.from_template(
+            "Limit: {limit}\nSubtitle: {subtitle}\nClosed-Caption Description: {description}"
+        ),
+    ]
+)
diff --git a/libs/experimental/langchain_experimental/video_captioning/services/audio_service.py b/libs/experimental/langchain_experimental/video_captioning/services/audio_service.py
new file mode 100644
index 0000000000..b7844df3f7
--- /dev/null
+++ b/libs/experimental/langchain_experimental/video_captioning/services/audio_service.py
@@ -0,0 +1,92 @@
+import subprocess
+from pathlib import Path
+from typing import List, Optional
+
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from langchain.schema import Document
+from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
+from langchain_community.document_loaders.assemblyai import TranscriptFormat
+
+from langchain_experimental.video_captioning.models import AudioModel, BaseModel
+
+
+class AudioProcessor:
+    def __init__(
+        self,
+        api_key: str,
+        output_audio_path: str = "output_audio.mp3",
+    ):
+        self.output_audio_path = Path(output_audio_path)
+        self.api_key = api_key
+
+    def process(
+        self,
+        video_file_path: str,
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> list:
+        try:
+            self._extract_audio(video_file_path)
+            return self._transcribe_audio()
+        finally:
+            # Cleanup: Delete the MP3 file after processing
+            try:
+                self.output_audio_path.unlink()
+            except FileNotFoundError:
+                pass  # File not found, nothing to delete
+
+    def _extract_audio(self, video_file_path: str) -> None:
+        # Ensure the directory exists where the output file will be saved
+        self.output_audio_path.parent.mkdir(parents=True, exist_ok=True)
+
+        command = [
+            "ffmpeg",
+            "-i",
+            video_file_path,
+            "-vn",
+            "-acodec",
+            "mp3",
+            self.output_audio_path.as_posix(),
+            "-y",  # The '-y' flag overwrites the output file if it exists
+        ]
+
+        subprocess.run(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True
+        )
+
+    def _transcribe_audio(self) -> List[BaseModel]:
+        if not self.api_key:
+            raise ValueError("API key for AssemblyAI is not configured")
+        audio_file_path_str = str(self.output_audio_path)
+        loader = AssemblyAIAudioTranscriptLoader(
+            file_path=audio_file_path_str,
+            api_key=self.api_key,
+            transcript_format=TranscriptFormat.SUBTITLES_SRT,
+        )
+        docs = loader.load()
+        return self._create_transcript_models(docs)
+
+    @staticmethod
+    def _create_transcript_models(docs: List[Document]) -> List[BaseModel]:
+        # Assuming docs is a list of Documents with .page_content as the transcript data
+        models = []
+        for doc in docs:
+            models.extend(AudioProcessor._parse_transcript(doc.page_content))
+        return models
+
+    @staticmethod
+    def _parse_transcript(srt_content: str) -> List[BaseModel]:
+        models = []
+        entries = srt_content.strip().split("\n\n")  # Split based on double newline
+
+        for entry in entries:
+            index, timespan, *subtitle_lines = entry.split("\n")
+
+            # If not a valid entry format, skip
+            if len(subtitle_lines) == 0:
+                continue
+
+            start_time, end_time = timespan.split(" --> ")
+            subtitle_text = " ".join(subtitle_lines).strip()
+            models.append(AudioModel.from_srt(start_time, end_time, subtitle_text))
+
+        return models
diff --git a/libs/experimental/langchain_experimental/video_captioning/services/caption_service.py b/libs/experimental/langchain_experimental/video_captioning/services/caption_service.py
new file mode 100644
index 0000000000..5d844a5c1e
--- /dev/null
+++ b/libs/experimental/langchain_experimental/video_captioning/services/caption_service.py
@@ -0,0 +1,279 @@
+from typing import Dict, List, Optional, Tuple
+
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from langchain.chains.llm import LLMChain
+from langchain_core.language_models import BaseLanguageModel
+
+from langchain_experimental.video_captioning.models import VideoModel
+from langchain_experimental.video_captioning.prompts import (
+    JOIN_SIMILAR_VIDEO_MODELS_PROMPT,
+    REMOVE_VIDEO_MODEL_DESCRIPTION_PROMPT,
+)
+
+
+class CaptionProcessor:
+    def __init__(
+        self,
+        llm: BaseLanguageModel,
+        verbose: bool = True,
+        similarity_threshold: int = 80,
+        use_unclustered_models: bool = False,
+    ) -> None:
+        self.llm = llm
+        self.verbose = verbose
+
+        # Set the percentage value for how similar two video model image
+        # descriptions should be in order for us to cluster them into a group
+        self._SIMILARITY_THRESHOLD = similarity_threshold
+        # Set to True if you want to include video models which were not clustered.
+        # Will likely result in closed-caption artifacts
+        self._USE_NON_CLUSTERED_VIDEO_MODELS = use_unclustered_models
+
+    def process(
+        self,
+        video_models: List[VideoModel],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> List[VideoModel]:
+        # Remove any consecutive duplicates
+        video_models = self._remove_consecutive_duplicates(video_models)
+
+        # Holds the video models after clustering has been applied
+        video_models_post_clustering = []
+        # In this case, index represents a divider between clusters
+        index = 0
+        for start, end in self._get_model_clusters(video_models):
+            start_vm, end_vm = video_models[start], video_models[end]
+
+            if self._USE_NON_CLUSTERED_VIDEO_MODELS:
+                # Append all the non-clustered models in between model clusters
+                # staged for OpenAI combination
+                video_models_post_clustering += video_models[index:start]
+            index = end + 1
+
+            # Send to llm for description combination
+            models_to_combine = video_models[start:index]
+            combined_description = self._join_similar_video_models(
+                models_to_combine, run_manager
+            )
+
+            # Strip any prefixes that are redundant in the context of closed-captions
+            stripped_description = self._remove_video_model_description_prefix(
+                combined_description, run_manager
+            )
+
+            # Create a new video model which is the combination of all the models in
+            # the cluster
+            combined_and_stripped_model = VideoModel(
+                start_vm.start_time, end_vm.end_time, stripped_description
+            )
+
+            video_models_post_clustering.append(combined_and_stripped_model)
+
+        if self._USE_NON_CLUSTERED_VIDEO_MODELS:
+            # Append any non-clustered models present after every clustered model
+            video_models_post_clustering += video_models[index:]
+
+        return video_models_post_clustering
+
+    def _remove_consecutive_duplicates(
+        self,
+        video_models: List[VideoModel],
+    ) -> List[VideoModel]:
+        buffer: List[VideoModel] = []
+
+        for video_model in video_models:
+            # Join this model and the previous model if they have the same image
+            # description
+            if (
+                len(buffer) > 0
+                and buffer[-1].image_description == video_model.image_description
+            ):
+                buffer[-1].end_time = video_model.end_time
+
+            else:
+                buffer.append(video_model)
+
+        return buffer
+
+    def _remove_video_model_description_prefix(
+        self, description: str, run_manager: Optional[CallbackManagerForChainRun] = None
+    ) -> str:
+        conversation = LLMChain(
+            llm=self.llm,
+            prompt=REMOVE_VIDEO_MODEL_DESCRIPTION_PROMPT,
+            verbose=True,
+            callbacks=run_manager.get_child() if run_manager else None,
+        )
+        # Get response from OpenAI using LLMChain
+        response = conversation({"description": description})
+
+        # Take out the Result: part of the response
+        return response["text"].replace("Result:", "").strip()
+
+    def _join_similar_video_models(
+        self,
+        video_models: List[VideoModel],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> str:
+        descriptions = ""
+        count = 1
+        for video_model in video_models:
+            descriptions += (
+                f"Description {count}: " + video_model.image_description + ", "
+            )
+            count += 1
+
+        # Strip trailing ", "
+        descriptions = descriptions[:-2]
+
+        conversation = LLMChain(
+            llm=self.llm,
+            prompt=JOIN_SIMILAR_VIDEO_MODELS_PROMPT,
+            verbose=True,
+            callbacks=run_manager.get_child() if run_manager else None,
+        )
+        # Get response from OpenAI using LLMChain
+        response = conversation({"descriptions": descriptions})
+
+        # Take out the Result: part of the response
+        return response["text"].replace("Result:", "").strip()
+
+    def _get_model_clusters(
+        self, video_models: List[VideoModel]
+    ) -> List[Tuple[int, int]]:
+        # Word bank which maps lowercase words (case-insensitive) with trailing s's
+        # removed (singular/plural-insensitive) to video model indexes in video_models
+        word_bank: Dict[str, List[int]] = {}
+
+        # Function which formats words to be inserted into word bank, as specified
+        # above
+        def format_word(w: str) -> str:
+            return w.lower().rstrip("s")
+
+        # Keeps track of the current video model index
+        index = 0
+        for vm in video_models:
+            for word in vm.image_description.split():
+                formatted_word = format_word(word)
+                word_bank[formatted_word] = (
+                    word_bank[formatted_word] if formatted_word in word_bank else []
+                ) + [index]
+            index += 1
+
+        # Keeps track of the current video model index
+        index = 0
+        # Maps video model index to list of other video model indexes that have a
+        # similarity score above the threshold
+        sims: Dict[int, List[int]] = {}
+        for vm in video_models:
+            # Maps other video model index to number of words it shares in common
+            # with this video model
+            matches: Dict[int, int] = {}
+            for word in vm.image_description.split():
+                formatted_word = format_word(word)
+                for match in word_bank[formatted_word]:
+                    if match != index:
+                        matches[match] = matches[match] + 1 if match in matches else 1
+            if matches:
+                # Get the highest number of words another video model shares with
+                # this video model
+                max_words_in_common = max(matches.values())
+
+                # Get all video model indexes that share the maximum number of words
+                # with this video model
+                vms_with_max_words = [
+                    key
+                    for key, value in matches.items()
+                    if value == max_words_in_common
+                ]
+
+                # Maps other video model index to its similarity score with this
+                # video model
+                sim_scores: Dict[int, float] = {}
+
+                # Compute similarity score for all video models that share the
+                # highest number of word occurrences with this video model
+                for vm_index in vms_with_max_words:
+                    sim_scores[vm_index] = video_models[vm_index].similarity_score(vm)
+
+                # Get the highest similarity score another video model shares with
+                # this video model
+                max_score = max(sim_scores.values())
+
+                # Get a list of all video models that have the maximum similarity
+                # score to this video model
+                vms_with_max_score = [
+                    key for key, value in sim_scores.items() if value == max_score
+                ]
+
+                # Finally, transfer all video models with a high enough similarity
+                # to this video model into the sims dictionary
+                if max_score >= self._SIMILARITY_THRESHOLD:
+                    sims[index] = []
+                    for vm_index in vms_with_max_score:
+                        sims[index].append(vm_index)
+
+                index += 1
+
+        # Maps video model index to boolean, indicates if we have already checked
+        # this video model's similarity array so that we don't have infinite recursion
+        already_accessed: Dict[int, bool] = {}
+
+        # Recursively search video_model[vm_index]'s similarity matches to find the
+        # earliest and latest video model in the cluster (start and end)
+        def _find_start_and_end(vm_index: int) -> Tuple[int, int]:
+            close_matches = sims[vm_index]
+            first_vm, last_vm = min(close_matches), max(close_matches)
+            first_vm, last_vm = min(vm_index, first_vm), max(vm_index, last_vm)
+
+            if not already_accessed.get(vm_index, None):
+                already_accessed[vm_index] = True
+                for close_match in close_matches:
+                    if close_match in sims:
+                        if vm_index in sims[close_match]:
+                            s, e = _find_start_and_end(close_match)
+                            first_vm = min(s, first_vm)
+                            last_vm = max(e, last_vm)
+
+            return first_vm, last_vm
+
+        # Add the video model cluster results into a set
+        clusters = set()
+        for vm_index in sims:
+            clusters.add(_find_start_and_end(vm_index))
+
+        # Filter the set to include only non-subset intervals
+        filtered_clusters = set()
+        for interval in clusters:
+            start, end = interval[0], interval[1]
+            is_subset = any(
+                start >= other_start and end <= other_end
+                for other_start, other_end in clusters
+                if interval != (other_start, other_end)
+            )
+            if not is_subset:
+                filtered_clusters.add(interval)
+
+        # Sort these clusters into a list, sorted using the first element of the
+        # tuple (index of video model in the cluster with earliest start time)
+        sorted_clusters = sorted(filtered_clusters, key=lambda x: x[0])
+
+        # Merge any overlapping clusters into one big cluster
+        def _merge_overlapping_clusters(
+            array: List[Tuple[int, int]],
+        ) -> List[Tuple[int, int]]:
+            if len(array) <= 1:
+                return array
+
+            def _merge(
+                curr: Tuple[int, int], rest: List[Tuple[int, int]]
+            ) -> List[Tuple[int, int]]:
+                if curr[1] >= rest[0][0]:
+                    return [(curr[0], rest[0][1])] + rest[1:]
+                return [curr] + rest
+
+            return _merge(array[0], _merge_overlapping_clusters(array[1:]))
+
+        merged_clusters = _merge_overlapping_clusters(sorted_clusters)
+
+        return merged_clusters
diff --git a/libs/experimental/langchain_experimental/video_captioning/services/combine_service.py b/libs/experimental/langchain_experimental/video_captioning/services/combine_service.py
new file mode 100644
index 0000000000..09d14c949f
--- /dev/null
+++ b/libs/experimental/langchain_experimental/video_captioning/services/combine_service.py
@@ -0,0 +1,141 @@
+from typing import Dict, List, Optional, Tuple
+
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from langchain.chains.llm import LLMChain
+from langchain.schema.language_model import BaseLanguageModel
+
+from langchain_experimental.video_captioning.models import (
+    AudioModel,
+    CaptionModel,
+    VideoModel,
+)
+from langchain_experimental.video_captioning.prompts import (
+    VALIDATE_AND_ADJUST_DESCRIPTION_PROMPT,
+)
+
+
+class CombineProcessor:
+    def __init__(
+        self, llm: BaseLanguageModel, verbose: bool = True, char_limit: int = 20
+    ):
+        self.llm = llm
+        self.verbose = verbose
+
+        # Adjust as needed. Be careful adjusting it too low because OpenAI may
+        # produce unwanted output
+        self._CHAR_LIMIT = char_limit
+
+    def process(
+        self,
+        video_models: List[VideoModel],
+        audio_models: List[AudioModel],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> List[CaptionModel]:
+        caption_models = []
+        audio_index = 0
+
+        for video_model in video_models:
+            while audio_index < len(audio_models):
+                audio_model = audio_models[audio_index]
+                overlap_start, overlap_end = self._check_overlap(
+                    video_model, audio_model
+                )
+
+                if overlap_start == -1:
+                    if audio_model.start_time <= video_model.start_time:
+                        caption_models.append(
+                            CaptionModel.from_audio_model(audio_model)
+                        )
+                        audio_index += 1
+                    else:
+                        break
+                else:
+                    self._handle_overlap(
+                        caption_models,
+                        video_model,
+                        audio_model,
+                        overlap_start,
+                        overlap_end,
+                    )
+
+                    # Update audio model or pop if it's fully used
+                    if audio_model.end_time <= overlap_end:
+                        audio_index += 1
+                    else:
+                        audio_model.start_time = overlap_end
+
+            caption_models.append(CaptionModel.from_video_model(video_model))
+
+        # Add remaining audio models
+        for i in range(audio_index, len(audio_models)):
+            caption_models.append(CaptionModel.from_audio_model(audio_models[i]))
+
+        return caption_models
+
+    @staticmethod
+    def _check_overlap(
+        video_model: VideoModel, audio_model: AudioModel
+    ) -> Tuple[int, int]:
+        overlap_start = max(audio_model.start_time, video_model.start_time)
+        overlap_end = min(audio_model.end_time, video_model.end_time)
+        if overlap_start < overlap_end:
+            return overlap_start, overlap_end
+        return -1, -1
+
+    def _handle_overlap(
+        self,
+        caption_models: List[CaptionModel],
+        video_model: VideoModel,
+        audio_model: AudioModel,
+        overlap_start: int,
+        overlap_end: int,
+    ) -> None:
+        # Handle non-overlapping part
+        if video_model.start_time < overlap_start:
+            caption_models.append(
+                CaptionModel.from_video_model(
+                    VideoModel(
+                        video_model.start_time,
+                        overlap_start,
+                        video_model.image_description,
+                    )
+                )
+            )
+            video_model.start_time = overlap_start
+
+        # Handle the combined caption during overlap
+        caption_text = self._validate_and_adjust_description(audio_model, video_model)
+        subtitle_text = audio_model.subtitle_text
+        caption_models.append(
+            CaptionModel.from_video_model(
+                VideoModel(overlap_start, overlap_end, caption_text)
+            ).add_subtitle_text(subtitle_text)
+        )
+
+        # Update video model start time for remaining part
+        if video_model.end_time > overlap_end:
+            video_model.start_time = overlap_end
+
+    def _validate_and_adjust_description(
+        self,
+        audio_model: AudioModel,
+        video_model: VideoModel,
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> str:
+        conversation = LLMChain(
+            llm=self.llm,
+            prompt=VALIDATE_AND_ADJUST_DESCRIPTION_PROMPT,
+            verbose=True,
+            callbacks=run_manager.get_child() if run_manager else None,
+        )
+        # Get response from OpenAI using LLMChain
+        response: Dict[str, str] = conversation(
+            {
+                "limit": self._CHAR_LIMIT,
+                "subtitle": audio_model.subtitle_text,
+                "description": video_model.image_description,
+            }
+        )
+
+        # Take out the Result: part of the response
+        return response["text"].replace("Result:", "").strip()
diff --git a/libs/experimental/langchain_experimental/video_captioning/services/image_service.py b/libs/experimental/langchain_experimental/video_captioning/services/image_service.py
new file mode 100644
index 0000000000..551499222c
--- /dev/null
+++ b/libs/experimental/langchain_experimental/video_captioning/services/image_service.py
@@ -0,0 +1,111 @@
+from typing import List, Optional
+
+import numpy as np
+from langchain_community.document_loaders import ImageCaptionLoader
+from langchain_core.callbacks import CallbackManagerForChainRun
+
+from langchain_experimental.video_captioning.models import VideoModel
+
+
+class ImageProcessor:
+    _SAMPLES_PER_SECOND: int = 4
+
+    def __init__(self, frame_skip: int = -1, threshold: int = 3000000) -> None:
+        self.threshold = threshold
+        self.frame_skip = frame_skip
+
+    def process(
+        self,
+        video_file_path: str,
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> list:
+        return self._extract_frames(video_file_path)
+
+    def _extract_frames(self, video_file_path: str) -> list:
+        try:
+            import cv2
+            from cv2.typing import MatLike
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import cv2, please install it with "
+                "`pip install -U opencv-python`"
+            ) from e
+        video_models: List[VideoModel] = []
+
+        def _add_model(start_time: int, end_time: int) -> None:
+            middle_frame_time = start_time / end_time
+            cap.set(cv2.CAP_PROP_POS_MSEC, middle_frame_time)
+
+            # Convert the frame to bytes
+            _, encoded_frame = cv2.imencode(".jpg", frame)
+            notable_frame_bytes = encoded_frame.tobytes()
+
+            cap.set(cv2.CAP_PROP_POS_MSEC, end_time)
+
+            # Create an instance of the ImageCaptionLoader
+            loader = ImageCaptionLoader(images=notable_frame_bytes)
+
+            # Load captions for the images
+            list_docs = loader.load()
+
+            video_model = VideoModel(
+                start_time,
+                end_time,
+                list_docs[len(list_docs) - 1].page_content.replace("[SEP]", "").strip(),
+            )
+            video_models.append(video_model)
+
+        def _is_notable_frame(frame1: MatLike, frame2: MatLike, threshold: int) -> bool:
+            # Convert frames to grayscale
+            gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
+            gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
+
+            # Compute absolute difference between frames
+            frame_diff = cv2.absdiff(gray1, gray2)
+
+            # Apply threshold to identify notable differences
+            _, thresholded_diff = cv2.threshold(frame_diff, 30, 255, cv2.THRESH_BINARY)
+
+            # Count the number of white pixels (indicating differences)
+            num_diff_pixels = np.sum(thresholded_diff)
+
+            return num_diff_pixels > threshold
+
+        # Open the video file
+        cap = cv2.VideoCapture(video_file_path)
+
+        if self.frame_skip == -1:
+            self.frame_skip = int(cap.get(cv2.CAP_PROP_FPS)) // self._SAMPLES_PER_SECOND
+
+        # Read the first frame
+        ret, prev_frame = cap.read()
+
+        # Loop through the video frames
+        start_time = 0
+        end_time = 0
+
+        while True:
+            # Read the next frame
+            ret, frame = cap.read()
+            if not ret:
+                break  # Break the loop if there are no more frames
+
+            # Check if the current frame is notable
+            if _is_notable_frame(prev_frame, frame, self.threshold):
+                end_time = int(cap.get(cv2.CAP_PROP_POS_MSEC))
+                _add_model(start_time, end_time)
+                start_time = end_time
+
+            # Update the previous frame
+            prev_frame = frame.copy()
+
+            # Increment the frame position by the skip value
+            cap.set(
+                cv2.CAP_PROP_POS_FRAMES,
+                cap.get(cv2.CAP_PROP_POS_FRAMES) + self.frame_skip,
+            )
+
+        # Release the video capture object
+        cap.release()
+
+        return video_models
diff --git a/libs/experimental/langchain_experimental/video_captioning/services/srt_service.py b/libs/experimental/langchain_experimental/video_captioning/services/srt_service.py
new file mode 100644
index 0000000000..4b09490400
--- /dev/null
+++ b/libs/experimental/langchain_experimental/video_captioning/services/srt_service.py
@@ -0,0 +1,14 @@
+from typing import List
+
+from langchain_experimental.video_captioning.models import CaptionModel
+
+
+class SRTProcessor:
+    @staticmethod
+    def process(caption_models: List[CaptionModel]) -> str:
+        """Generates the full SRT content from a list of caption models."""
+        srt_entries = []
+        for index, model in enumerate(caption_models, start=1):
+            srt_entries.append(model.to_srt_entry(index))
+
+        return "\n".join(srt_entries)
diff --git a/libs/experimental/poetry.lock b/libs/experimental/poetry.lock
index b9e27240ba..dec78ee130 100644
--- a/libs/experimental/poetry.lock
+++ b/libs/experimental/poetry.lock
@@ -783,6 +783,17 @@ files = [
     {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
 ]
 
+[[package]]
+name = "distro"
+version = "1.9.0"
+description = "Distro - an OS platform information API"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
+    {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.2.0"
@@ -1740,12 +1751,12 @@ all = []
 azure = ["azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-textanalytics (>=5.3.0,<6.0.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-core (>=1.26.4,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "azure-search-documents (==11.4.0b8)", "openai (<2)"]
 clarifai = ["clarifai (>=9.1.0)"]
 cli = ["typer (>=0.9.0,<0.10.0)"]
-cohere = ["cohere (>=4,<5)"]
+cohere = ["cohere (>=4,<6)"]
 docarray = ["docarray[hnswlib] (>=0.32.0,<0.33.0)"]
 embeddings = ["sentence-transformers (>=2,<3)"]
-extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "cohere (>=4,<5)", "couchbase (>=4.1.9,<5.0.0)", "dashvector (>=1.0.1,<2.0.0)", "databricks-vectorsearch (>=0.21,<0.22)", "datasets (>=2.15.0,<3.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.9.0,<0.10.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "hologres-vector (>=0.0.6,<0.0.7)", "html2text (>=2020.1.16,<2021.0.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "langchain-openai (>=0.0.2,<0.1)", "lxml (>=4.9.2,<5.0.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "openai (<2)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "praw (>=7.7.1,<8.0.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "rdflib (==7.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "upstash-redis (>=0.15.0,<0.16.0)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)"]
+extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "cohere (>=4,<6)", "couchbase (>=4.1.9,<5.0.0)", "dashvector (>=1.0.1,<2.0.0)", "databricks-vectorsearch (>=0.21,<0.22)", "datasets (>=2.15.0,<3.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.9.0,<0.10.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "hologres-vector (>=0.0.6,<0.0.7)", "html2text (>=2020.1.16,<2021.0.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "langchain-openai (>=0.0.2,<0.1)", "lxml (>=4.9.3,<6.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "openai (<2)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "praw (>=7.7.1,<8.0.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "rdflib (==7.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "upstash-redis (>=0.15.0,<0.16.0)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)"]
 javascript = ["esprima (>=4.0.1,<5.0.0)"]
-llms = ["clarifai (>=9.1.0)", "cohere (>=4,<5)", "huggingface_hub (>=0,<1)", "manifest-ml (>=0.0.1,<0.0.2)", "nlpcloud (>=1,<2)", "openai (<2)", "openlm (>=0.0.5,<0.0.6)", "torch (>=1,<3)", "transformers (>=4,<5)"]
+llms = ["clarifai (>=9.1.0)", "cohere (>=4,<6)", "huggingface_hub (>=0,<1)", "manifest-ml (>=0.0.1,<0.0.2)", "nlpcloud (>=1,<2)", "openai (<2)", "openlm (>=0.0.5,<0.0.6)", "torch (>=1,<3)", "transformers (>=4,<5)"]
 openai = ["openai (<2)", "tiktoken (>=0.3.2,<0.6.0)"]
 qdrant = ["qdrant-client (>=1.3.1,<2.0.0)"]
 text-helpers = ["chardet (>=5.1.0,<6.0.0)"]
@@ -1776,7 +1787,7 @@ tenacity = "^8.1.0"
 
 [package.extras]
 cli = ["typer (>=0.9.0,<0.10.0)"]
-extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "azure-ai-documentintelligence (>=1.0.0b1,<2.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "cloudpickle (>=2.0.0)", "cloudpickle (>=2.0.0)", "cohere (>=4,<5)", "databricks-vectorsearch (>=0.21,<0.22)", "datasets (>=2.15.0,<3.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "elasticsearch (>=8.12.0,<9.0.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.9.0,<0.10.0)", "friendli-client (>=1.2.4,<2.0.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "gradientai (>=1.4.0,<2.0.0)", "hdbcli (>=2.19.21,<3.0.0)", "hologres-vector (>=0.0.6,<0.0.7)", "html2text (>=2020.1.16,<2021.0.0)", "httpx (>=0.24.1,<0.25.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "lxml (>=4.9.2,<5.0.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "nvidia-riva-client (>=2.14.0,<3.0.0)", "oci (>=2.119.1,<3.0.0)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "oracle-ads (>=2.9.1,<3.0.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "praw (>=7.7.1,<8.0.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "rdflib (==7.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "tidb-vector (>=0.0.3,<1.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "tree-sitter (>=0.20.2,<0.21.0)", "tree-sitter-languages (>=1.8.0,<2.0.0)", "upstash-redis (>=0.15.0,<0.16.0)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)", "zhipuai (>=1.0.7,<2.0.0)"]
+extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "azure-ai-documentintelligence (>=1.0.0b1,<2.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "cloudpickle (>=2.0.0)", "cloudpickle (>=2.0.0)", "cohere (>=4,<5)", "databricks-vectorsearch (>=0.21,<0.22)", "datasets (>=2.15.0,<3.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "elasticsearch (>=8.12.0,<9.0.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.9.0,<0.10.0)", "friendli-client (>=1.2.4,<2.0.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "gradientai (>=1.4.0,<2.0.0)", "hdbcli (>=2.19.21,<3.0.0)", "hologres-vector (>=0.0.6,<0.0.7)", "html2text (>=2020.1.16,<2021.0.0)", "httpx (>=0.24.1,<0.25.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "lxml (>=4.9.3,<6.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "nvidia-riva-client (>=2.14.0,<3.0.0)", "oci (>=2.119.1,<3.0.0)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "oracle-ads (>=2.9.1,<3.0.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "praw (>=7.7.1,<8.0.0)", "premai (>=0.3.25,<0.4.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "rdflib (==7.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "tidb-vector (>=0.0.3,<1.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "tree-sitter (>=0.20.2,<0.21.0)", "tree-sitter-languages (>=1.8.0,<2.0.0)", "upstash-redis (>=0.15.0,<0.16.0)", "vdms (>=0.0.20,<0.0.21)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)", "zhipuai (>=1.0.7,<2.0.0)"]
 
 [package.source]
 type = "directory"
@@ -1784,7 +1795,7 @@ url = "../community"
 
 [[package]]
 name = "langchain-core"
-version = "0.1.33"
+version = "0.1.36"
 description = "Building applications with LLMs through composability"
 optional = false
 python-versions = ">=3.8.1,<4.0"
@@ -1792,7 +1803,6 @@ files = []
 develop = true
 
 [package.dependencies]
-anyio = ">=3,<5"
 jsonpatch = "^1.33"
 langsmith = "^0.1.0"
 packaging = "^23.2"
@@ -1808,6 +1818,24 @@ extended-testing = ["jinja2 (>=3,<4)"]
 type = "directory"
 url = "../core"
 
+[[package]]
+name = "langchain-openai"
+version = "0.1.1"
+description = "An integration package connecting OpenAI and LangChain"
+optional = false
+python-versions = ">=3.8.1,<4.0"
+files = []
+develop = true
+
+[package.dependencies]
+langchain-core = "^0.1.33"
+openai = "^1.10.0"
+tiktoken = ">=0.5.2,<1"
+
+[package.source]
+type = "directory"
+url = "../partners/openai"
+
 [[package]]
 name = "langchain-text-splitters"
 version = "0.0.1"
@@ -2516,6 +2544,29 @@ files = [
     {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"},
 ]
 
+[[package]]
+name = "openai"
+version = "1.14.3"
+description = "The official Python library for the openai API"
+optional = false
+python-versions = ">=3.7.1"
+files = [
+    {file = "openai-1.14.3-py3-none-any.whl", hash = "sha256:7a465994a7ccf677a110c6cc2ef9d86229bad42c060b585b67049aa749f3b774"},
+    {file = "openai-1.14.3.tar.gz", hash = "sha256:37b514e9c0ff45383ec9b242abd0f7859b1080d4b54b61393ed341ecad1b8eb9"},
+]
+
+[package.dependencies]
+anyio = ">=3.5.0,<5"
+distro = ">=1.7.0,<2"
+httpx = ">=0.23.0,<1"
+pydantic = ">=1.9.0,<3"
+sniffio = "*"
+tqdm = ">4"
+typing-extensions = ">=4.7,<5"
+
+[package.extras]
+datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
+
 [[package]]
 name = "orjson"
 version = "3.9.15"
@@ -3531,7 +3582,7 @@ rpds-py = ">=0.7.0"
 name = "regex"
 version = "2023.12.25"
 description = "Alternative regular expression module, to replace re."
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0694219a1d54336fd0445ea382d49d36882415c0134ee1e8332afd1529f0baa5"},
@@ -4571,6 +4622,58 @@ files = [
     {file = "threadpoolctl-3.3.0.tar.gz", hash = "sha256:5dac632b4fa2d43f42130267929af3ba01399ef4bd1882918e92dbc30365d30c"},
 ]
 
+[[package]]
+name = "tiktoken"
+version = "0.6.0"
+description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "tiktoken-0.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:277de84ccd8fa12730a6b4067456e5cf72fef6300bea61d506c09e45658d41ac"},
+    {file = "tiktoken-0.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9c44433f658064463650d61387623735641dcc4b6c999ca30bc0f8ba3fccaf5c"},
+    {file = "tiktoken-0.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afb9a2a866ae6eef1995ab656744287a5ac95acc7e0491c33fad54d053288ad3"},
+    {file = "tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c62c05b3109fefca26fedb2820452a050074ad8e5ad9803f4652977778177d9f"},
+    {file = "tiktoken-0.6.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0ef917fad0bccda07bfbad835525bbed5f3ab97a8a3e66526e48cdc3e7beacf7"},
+    {file = "tiktoken-0.6.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e095131ab6092d0769a2fda85aa260c7c383072daec599ba9d8b149d2a3f4d8b"},
+    {file = "tiktoken-0.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:05b344c61779f815038292a19a0c6eb7098b63c8f865ff205abb9ea1b656030e"},
+    {file = "tiktoken-0.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cefb9870fb55dca9e450e54dbf61f904aab9180ff6fe568b61f4db9564e78871"},
+    {file = "tiktoken-0.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:702950d33d8cabc039845674107d2e6dcabbbb0990ef350f640661368df481bb"},
+    {file = "tiktoken-0.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8d49d076058f23254f2aff9af603863c5c5f9ab095bc896bceed04f8f0b013a"},
+    {file = "tiktoken-0.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:430bc4e650a2d23a789dc2cdca3b9e5e7eb3cd3935168d97d43518cbb1f9a911"},
+    {file = "tiktoken-0.6.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:293cb8669757301a3019a12d6770bd55bec38a4d3ee9978ddbe599d68976aca7"},
+    {file = "tiktoken-0.6.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7bd1a288b7903aadc054b0e16ea78e3171f70b670e7372432298c686ebf9dd47"},
+    {file = "tiktoken-0.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac76e000183e3b749634968a45c7169b351e99936ef46f0d2353cd0d46c3118d"},
+    {file = "tiktoken-0.6.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:17cc8a4a3245ab7d935c83a2db6bb71619099d7284b884f4b2aea4c74f2f83e3"},
+    {file = "tiktoken-0.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:284aebcccffe1bba0d6571651317df6a5b376ff6cfed5aeb800c55df44c78177"},
+    {file = "tiktoken-0.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c1a3a5d33846f8cd9dd3b7897c1d45722f48625a587f8e6f3d3e85080559be8"},
+    {file = "tiktoken-0.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6318b2bb2337f38ee954fd5efa82632c6e5ced1d52a671370fa4b2eff1355e91"},
+    {file = "tiktoken-0.6.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1f5f0f2ed67ba16373f9a6013b68da298096b27cd4e1cf276d2d3868b5c7efd1"},
+    {file = "tiktoken-0.6.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:75af4c0b16609c2ad02581f3cdcd1fb698c7565091370bf6c0cf8624ffaba6dc"},
+    {file = "tiktoken-0.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:45577faf9a9d383b8fd683e313cf6df88b6076c034f0a16da243bb1c139340c3"},
+    {file = "tiktoken-0.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7c1492ab90c21ca4d11cef3a236ee31a3e279bb21b3fc5b0e2210588c4209e68"},
+    {file = "tiktoken-0.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e2b380c5b7751272015400b26144a2bab4066ebb8daae9c3cd2a92c3b508fe5a"},
+    {file = "tiktoken-0.6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9f497598b9f58c99cbc0eb764b4a92272c14d5203fc713dd650b896a03a50ad"},
+    {file = "tiktoken-0.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e65e8bd6f3f279d80f1e1fbd5f588f036b9a5fa27690b7f0cc07021f1dfa0839"},
+    {file = "tiktoken-0.6.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5f1495450a54e564d236769d25bfefbf77727e232d7a8a378f97acddee08c1ae"},
+    {file = "tiktoken-0.6.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6c4e4857d99f6fb4670e928250835b21b68c59250520a1941618b5b4194e20c3"},
+    {file = "tiktoken-0.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:168d718f07a39b013032741867e789971346df8e89983fe3c0ef3fbd5a0b1cb9"},
+    {file = "tiktoken-0.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:47fdcfe11bd55376785a6aea8ad1db967db7f66ea81aed5c43fad497521819a4"},
+    {file = "tiktoken-0.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fb7d2ccbf1a7784810aff6b80b4012fb42c6fc37eaa68cb3b553801a5cc2d1fc"},
+    {file = "tiktoken-0.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ccb7a111ee76af5d876a729a347f8747d5ad548e1487eeea90eaf58894b3138"},
+    {file = "tiktoken-0.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2048e1086b48e3c8c6e2ceeac866561374cd57a84622fa49a6b245ffecb7744"},
+    {file = "tiktoken-0.6.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:07f229a5eb250b6403a61200199cecf0aac4aa23c3ecc1c11c1ca002cbb8f159"},
+    {file = "tiktoken-0.6.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:432aa3be8436177b0db5a2b3e7cc28fd6c693f783b2f8722539ba16a867d0c6a"},
+    {file = "tiktoken-0.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:8bfe8a19c8b5c40d121ee7938cd9c6a278e5b97dc035fd61714b4f0399d2f7a1"},
+    {file = "tiktoken-0.6.0.tar.gz", hash = "sha256:ace62a4ede83c75b0374a2ddfa4b76903cf483e9cb06247f566be3bf14e6beed"},
+]
+
+[package.dependencies]
+regex = ">=2022.1.18"
+requests = ">=2.26.0"
+
+[package.extras]
+blobfile = ["blobfile (>=2)"]
+
 [[package]]
 name = "tinycss2"
 version = "1.2.1"
@@ -4829,7 +4932,7 @@ files = [
 name = "tqdm"
 version = "4.66.2"
 description = "Fast, Extensible Progress Meter"
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "tqdm-4.66.2-py3-none-any.whl", hash = "sha256:1ee4f8a893eb9bef51c6e35730cebf234d5d0b6bd112b0271e10ed7c24a02bd9"},
@@ -5327,4 +5430,4 @@ extended-testing = ["faker", "jinja2", "pandas", "presidio-analyzer", "presidio-
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "ac0774f64c323c9c8536eef0d4928ed3e9f7f5b5b99a8b6a7bab37c8bed80b57"
+content-hash = "c0251b06d3f8c3df0d7aaa7e142c5f1ce450bf88ff37690bd36cb6074eb803be"
diff --git a/libs/experimental/pyproject.toml b/libs/experimental/pyproject.toml
index b150b31765..e3cf81dfc5 100644
--- a/libs/experimental/pyproject.toml
+++ b/libs/experimental/pyproject.toml
@@ -69,6 +69,7 @@ optional = true
 langchain = {path = "../langchain", develop = true}
 langchain-core = {path = "../core", develop = true}
 langchain-community = {path = "../community", develop = true}
+langchain-openai = {path = "../partners/openai", develop = true}
 
 # An extra used to be able to add extended testing.
 # Please use new-line on formatting to make it easier to add new packages without
diff --git a/libs/experimental/tests/integration_tests/test_video_captioning.py b/libs/experimental/tests/integration_tests/test_video_captioning.py
new file mode 100644
index 0000000000..dbfcfaef2f
--- /dev/null
+++ b/libs/experimental/tests/integration_tests/test_video_captioning.py
@@ -0,0 +1,28 @@
+"""Integration test for video captioning."""
+from langchain_openai import ChatOpenAI
+
+from langchain_experimental.video_captioning.base import VideoCaptioningChain
+
+
+def test_video_captioning_hard() -> None:
+    """Test input that is considered hard for this chain to process."""
+    URL = """
+    https://ia904700.us.archive.org/22/items/any-chibes/X2Download.com
+    -FXX%20USA%20%C2%ABPromo%20Noon%20-%204A%20Every%20Day%EF%BF%BD%EF
+    %BF%BD%C2%BB%20November%202021%EF%BF%BD%EF%BF%BD-%281080p60%29.mp4
+    """
+    chain = VideoCaptioningChain(
+        llm=ChatOpenAI(
+            model="gpt-4",
+            max_tokens=4000,
+        )
+    )
+    srt_content = chain.run(video_file_path=URL)
+
+    assert (
+        "mustache" in srt_content
+        and "Any chives?" in srt_content
+        and "How easy? A little tighter." in srt_content
+        and "it's a little tight in" in srt_content
+        and "every day" in srt_content
+    )