You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/experimental/langchain_experimental/video_captioning/services/caption_service.py

280 lines
11 KiB
Python

from typing import Dict, List, Optional, Tuple
from langchain.callbacks.manager import CallbackManagerForChainRun
from langchain.chains.llm import LLMChain
from langchain_core.language_models import BaseLanguageModel
from langchain_experimental.video_captioning.models import VideoModel
from langchain_experimental.video_captioning.prompts import (
JOIN_SIMILAR_VIDEO_MODELS_PROMPT,
REMOVE_VIDEO_MODEL_DESCRIPTION_PROMPT,
)
class CaptionProcessor:
def __init__(
self,
llm: BaseLanguageModel,
verbose: bool = True,
similarity_threshold: int = 80,
use_unclustered_models: bool = False,
) -> None:
self.llm = llm
self.verbose = verbose
# Set the percentage value for how similar two video model image
# descriptions should be in order for us to cluster them into a group
self._SIMILARITY_THRESHOLD = similarity_threshold
# Set to True if you want to include video models which were not clustered.
# Will likely result in closed-caption artifacts
self._USE_NON_CLUSTERED_VIDEO_MODELS = use_unclustered_models
def process(
self,
video_models: List[VideoModel],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> List[VideoModel]:
# Remove any consecutive duplicates
video_models = self._remove_consecutive_duplicates(video_models)
# Holds the video models after clustering has been applied
video_models_post_clustering = []
# In this case, index represents a divider between clusters
index = 0
for start, end in self._get_model_clusters(video_models):
start_vm, end_vm = video_models[start], video_models[end]
if self._USE_NON_CLUSTERED_VIDEO_MODELS:
# Append all the non-clustered models in between model clusters
# staged for OpenAI combination
video_models_post_clustering += video_models[index:start]
index = end + 1
# Send to llm for description combination
models_to_combine = video_models[start:index]
combined_description = self._join_similar_video_models(
models_to_combine, run_manager
)
# Strip any prefixes that are redundant in the context of closed-captions
stripped_description = self._remove_video_model_description_prefix(
combined_description, run_manager
)
# Create a new video model which is the combination of all the models in
# the cluster
combined_and_stripped_model = VideoModel(
start_vm.start_time, end_vm.end_time, stripped_description
)
video_models_post_clustering.append(combined_and_stripped_model)
if self._USE_NON_CLUSTERED_VIDEO_MODELS:
# Append any non-clustered models present after every clustered model
video_models_post_clustering += video_models[index:]
return video_models_post_clustering
def _remove_consecutive_duplicates(
self,
video_models: List[VideoModel],
) -> List[VideoModel]:
buffer: List[VideoModel] = []
for video_model in video_models:
# Join this model and the previous model if they have the same image
# description
if (
len(buffer) > 0
and buffer[-1].image_description == video_model.image_description
):
buffer[-1].end_time = video_model.end_time
else:
buffer.append(video_model)
return buffer
def _remove_video_model_description_prefix(
self, description: str, run_manager: Optional[CallbackManagerForChainRun] = None
) -> str:
conversation = LLMChain(
llm=self.llm,
prompt=REMOVE_VIDEO_MODEL_DESCRIPTION_PROMPT,
verbose=True,
callbacks=run_manager.get_child() if run_manager else None,
)
# Get response from OpenAI using LLMChain
response = conversation({"description": description})
# Take out the Result: part of the response
return response["text"].replace("Result:", "").strip()
def _join_similar_video_models(
self,
video_models: List[VideoModel],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> str:
descriptions = ""
count = 1
for video_model in video_models:
descriptions += (
f"Description {count}: " + video_model.image_description + ", "
)
count += 1
# Strip trailing ", "
descriptions = descriptions[:-2]
conversation = LLMChain(
llm=self.llm,
prompt=JOIN_SIMILAR_VIDEO_MODELS_PROMPT,
verbose=True,
callbacks=run_manager.get_child() if run_manager else None,
)
# Get response from OpenAI using LLMChain
response = conversation({"descriptions": descriptions})
# Take out the Result: part of the response
return response["text"].replace("Result:", "").strip()
def _get_model_clusters(
self, video_models: List[VideoModel]
) -> List[Tuple[int, int]]:
# Word bank which maps lowercase words (case-insensitive) with trailing s's
# removed (singular/plural-insensitive) to video model indexes in video_models
word_bank: Dict[str, List[int]] = {}
# Function which formats words to be inserted into word bank, as specified
# above
def format_word(w: str) -> str:
return w.lower().rstrip("s")
# Keeps track of the current video model index
index = 0
for vm in video_models:
for word in vm.image_description.split():
formatted_word = format_word(word)
word_bank[formatted_word] = (
word_bank[formatted_word] if formatted_word in word_bank else []
) + [index]
index += 1
# Keeps track of the current video model index
index = 0
# Maps video model index to list of other video model indexes that have a
# similarity score above the threshold
sims: Dict[int, List[int]] = {}
for vm in video_models:
# Maps other video model index to number of words it shares in common
# with this video model
matches: Dict[int, int] = {}
for word in vm.image_description.split():
formatted_word = format_word(word)
for match in word_bank[formatted_word]:
if match != index:
matches[match] = matches[match] + 1 if match in matches else 1
if matches:
# Get the highest number of words another video model shares with
# this video model
max_words_in_common = max(matches.values())
# Get all video model indexes that share the maximum number of words
# with this video model
vms_with_max_words = [
key
for key, value in matches.items()
if value == max_words_in_common
]
# Maps other video model index to its similarity score with this
# video model
sim_scores: Dict[int, float] = {}
# Compute similarity score for all video models that share the
# highest number of word occurrences with this video model
for vm_index in vms_with_max_words:
sim_scores[vm_index] = video_models[vm_index].similarity_score(vm)
# Get the highest similarity score another video model shares with
# this video model
max_score = max(sim_scores.values())
# Get a list of all video models that have the maximum similarity
# score to this video model
vms_with_max_score = [
key for key, value in sim_scores.items() if value == max_score
]
# Finally, transfer all video models with a high enough similarity
# to this video model into the sims dictionary
if max_score >= self._SIMILARITY_THRESHOLD:
sims[index] = []
for vm_index in vms_with_max_score:
sims[index].append(vm_index)
index += 1
# Maps video model index to boolean, indicates if we have already checked
# this video model's similarity array so that we don't have infinite recursion
already_accessed: Dict[int, bool] = {}
# Recursively search video_model[vm_index]'s similarity matches to find the
# earliest and latest video model in the cluster (start and end)
def _find_start_and_end(vm_index: int) -> Tuple[int, int]:
close_matches = sims[vm_index]
first_vm, last_vm = min(close_matches), max(close_matches)
first_vm, last_vm = min(vm_index, first_vm), max(vm_index, last_vm)
if not already_accessed.get(vm_index, None):
already_accessed[vm_index] = True
for close_match in close_matches:
if close_match in sims:
if vm_index in sims[close_match]:
s, e = _find_start_and_end(close_match)
first_vm = min(s, first_vm)
last_vm = max(e, last_vm)
return first_vm, last_vm
# Add the video model cluster results into a set
clusters = set()
for vm_index in sims:
clusters.add(_find_start_and_end(vm_index))
# Filter the set to include only non-subset intervals
filtered_clusters = set()
for interval in clusters:
start, end = interval[0], interval[1]
is_subset = any(
start >= other_start and end <= other_end
for other_start, other_end in clusters
if interval != (other_start, other_end)
)
if not is_subset:
filtered_clusters.add(interval)
# Sort these clusters into a list, sorted using the first element of the
# tuple (index of video model in the cluster with earliest start time)
sorted_clusters = sorted(filtered_clusters, key=lambda x: x[0])
# Merge any overlapping clusters into one big cluster
def _merge_overlapping_clusters(
array: List[Tuple[int, int]],
) -> List[Tuple[int, int]]:
if len(array) <= 1:
return array
def _merge(
curr: Tuple[int, int], rest: List[Tuple[int, int]]
) -> List[Tuple[int, int]]:
if curr[1] >= rest[0][0]:
return [(curr[0], rest[0][1])] + rest[1:]
return [curr] + rest
return _merge(array[0], _merge_overlapping_clusters(array[1:]))
merged_clusters = _merge_overlapping_clusters(sorted_clusters)
return merged_clusters