community[minor]: add the ability to load existing transcripts from AssemblyAI by their id. (#16051)

- **Description:** the existing AssemblyAI API allows to pass a path or
an url to transcribe an audio file and turn in into Langchain Documents,
this PR allows to get existing transcript by their transcript id and
turn them into Documents.
  - **Issue:** not related to an existing issue
  - **Dependencies:** requests

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
pull/16808/head
Raphael 5 months ago committed by GitHub
parent daf820c77b
commit bf9068516e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -3,6 +3,7 @@ from __future__ import annotations
from enum import Enum
from typing import TYPE_CHECKING, List, Optional
import requests
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
@ -110,3 +111,111 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
return [Document(page_content=transcript.export_subtitles_vtt())]
else:
raise ValueError("Unknown transcript format.")
class AssemblyAIAudioLoaderById(BaseLoader):
"""
Loader for AssemblyAI audio transcripts.
It uses the AssemblyAI API to get an existing transcription
and loads the transcribed text into one or more Documents,
depending on the specified format.
"""
def __init__(self, transcript_id, api_key, transcript_format):
"""
Initializes the AssemblyAI AssemblyAIAudioLoaderById.
Args:
transcript_id: Id of an existing transcription.
transcript_format: Transcript format to use.
See class ``TranscriptFormat`` for more info.
api_key: AssemblyAI API key.
"""
self.api_key = api_key
self.transcript_id = transcript_id
self.transcript_format = transcript_format
def load(self) -> List[Document]:
"""Load data into Document objects."""
HEADERS = {"authorization": self.api_key}
if self.transcript_format == TranscriptFormat.TEXT:
try:
transcript_response = requests.get(
f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}",
headers=HEADERS,
)
transcript_response.raise_for_status()
except Exception as e:
print(f"An error occurred: {e}")
raise
transcript = transcript_response.json()["text"]
return [
Document(page_content=transcript, metadata=transcript_response.json())
]
elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
try:
paragraphs_response = requests.get(
f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/paragraphs",
headers=HEADERS,
)
paragraphs_response.raise_for_status()
except Exception as e:
print(f"An error occurred: {e}")
raise
paragraphs = paragraphs_response.json()["paragraphs"]
return [Document(page_content=p["text"], metadata=p) for p in paragraphs]
elif self.transcript_format == TranscriptFormat.SENTENCES:
try:
sentences_response = requests.get(
f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/sentences",
headers=HEADERS,
)
sentences_response.raise_for_status()
except Exception as e:
print(f"An error occurred: {e}")
raise
sentences = sentences_response.json()["sentences"]
return [Document(page_content=s["text"], metadata=s) for s in sentences]
elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
try:
srt_response = requests.get(
f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/srt",
headers=HEADERS,
)
srt_response.raise_for_status()
except Exception as e:
print(f"An error occurred: {e}")
raise
srt = srt_response.text
return [Document(page_content=srt)]
elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
try:
vtt_response = requests.get(
f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/vtt",
headers=HEADERS,
)
vtt_response.raise_for_status()
except Exception as e:
print(f"An error occurred: {e}")
raise
vtt = vtt_response.text
return [Document(page_content=vtt)]
else:
raise ValueError("Unknown transcript format.")

Loading…
Cancel
Save