From bf9068516ea302025ad585e22625fabd87841503 Mon Sep 17 00:00:00 2001 From: Raphael Date: Tue, 30 Jan 2024 22:47:45 +0100 Subject: [PATCH] community[minor]: add the ability to load existing transcripts from AssemblyAI by their id. (#16051) - **Description:** the existing AssemblyAI API allows to pass a path or an url to transcribe an audio file and turn in into Langchain Documents, this PR allows to get existing transcript by their transcript id and turn them into Documents. - **Issue:** not related to an existing issue - **Dependencies:** requests --------- Co-authored-by: Harrison Chase --- .../document_loaders/assemblyai.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/libs/community/langchain_community/document_loaders/assemblyai.py b/libs/community/langchain_community/document_loaders/assemblyai.py index 0dd64256ab..d3947d9f71 100644 --- a/libs/community/langchain_community/document_loaders/assemblyai.py +++ b/libs/community/langchain_community/document_loaders/assemblyai.py @@ -3,6 +3,7 @@ from __future__ import annotations from enum import Enum from typing import TYPE_CHECKING, List, Optional +import requests from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseLoader @@ -110,3 +111,111 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader): return [Document(page_content=transcript.export_subtitles_vtt())] else: raise ValueError("Unknown transcript format.") + + +class AssemblyAIAudioLoaderById(BaseLoader): + """ + Loader for AssemblyAI audio transcripts. + + It uses the AssemblyAI API to get an existing transcription + and loads the transcribed text into one or more Documents, + depending on the specified format. + + """ + + def __init__(self, transcript_id, api_key, transcript_format): + """ + Initializes the AssemblyAI AssemblyAIAudioLoaderById. + + Args: + transcript_id: Id of an existing transcription. + transcript_format: Transcript format to use. + See class ``TranscriptFormat`` for more info. + api_key: AssemblyAI API key. + """ + + self.api_key = api_key + self.transcript_id = transcript_id + self.transcript_format = transcript_format + + def load(self) -> List[Document]: + """Load data into Document objects.""" + HEADERS = {"authorization": self.api_key} + + if self.transcript_format == TranscriptFormat.TEXT: + try: + transcript_response = requests.get( + f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}", + headers=HEADERS, + ) + transcript_response.raise_for_status() + except Exception as e: + print(f"An error occurred: {e}") + raise + + transcript = transcript_response.json()["text"] + + return [ + Document(page_content=transcript, metadata=transcript_response.json()) + ] + elif self.transcript_format == TranscriptFormat.PARAGRAPHS: + try: + paragraphs_response = requests.get( + f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/paragraphs", + headers=HEADERS, + ) + paragraphs_response.raise_for_status() + except Exception as e: + print(f"An error occurred: {e}") + raise + + paragraphs = paragraphs_response.json()["paragraphs"] + + return [Document(page_content=p["text"], metadata=p) for p in paragraphs] + + elif self.transcript_format == TranscriptFormat.SENTENCES: + try: + sentences_response = requests.get( + f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/sentences", + headers=HEADERS, + ) + sentences_response.raise_for_status() + except Exception as e: + print(f"An error occurred: {e}") + raise + + sentences = sentences_response.json()["sentences"] + + return [Document(page_content=s["text"], metadata=s) for s in sentences] + + elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT: + try: + srt_response = requests.get( + f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/srt", + headers=HEADERS, + ) + srt_response.raise_for_status() + except Exception as e: + print(f"An error occurred: {e}") + raise + + srt = srt_response.text + + return [Document(page_content=srt)] + + elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT: + try: + vtt_response = requests.get( + f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/vtt", + headers=HEADERS, + ) + vtt_response.raise_for_status() + except Exception as e: + print(f"An error occurred: {e}") + raise + + vtt = vtt_response.text + + return [Document(page_content=vtt)] + else: + raise ValueError("Unknown transcript format.")