community[minor]: add the ability to load existing transcripts from AssemblyAI by their id. (#16051)

- **Description:** the existing AssemblyAI API allows to pass a path or an url to transcribe an audio file and turn in into Langchain Documents, this PR allows to get existing transcript by their transcript id and turn them into Documents. - **Issue:** not related to an existing issue - **Dependencies:** requests --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
5 months ago · bf9068516e
parent daf820c77b
commit bf9068516e
1 changed files with 109 additions and 0 deletions
--- a/libs/community/langchain_community/document_loaders/assemblyai.py
+++ b/libs/community/langchain_community/document_loaders/assemblyai.py
@ -3,6 +3,7 @@ from __future__ import annotations
 from enum import Enum
 from typing import TYPE_CHECKING, List, Optional

+import requests
 from langchain_core.documents import Document

 from langchain_community.document_loaders.base import BaseLoader
@ -110,3 +111,111 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
            return [Document(page_content=transcript.export_subtitles_vtt())]
        else:
            raise ValueError("Unknown transcript format.")
+
+
+class AssemblyAIAudioLoaderById(BaseLoader):
+    """
+    Loader for AssemblyAI audio transcripts.
+
+    It uses the AssemblyAI API to get an existing transcription
+    and loads the transcribed text into one or more Documents,
+    depending on the specified format.
+
+    """
+
+    def __init__(self, transcript_id, api_key, transcript_format):
+        """
+        Initializes the AssemblyAI AssemblyAIAudioLoaderById.
+
+        Args:
+            transcript_id: Id of an existing transcription.
+            transcript_format: Transcript format to use.
+                See class ``TranscriptFormat`` for more info.
+            api_key: AssemblyAI API key.
+        """
+
+        self.api_key = api_key
+        self.transcript_id = transcript_id
+        self.transcript_format = transcript_format
+
+    def load(self) -> List[Document]:
+        """Load data into Document objects."""
+        HEADERS = {"authorization": self.api_key}
+
+        if self.transcript_format == TranscriptFormat.TEXT:
+            try:
+                transcript_response = requests.get(
+                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}",
+                    headers=HEADERS,
+                )
+                transcript_response.raise_for_status()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                raise
+
+            transcript = transcript_response.json()["text"]
+
+            return [
+                Document(page_content=transcript, metadata=transcript_response.json())
+            ]
+        elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
+            try:
+                paragraphs_response = requests.get(
+                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/paragraphs",
+                    headers=HEADERS,
+                )
+                paragraphs_response.raise_for_status()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                raise
+
+            paragraphs = paragraphs_response.json()["paragraphs"]
+
+            return [Document(page_content=p["text"], metadata=p) for p in paragraphs]
+
+        elif self.transcript_format == TranscriptFormat.SENTENCES:
+            try:
+                sentences_response = requests.get(
+                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/sentences",
+                    headers=HEADERS,
+                )
+                sentences_response.raise_for_status()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                raise
+
+            sentences = sentences_response.json()["sentences"]
+
+            return [Document(page_content=s["text"], metadata=s) for s in sentences]
+
+        elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
+            try:
+                srt_response = requests.get(
+                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/srt",
+                    headers=HEADERS,
+                )
+                srt_response.raise_for_status()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                raise
+
+            srt = srt_response.text
+
+            return [Document(page_content=srt)]
+
+        elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
+            try:
+                vtt_response = requests.get(
+                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/vtt",
+                    headers=HEADERS,
+                )
+                vtt_response.raise_for_status()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                raise
+
+            vtt = vtt_response.text
+
+            return [Document(page_content=vtt)]
+        else:
+            raise ValueError("Unknown transcript format.")