From bf9068516ea302025ad585e22625fabd87841503 Mon Sep 17 00:00:00 2001
From: Raphael <raph.favero@gmail.com>
Date: Tue, 30 Jan 2024 22:47:45 +0100
Subject: [PATCH] community[minor]: add the ability to load existing
 transcripts from AssemblyAI by their id. (#16051)

- **Description:** the existing AssemblyAI API allows to pass a path or
an url to transcribe an audio file and turn in into Langchain Documents,
this PR allows to get existing transcript by their transcript id and
turn them into Documents.
  - **Issue:** not related to an existing issue
  - **Dependencies:** requests

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
---
 .../document_loaders/assemblyai.py            | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/libs/community/langchain_community/document_loaders/assemblyai.py b/libs/community/langchain_community/document_loaders/assemblyai.py
index 0dd64256ab..d3947d9f71 100644
--- a/libs/community/langchain_community/document_loaders/assemblyai.py
+++ b/libs/community/langchain_community/document_loaders/assemblyai.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from enum import Enum
 from typing import TYPE_CHECKING, List, Optional
 
+import requests
 from langchain_core.documents import Document
 
 from langchain_community.document_loaders.base import BaseLoader
@@ -110,3 +111,111 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
             return [Document(page_content=transcript.export_subtitles_vtt())]
         else:
             raise ValueError("Unknown transcript format.")
+
+
+class AssemblyAIAudioLoaderById(BaseLoader):
+    """
+    Loader for AssemblyAI audio transcripts.
+
+    It uses the AssemblyAI API to get an existing transcription
+    and loads the transcribed text into one or more Documents,
+    depending on the specified format.
+
+    """
+
+    def __init__(self, transcript_id, api_key, transcript_format):
+        """
+        Initializes the AssemblyAI AssemblyAIAudioLoaderById.
+
+        Args:
+            transcript_id: Id of an existing transcription.
+            transcript_format: Transcript format to use.
+                See class ``TranscriptFormat`` for more info.
+            api_key: AssemblyAI API key.
+        """
+
+        self.api_key = api_key
+        self.transcript_id = transcript_id
+        self.transcript_format = transcript_format
+
+    def load(self) -> List[Document]:
+        """Load data into Document objects."""
+        HEADERS = {"authorization": self.api_key}
+
+        if self.transcript_format == TranscriptFormat.TEXT:
+            try:
+                transcript_response = requests.get(
+                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}",
+                    headers=HEADERS,
+                )
+                transcript_response.raise_for_status()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                raise
+
+            transcript = transcript_response.json()["text"]
+
+            return [
+                Document(page_content=transcript, metadata=transcript_response.json())
+            ]
+        elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
+            try:
+                paragraphs_response = requests.get(
+                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/paragraphs",
+                    headers=HEADERS,
+                )
+                paragraphs_response.raise_for_status()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                raise
+
+            paragraphs = paragraphs_response.json()["paragraphs"]
+
+            return [Document(page_content=p["text"], metadata=p) for p in paragraphs]
+
+        elif self.transcript_format == TranscriptFormat.SENTENCES:
+            try:
+                sentences_response = requests.get(
+                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/sentences",
+                    headers=HEADERS,
+                )
+                sentences_response.raise_for_status()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                raise
+
+            sentences = sentences_response.json()["sentences"]
+
+            return [Document(page_content=s["text"], metadata=s) for s in sentences]
+
+        elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
+            try:
+                srt_response = requests.get(
+                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/srt",
+                    headers=HEADERS,
+                )
+                srt_response.raise_for_status()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                raise
+
+            srt = srt_response.text
+
+            return [Document(page_content=srt)]
+
+        elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
+            try:
+                vtt_response = requests.get(
+                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/vtt",
+                    headers=HEADERS,
+                )
+                vtt_response.raise_for_status()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                raise
+
+            vtt = vtt_response.text
+
+            return [Document(page_content=vtt)]
+        else:
+            raise ValueError("Unknown transcript format.")