community[minor]: add the ability to load existing transcripts from AssemblyAI by their id. (#16051)

- **Description:** the existing AssemblyAI API allows to pass a path or an url to transcribe an audio file and turn in into Langchain Documents, this PR allows to get existing transcript by their transcript id and turn them into Documents. - **Issue:** not related to an existing issue - **Dependencies:** requests --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
8 months ago · bf9068516e
parent daf820c77b
commit bf9068516e
1 changed files with 109 additions and 0 deletions
--- a/libs/community/langchain_community/document_loaders/assemblyai.py
+++ b/libs/community/langchain_community/document_loaders/assemblyai.py
@ -3,6 +3,7 @@ from __future__ import annotations
 from enum import Enum
 from typing import TYPE_CHECKING, List, Optional
 import requests
 from langchain_core.documents import Document
 from langchain_community.document_loaders.base import BaseLoader
@ -110,3 +111,111 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
            return [Document(page_content=transcript.export_subtitles_vtt())]
        else:
            raise ValueError("Unknown transcript format.")
 class AssemblyAIAudioLoaderById(BaseLoader):
    """
    Loader for AssemblyAI audio transcripts.
    It uses the AssemblyAI API to get an existing transcription
    and loads the transcribed text into one or more Documents,
    depending on the specified format.
    """
    def __init__(self, transcript_id, api_key, transcript_format):
        """
        Initializes the AssemblyAI AssemblyAIAudioLoaderById.
        Args:
            transcript_id: Id of an existing transcription.
            transcript_format: Transcript format to use.
                See class ``TranscriptFormat`` for more info.
            api_key: AssemblyAI API key.
        """
        self.api_key = api_key
        self.transcript_id = transcript_id
        self.transcript_format = transcript_format
    def load(self) -> List[Document]:
        """Load data into Document objects."""
        HEADERS = {"authorization": self.api_key}
        if self.transcript_format == TranscriptFormat.TEXT:
            try:
                transcript_response = requests.get(
                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}",
                    headers=HEADERS,
                )
                transcript_response.raise_for_status()
            except Exception as e:
                print(f"An error occurred: {e}")
                raise
            transcript = transcript_response.json()["text"]
            return [
                Document(page_content=transcript, metadata=transcript_response.json())
            ]
        elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
            try:
                paragraphs_response = requests.get(
                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/paragraphs",
                    headers=HEADERS,
                )
                paragraphs_response.raise_for_status()
            except Exception as e:
                print(f"An error occurred: {e}")
                raise
            paragraphs = paragraphs_response.json()["paragraphs"]
            return [Document(page_content=p["text"], metadata=p) for p in paragraphs]
        elif self.transcript_format == TranscriptFormat.SENTENCES:
            try:
                sentences_response = requests.get(
                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/sentences",
                    headers=HEADERS,
                )
                sentences_response.raise_for_status()
            except Exception as e:
                print(f"An error occurred: {e}")
                raise
            sentences = sentences_response.json()["sentences"]
            return [Document(page_content=s["text"], metadata=s) for s in sentences]
        elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
            try:
                srt_response = requests.get(
                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/srt",
                    headers=HEADERS,
                )
                srt_response.raise_for_status()
            except Exception as e:
                print(f"An error occurred: {e}")
                raise
            srt = srt_response.text
            return [Document(page_content=srt)]
        elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
            try:
                vtt_response = requests.get(
                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/vtt",
                    headers=HEADERS,
                )
                vtt_response.raise_for_status()
            except Exception as e:
                print(f"An error occurred: {e}")
                raise
            vtt = vtt_response.text
            return [Document(page_content=vtt)]
        else:
            raise ValueError("Unknown transcript format.")