"""A common module for NVIDIA Riva Runnables.""" import asyncio import logging import pathlib import queue import tempfile import threading import wave from enum import Enum from typing import ( TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Dict, Generator, Iterator, List, Optional, Tuple, Union, cast, ) from langchain_core.messages import AnyMessage, BaseMessage from langchain_core.prompt_values import PromptValue from langchain_core.pydantic_v1 import ( AnyHttpUrl, BaseModel, Field, parse_obj_as, root_validator, validator, ) from langchain_core.runnables import RunnableConfig, RunnableSerializable if TYPE_CHECKING: import riva.client import riva.client.proto.riva_asr_pb2 as rasr _LOGGER = logging.getLogger(__name__) _QUEUE_GET_TIMEOUT = 0.5 _MAX_TEXT_LENGTH = 400 _SENTENCE_TERMINATORS = ("\n", ".", "!", "?", "¡", "¿") # COMMON utilities used by all Riva Runnables def _import_riva_client() -> "riva.client": """Import the riva client and raise an error on failure.""" try: # pylint: disable-next=import-outside-toplevel # this client library is optional import riva.client except ImportError as err: raise ImportError( "Could not import the NVIDIA Riva client library. " "Please install it with `pip install nvidia-riva-client`." ) from err return riva.client class SentinelT: # pylint: disable=too-few-public-methods """An empty Sentinel type.""" HANGUP = SentinelT() _TRANSFORM_END = SentinelT() class RivaAudioEncoding(str, Enum): """An enum of the possible choices for Riva audio encoding. The list of types exposed by the Riva GRPC Protobuf files can be found with the following commands: ```python import riva.client print(riva.client.AudioEncoding.keys()) # noqa: T201 ``` """ ALAW = "ALAW" ENCODING_UNSPECIFIED = "ENCODING_UNSPECIFIED" FLAC = "FLAC" LINEAR_PCM = "LINEAR_PCM" MULAW = "MULAW" OGGOPUS = "OGGOPUS" @classmethod def from_wave_format_code(cls, format_code: int) -> "RivaAudioEncoding": """Return the audio encoding specified by the format code in the wave file. ref: https://mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html """ try: return {1: cls.LINEAR_PCM, 6: cls.ALAW, 7: cls.MULAW}[format_code] except KeyError as err: raise NotImplementedError( "The following wave file format code is " f"not supported by Riva: {format_code}" ) from err @property def riva_pb2(self) -> "riva.client.AudioEncoding": """Returns the Riva API object for the encoding.""" riva_client = _import_riva_client() return getattr(riva_client.AudioEncoding, self) class RivaAuthMixin(BaseModel): """Configuration for the authentication to a Riva service connection.""" url: Union[AnyHttpUrl, str] = Field( AnyHttpUrl("http://localhost:50051", scheme="http"), description="The full URL where the Riva service can be found.", examples=["http://localhost:50051", "https://user@pass:riva.example.com"], ) ssl_cert: Optional[str] = Field( None, description="A full path to the file where Riva's public ssl key can be read.", ) @property def auth(self) -> "riva.client.Auth": """Return a riva client auth object.""" riva_client = _import_riva_client() url = cast(AnyHttpUrl, self.url) use_ssl = url.scheme == "https" # pylint: disable=no-member # false positive url_no_scheme = str(self.url).split("/")[2] return riva_client.Auth( ssl_cert=self.ssl_cert, use_ssl=use_ssl, uri=url_no_scheme ) @validator("url", pre=True, allow_reuse=True) @classmethod def _validate_url(cls, val: Any) -> AnyHttpUrl: """Do some initial conversations for the URL before checking.""" if isinstance(val, str): return cast(AnyHttpUrl, parse_obj_as(AnyHttpUrl, val)) return cast(AnyHttpUrl, val) class RivaCommonConfigMixin(BaseModel): """A collection of common Riva settings.""" encoding: RivaAudioEncoding = Field( default=RivaAudioEncoding.LINEAR_PCM, description="The encoding on the audio stream.", ) sample_rate_hertz: int = Field( default=8000, description="The sample rate frequency of audio stream." ) language_code: str = Field( default="en-US", description=( "The [BCP-47 language code]" "(https://www.rfc-editor.org/rfc/bcp/bcp47.txt) for " "the target language." ), ) class _Event: """A combined event that is threadsafe and async safe.""" _event: threading.Event _aevent: asyncio.Event def __init__(self) -> None: """Initialize the event.""" self._event = threading.Event() self._aevent = asyncio.Event() def set(self) -> None: """Set the event.""" self._event.set() self._aevent.set() def clear(self) -> None: """Set the event.""" self._event.clear() self._aevent.clear() def is_set(self) -> bool: """Indicate if the event is set.""" return self._event.is_set() def wait(self) -> None: """Wait for the event to be set.""" self._event.wait() async def async_wait(self) -> None: """Async wait for the event to be set.""" await self._aevent.wait() def _mk_wave_file( output_directory: Optional[str], sample_rate: float ) -> Tuple[Optional[str], Optional[wave.Wave_write]]: """Create a new wave file and return the wave write object and filename.""" if output_directory: with tempfile.NamedTemporaryFile( mode="bx", suffix=".wav", delete=False, dir=output_directory ) as f: wav_file_name = f.name wav_file = wave.open(wav_file_name, "wb") wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(sample_rate) return (wav_file_name, wav_file) return (None, None) def _coerce_string(val: "TTSInputType") -> str: """Attempt to coerce the input value to a string. This is particularly useful for converting LangChain message to strings. """ if isinstance(val, PromptValue): return val.to_string() if isinstance(val, BaseMessage): return str(val.content) return str(val) def _process_chunks(inputs: Iterator["TTSInputType"]) -> Generator[str, None, None]: """Filter the input chunks are return strings ready for TTS.""" buffer = "" for chunk in inputs: chunk = _coerce_string(chunk) # return the buffer if an end of sentence character is detected for terminator in _SENTENCE_TERMINATORS: while terminator in chunk: last_sentence, chunk = chunk.split(terminator, 1) yield buffer + last_sentence + terminator buffer = "" buffer += chunk # return the buffer if is too long if len(buffer) > _MAX_TEXT_LENGTH: for idx in range(0, len(buffer), _MAX_TEXT_LENGTH): yield buffer[idx : idx + 5] buffer = "" # return remaining buffer if buffer: yield buffer # Riva AudioStream Type StreamInputType = Union[bytes, SentinelT] StreamOutputType = str class AudioStream: """A message containing streaming audio.""" _put_lock: threading.Lock _queue: queue.Queue output: queue.Queue hangup: _Event user_talking: _Event user_quiet: _Event _worker: Optional[threading.Thread] def __init__(self, maxsize: int = 0) -> None: """Initialize the queue.""" self._put_lock = threading.Lock() self._queue = queue.Queue(maxsize=maxsize) self.output = queue.Queue() self.hangup = _Event() self.user_quiet = _Event() self.user_talking = _Event() self._worker = None def __iter__(self) -> Generator[bytes, None, None]: """Return an error.""" while True: # get next item try: next_val = self._queue.get(True, _QUEUE_GET_TIMEOUT) except queue.Empty: continue # hangup when requested if next_val == HANGUP: break # yield next item yield next_val self._queue.task_done() async def __aiter__(self) -> AsyncIterator[StreamInputType]: """Iterate through all items in the queue until HANGUP.""" while True: # get next item try: next_val = await asyncio.get_event_loop().run_in_executor( None, self._queue.get, True, _QUEUE_GET_TIMEOUT ) except queue.Empty: continue # hangup when requested if next_val == HANGUP: break # yield next item yield next_val self._queue.task_done() @property def hungup(self) -> bool: """Indicate if the audio stream has hungup.""" return self.hangup.is_set() @property def empty(self) -> bool: """Indicate in the input stream buffer is empty.""" return self._queue.empty() @property def complete(self) -> bool: """Indicate if the audio stream has hungup and been processed.""" input_done = self.hungup and self.empty output_done = ( self._worker is not None and not self._worker.is_alive() and self.output.empty() ) return input_done and output_done @property def running(self) -> bool: """Indicate if the ASR stream is running.""" if self._worker: return self._worker.is_alive() return False def put(self, item: StreamInputType, timeout: Optional[int] = None) -> None: """Put a new item into the queue.""" with self._put_lock: if self.hungup: raise RuntimeError( "The audio stream has already been hungup. Cannot put more data." ) if item is HANGUP: self.hangup.set() self._queue.put(item, timeout=timeout) async def aput(self, item: StreamInputType, timeout: Optional[int] = None) -> None: """Async put a new item into the queue.""" loop = asyncio.get_event_loop() await asyncio.wait_for(loop.run_in_executor(None, self.put, item), timeout) def close(self, timeout: Optional[int] = None) -> None: """Send the hangup signal.""" self.put(HANGUP, timeout) async def aclose(self, timeout: Optional[int] = None) -> None: """Async send the hangup signal.""" await self.aput(HANGUP, timeout) def register(self, responses: Iterator["rasr.StreamingRecognizeResponse"]) -> None: """Drain the responses from the provided iterator and put them into a queue.""" if self.running: raise RuntimeError("An ASR instance has already been registered.") has_started = threading.Barrier(2, timeout=5) def worker() -> None: """Consume the ASR Generator.""" has_started.wait() for response in responses: if not response.results: continue for result in response.results: if not result.alternatives: continue if result.is_final: self.user_talking.clear() self.user_quiet.set() transcript = cast(str, result.alternatives[0].transcript) self.output.put(transcript) elif not self.user_talking.is_set(): self.user_talking.set() self.user_quiet.clear() self._worker = threading.Thread(target=worker) self._worker.daemon = True self._worker.start() has_started.wait() # RivaASR Runnable ASRInputType = AudioStream ASROutputType = str class RivaASR( RivaAuthMixin, RivaCommonConfigMixin, RunnableSerializable[ASRInputType, ASROutputType], ): """A runnable that performs Automatic Speech Recognition (ASR) using NVIDIA Riva.""" name: str = "nvidia_riva_asr" description: str = ( "A Runnable for converting audio bytes to a string." "This is useful for feeding an audio stream into a chain and" "preprocessing that audio to create an LLM prompt." ) # riva options audio_channel_count: int = Field( 1, description="The number of audio channels in the input audio stream." ) profanity_filter: bool = Field( True, description=( "Controls whether or not Riva should attempt to filter " "profanity out of the transcribed text." ), ) enable_automatic_punctuation: bool = Field( True, description=( "Controls whether Riva should attempt to correct " "senetence puncuation in the transcribed text." ), ) @root_validator(pre=True) @classmethod def _validate_environment(cls, values: Dict[str, Any]) -> Dict[str, Any]: """Validate the Python environment and input arguments.""" _ = _import_riva_client() return values @property def config(self) -> "riva.client.StreamingRecognitionConfig": """Create and return the riva config object.""" riva_client = _import_riva_client() return riva_client.StreamingRecognitionConfig( interim_results=True, config=riva_client.RecognitionConfig( encoding=self.encoding, sample_rate_hertz=self.sample_rate_hertz, audio_channel_count=self.audio_channel_count, max_alternatives=1, profanity_filter=self.profanity_filter, enable_automatic_punctuation=self.enable_automatic_punctuation, language_code=self.language_code, ), ) def _get_service(self) -> "riva.client.ASRService": """Connect to the riva service and return the a client object.""" riva_client = _import_riva_client() try: return riva_client.ASRService(self.auth) except Exception as err: raise ValueError( "Error raised while connecting to the Riva ASR server." ) from err def invoke( self, input: ASRInputType, _: Optional[RunnableConfig] = None, ) -> ASROutputType: """Transcribe the audio bytes into a string with Riva.""" # create an output text generator with Riva if not input.running: service = self._get_service() responses = service.streaming_response_generator( audio_chunks=input, streaming_config=self.config, ) input.register(responses) # return the first valid result full_response: List[str] = [] while not input.complete: with input.output.not_empty: ready = input.output.not_empty.wait(0.1) if ready: while not input.output.empty(): try: full_response += [input.output.get_nowait()] except queue.Empty: continue input.output.task_done() _LOGGER.debug("Riva ASR returning: %s", repr(full_response)) return " ".join(full_response).strip() return "" # RivaTTS Runnable # pylint: disable-next=invalid-name TTSInputType = Union[str, AnyMessage, PromptValue] TTSOutputType = bytes class RivaTTS( RivaAuthMixin, RivaCommonConfigMixin, RunnableSerializable[TTSInputType, TTSOutputType], ): """A runnable that performs Text-to-Speech (TTS) with NVIDIA Riva.""" name: str = "nvidia_riva_tts" description: str = ( "A tool for converting text to speech." "This is useful for converting LLM output into audio bytes." ) # riva options voice_name: str = Field( "English-US.Female-1", description=( "The voice model in Riva to use for speech. " "Pre-trained models are documented in " "[the Riva documentation]" "(https://docs.nvidia.com/deeplearning/riva/user-guide/docs/tts/tts-overview.html)." ), ) output_directory: Optional[str] = Field( None, description=( "The directory where all audio files should be saved. " "A null value indicates that wave files should not be saved. " "This is useful for debugging purposes." ), ) @root_validator(pre=True) @classmethod def _validate_environment(cls, values: Dict[str, Any]) -> Dict[str, Any]: """Validate the Python environment and input arguments.""" _ = _import_riva_client() return values @validator("output_directory") @classmethod def _output_directory_validator(cls, v: str) -> str: if v: dirpath = pathlib.Path(v) dirpath.mkdir(parents=True, exist_ok=True) return str(dirpath.absolute()) return v def _get_service(self) -> "riva.client.SpeechSynthesisService": """Connect to the riva service and return the a client object.""" riva_client = _import_riva_client() try: return riva_client.SpeechSynthesisService(self.auth) except Exception as err: raise ValueError( "Error raised while connecting to the Riva TTS server." ) from err def invoke( self, input: TTSInputType, _: Union[RunnableConfig, None] = None ) -> TTSOutputType: """Perform TTS by taking a string and outputting the entire audio file.""" return b"".join(self.transform(iter([input]))) def transform( self, input: Iterator[TTSInputType], config: Optional[RunnableConfig] = None, **kwargs: Optional[Any], ) -> Iterator[TTSOutputType]: """Perform TTS by taking a stream of characters and streaming output bytes.""" service = self._get_service() # create an output wave file wav_file_name, wav_file = _mk_wave_file( self.output_directory, self.sample_rate_hertz ) # split the input text and perform tts for chunk in _process_chunks(input): _LOGGER.debug("Riva TTS chunk: %s", chunk) # start riva tts streaming responses = service.synthesize_online( text=chunk, voice_name=self.voice_name, language_code=self.language_code, encoding=self.encoding.riva_pb2, sample_rate_hz=self.sample_rate_hertz, ) # stream audio bytes out for resp in responses: audio = cast(bytes, resp.audio) if wav_file: wav_file.writeframesraw(audio) yield audio # close the wave file when we are done if wav_file: wav_file.close() _LOGGER.debug("Riva TTS wrote file: %s", wav_file_name) async def atransform( self, input: AsyncIterator[TTSInputType], config: Optional[RunnableConfig] = None, **kwargs: Optional[Any], ) -> AsyncGenerator[TTSOutputType, None]: """Intercept async transforms and route them to the synchronous transform.""" loop = asyncio.get_running_loop() input_queue: queue.Queue = queue.Queue() out_queue: asyncio.Queue = asyncio.Queue() async def _producer() -> None: """Produce input into the input queue.""" async for val in input: input_queue.put_nowait(val) input_queue.put_nowait(_TRANSFORM_END) def _input_iterator() -> Iterator[TTSInputType]: """Iterate over the input_queue.""" while True: try: val = input_queue.get(timeout=0.5) except queue.Empty: continue if val == _TRANSFORM_END: break yield val def _consumer() -> None: """Consume the input with transform.""" for val in self.transform(_input_iterator()): out_queue.put_nowait(val) out_queue.put_nowait(_TRANSFORM_END) async def _consumer_coro() -> None: """Coroutine that wraps the consumer.""" await loop.run_in_executor(None, _consumer) producer = loop.create_task(_producer()) consumer = loop.create_task(_consumer_coro()) while True: try: val = await asyncio.wait_for(out_queue.get(), 0.5) except asyncio.exceptions.TimeoutError: continue out_queue.task_done() if val is _TRANSFORM_END: break yield val await producer await consumer # Backwards compatibility: NVIDIARivaASR = RivaASR NVIDIARivaTTS = RivaTTS NVIDIARivaStream = AudioStream