Use rnnoise for denoising voice agent

ajhai · ajhai · commit 2cb4a8890344 · 2024-11-26T16:31:44.000-08:00
diff --git a/docker/api/Dockerfile b/docker/api/Dockerfile
@@ -1,6 +1,17 @@
 # Stage 1: Compile and build code
 FROM python:3.11 AS builder
 
+# Install build dependencies for pyrnnoise
+RUN apt-get update && apt-get install -y \
+    cmake \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone and build pyrnnoise
+RUN git clone https://github.com/pengzhendong/pyrnnoise.git /src/pyrnnoise
+RUN cd /src/pyrnnoise && git submodule update --init && cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build --target install && pip install . && cd ../../
+RUN cp /src/pyrnnoise/build/librnnoise.so /usr/local/lib/python3.11/site-packages/
+
 # Add poetry to the image
 RUN pip install --no-cache-dir poetry
 
diff --git a/llmstack/apps/runner/agent_controller.py b/llmstack/apps/runner/agent_controller.py
@@ -7,9 +7,11 @@
 import threading
 from typing import Any, Dict, List, Optional, Union
 
+import numpy as np
 import websockets
 from asgiref.sync import sync_to_async
 from pydantic import BaseModel, ConfigDict
+from pyrnnoise import RNNoise
 
 from llmstack.apps.types.agent import AgentConfigSchema
 from llmstack.apps.types.voice_agent import VoiceAgentConfigSchema
@@ -143,6 +145,7 @@ def __init__(self, output_queue: asyncio.Queue, config: AgentControllerConfig):
         self._input_metadata = {}
         self._output_audio_stream = None
         self._output_transcript_stream = None
+        self._rnnoise = RNNoise(sample_rate=24000)
 
         self._input_messages_queue = queue.Queue()
         self._loop = asyncio.new_event_loop()
@@ -270,10 +273,33 @@ async def _process_input_audio_stream(self):
                     await self._send_websocket_message({"type": "response.create"})
                     break
 
+                # Convert bytes to numpy array and normalize to float32
+                try:
+                    audio_data = np.frombuffer(chunk, dtype=np.int16)
+                    # Convert int16 to float32 and normalize to [-1, 1]
+                    audio_data = audio_data.astype(np.float32) / 32768.0
+                    frame_iterator = self._rnnoise.process_chunk(audio_data)
+                except Exception as e:
+                    logger.exception(f"Error processing chunk with rnnoise: {e}")
+                    frame_iterator = []
+
+                # Rebuild the chunk from the denoised frames
+                denoised_chunk = b""
+                try:
+                    for _, denoised_frame in frame_iterator:
+                        # Convert float32 [-1, 1] back to int16 range and then to bytes
+                        int16_data = (denoised_frame * 32768.0).astype(np.int16)
+                        denoised_chunk += int16_data.tobytes()
+                except Exception as e:
+                    logger.exception(f"Error joining denoised frames: {e}")
+
+                logger.debug(f"Denoised chunk size to original chunk size: {len(denoised_chunk)} vs {len(chunk)}")
+
                 # Base64 encode and send
-                await self._send_websocket_message(
-                    {"type": "input_audio_buffer.append", "audio": base64.b64encode(chunk).decode("utf-8")}
-                )
+                if len(denoised_chunk) > 0:
+                    await self._send_websocket_message(
+                        {"type": "input_audio_buffer.append", "audio": base64.b64encode(denoised_chunk).decode("utf-8")}
+                    )
 
     async def _process_input_text_stream(self):
         if self._input_text_stream: