Skip to content

Commit 2cb4a88

Browse files
committed
Use rnnoise for denoising voice agent
1 parent 40ac067 commit 2cb4a88

File tree

2 files changed

+40
-3
lines changed

2 files changed

+40
-3
lines changed

docker/api/Dockerfile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,17 @@
11
# Stage 1: Compile and build code
22
FROM python:3.11 AS builder
33

4+
# Install build dependencies for pyrnnoise
5+
RUN apt-get update && apt-get install -y \
6+
cmake \
7+
build-essential \
8+
&& rm -rf /var/lib/apt/lists/*
9+
10+
# Clone and build pyrnnoise
11+
RUN git clone https://github.com/pengzhendong/pyrnnoise.git /src/pyrnnoise
12+
RUN cd /src/pyrnnoise && git submodule update --init && cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build --target install && pip install . && cd ../../
13+
RUN cp /src/pyrnnoise/build/librnnoise.so /usr/local/lib/python3.11/site-packages/
14+
415
# Add poetry to the image
516
RUN pip install --no-cache-dir poetry
617

llmstack/apps/runner/agent_controller.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77
import threading
88
from typing import Any, Dict, List, Optional, Union
99

10+
import numpy as np
1011
import websockets
1112
from asgiref.sync import sync_to_async
1213
from pydantic import BaseModel, ConfigDict
14+
from pyrnnoise import RNNoise
1315

1416
from llmstack.apps.types.agent import AgentConfigSchema
1517
from llmstack.apps.types.voice_agent import VoiceAgentConfigSchema
@@ -143,6 +145,7 @@ def __init__(self, output_queue: asyncio.Queue, config: AgentControllerConfig):
143145
self._input_metadata = {}
144146
self._output_audio_stream = None
145147
self._output_transcript_stream = None
148+
self._rnnoise = RNNoise(sample_rate=24000)
146149

147150
self._input_messages_queue = queue.Queue()
148151
self._loop = asyncio.new_event_loop()
@@ -270,10 +273,33 @@ async def _process_input_audio_stream(self):
270273
await self._send_websocket_message({"type": "response.create"})
271274
break
272275

276+
# Convert bytes to numpy array and normalize to float32
277+
try:
278+
audio_data = np.frombuffer(chunk, dtype=np.int16)
279+
# Convert int16 to float32 and normalize to [-1, 1]
280+
audio_data = audio_data.astype(np.float32) / 32768.0
281+
frame_iterator = self._rnnoise.process_chunk(audio_data)
282+
except Exception as e:
283+
logger.exception(f"Error processing chunk with rnnoise: {e}")
284+
frame_iterator = []
285+
286+
# Rebuild the chunk from the denoised frames
287+
denoised_chunk = b""
288+
try:
289+
for _, denoised_frame in frame_iterator:
290+
# Convert float32 [-1, 1] back to int16 range and then to bytes
291+
int16_data = (denoised_frame * 32768.0).astype(np.int16)
292+
denoised_chunk += int16_data.tobytes()
293+
except Exception as e:
294+
logger.exception(f"Error joining denoised frames: {e}")
295+
296+
logger.debug(f"Denoised chunk size to original chunk size: {len(denoised_chunk)} vs {len(chunk)}")
297+
273298
# Base64 encode and send
274-
await self._send_websocket_message(
275-
{"type": "input_audio_buffer.append", "audio": base64.b64encode(chunk).decode("utf-8")}
276-
)
299+
if len(denoised_chunk) > 0:
300+
await self._send_websocket_message(
301+
{"type": "input_audio_buffer.append", "audio": base64.b64encode(denoised_chunk).decode("utf-8")}
302+
)
277303

278304
async def _process_input_text_stream(self):
279305
if self._input_text_stream:

0 commit comments

Comments
 (0)