initial commit (clean, no models)

2026-05-30 21:30:49 +07:00
commit e90b5c971c
22 changed files with 7872 additions and 0 deletions
@@ -0,0 +1,768 @@
+import os
+import sys
+import types
+import json
+
+# --- FIX FOR PATH COLLISION BETWEEN modules.py AND modules/ DIRECTORY ---
+# Kita memaksa Python untuk mendaftarkan 'lib.infer_pack.modules' sebagai package directory 
+# alih-alih file-module, sehingga sub-impor (seperti F0Predictor) dapat dimuat dengan sukses
+base_dir = os.path.dirname(os.path.abspath(__file__))
+modules_path = os.path.join(base_dir, "lib", "infer_pack", "modules")
+if os.path.isdir(modules_path):
+    modules_pkg = types.ModuleType("lib.infer_pack.modules")
+    modules_pkg.__path__ = [modules_path]
+    modules_pkg.__file__ = os.path.join(modules_path, "__init__.py")
+    sys.modules["lib.infer_pack.modules"] = modules_pkg
+# ------------------------------------------------------------------------
+import time
+import asyncio
+import logging
+import traceback
+import argparse
+import threading
+import webbrowser
+from http.server import SimpleHTTPRequestHandler
+import socketserver
+import numpy as np
+import torch
+import onnxruntime as ort
+
+# Add parent directories to sys.path so we can import lib
+sys.path.append(os.getcwd())
+
+from lib.infer_pack.onnx_inference import OnnxRVC, get_f0_predictor
+
+# Set logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger("RVC-Realtime-Server")
+
+# Thread pool for audio processing (1 worker = sequential, but non-blocking to event loop)
+import concurrent.futures
+_audio_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1, thread_name_prefix="rvc-audio")
+
+# Global instances cache
+current_rvc_onnx = None
+current_model_key = None
+model_root = "weights"
+pretrained_root = "pretrained"
+
+# Patch torch.load to default to weights_only=False for compatibility
+original_load = torch.load
+def patched_load(*args, **kwargs):
+    if "weights_only" not in kwargs:
+        kwargs["weights_only"] = False
+    return original_load(*args, **kwargs)
+torch.load = patched_load
+
+def get_onnx_models():
+    models = []
+    if os.path.exists(model_root):
+        for d in os.listdir(model_root):
+            d_path = os.path.join(model_root, d)
+            if os.path.isdir(d_path):
+                onnx_files = [f for f in os.listdir(d_path) if f.endswith(".onnx")]
+                if onnx_files:
+                    models.append(d)
+    models.sort()
+    return models
+
+def get_model_metadata(model_name):
+    model_dir = os.path.join(model_root, model_name)
+    onnx_files = [f for f in os.listdir(model_dir) if f.endswith(".onnx")]
+    pth_files = [f for f in os.listdir(model_dir) if f.endswith(".pth")]
+    
+    onnx_path = os.path.join(model_dir, onnx_files[0])
+    sr = 40000  # default
+    if pth_files:
+        try:
+            pth_path = os.path.join(model_dir, pth_files[0])
+            cpt = torch.load(pth_path, map_location="cpu")
+            sr = cpt["config"][-1]
+            logger.info(f"Detected sample rate from .pth: {sr} Hz")
+        except Exception as e:
+            logger.warning(f"Failed to load sample rate from .pth, using default 40000: {e}")
+            
+    version = "v2"
+    vec_path = "vec-768-layer-12"
+    try:
+        sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+        feat_dim = sess.get_inputs()[0].shape[2]
+        if feat_dim == 256:
+            version = "v1"
+            vec_path = "vec-256-layer-12"
+            logger.info("Detected RVC Model Version: v1 (feat_dim = 256)")
+        else:
+            version = "v2"
+            vec_path = "vec-768-layer-12"
+            logger.info("Detected RVC Model Version: v2 (feat_dim = 768)")
+    except Exception as e:
+        logger.error(f"Error auto-detecting model version from ONNX: {e}")
+        
+    return onnx_path, sr, vec_path, version
+
+class RealtimeVoiceChanger:
+    def __init__(self):
+        self.processor = None
+        self.model_name = ""
+        self.f0_up_key = 0
+        self.f0_method = "pm"
+        self.device = "cuda"
+        self.input_sr = 44100
+        self.noise_gate_db = -40.0
+        self.input_gain = 1.0
+        self.output_gain = 1.0
+        
+        # Audio sliding buffers
+        self.input_buffer = np.zeros(0, dtype=np.float32)
+        self.history_duration = 0.30  # 300ms history = enough context even at 8192 samples @ 48kHz
+        self.target_sr = 40000
+        self.vec_path = "vec-768-layer-12"
+        self.version = "v2"
+        self.f0_predictors = {}  # Cache to reuse pitch predictors instead of recreating them on every chunk
+        
+        # Server Hardware Routing properties
+        self.local_stream = None
+        self.routing_mode = "browser"
+        self.input_device = None
+        self.output_device = None
+        self.chunk_size = 8192
+        self.loop = None
+        self.ws_client = None
+        self.visualizer_queue = None
+        
+        # High-pass filter coefficients cache
+        self.hpf_b = None
+        self.hpf_a = None
+        
+    def load_model(self, model_name, device):
+        global current_rvc_onnx, current_model_key
+        
+        # Resolve the device provider
+        if device == "cuda" and not torch.cuda.is_available():
+            logger.warning("CUDA is not available, falling back to CPU")
+            device = "cpu"
+            
+        model_key = f"{model_name}_{device}"
+        if current_rvc_onnx is None or current_model_key != model_key:
+            logger.info(f"Loading RVC model '{model_name}' on {device}...")
+            onnx_path, sr, vec_path, version = get_model_metadata(model_name)
+            
+            # Ensure HuBERT model exists
+            full_vec_path = os.path.join(pretrained_root, f"{vec_path}.onnx")
+            if not os.path.exists(full_vec_path):
+                raise FileNotFoundError(f"ContentVec ONNX not found at: {full_vec_path}")
+                
+            current_rvc_onnx = OnnxRVC(
+                model_path=onnx_path,
+                sr=sr,
+                hop_size=512,
+                vec_path=vec_path,
+                device=device
+            )
+            current_model_key = model_key
+            logger.info("Model loaded successfully")
+            
+        self.processor = current_rvc_onnx
+        self.target_sr = self.processor.sampling_rate
+        self.model_name = model_name
+        self.device = device
+        
+    def set_config(self, config):
+        logger.info(f"Updating config: {config}")
+        
+        # Update config fields
+        self.f0_up_key = int(config.get("f0_up_key", self.f0_up_key))
+        self.f0_method = config.get("f0_method", self.f0_method)
+        self.input_sr = int(config.get("input_sr", self.input_sr))
+        self.noise_gate_db = float(config.get("noise_gate", self.noise_gate_db))
+        self.input_gain = float(config.get("input_gain", self.input_gain))
+        self.output_gain = float(config.get("output_gain", self.output_gain))
+        
+        model_name = config.get("model_name", self.model_name)
+        device = config.get("device", self.device)
+        
+        if not self.model_name or model_name != self.model_name or device != self.device:
+            self.load_model(model_name, device)
+            
+        # Reset input buffer if input samplerate changed
+        history_samples = int(self.history_duration * self.input_sr)
+        if len(self.input_buffer) != history_samples:
+            self.input_buffer = np.zeros(history_samples, dtype=np.float32)
+            
+        # Design a 1st order Butterworth high-pass filter at 80Hz to eliminate low-frequency static rumbling/hums
+        try:
+            from scipy import signal
+            nyq = 0.5 * self.input_sr
+            normal_cutoff = 80.0 / nyq
+            self.hpf_b, self.hpf_a = signal.butter(1, normal_cutoff, btype='high', analog=False)
+        except Exception as e:
+            logger.error(f"Failed to design high-pass filter: {e}")
+            self.hpf_b, self.hpf_a = None, None
+
+    def apply_noise_gate(self, audio):
+        # Calculate RMS energy of the audio chunk
+        rms = np.sqrt(np.mean(audio**2)) + 1e-9
+        rms_db = 20 * np.log10(rms)
+        
+        if rms_db < self.noise_gate_db:
+            return np.zeros_like(audio)
+        return audio
+
+    def resample(self, audio, orig_sr, target_sr):
+        if orig_sr == target_sr:
+            return audio
+            
+        # Fast linear interpolation resampling
+        duration = len(audio) / orig_sr
+        num_target_samples = int(duration * target_sr)
+        x_orig = np.linspace(0, duration, len(audio))
+        x_target = np.linspace(0, duration, num_target_samples)
+        return np.interp(x_target, x_orig, audio).astype(np.float32)
+
+    def process_audio_chunk(self, raw_chunk):
+        """
+        Process a raw input Float32 PCM audio chunk in memory with sliding window.
+        """
+        if self.processor is None:
+            return raw_chunk
+            
+        t_start = time.time()
+        
+        # 1. Apply High-pass filter (80Hz Low-cut) to eliminate low-frequency background rumbles and AC hum
+        chunk = raw_chunk
+        if self.hpf_b is not None and self.hpf_a is not None:
+            try:
+                from scipy import signal
+                chunk = signal.lfilter(self.hpf_b, self.hpf_a, chunk).astype(np.float32)
+            except Exception as e:
+                pass
+        
+        # 2. Apply Input Gain & Noise Gate
+        chunk = chunk * self.input_gain
+        chunk = self.apply_noise_gate(chunk)
+        
+        # If chunk is pure silence from the gate, bypass inference immediately to save CPU!
+        if np.max(np.abs(chunk)) < 1e-6:
+            output_len = int(len(raw_chunk) * (self.target_sr / self.input_sr))
+            return np.zeros(output_len, dtype=np.float32)
+
+        t_gate = time.time()
+
+        # 3. Manage Sliding Window Buffer
+        self.input_buffer = np.append(self.input_buffer[len(chunk):], chunk)
+        
+        # Append 120ms of silence at the end to push RVC convolution edge fading into the padded future (no edge distortion!)
+        future_samples = int(0.12 * self.input_sr)
+        full_input_audio = np.append(self.input_buffer, np.zeros(future_samples, dtype=np.float32))
+        
+        # 4. Resample full segment to 16kHz for HuBERT and RMVPE
+        wav16k = self.resample(full_input_audio, self.input_sr, 16000)
+        
+        t_resample_in = time.time()
+
+        # 4. Generate RVC ONNX inputs in-memory
+        hubert = self.processor.vec_model(wav16k)
+        hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
+        hubert_length = hubert.shape[1]
+        
+        t_hubert = time.time()
+
+        # Initialize and cache pitch predictor (extract at 16kHz for 3x performance boost!)
+        predictor_key = f"{self.f0_method}_{self.device}_16000"
+        if predictor_key not in self.f0_predictors:
+            logger.info(f"Initializing and caching 16kHz F0 Predictor '{self.f0_method}' on {self.device}...")
+            hop_16k = int(self.processor.hop_size * 16000 / self.target_sr)
+            self.f0_predictors[predictor_key] = get_f0_predictor(
+                self.f0_method,
+                hop_length=hop_16k,
+                sampling_rate=16000,
+                threshold=0.02,
+                device=self.device,
+                is_half=True if self.device == "cuda" else False
+            )
+        f0_predictor = self.f0_predictors[predictor_key]
+        
+        # Calculate pitch on 16kHz audio for all methods (massive CPU speed up!)
+        pitchf = f0_predictor.compute_f0(wav16k, hubert_length)
+        
+        t_f0 = time.time()
+            
+        # Pitch transpose
+        pitchf = pitchf * (2 ** (self.f0_up_key / 12))
+        
+        # Pitch binning for RVC
+        f0_min = 50
+        f0_max = 1100
+        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+        f0_mel = 1127 * np.log(1 + pitchf / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > 255] = 255
+        pitch = np.rint(f0_mel).astype(np.int64)
+        
+        pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
+        pitch = pitch.reshape(1, len(pitch))
+        ds = np.array([0]).astype(np.int64)  # sid = 0
+        rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
+        hubert_length_tensor = np.array([hubert_length]).astype(np.int64)
+        
+        # 5. Run synthesis
+        out_wav = self.processor.forward(hubert, hubert_length_tensor, pitch, pitchf, ds, rnd).squeeze()
+        out_wav = out_wav.astype(np.float32) / 32767.0  # Normalize back to [-1.0, 1.0] float32
+        
+        t_synth = time.time()
+
+        # 6. Extract only the newly converted chunk, discarding history and future padding.
+        # The new chunk starts at (buffer_size - chunk_size) in the updated input_buffer.
+        # Use exact ratio target_sr/input_sr for clean integer math, not out_wav/full_input_audio
+        # which can drift due to future silence padding.
+        sr_ratio = self.target_sr / self.input_sr
+        history_in_buffer = len(self.input_buffer) - len(chunk)  # samples of old audio before new chunk
+        start_idx = int(history_in_buffer * sr_ratio)
+        end_idx = int((history_in_buffer + len(chunk)) * sr_ratio)
+        
+        # Safety clamp
+        start_idx = max(0, min(start_idx, len(out_wav) - 1))
+        end_idx   = max(start_idx + 1, min(end_idx, len(out_wav)))
+        
+        # Extract the converted chunk safely from the middle (flawless, clear, continuous voice!)
+        output_chunk = out_wav[start_idx:end_idx]
+        
+        # Resample back to target output samples to match browser playback rate perfectly
+        target_chunk_len = int(len(chunk) * (self.target_sr / self.input_sr))
+        output_chunk = self.resample(output_chunk, len(output_chunk), target_chunk_len)
+        
+        # Apply output gain
+        output_chunk = output_chunk * self.output_gain
+        
+        # Ensure we don't clip
+        output_chunk = np.clip(output_chunk, -1.0, 1.0)
+        
+        t_end = time.time()
+        
+        d_gate = (t_gate - t_start) * 1000
+        d_res_in = (t_resample_in - t_gate) * 1000
+        d_hubert = (t_hubert - t_resample_in) * 1000
+        d_f0 = (t_f0 - t_hubert) * 1000
+        d_synth = (t_synth - t_f0) * 1000
+        d_res_out = (t_end - t_synth) * 1000
+        t_elapsed = (t_end - t_start) * 1000
+        
+        logger.info(
+            f"Chunk Profile: total={t_elapsed:.1f}ms | gate={d_gate:.1f}ms | res_in={d_res_in:.1f}ms | "
+            f"hubert={d_hubert:.1f}ms | f0={d_f0:.1f}ms | synth={d_synth:.1f}ms | res_out={d_res_out:.1f}ms"
+        )
+        
+        return output_chunk
+
+    def start_local_stream(self, loop, ws_client):
+        import sounddevice as sd
+        self.loop = loop
+        self.ws_client = ws_client
+        self.visualizer_queue = asyncio.Queue()
+        
+        if self.local_stream is not None:
+            self.stop_local_stream()
+            
+        if self.input_device is None:
+            self.input_device = sd.default.device[0]
+        if self.output_device is None:
+            self.output_device = sd.default.device[1]
+            
+        input_info = sd.query_devices(self.input_device)
+        output_info = sd.query_devices(self.output_device)
+        
+        input_sr = int(input_info["default_samplerate"])
+        logger.info(f"Starting Server Hardware Stream: Input='{input_info['name']}' ({input_sr}Hz) | Output='{output_info['name']}' ({self.target_sr}Hz)")
+        
+        self.set_config({
+            "input_sr": input_sr
+        })
+        
+        def audio_callback(indata, outdata, frames, time_info, status):
+            if status:
+                logger.warning(f"Hardware Audio Callback Status: {status}")
+                
+            raw_chunk = indata[:, 0].copy()
+            output_chunk = self.process_audio_chunk(raw_chunk)
+            
+            if len(output_chunk) < frames:
+                outdata[:, 0] = np.pad(output_chunk, (0, frames - len(output_chunk)), "constant")
+            else:
+                outdata[:, 0] = output_chunk[:frames]
+                
+            # Send waveform chunks to WebSocket safely using loop.call_soon_threadsafe
+            if self.ws_client is not None:
+                loop.call_soon_threadsafe(
+                    self.visualizer_queue.put_nowait,
+                    (raw_chunk.copy(), output_chunk.copy())
+                )
+                
+        try:
+            self.local_stream = sd.Stream(
+                device=(self.input_device, self.output_device),
+                samplerate=self.target_sr,
+                blocksize=self.chunk_size,
+                channels=1,
+                dtype="float32",
+                callback=audio_callback
+            )
+            self.local_stream.start()
+            logger.info("Server Hardware Stream active and processing locally!")
+        except Exception as e:
+            logger.error(f"Failed to start hardware stream: {e}")
+            raise e
+
+    def stop_local_stream(self):
+        if self.local_stream is not None:
+            try:
+                self.local_stream.stop()
+                self.local_stream.close()
+                logger.info("Server Hardware Stream stopped successfully.")
+            except Exception as e:
+                logger.error(f"Error stopping hardware stream: {e}")
+            self.local_stream = None
+        self.visualizer_queue = None
+
+# --- WEBSOCKET SERVER IMPLEMENTATION ---
+async def websocket_handler(websocket):
+    logger.info("New WebSocket client connected")
+    rvc = RealtimeVoiceChanger()
+    loop = asyncio.get_running_loop()
+
+    # --- Pipeline queues ---
+    # input_queue : raw bytes from browser mic
+    # output_queue: processed bytes ready to send back
+    # maxsize=3 = ~3 chunk durations of buffer; provides backpressure without unbounded memory
+    input_queue  = asyncio.Queue(maxsize=3)
+    output_queue = asyncio.Queue(maxsize=3)
+
+    # --- STAGE 1: Receiver ---
+    # Reads ALL WebSocket messages immediately (never blocks on processing).
+    # Handles JSON config inline; binary audio chunks go to input_queue.
+    async def receiver_task():
+        try:
+            async for message in websocket:
+                if isinstance(message, str):
+                    try:
+                        data = json.loads(message)
+                        if data.get("type") == "config":
+                            new_routing_mode = data.get("routing_mode", rvc.routing_mode)
+                            new_input_device  = data.get("input_device",  rvc.input_device)
+                            new_output_device = data.get("output_device", rvc.output_device)
+                            new_chunk_size    = int(data.get("chunk_size", rvc.chunk_size))
+
+                            if new_input_device  is not None: rvc.input_device  = int(new_input_device)
+                            if new_output_device is not None: rvc.output_device = int(new_output_device)
+                            rvc.chunk_size = new_chunk_size
+                            rvc.set_config(data)
+
+                            if new_routing_mode == "hardware":
+                                if rvc.routing_mode != "hardware" or rvc.local_stream is None:
+                                    rvc.routing_mode = "hardware"
+                                    rvc.start_local_stream(loop, websocket)
+                            else:
+                                if rvc.routing_mode == "hardware":
+                                    rvc.stop_local_stream()
+                                    rvc.routing_mode = "browser"
+
+                            response = {
+                                "type": "config_success",
+                                "model_name": rvc.model_name,
+                                "target_sr":  rvc.target_sr,
+                                "f0_method":  rvc.f0_method,
+                                "f0_up_key":  rvc.f0_up_key,
+                                "device":     rvc.device,
+                                "routing_mode": rvc.routing_mode
+                            }
+                            await websocket.send(json.dumps(response))
+                    except Exception as e:
+                        logger.error(f"Config parse error: {e}")
+                        await websocket.send(json.dumps({"type": "error", "message": str(e)}))
+
+                elif isinstance(message, bytes):
+                    if rvc.routing_mode == "hardware":
+                        continue
+                    if rvc.processor is None:
+                        # Prepend 0.0 processing time to echoed message
+                        input_chunk = np.frombuffer(message, dtype=np.float32)
+                        payload = np.empty(len(input_chunk) + 1, dtype=np.float32)
+                        payload[0] = 0.0
+                        payload[1:] = input_chunk
+                        await websocket.send(payload.tobytes())
+                        continue
+                    # Put chunk in queue — await here means we yield if queue is full (backpressure)
+                    await input_queue.put(message)
+        except Exception as e:
+            logger.error(f"Receiver error: {e}")
+        finally:
+            # Signal downstream stages to stop
+            await input_queue.put(None)
+
+    # --- STAGE 2: Processor ---
+    # Pulls from input_queue, processes in thread (non-blocking to event loop), pushes to output_queue.
+    # Sequential (1 executor worker) = output order matches input order.
+    async def processor_task():
+        try:
+            while True:
+                item = await input_queue.get()
+                if item is None:
+                    break  # shutdown signal
+                input_chunk  = np.frombuffer(item, dtype=np.float32).copy()
+                t_start = time.time()
+                output_chunk = await loop.run_in_executor(
+                    _audio_executor, rvc.process_audio_chunk, input_chunk
+                )
+                t_elapsed = (time.time() - t_start) * 1000
+                
+                # Prepend the elapsed processing time to the audio chunk bytes
+                payload = np.empty(len(output_chunk) + 1, dtype=np.float32)
+                payload[0] = t_elapsed
+                payload[1:] = output_chunk
+                await output_queue.put(payload.tobytes())
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            logger.error(f"Processor error: {e}")
+        finally:
+            await output_queue.put(None)
+
+    # --- STAGE 3: Sender ---
+    # Pulls processed audio from output_queue and sends to client.
+    async def sender_task():
+        try:
+            while True:
+                item = await output_queue.get()
+                if item is None:
+                    break  # shutdown signal
+                await websocket.send(item)
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            logger.error(f"Sender error: {e}")
+
+    # --- Visualizer (hardware mode only) ---
+    async def visualizer_sender_loop():
+        try:
+            while True:
+                if rvc.routing_mode == "hardware" and rvc.visualizer_queue is not None:
+                    try:
+                        raw_chunk, output_chunk = await asyncio.wait_for(
+                            rvc.visualizer_queue.get(), timeout=0.1
+                        )
+                        payload = {
+                            "type": "visualizer",
+                            "input":  raw_chunk.tolist(),
+                            "output": output_chunk.tolist()
+                        }
+                        await websocket.send(json.dumps(payload))
+                    except asyncio.TimeoutError:
+                        pass
+                else:
+                    await asyncio.sleep(0.05)
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            logger.error(f"Visualizer sender error: {e}")
+
+    # --- Send device list on connect ---
+    import sounddevice as sd
+    devices_list = []
+    try:
+        for idx, d in enumerate(sd.query_devices()):
+            devices_list.append({
+                "id": idx,
+                "name": d["name"],
+                "max_input_channels":  d["max_input_channels"],
+                "max_output_channels": d["max_output_channels"],
+                "default_samplerate":  d["default_samplerate"]
+            })
+        default_input  = sd.default.device[0]
+        default_output = sd.default.device[1]
+    except Exception as e:
+        logger.error(f"Failed to query server audio devices: {e}")
+        devices_list  = []
+        default_input  = -1
+        default_output = -1
+
+    await websocket.send(json.dumps({
+        "type": "init_devices",
+        "devices": devices_list,
+        "default_input":  default_input,
+        "default_output": default_output
+    }))
+
+    # --- Run all pipeline stages concurrently ---
+    vis_task  = asyncio.create_task(visualizer_sender_loop())
+    proc_task = asyncio.create_task(processor_task())
+    send_task = asyncio.create_task(sender_task())
+
+    try:
+        await receiver_task()          # runs until websocket closes
+    except Exception as e:
+        logger.error(f"WebSocket handler error: {e}")
+    finally:
+        vis_task.cancel()
+        proc_task.cancel()
+        send_task.cancel()
+        rvc.stop_local_stream()
+        logger.info("WebSocket client disconnected, pipeline cleaned up.")
+
+
+async def start_websocket_server(host, port):
+    import websockets
+    logger.info(f"Starting WebSocket server on ws://{host}:{port}...")
+    async with websockets.serve(websocket_handler, host, port):
+        await asyncio.Future()
+
+# --- HTTP STATIC FILE SERVER FOR FRONTEND ---
+def start_http_server(port, directory="frontend"):
+    class MyHandler(SimpleHTTPRequestHandler):
+        def __init__(self, *args, **kwargs):
+            # Force serve from directory relative to the project root
+            base_dir = os.path.dirname(os.path.abspath(__file__))
+            full_dir = os.path.join(base_dir, directory)
+            super().__init__(*args, directory=full_dir, **kwargs)
+            
+        def log_message(self, format, *args):
+            # Suppress standard logging to prevent console pollution
+            pass
+
+    try:
+        # Create a TCPServer that allows address reuse
+        socketserver.TCPServer.allow_reuse_address = True
+        with socketserver.TCPServer(("", port), MyHandler) as httpd:
+            logger.info(f"Serving HTTP frontend on http://localhost:{port}")
+            httpd.serve_forever()
+    except Exception as e:
+        logger.error(f"Failed to start HTTP server: {e}")
+
+# --- LOCAL AUDIO DEVICE STREAM MODE ---
+def run_local_device_mode(model_name, f0_up_key, f0_method, device, input_device, output_device, chunk_size):
+    import sounddevice as sd
+    
+    logger.info("Starting Local Audio Hardware Stream Mode...")
+    
+    rvc = RealtimeVoiceChanger()
+    rvc.load_model(model_name, device)
+    
+    if input_device is None:
+        input_device = sd.default.device[0]
+    if output_device is None:
+        output_device = sd.default.device[1]
+        
+    input_info = sd.query_devices(input_device)
+    output_info = sd.query_devices(output_device)
+    
+    input_sr = int(input_info["default_samplerate"])
+    target_sr = rvc.target_sr
+    
+    logger.info(f"Input Device: {input_info['name']} (Sample Rate: {input_sr} Hz)")
+    logger.info(f"Output Device: {output_info['name']} (Sample Rate: {target_sr} Hz)")
+    
+    rvc.set_config({
+        "f0_up_key": f0_up_key,
+        "f0_method": f0_method,
+        "input_sr": input_sr,
+        "device": device,
+        "model_name": model_name,
+        "noise_gate": -40.0,
+        "input_gain": 1.0,
+        "output_gain": 1.0
+    })
+    
+    def audio_callback(indata, outdata, frames, time_info, status):
+        if status:
+            logger.warning(f"Audio Callback Status: {status}")
+            
+        raw_chunk = indata[:, 0].copy()
+        output_chunk = rvc.process_audio_chunk(raw_chunk)
+        
+        if len(output_chunk) < frames:
+            outdata[:, 0] = np.pad(output_chunk, (0, frames - len(output_chunk)), "constant")
+        else:
+            outdata[:, 0] = output_chunk[:frames]
+
+    try:
+        stream = sd.Stream(
+            device=(input_device, output_device),
+            samplerate=target_sr,
+            blocksize=chunk_size,
+            channels=1,
+            dtype="float32",
+            callback=audio_callback
+        )
+        with stream:
+            logger.info("Real-Time Sounddevice Stream active! Press Ctrl+C to stop.")
+            while True:
+                time.sleep(1)
+    except KeyboardInterrupt:
+        logger.info("Local stream stopped by user")
+    except Exception as e:
+        logger.error(f"Local stream error: {e}")
+        traceback.print_exc()
+
+# --- MAIN ---
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="High-Performance Real-Time RVC ONNX Server")
+    parser.add_argument("--mode", type=str, default="websocket", choices=["websocket", "device"], help="Server running mode")
+    parser.add_argument("--host", type=str, default="127.0.0.1", help="WebSocket host")
+    parser.add_argument("--port", type=int, default=8765, help="WebSocket port")
+    parser.add_argument("--http_port", type=int, default=8000, help="HTTP static server port for Web UI")
+    parser.add_argument("--model", type=str, default="", help="RVC Model folder name inside weights/")
+    parser.add_argument("--transpose", type=int, default=0, help="Pitch shift in semitones (transpose)")
+    parser.add_argument("--f0_method", type=str, default="pm", choices=["pm", "harvest", "dio", "rmvpe"], help="Pitch extraction method")
+    parser.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda", "dml"], help="Execution provider")
+    parser.add_argument("--input_device", type=int, default=None, help="Input device ID (for device mode)")
+    parser.add_argument("--output_device", type=int, default=None, help="Output device ID (for device mode)")
+    parser.add_argument("--chunk_size", type=int, default=2048, help="Audio block size in samples")
+    
+    args = parser.parse_args()
+    
+    model_name = args.model
+    if not model_name:
+        models = get_onnx_models()
+        if models:
+            model_name = models[0]
+            logger.info(f"Auto-selected model: {model_name}")
+        else:
+            logger.error("No models found in weights/ directory. Please export a model first.")
+            sys.exit(1)
+            
+    if args.mode == "websocket":
+        # 1. Start HTTP Server in a background thread to serve the frontend!
+        http_thread = threading.Thread(
+            target=start_http_server, 
+            args=(args.http_port, "frontend"), 
+            daemon=True
+        )
+        http_thread.start()
+        
+        # 2. Automatically open the Web UI in the default browser!
+        web_ui_url = f"http://127.0.0.1:{args.http_port}"
+        logger.info(f"Automatically launching Web UI at {web_ui_url} in browser...")
+        
+        # We give it a tiny delay to ensure the HTTP server socket is open
+        def open_browser():
+            time.sleep(0.5)
+            webbrowser.open(web_ui_url)
+            
+        browser_thread = threading.Thread(target=open_browser, daemon=True)
+        browser_thread.start()
+        
+        # 3. Start the WebSocket server on the main event loop
+        try:
+            asyncio.run(start_websocket_server(args.host, args.port))
+        except KeyboardInterrupt:
+            logger.info("Server shut down")
+    elif args.mode == "device":
+        run_local_device_mode(
+            model_name=model_name,
+            f0_up_key=args.transpose,
+            f0_method=args.f0_method,
+            device=args.device,
+            input_device=args.input_device,
+            output_device=args.output_device,
+            chunk_size=args.chunk_size
+        )