commit e90b5c971ca836120a05186b7285043f4f74fc61 Author: akukanara Date: Sat May 30 21:30:49 2026 +0700 initial commit (clean, no models) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a98f74a --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +# Python cache +__pycache__/ +*.py[cod] +*$py.class + +# Environments +.venv/ +venv/ +ENV/ +env/ + +# Virtual environment files +Pipfile.lock +poetry.lock + +# OS files +.DS_Store +Thumbs.db + +# Agent/IDE files +.antigravitycli/ +.vscode/ +.idea/ + +# Voice changer models and weights +*.onnx +*.pth +*.pt +*.index +weights/ +pretrained/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..b06482f --- /dev/null +++ b/README.md @@ -0,0 +1,31 @@ +# Standalone ONNX Voice Changer Service + +Layanan pengubah suara real-time berbasis AI berlatensi rendah menggunakan akselerasi ONNX Runtime dan model RVC (Retrieval-based Voice Conversion). + +## Struktur Proyek +- `server.py`: WebSocket server utama yang memproses streaming audio dan menyajikan static HTTP frontend. +- `frontend/`: File UI web client (HTML, CSS, JS). +- `lib/`: Modul inferensi ONNX RVC. +- `weights/`: Tempat penyimpanan model suara (folder per model berisi file `.onnx` dan opsional file `.pth`). +- `pretrained/`: Model pra-latih dasar (seperti `vec-768-layer-12.onnx`). +- `rmvpe.pt` & `rmvpe.py`: Untuk ekstraksi pitch suara fidelitas tinggi. + +## Cara Menjalankan + +### Persyaratan Sistem +Pastikan Python 3.10+ sudah terinstal di sistem Anda beserta library yang dibutuhkan di `requirements.txt`. + +### Menjalankan Server +Jalankan server menggunakan Python dari environment Anda: +```bash +python server.py --host 127.0.0.1 --port 8765 --http_port 8000 +``` + +Parameter opsional: +- `--host`: Alamat host WebSocket server (default: `127.0.0.1`). +- `--port`: Port WebSocket server (default: `8765`). +- `--http_port`: Port HTTP server untuk UI web client (default: `8000`). +- `--device`: Execution Provider (`cpu`, `cuda`, atau `dml` - default: `cuda`). +- `--model`: Nama folder model suara di dalam `weights/` yang ingin dimuat langsung saat start. + +Setelah server berjalan, Web UI akan otomatis terbuka di browser Anda pada alamat `http://localhost:8000`. diff --git a/frontend/app.js b/frontend/app.js new file mode 100644 index 0000000..ade723a --- /dev/null +++ b/frontend/app.js @@ -0,0 +1,744 @@ +/** + * Omni Real-Time Voice Changer - Client App + * High-performance browser-based mic streaming and RVC playback. + */ + +// UI Elements +const wsUrlInput = document.getElementById('ws_url'); +const connectionStatus = document.getElementById('connection_status'); +const connectBtn = document.getElementById('connect_btn'); +const streamBtn = document.getElementById('stream_btn'); +const playToggleBtn = document.getElementById('play_toggle_btn'); + +const modelSelect = document.getElementById('model_select'); +const deviceSelect = document.getElementById('device_select'); +const transposeSlider = document.getElementById('transpose_slider'); +const transposeVal = document.getElementById('transpose_val'); +const gateSlider = document.getElementById('gate_slider'); +const gateVal = document.getElementById('gate_val'); +const inputGainSlider = document.getElementById('input_gain_slider'); +const inputGainVal = document.getElementById('input_gain_val'); +const outputGainSlider = document.getElementById('output_gain_slider'); +const outputGainVal = document.getElementById('output_gain_val'); +const chunkSelect = document.getElementById('chunk_select'); +const noiseCancelCheckbox = document.getElementById('noise_cancel_checkbox'); +const routingModeSelect = document.getElementById('routing_mode_select'); +const hardwareDevicesPanel = document.getElementById('hardware_devices_panel'); +const serverInputSelect = document.getElementById('server_input_select'); +const serverOutputSelect = document.getElementById('server_output_select'); +const browserNoiseCancelGroup = document.getElementById('browser_noise_cancel_group'); + +const presetLatencyBtn = document.getElementById('preset_latency_btn'); +const presetQualityBtn = document.getElementById('preset_quality_btn'); + +const inputCanvas = document.getElementById('input_canvas'); +const outputCanvas = document.getElementById('output_canvas'); + +const hudLatency = document.getElementById('hud_latency'); +const hudTime = document.getElementById('hud_time'); +const hudGateStatus = document.getElementById('hud_gate_status'); +const hudSr = document.getElementById('hud_sr'); + +// Audio Visualizer Contexts +const inputCtx = inputCanvas.getContext('2d'); +const outputCtx = outputCanvas.getContext('2d'); + +// Web Audio State +let audioContext = null; +let micStream = null; +let micSourceNode = null; +let scriptProcessorNode = null; +let micAccumulator = new Float32Array(0); // Accumulates audio for large/custom chunk sizes + +// WebSocket State +let socket = null; +let isStreaming = false; +let playOutput = true; +let targetSampleRate = 40000; // RVC Model default, updated dynamically + +// Playback Sync State +let nextPlaybackTime = 0; +const safetyDelay = 0.10; // 100ms buffer to absorb network/websocket jitter (increased for perfect smoothness!) + +// Latency Tracking Queues +let sentTimestamps = []; +const maxSentLogs = 50; + +// --- SMOOTH VISUALIZER (Rolling Display Buffers + RAF loop) --- +// Fixed display buffer size: ~85ms window looks great at all chunk sizes. +const VIS_DISPLAY_SIZE = 4096; +let inputDisplayBuf = new Float32Array(VIS_DISPLAY_SIZE); // rolling input (updated ~85ms) +let outputDisplayBuf = new Float32Array(VIS_DISPLAY_SIZE); // fallback for hardware mode +let rafHandle = null; + +// Time-synced output queue: each entry = { data: Float32Array, startTime: number (audioCtx seconds) } +let outputChunkQueue = []; + +function pushToDisplayBuf(displayBuf, newSamples) { + if (newSamples.length >= VIS_DISPLAY_SIZE) { + displayBuf.set(newSamples.slice(newSamples.length - VIS_DISPLAY_SIZE)); + } else { + displayBuf.copyWithin(0, newSamples.length); + displayBuf.set(newSamples, VIS_DISPLAY_SIZE - newSamples.length); + } +} + +// Build a VIS_DISPLAY_SIZE window of output samples ending at audioContext.currentTime +function buildTimeSyncedOutputBuf() { + if (!audioContext || outputChunkQueue.length === 0) return outputDisplayBuf; + + const now = audioContext.currentTime; + const windowDuration = VIS_DISPLAY_SIZE / targetSampleRate; + const windowStart = now - windowDuration; + + // Drop chunks that ended before our window start + while (outputChunkQueue.length > 0) { + const c = outputChunkQueue[0]; + if (c.startTime + c.data.length / targetSampleRate < windowStart) { + outputChunkQueue.shift(); + } else break; + } + + const out = new Float32Array(VIS_DISPLAY_SIZE); + for (const chunk of outputChunkQueue) { + const chunkEnd = chunk.startTime + chunk.data.length / targetSampleRate; + // Overlap between [windowStart, now] and [chunk.startTime, chunkEnd] + const overlapStart = Math.max(windowStart, chunk.startTime); + const overlapEnd = Math.min(now, chunkEnd); + if (overlapStart >= overlapEnd) continue; + + const srcOffset = Math.floor((overlapStart - chunk.startTime) * targetSampleRate); + const destOffset = Math.floor((overlapStart - windowStart) * targetSampleRate); + const count = Math.floor((overlapEnd - overlapStart) * targetSampleRate); + const safeCount = Math.min(count, + chunk.data.length - srcOffset, + VIS_DISPLAY_SIZE - destOffset); + if (safeCount > 0) out.set(chunk.data.subarray(srcOffset, srcOffset + safeCount), destOffset); + } + return out; +} + +function startVisualizerLoop() { + if (rafHandle) return; + function frame() { + drawWaveform(inputDisplayBuf, inputCanvas, '#6366f1'); + // Time-synced output: scrub through queued chunks using audioContext clock + drawWaveform(buildTimeSyncedOutputBuf(), outputCanvas, '#a855f7'); + rafHandle = requestAnimationFrame(frame); + } + rafHandle = requestAnimationFrame(frame); +} + +function stopVisualizerLoop() { + if (rafHandle) { + cancelAnimationFrame(rafHandle); + rafHandle = null; + } + outputChunkQueue = []; +} + +// Setup Canvas Sizes dynamically +function resizeCanvases() { + inputCanvas.width = inputCanvas.clientWidth * window.devicePixelRatio; + inputCanvas.height = inputCanvas.clientHeight * window.devicePixelRatio; + outputCanvas.width = outputCanvas.clientWidth * window.devicePixelRatio; + outputCanvas.height = outputCanvas.clientHeight * window.devicePixelRatio; +} +resizeCanvases(); +window.addEventListener('resize', resizeCanvases); + +// Connect / Disconnect WebSocket +connectBtn.addEventListener('click', () => { + if (socket && (socket.readyState === WebSocket.OPEN || socket.readyState === WebSocket.CONNECTING)) { + disconnectServer(); + } else { + connectServer(); + } +}); + +function connectServer() { + const url = wsUrlInput.value.trim(); + updateConnectionStatus('connecting'); + + try { + socket = new WebSocket(url); + socket.binaryType = 'arraybuffer'; + + socket.onopen = () => { + console.log('Connected to RVC Server'); + updateConnectionStatus('connected'); + sendConfigToServer(); // Send initial configurations + streamBtn.disabled = false; + playToggleBtn.disabled = false; + }; + + socket.onclose = () => { + console.log('WebSocket Connection Closed'); + disconnectServer(); + }; + + socket.onerror = (err) => { + console.error('WebSocket Error:', err); + disconnectServer(); + }; + + socket.onmessage = (event) => { + if (typeof event.data === 'string') { + // Config or control response + try { + const response = JSON.parse(event.data); + if (response.type === 'config_success') { + targetSampleRate = response.target_sr; + console.log('Server configuration synced successfully:', response); + } else if (response.type === 'init_devices') { + populateServerDevices(response.devices, response.default_input, response.default_output); + } else if (response.type === 'visualizer') { + // Feed rolling display buffers — RAF loop handles drawing at 60fps + pushToDisplayBuf(inputDisplayBuf, new Float32Array(response.input)); + pushToDisplayBuf(outputDisplayBuf, new Float32Array(response.output)); + if (!rafHandle) startVisualizerLoop(); + } else if (response.type === 'error') { + alert('Server Error: ' + response.message); + } + } catch (e) { + console.error('Error parsing text message:', e); + } + } else if (event.data instanceof ArrayBuffer) { + // Binary processed PCM audio chunk returned from server (Browser Mode only) + handleServerAudioChunk(event.data); + } + }; + + } catch (e) { + console.error('Connection failed:', e); + disconnectServer(); + } +} + +function disconnectServer() { + if (isStreaming) { + stopStreaming(); + } + + if (socket) { + try { + socket.close(); + } catch (e) {} + socket = null; + } + + updateConnectionStatus('disconnected'); + streamBtn.disabled = true; + playToggleBtn.disabled = true; +} + +function updateConnectionStatus(status) { + connectionStatus.className = 'status-badge ' + status; + if (status === 'connected') { + connectionStatus.textContent = 'Terhubung'; + connectBtn.textContent = 'Putuskan Server'; + connectBtn.className = 'btn btn-primary'; + } else if (status === 'connecting') { + connectionStatus.textContent = 'Menghubungkan'; + connectBtn.textContent = 'Batal'; + } else { + connectionStatus.textContent = 'Terputus'; + connectBtn.textContent = 'Hubungkan Server'; + connectBtn.className = 'btn btn-primary'; + } +} + +// Config synchronization +function sendConfigToServer() { + if (!socket || socket.readyState !== WebSocket.OPEN) return; + + const activeF0 = document.querySelector('input[name="f0_method"]:checked').value; + + const config = { + type: 'config', + model_name: modelSelect.value, + device: deviceSelect.value, + f0_method: activeF0, + f0_up_key: parseInt(transposeSlider.value), + noise_gate: parseFloat(gateSlider.value), + input_gain: parseFloat(inputGainSlider.value), + output_gain: parseFloat(outputGainSlider.value), + input_sr: audioContext ? audioContext.sampleRate : 44100, + routing_mode: routingModeSelect.value, + input_device: serverInputSelect.value ? parseInt(serverInputSelect.value) : null, + output_device: serverOutputSelect.value ? parseInt(serverOutputSelect.value) : null, + chunk_size: parseInt(chunkSelect.value) + }; + + socket.send(jsonEncode(config)); + console.log('Sent configuration change:', config); +} + +// Populate Server Audio Devices dropdowns +function populateServerDevices(devices, defaultInput, defaultOutput) { + serverInputSelect.innerHTML = ''; + serverOutputSelect.innerHTML = ''; + + if (devices.length === 0) { + const optIn = document.createElement('option'); + optIn.textContent = 'Tidak ada mic terdeteksi di server'; + serverInputSelect.appendChild(optIn); + + const optOut = document.createElement('option'); + optOut.textContent = 'Tidak ada output terdeteksi di server'; + serverOutputSelect.appendChild(optOut); + return; + } + + devices.forEach(device => { + if (device.max_input_channels > 0) { + const opt = document.createElement('option'); + opt.value = device.id; + opt.textContent = `[ID ${device.id}] ${device.name}`; + if (device.id === defaultInput) opt.selected = true; + serverInputSelect.appendChild(opt); + } + + if (device.max_output_channels > 0) { + const opt = document.createElement('option'); + opt.value = device.id; + opt.textContent = `[ID ${device.id}] ${device.name}`; + if (device.id === defaultOutput) opt.selected = true; + serverOutputSelect.appendChild(opt); + } + }); + + console.log('Successfully populated server hardware devices in UI.'); +} + +// UI Event Listeners to trigger instant sync +modelSelect.addEventListener('change', sendConfigToServer); +deviceSelect.addEventListener('change', sendConfigToServer); +document.querySelectorAll('input[name="f0_method"]').forEach(radio => { + radio.addEventListener('change', sendConfigToServer); +}); + +transposeSlider.addEventListener('input', () => { + transposeVal.textContent = (transposeSlider.value >= 0 ? '+' : '') + transposeSlider.value + ' semitone'; +}); +transposeSlider.addEventListener('change', sendConfigToServer); + +gateSlider.addEventListener('input', () => { + gateVal.textContent = gateSlider.value + ' dB'; +}); +gateSlider.addEventListener('change', sendConfigToServer); + +inputGainSlider.addEventListener('input', () => { + inputGainVal.textContent = parseFloat(inputGainSlider.value).toFixed(1) + 'x'; +}); +inputGainSlider.addEventListener('change', sendConfigToServer); + +outputGainSlider.addEventListener('input', () => { + outputGainVal.textContent = parseFloat(outputGainSlider.value).toFixed(1) + 'x'; +}); +outputGainSlider.addEventListener('change', sendConfigToServer); + +chunkSelect.addEventListener('change', () => { + // Reinitialize stream if buffer size is changed during active streaming + if (isStreaming) { + stopStreaming(); + startStreaming(); + } +}); + +noiseCancelCheckbox.addEventListener('change', () => { + // Reinitialize microphone with new noise cancellation constraints if streaming + if (isStreaming) { + stopStreaming(); + startStreaming(); + } +}); + +// Helper to dynamically adjust UI layout based on Routing Mode +function applyAudioRoutingUI() { + if (routingModeSelect.value === 'hardware') { + hardwareDevicesPanel.style.display = 'block'; + playToggleBtn.style.display = 'none'; // Hide browser-only "Mendengarkan" button + browserNoiseCancelGroup.style.display = 'none'; // Hide browser-only Noise Cancel checkbox + } else { + hardwareDevicesPanel.style.display = 'none'; + playToggleBtn.style.display = 'inline-block'; // Show browser-only "Mendengarkan" button + browserNoiseCancelGroup.style.display = 'block'; // Show browser-only Noise Cancel checkbox + } +} + +// Routing Mode Event Listeners +routingModeSelect.addEventListener('change', () => { + applyAudioRoutingUI(); + sendConfigToServer(); + + if (isStreaming) { + stopStreaming(); + startStreaming(); + } +}); + +serverInputSelect.addEventListener('change', sendConfigToServer); +serverOutputSelect.addEventListener('change', sendConfigToServer); + +// Quick Presets Event Listeners +presetLatencyBtn.addEventListener('click', () => { + const radioPM = document.querySelector('input[name="f0_method"][value="pm"]'); + if (radioPM) radioPM.checked = true; + chunkSelect.value = "8192"; + + console.log("Preset loaded: Latency (PM + 8192)"); + sendConfigToServer(); + + if (isStreaming) { + stopStreaming(); + startStreaming(); + } +}); + +presetQualityBtn.addEventListener('click', () => { + const radioRMVPE = document.querySelector('input[name="f0_method"][value="rmvpe"]'); + if (radioRMVPE) radioRMVPE.checked = true; + chunkSelect.value = "16384"; + + console.log("Preset loaded: Quality (RMVPE + 16384)"); + sendConfigToServer(); + + if (isStreaming) { + stopStreaming(); + startStreaming(); + } +}); + +// Helper functions for UI JSON safely +function jsonEncode(obj) { + return JSON.stringify(obj); +} + +playToggleBtn.addEventListener('click', () => { + playOutput = !playOutput; + if (playOutput) { + playToggleBtn.textContent = '🔊 Mendengarkan: AKTIF'; + playToggleBtn.className = 'btn btn-primary'; + } else { + playToggleBtn.textContent = '🔇 Mendengarkan: SENYAP'; + playToggleBtn.className = 'btn btn-accent'; + } +}); + +// Stream Toggle +streamBtn.addEventListener('click', () => { + if (isStreaming) { + stopStreaming(); + } else { + startStreaming(); + } +}); + +async function startStreaming() { + isStreaming = true; + streamBtn.textContent = 'Hentikan Pengubah Suara'; + streamBtn.className = 'btn btn-primary'; + + const isHardwareMode = (routingModeSelect.value === 'hardware'); + + if (isHardwareMode) { + // --- SERVER HARDWARE ROUTING MODE --- + inputDisplayBuf = new Float32Array(VIS_DISPLAY_SIZE); + outputDisplayBuf = new Float32Array(VIS_DISPLAY_SIZE); + startVisualizerLoop(); + sendConfigToServer(); // Sends config with routing_mode: 'hardware' which triggers stream start on server + console.log('Server Hardware Mode initialized.'); + return; + } + + // --- CLIENT BROWSER MODE --- + // 1. Create AudioContext if not active + if (!audioContext) { + audioContext = new (window.AudioContext || window.webkitAudioContext)({ + latencyHint: 'interactive' + }); + } + + if (audioContext.state === 'suspended') { + await audioContext.resume(); + } + + hudSr.textContent = audioContext.sampleRate + ' Hz'; + sendConfigToServer(); // sync actual input sample rate + + // 2. Request user microphone with high-fidelity, lowest possible latency constraints + try { + const useNoiseCancel = noiseCancelCheckbox.checked; + micStream = await navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: useNoiseCancel, + noiseSuppression: useNoiseCancel, + autoGainControl: useNoiseCancel + } + }); + + micSourceNode = audioContext.createMediaStreamSource(micStream); + + // 3. Create Audio Processing Loop Node (ScriptProcessorNode) + // BaseAudioContext's createScriptProcessor buffer size MUST be a power of two between 256 and 16384. + // We use a fixed, highly supported buffer size of 4096 for recording, and accumulate samples in-memory + // to support ANY arbitrary or extremely large chunk size (like 12288, 24576, 32768) selected by the user! + const recordBufferSize = 4096; + scriptProcessorNode = audioContext.createScriptProcessor(recordBufferSize, 1, 1); + + scriptProcessorNode.onaudioprocess = (event) => { + if (!isStreaming) return; + + const inputBuffer = event.inputBuffer; + const inputData = inputBuffer.getChannelData(0); // 4096 samples + + // Push latest mic samples into the rolling display buffer every callback (~85ms) + pushToDisplayBuf(inputDisplayBuf, inputData); + + // Append incoming recorded samples to our accumulator + const temp = new Float32Array(micAccumulator.length + inputData.length); + temp.set(micAccumulator); + temp.set(inputData, micAccumulator.length); + micAccumulator = temp; + + const targetChunkSize = parseInt(chunkSelect.value); + + // Process and send chunks of the user's selected target size + while (micAccumulator.length >= targetChunkSize) { + const chunkToSend = micAccumulator.slice(0, targetChunkSize); + micAccumulator = micAccumulator.slice(targetChunkSize); // Keep remainder + + // Voice Activity Detection for gate status badge + let maxVal = 0; + for (let i = 0; i < chunkToSend.length; i++) maxVal = Math.max(maxVal, Math.abs(chunkToSend[i])); + if (maxVal > 0.005) { + hudGateStatus.textContent = 'Bicara'; + hudGateStatus.className = 'hud-value active-badge'; + } else { + hudGateStatus.textContent = 'Berdiam'; + hudGateStatus.className = 'hud-value text-muted'; + } + + // Send binary PCM Float32 audio chunk of target size to Python Server + if (socket && socket.readyState === WebSocket.OPEN) { + const packetTime = performance.now(); + sentTimestamps.push({ id: packetTime, sent: packetTime }); + if (sentTimestamps.length > maxSentLogs) { + sentTimestamps.shift(); + } + + socket.send(chunkToSend.buffer); // Send direct array buffer + } + } + }; + + micSourceNode.connect(scriptProcessorNode); + scriptProcessorNode.connect(audioContext.destination); // Required to trigger onaudioprocess + + // Reset playback sync clock + nextPlaybackTime = 0; + micAccumulator = new Float32Array(0); // Reset accumulator + inputDisplayBuf = new Float32Array(VIS_DISPLAY_SIZE); + outputDisplayBuf = new Float32Array(VIS_DISPLAY_SIZE); + startVisualizerLoop(); + + console.log('Browser Streaming active. Recording buffer size: 4096 | Target chunk size:', chunkSelect.value); + } catch (e) { + console.error('Failed to access microphone:', e); + alert('Gagal mengakses mikrofon Anda: ' + e.message); + stopStreaming(); + } +} + +function stopStreaming() { + isStreaming = false; + streamBtn.textContent = 'Mulai Mengubah Suara'; + streamBtn.className = 'btn btn-accent'; + + playOutput = true; + playToggleBtn.textContent = '🔊 Mendengarkan: AKTIF'; + playToggleBtn.className = 'btn btn-primary'; + + const isHardwareMode = (routingModeSelect.value === 'hardware'); + + if (isHardwareMode) { + // --- SERVER HARDWARE ROUTING MODE --- + if (socket && socket.readyState === WebSocket.OPEN) { + const config = { + type: 'config', + routing_mode: 'browser' // Tells server to stop local hardware stream + }; + socket.send(jsonEncode(config)); + } + console.log('Server Hardware Mode stopped.'); + + hudGateStatus.textContent = 'Berdiam'; + hudGateStatus.className = 'hud-value text-muted'; + hudLatency.textContent = '-- ms'; + hudTime.textContent = '-- ms'; + + stopVisualizerLoop(); + inputDisplayBuf = new Float32Array(VIS_DISPLAY_SIZE); + outputDisplayBuf = new Float32Array(VIS_DISPLAY_SIZE); + clearCanvas(inputCanvas); + clearCanvas(outputCanvas); + return; + } + + // --- CLIENT BROWSER MODE --- + // Stop microphone stream tracks + if (micStream) { + micStream.getTracks().forEach(track => track.stop()); + micStream = null; + } + + // Disconnect Web Audio nodes + if (micSourceNode) { + micSourceNode.disconnect(); + micSourceNode = null; + } + if (scriptProcessorNode) { + scriptProcessorNode.disconnect(); + scriptProcessorNode = null; + } + + micAccumulator = new Float32Array(0); // Reset accumulator + + stopVisualizerLoop(); + inputDisplayBuf = new Float32Array(VIS_DISPLAY_SIZE); + outputDisplayBuf = new Float32Array(VIS_DISPLAY_SIZE); + + hudGateStatus.textContent = 'Berdiam'; + hudGateStatus.className = 'hud-value text-muted'; + hudLatency.textContent = '-- ms'; + hudTime.textContent = '-- ms'; + + clearCanvas(inputCanvas); + clearCanvas(outputCanvas); +} + +// Seamless Audio Playback Scheduler (Absorbs WebSocket & processing jitter) +function handleServerAudioChunk(arrayBuffer) { + if (!isStreaming) return; + + // 1. Measure Round-Trip Time Latency (RTT) + const now = performance.now(); + let rtt = 0; + if (sentTimestamps.length > 0) { + const oldestSent = sentTimestamps.shift(); + rtt = now - oldestSent.sent; + hudLatency.textContent = Math.round(rtt) + ' ms'; + } + + // Convert arrayBuffer to Float32 samples + const payload = new Float32Array(arrayBuffer); + const processingTime = payload[0]; // first float32 is the server processing time in ms + const pcmData = payload.subarray(1); // the rest is the audio + + // 2. Schedule chunk smoothly inside the AudioContext timeline + const audioBuf = audioContext.createBuffer(1, pcmData.length, targetSampleRate); + audioBuf.getChannelData(0).set(pcmData); + + const source = audioContext.createBufferSource(); + source.buffer = audioBuf; + + if (playOutput) { + source.connect(audioContext.destination); + } + + // Calculate precise playback clock scheduling + const currentTime = audioContext.currentTime; + const chunkDuration = audioBuf.duration; // actual chunk duration in seconds + // Adaptive buffer: enough headroom so next chunk always arrives before this one ends. + // 2.5× chunk or 500ms cap — absorbs even 300ms+ processing spikes. + const adaptiveBuf = Math.min(chunkDuration * 2.5, 0.50); + + if (nextPlaybackTime < currentTime) { + // Clock behind — first chunk or dropout recovery. + // Use full adaptiveBuf on BOTH cases so recovery fully rebuilds headroom. + // (0.5× recovery was causing cascading dropouts: one late chunk → the next also late) + nextPlaybackTime = currentTime + adaptiveBuf; + } else if (nextPlaybackTime > currentTime + chunkDuration * 5.0) { + // --- ADAPTIVE LATENCY BUSTER --- + // Only snap when queue is >5 chunk-durations ahead (genuine backlog, not normal look-ahead). + // At 8192 (170ms): threshold = 850ms + // At 65536 (1.6s): threshold = 8s + const snapTarget = currentTime + adaptiveBuf; + console.log(`Latency Buster: ${Math.round((nextPlaybackTime-currentTime)*1000)}ms → ${Math.round(adaptiveBuf*1000)}ms`); + nextPlaybackTime = snapTarget; + } + + // Record schedule start time BEFORE advancing the clock (for time-synced visualizer) + const scheduleStartTime = nextPlaybackTime; + + // Schedule play + source.start(nextPlaybackTime); + + hudTime.textContent = Math.max(0, Math.round(processingTime)) + ' ms'; + + // Advance playback sync clock + nextPlaybackTime += audioBuf.duration; + + // Push to time-synced output queue for visualizer (keyed by when audio actually plays) + outputChunkQueue.push({ data: pcmData, startTime: scheduleStartTime }); + // Keep queue bounded to ~10 seconds of audio max + while (outputChunkQueue.length > 0) { + const c = outputChunkQueue[0]; + if (c.startTime + c.data.length / targetSampleRate < audioContext.currentTime - 2.0) { + outputChunkQueue.shift(); + } else break; + } +} + +// --- VISUALIZATION / DRAWING ROUTINES --- +function drawWaveform(dataArray, canvas, strokeColor) { + const ctx = canvas.getContext('2d'); + const width = canvas.width; + const height = canvas.height; + + // Dark transparent redraw for trace/motion-blur effect + ctx.fillStyle = 'rgba(11, 12, 19, 0.4)'; + ctx.fillRect(0, 0, width, height); + + ctx.lineWidth = 2 * window.devicePixelRatio; + ctx.strokeStyle = strokeColor; + ctx.beginPath(); + + const sliceWidth = width / dataArray.length; + let x = 0; + + for (let i = 0; i < dataArray.length; i++) { + // Center the wave around half-height and scale scale amplitude + const v = dataArray[i] * 1.5; + const y = (v * (height / 2)) + (height / 2); + + if (i === 0) { + ctx.moveTo(x, y); + } else { + ctx.lineTo(x, y); + } + + x += sliceWidth; + } + + ctx.lineTo(width, height / 2); + ctx.stroke(); + + // Draw a subtle baseline center glowing path + ctx.strokeStyle = 'rgba(255, 255, 255, 0.05)'; + ctx.lineWidth = 1; + ctx.beginPath(); + ctx.moveTo(0, height / 2); + ctx.lineTo(width, height / 2); + ctx.stroke(); +} + +function clearCanvas(canvas) { + const ctx = canvas.getContext('2d'); + ctx.fillStyle = '#0b0c13'; + ctx.fillRect(0, 0, canvas.width, canvas.height); +} + +// Apply initial UI layout on startup +applyAudioRoutingUI(); diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000..7072cfe --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,243 @@ + + + + + + + 🎙️ Omni Real-Time Voice Changer - High-Performance AI Audio + + + + + + + + + + +
+ +
+ +
+
+ +

🎙️ OMNI VOICE CHANGER

+
+

Pengubah Suara Real-Time AI Berlatensi Ultra Rendah menggunakan Akselerasi ONNX Runtime

+
+ + +
+
+
+ + +
+
+ Terputus +
+
+ + + +
+
+
+ + +
+ + +
+

⚙️ Konfigurasi Model & Perangkat

+ + +
+ +
+ + +
+
+ +
+ + +
+ +
+ + +
+ + +
+ + +
+ + + +
+ +
+ + + + +
+
+ +
+
+ + 0 semitone +
+ +
+ -24 (Pria Berat) + 0 (Asli) + +24 (Wanita/Anime) +
+
+
+ + +
+

🎛️ Pemrosesan Audio (DSP)

+ +
+
+ + -40 dB +
+ +
+ -60 dB (Sensitif) + -40 dB (Default) + -10 dB (Ketat) +
+
+ +
+
+ + 1.0x +
+ +
+ +
+
+ + 1.0x +
+ +
+ +
+ +
+ +
+ + +
+
+ + +
+

📊 Live Audio Waveform & Visualizer

+ +
+
+
+ + Sinyal Mikrofon (Input) +
+ +
+
+
+ + Hasil AI Voice (Output) +
+ +
+
+
+ +
+ + +
+
+ Latensi Bulat (RTT) + -- ms +
+
+
+ Rasio Pemrosesan + -- ms +
+
+
+ Sinyal Suara + Berdiam +
+
+
+ Frekuensi Audio + 44100 Hz +
+
+
+ + + + + diff --git a/frontend/styles.css b/frontend/styles.css new file mode 100644 index 0000000..12a717b --- /dev/null +++ b/frontend/styles.css @@ -0,0 +1,595 @@ +/* ========================================================================== + CSS GLOBAL TOKENS & RESET + ========================================================================== */ +:root { + --bg-dark: #07080e; + --bg-card: rgba(13, 17, 30, 0.7); + --border-color: rgba(99, 102, 241, 0.18); + + --primary: #6366f1; + --primary-glow: rgba(99, 102, 241, 0.4); + --accent: #a855f7; + --accent-glow: rgba(168, 85, 247, 0.45); + --emerald: #10b981; + --rose: #ef4444; + + --text-main: #e2e8f0; + --text-muted: #94a3b8; + --font-header: 'Outfit', 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; + --font-body: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; + + --transition-smooth: all 0.3s cubic-bezier(0.25, 0.8, 0.25, 1); +} + +* { + box-sizing: border-box; + margin: 0; + padding: 0; +} + +body { + background-color: var(--bg-dark); + color: var(--text-main); + font-family: var(--font-body); + min-height: 100vh; + overflow-x: hidden; + position: relative; + padding: 2rem 1.5rem; +} + +/* ========================================================================== + DYNAMIC GLOWING BACKGROUND + ========================================================================== */ +.glow-backdrop { + position: fixed; + top: 0; + left: 0; + right: 0; + bottom: 0; + z-index: -1; + background: + radial-gradient(circle at 10% 20%, rgba(99, 102, 241, 0.08) 0%, transparent 40%), + radial-gradient(circle at 90% 80%, rgba(168, 85, 247, 0.09) 0%, transparent 45%); + pointer-events: none; +} + +/* ========================================================================== + LAYOUT CONTAINER & CARDS + ========================================================================== */ +.dashboard-container { + max-width: 1200px; + margin: 0 auto; + display: flex; + flex-direction: column; + gap: 1.5rem; +} + +.glassmorphism { + background: var(--bg-card); + backdrop-filter: blur(16px); + -webkit-backdrop-filter: blur(16px); + border: 1px solid var(--border-color); + border-radius: 16px; + box-shadow: 0 8px 32px 0 rgba(0, 0, 0, 0.37); + transition: var(--transition-smooth); +} + +.glassmorphism:hover { + border-color: rgba(99, 102, 241, 0.3); + box-shadow: 0 10px 40px 0 rgba(99, 102, 241, 0.1); +} + +.card { + padding: 1.75rem; +} + +.card-title { + font-family: var(--font-header); + font-size: 1.25rem; + font-weight: 600; + margin-bottom: 1.25rem; + background: linear-gradient(135deg, #fff 0%, var(--text-muted) 100%); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + border-bottom: 1px solid rgba(255, 255, 255, 0.05); + padding-bottom: 0.75rem; +} + +/* ========================================================================== + APP HEADER + ========================================================================== */ +.app-header { + text-align: center; + margin-bottom: 1rem; +} + +.logo-area { + display: inline-flex; + align-items: center; + gap: 0.75rem; + margin-bottom: 0.5rem; +} + +.logo-area h1 { + font-family: var(--font-header); + font-size: 2.5rem; + font-weight: 800; + letter-spacing: -0.5px; + background: linear-gradient(135deg, var(--primary) 0%, var(--accent) 100%); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + text-shadow: 0 0 40px rgba(99, 102, 241, 0.2); +} + +.pulse-indicator { + width: 10px; + height: 10px; + border-radius: 50%; + background-color: var(--rose); + box-shadow: 0 0 10px var(--rose); +} + +.pulse-indicator.active { + background-color: var(--emerald); + box-shadow: 0 0 10px var(--emerald); + animation: pulse 1.8s infinite; +} + +.tagline { + color: var(--text-muted); + font-size: 0.95rem; + font-weight: 400; + max-width: 600px; + margin: 0 auto; +} + +/* ========================================================================== + DASHBOARD GRID LAYOUT + ========================================================================== */ +.dashboard-grid { + display: grid; + grid-template-columns: repeat(2, 1fr); + gap: 1.5rem; +} + +@media (max-width: 768px) { + .dashboard-grid { + grid-template-columns: 1fr; + } + .col-span-2 { + grid-column: span 1 !important; + } +} + +.col-span-2 { + grid-column: span 2; +} + +/* ========================================================================== + INPUTS & CONTROLS + ========================================================================== */ +.control-group { + margin-bottom: 1.25rem; +} + +.control-group:last-child { + margin-bottom: 0; +} + +label { + display: block; + font-size: 0.85rem; + font-weight: 500; + color: var(--text-muted); + margin-bottom: 0.5rem; + text-transform: uppercase; + letter-spacing: 0.5px; +} + +.custom-select { + width: 100%; + padding: 0.8rem 1rem; + background-color: rgba(20, 24, 45, 0.8); + border: 1px solid var(--border-color); + border-radius: 8px; + color: var(--text-main); + font-size: 0.9rem; + font-family: var(--font-body); + outline: none; + transition: var(--transition-smooth); + cursor: pointer; + appearance: none; + background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='24' height='24' viewBox='0 0 24 24' fill='none' stroke='%2394a3b8' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpolyline points='6 9 12 15 18 9'%3E%3C/polyline%3E%3C/svg%3E"); + background-repeat: no-repeat; + background-position: right 1rem center; + background-size: 1.2rem; +} + +.custom-select:focus { + border-color: var(--primary); + box-shadow: 0 0 8px var(--primary-glow); +} + +.input-group input { + background-color: rgba(20, 24, 45, 0.8); + border: 1px solid var(--border-color); + border-radius: 8px; + color: var(--text-main); + padding: 0.8rem 1rem; + width: 100%; + font-family: var(--font-body); + font-size: 0.9rem; + outline: none; + transition: var(--transition-smooth); +} + +.input-group input:focus { + border-color: var(--primary); + box-shadow: 0 0 8px var(--primary-glow); +} + +/* ========================================================================== + SLIDERS STYLING + ========================================================================== */ +.slider-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 0.25rem; +} + +.slider-value { + font-family: var(--font-header); + font-weight: 600; + color: var(--accent); + text-shadow: 0 0 8px var(--accent-glow); + font-size: 0.95rem; +} + +.custom-slider { + -webkit-appearance: none; + width: 100%; + height: 6px; + border-radius: 3px; + background: rgba(99, 102, 241, 0.15); + outline: none; + margin: 0.75rem 0; +} + +.custom-slider::-webkit-slider-thumb { + -webkit-appearance: none; + appearance: none; + width: 18px; + height: 18px; + border-radius: 50%; + background: linear-gradient(135deg, var(--primary) 0%, var(--accent) 100%); + cursor: pointer; + box-shadow: 0 0 10px var(--primary-glow); + transition: transform 0.1s ease; +} + +.custom-slider::-webkit-slider-thumb:hover { + transform: scale(1.2); +} + +.slider-ticks { + display: flex; + justify-content: space-between; + font-size: 0.75rem; + color: var(--text-muted); +} + +/* ========================================================================== + BUTTONS + ========================================================================== */ +.btn { + padding: 0.8rem 1.5rem; + border-radius: 8px; + font-family: var(--font-header); + font-weight: 600; + font-size: 0.9rem; + cursor: pointer; + border: none; + outline: none; + transition: var(--transition-smooth); + display: inline-flex; + align-items: center; + justify-content: center; + gap: 0.5rem; +} + +.btn-primary { + background: linear-gradient(135deg, var(--primary) 0%, #4f46e5 100%); + color: white; + box-shadow: 0 4px 14px 0 var(--primary-glow); +} + +.btn-primary:hover:not(:disabled) { + transform: translateY(-2px); + box-shadow: 0 6px 20px 0 rgba(99, 102, 241, 0.6); +} + +.btn-accent { + background: linear-gradient(135deg, var(--accent) 0%, #7c3aed 100%); + color: white; + box-shadow: 0 4px 14px 0 var(--accent-glow); +} + +.btn-accent:hover:not(:disabled) { + transform: translateY(-2px); + box-shadow: 0 6px 20px 0 rgba(168, 85, 247, 0.65); +} + +.btn:active:not(:disabled) { + transform: translateY(0); +} + +.btn:disabled { + opacity: 0.5; + cursor: not-allowed; + box-shadow: none; +} + +/* ========================================================================== + CONNECTION BAR + ========================================================================== */ +.connection-bar { + padding: 1rem 1.5rem !important; +} + +.form-row { + display: flex; + align-items: flex-end; + gap: 1.5rem; + flex-wrap: wrap; +} + +.form-row .input-group { + flex: 1; + min-width: 250px; +} + +.connection-status-container { + display: flex; + align-items: center; + height: 48px; +} + +.status-badge { + padding: 0.4rem 0.8rem; + border-radius: 20px; + font-size: 0.8rem; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.5px; + display: inline-flex; + align-items: center; + gap: 0.35rem; +} + +.status-badge::before { + content: ''; + display: inline-block; + width: 6px; + height: 6px; + border-radius: 50%; +} + +.status-badge.connected { + background-color: rgba(16, 185, 129, 0.15); + color: var(--emerald); + border: 1px solid rgba(16, 185, 129, 0.3); +} + +.status-badge.connected::before { + background-color: var(--emerald); + box-shadow: 0 0 6px var(--emerald); +} + +.status-badge.disconnected { + background-color: rgba(239, 68, 68, 0.15); + color: var(--rose); + border: 1px solid rgba(239, 68, 68, 0.3); +} + +.status-badge.disconnected::before { + background-color: var(--rose); + box-shadow: 0 0 6px var(--rose); +} + +.status-badge.connecting { + background-color: rgba(168, 85, 247, 0.15); + color: var(--accent); + border: 1px solid rgba(168, 85, 247, 0.3); +} + +.status-badge.connecting::before { + background-color: var(--accent); + box-shadow: 0 0 6px var(--accent); + animation: blink 1s infinite; +} + +.btn-group-row { + display: flex; + gap: 0.75rem; + height: 48px; +} + +/* ========================================================================== + MODERN RADIO TILES + ========================================================================== */ +.radio-group-modern { + display: grid; + grid-template-columns: repeat(2, 1fr); + gap: 0.5rem; +} + +.radio-tile { + position: relative; + cursor: pointer; + margin: 0; +} + +.radio-tile input { + position: absolute; + opacity: 0; +} + +.tile-label { + display: block; + padding: 0.6rem; + background-color: rgba(20, 24, 45, 0.5); + border: 1px solid var(--border-color); + border-radius: 8px; + text-align: center; + font-size: 0.8rem; + font-weight: 500; + color: var(--text-muted); + transition: var(--transition-smooth); +} + +.radio-tile input:checked + .tile-label { + background-color: rgba(99, 102, 241, 0.12); + border-color: var(--primary); + color: var(--text-main); + box-shadow: 0 0 10px rgba(99, 102, 241, 0.2); +} + +.radio-tile:hover .tile-label { + border-color: rgba(99, 102, 241, 0.4); +} + +/* ========================================================================== + OSCILLOSCOPE WAVEFORM CANVASES + ========================================================================== */ +.visualizer-row { + display: flex; + gap: 1.5rem; + flex-wrap: wrap; +} + +.visualizer-container { + flex: 1; + min-width: 280px; + display: flex; + flex-direction: column; + gap: 0.5rem; +} + +.vis-label { + display: flex; + align-items: center; + gap: 0.5rem; + font-size: 0.8rem; + font-weight: 500; + color: var(--text-muted); +} + +.dot { + width: 6px; + height: 6px; + border-radius: 50%; +} + +.input-dot { + background-color: var(--primary); + box-shadow: 0 0 6px var(--primary); +} + +.output-dot { + background-color: var(--accent); + box-shadow: 0 0 6px var(--accent); +} + +.waveform-canvas { + width: 100%; + height: 150px; + background-color: #0b0c13; + border-radius: 8px; + border: 1px solid rgba(255, 255, 255, 0.03); +} + +/* ========================================================================== + PERFORMANCE HUD + ========================================================================== */ +.performance-hud { + display: flex; + justify-content: space-between; + align-items: center; + padding: 0.85rem 1.75rem !important; +} + +.hud-item { + display: flex; + flex-direction: column; + gap: 0.15rem; +} + +.hud-label { + font-size: 0.7rem; + text-transform: uppercase; + letter-spacing: 1px; + color: var(--text-muted); + font-weight: 500; +} + +.hud-value { + font-family: var(--font-header); + font-size: 1.1rem; + font-weight: 700; + color: white; +} + +.hud-separator { + width: 1px; + height: 30px; + background-color: rgba(255, 255, 255, 0.08); +} + +.hud-value.text-accent { + color: var(--accent); + text-shadow: 0 0 8px var(--accent-glow); +} + +.active-badge { + color: var(--emerald); + text-shadow: 0 0 6px rgba(16, 185, 129, 0.4); +} + +@media (max-width: 600px) { + .performance-hud { + flex-direction: column; + align-items: flex-start; + gap: 0.75rem; + } + .hud-separator { + display: none; + } +} + +/* ========================================================================== + KEYFRAME ANIMATIONS + ========================================================================== */ +@keyframes pulse { + 0% { + transform: scale(0.9); + box-shadow: 0 0 0 0 rgba(16, 185, 129, 0.7); + } + 70% { + transform: scale(1.1); + box-shadow: 0 0 0 10px rgba(16, 185, 129, 0); + } + 100% { + transform: scale(0.9); + box-shadow: 0 0 0 0 rgba(16, 185, 129, 0); + } +} + +@keyframes blink { + 0%, 100% { + opacity: 1; + } + 50% { + opacity: 0.4; + } +} diff --git a/lib/infer_pack/attentions.py b/lib/infer_pack/attentions.py new file mode 100644 index 0000000..639cccf --- /dev/null +++ b/lib/infer_pack/attentions.py @@ -0,0 +1,436 @@ +import copy +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from lib.infer_pack import commons +from lib.infer_pack import modules +from lib.infer_pack.modules import LayerNorm + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=10, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class Decoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append( + MultiHeadAttention( + hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + import torch.onnx.operators as onnx_ops + shape_k = onnx_ops.shape_as_tensor(key) + shape_q = onnx_ops.shape_as_tensor(query) + b = shape_k[0] + d = shape_k[1] + t_s = shape_k[2] + t_t = shape_q[2] + + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + # Di ONNX, asersi dinamis kadang memicu error konstan, kita bypass di level grafik jika t_s == t_t selalu benar di self-attention + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + if not isinstance(length, torch.Tensor): + length = torch.tensor(length, dtype=torch.int64, device=relative_embeddings.device) + else: + length = length.to(dtype=torch.int64, device=relative_embeddings.device) + + max_relative_position = 2 * self.window_size + 1 + pad_length = torch.clamp(length - (self.window_size + 1), min=0) + slice_start_position = torch.clamp((self.window_size + 1) - length, min=0) + slice_end_position = slice_start_position + 2 * length - 1 + + h = relative_embeddings.shape[0] + d = relative_embeddings.shape[2] + + # dynamic pad using torch.cat and torch.zeros + zeros = torch.zeros(h, pad_length, d, dtype=relative_embeddings.dtype, device=relative_embeddings.device) + padded_relative_embeddings = torch.cat([zeros, relative_embeddings, zeros], dim=1) + + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + import torch.onnx.operators as onnx_ops + shape = onnx_ops.shape_as_tensor(x) + batch = shape[0] + heads = shape[1] + length = shape[2] + + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, [0, 1]) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view(batch, heads, -1) + zeros = torch.zeros(batch, heads, length - 1, dtype=x.dtype, device=x.device) + x_flat = torch.cat([x_flat, zeros], dim=-1) + + # Reshape and slice out the padded elements. + x_final = x_flat.view(batch, heads, length + 1, 2 * length - 1)[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + import torch.onnx.operators as onnx_ops + shape = onnx_ops.shape_as_tensor(x) + batch = shape[0] + heads = shape[1] + length = shape[2] + + # padd along column + zeros = torch.zeros(batch, heads, length, length - 1, dtype=x.dtype, device=x.device) + x = torch.cat([x, zeros], dim=-1) + x_flat = x.view(batch, heads, -1) + # add 0's in the beginning that will skew the elements after reshape + zeros_beg = torch.zeros(batch, heads, length, dtype=x.dtype, device=x.device) + x_flat = torch.cat([zeros_beg, x_flat], dim=-1) + x_final = x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x diff --git a/lib/infer_pack/commons.py b/lib/infer_pack/commons.py new file mode 100644 index 0000000..5447098 --- /dev/null +++ b/lib/infer_pack/commons.py @@ -0,0 +1,166 @@ +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def slice_segments2(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2, 3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm diff --git a/lib/infer_pack/models.py b/lib/infer_pack/models.py new file mode 100644 index 0000000..3665d03 --- /dev/null +++ b/lib/infer_pack/models.py @@ -0,0 +1,1142 @@ +import math, pdb, os +from time import time as ttime +import torch +from torch import nn +from torch.nn import functional as F +from lib.infer_pack import modules +from lib.infer_pack import attentions +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from lib.infer_pack.commons import init_weights +import numpy as np +from lib.infer_pack import commons + + +class TextEncoder256(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder768(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(768, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = np.prod(upsample_rates) + + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMs256NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + nsff0 = nsff0[:, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + nsff0 = nsff0[:, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs256NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, rate=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, rate=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/lib/infer_pack/models_dml.py b/lib/infer_pack/models_dml.py new file mode 100644 index 0000000..958d7b2 --- /dev/null +++ b/lib/infer_pack/models_dml.py @@ -0,0 +1,1124 @@ +import math, pdb, os +from time import time as ttime +import torch +from torch import nn +from torch.nn import functional as F +from lib.infer_pack import modules +from lib.infer_pack import attentions +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from lib.infer_pack.commons import init_weights +import numpy as np +from lib.infer_pack import commons + + +class TextEncoder256(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder768(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(768, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv.float() + + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = np.prod(upsample_rates) + + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMs256NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs256NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/lib/infer_pack/models_onnx.py b/lib/infer_pack/models_onnx.py new file mode 100644 index 0000000..963e67b --- /dev/null +++ b/lib/infer_pack/models_onnx.py @@ -0,0 +1,819 @@ +import math, pdb, os +from time import time as ttime +import torch +from torch import nn +from torch.nn import functional as F +from lib.infer_pack import modules +from lib.infer_pack import attentions +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from lib.infer_pack.commons import init_weights +import numpy as np +from lib.infer_pack import commons + + +class TextEncoder256(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder768(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(768, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = np.prod(upsample_rates) + + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMsNSFsidM(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + version, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + if version == "v1": + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + else: + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + self.speaker_map = None + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def construct_spkmixmap(self, n_speaker): + self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels)) + for i in range(n_speaker): + self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) + self.speaker_map = self.speaker_map.unsqueeze(0) + + def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None): + if self.speaker_map is not None: # [N, S] * [S, B, 1, H] + g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] + g = g * self.speaker_map # [N, S, B, 1, H] + g = torch.sum(g, dim=1) # [N, 1, B, 1, H] + g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] + else: + g = g.unsqueeze(0) + g = self.emb_g(g).transpose(1, 2) + + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/lib/infer_pack/modules.py b/lib/infer_pack/modules.py new file mode 100644 index 0000000..c83289d --- /dev/null +++ b/lib/infer_pack/modules.py @@ -0,0 +1,522 @@ +import copy +import math +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import functional as F + +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm + +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding +from lib.infer_pack.transforms import piecewise_rational_quadratic_transform + + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size**i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append( + nn.Conv1d( + channels, + channels, + kernel_size, + groups=channels, + dilation=dilation, + padding=padding, + ) + ) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + + def forward(self, x, x_mask=None): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Log(nn.Module): + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels, 1)) + self.logs = nn.Parameter(torch.zeros(channels, 1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + gin_channels=gin_channels, + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class ConvFlow(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + n_layers, + num_bins=10, + tail_bound=5.0, + ): + super().__init__() + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.num_bins = num_bins + self.tail_bound = tail_bound + self.half_channels = in_channels // 2 + + self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) + self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) + self.proj = nn.Conv1d( + filter_channels, self.half_channels * (num_bins * 3 - 1), 1 + ) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) + h = self.convs(h, x_mask, g=g) + h = self.proj(h) * x_mask + + b, c, t = x0.shape + h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] + + unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( + self.filter_channels + ) + unnormalized_derivatives = h[..., 2 * self.num_bins :] + + x1, logabsdet = piecewise_rational_quadratic_transform( + x1, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=reverse, + tails="linear", + tail_bound=self.tail_bound, + ) + + x = torch.cat([x0, x1], 1) * x_mask + logdet = torch.sum(logabsdet * x_mask, [1, 2]) + if not reverse: + return x, logdet + else: + return x diff --git a/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py b/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py new file mode 100644 index 0000000..ee3171b --- /dev/null +++ b/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py @@ -0,0 +1,90 @@ +from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor +import pyworld +import numpy as np + + +class DioF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def resize_f0(self, x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * target_len, len(source)) / target_len, + np.arange(0, len(source)), + source, + ) + res = np.nan_to_num(target) + return res + + def compute_f0(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] + + def compute_f0_uv(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/lib/infer_pack/modules/F0Predictor/F0Predictor.py b/lib/infer_pack/modules/F0Predictor/F0Predictor.py new file mode 100644 index 0000000..f56e49e --- /dev/null +++ b/lib/infer_pack/modules/F0Predictor/F0Predictor.py @@ -0,0 +1,16 @@ +class F0Predictor(object): + def compute_f0(self, wav, p_len): + """ + input: wav:[signal_length] + p_len:int + output: f0:[signal_length//hop_length] + """ + pass + + def compute_f0_uv(self, wav, p_len): + """ + input: wav:[signal_length] + p_len:int + output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] + """ + pass diff --git a/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py new file mode 100644 index 0000000..96f08a1 --- /dev/null +++ b/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py @@ -0,0 +1,87 @@ +from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor +import pyworld +import numpy as np + + +class HarvestF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + self.fs = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def resize_f0(self, x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * target_len, len(source)) / target_len, + np.arange(0, len(source)), + source, + ) + res = np.nan_to_num(target) + return res + + def compute_f0(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.harvest( + wav.astype(np.double), + fs=self.sampling_rate, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] + + def compute_f0_uv(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.harvest( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py b/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py new file mode 100644 index 0000000..2c4918f --- /dev/null +++ b/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py @@ -0,0 +1,102 @@ +from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor +import parselmouth +import numpy as np + + +class PMF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def resize_f0(self, x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * target_len, len(source)) / target_len, + np.arange(0, len(source)), + source, + ) + res = np.nan_to_num(target) + return res + + def compute_f0(self, wav, p_len=None): + x = wav + if p_len is None: + p_len = x.shape[0] // self.hop_length + + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = ( + parselmouth.Sound(x, self.sampling_rate) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max, + ) + .selected_array["frequency"] + ) + + f0_resized = self.resize_f0(f0, p_len) + f0_interpolated, uv = self.interpolate_f0(f0_resized) + return f0_interpolated + + def compute_f0_uv(self, wav, p_len=None): + x = wav + if p_len is None: + p_len = x.shape[0] // self.hop_length + + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = ( + parselmouth.Sound(x, self.sampling_rate) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max, + ) + .selected_array["frequency"] + ) + + f0_resized = self.resize_f0(f0, p_len) + f0_interpolated, uv = self.interpolate_f0(f0_resized) + return f0_interpolated, uv diff --git a/lib/infer_pack/modules/F0Predictor/__init__.py b/lib/infer_pack/modules/F0Predictor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/infer_pack/onnx_inference.py b/lib/infer_pack/onnx_inference.py new file mode 100644 index 0000000..bf752f7 --- /dev/null +++ b/lib/infer_pack/onnx_inference.py @@ -0,0 +1,277 @@ +import onnxruntime +import librosa +import numpy as np +import soundfile + + +def load_audio_fast(path, target_sr): + # 1. Coba torchaudio (sangat cepat, ~11ms) + try: + import torchaudio + wav, sr = torchaudio.load(path) + if sr != target_sr: + import torchaudio.transforms as T + resampler = T.Resample(sr, target_sr) + wav = resampler(wav) + if wav.shape[0] > 1: + wav = wav.mean(dim=0) + wav = wav.squeeze().numpy() + return wav, target_sr + except Exception: + pass + + # 2. Coba pydub (cepat, ~80ms) + try: + from pydub import AudioSegment + audio_seg = AudioSegment.from_file(path) + audio_seg = audio_seg.set_frame_rate(target_sr).set_channels(1) + wav = np.array(audio_seg.get_array_of_samples(), dtype=np.float32) / 32768.0 + return wav, target_sr + except Exception: + pass + + # 3. Fallback ke librosa asli + return librosa.load(path, sr=target_sr, mono=True) + +class ContentVec: + def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None): + print("load model(s) from {}".format(vec_path)) + import onnxruntime as ort + opts = ort.SessionOptions() + opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + opts.intra_op_num_threads = 2 + + if device == "cpu" or device is None: + providers = ["CPUExecutionProvider"] + elif device == "cuda": + providers = [ + ("CUDAExecutionProvider", { + "device_id": 0, + "arena_extend_strategy": "kNextPowerOfTwo", + "cudnn_conv_algo_search": "EXHAUSTIVE", + "do_copy_in_default_stream": True, + }), + "CPUExecutionProvider" + ] + elif device == "dml": + providers = ["DmlExecutionProvider"] + else: + raise RuntimeError("Unsportted Device") + self.model = ort.InferenceSession(vec_path, sess_options=opts, providers=providers) + + def __call__(self, wav): + return self.forward(wav) + + def forward(self, wav): + feats = wav + if feats.ndim == 2: # double channels + feats = feats.mean(-1) + assert feats.ndim == 1, feats.ndim + feats = np.expand_dims(np.expand_dims(feats, 0), 0) + onnx_input = {self.model.get_inputs()[0].name: feats} + logits = self.model.run(None, onnx_input)[0] + return logits.transpose(0, 2, 1) + + +class RMVPEF0Predictor: + def __init__(self, model_path="rmvpe.pt", is_half=False, device="cpu", sampling_rate=40000): + import torch + from rmvpe import RMVPE + self.model = RMVPE(model_path, is_half=is_half, device=device) + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + data = np.reshape(f0, (f0.size, 1)) + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def compute_f0(self, wav16k, p_len): + # Input 'wav16k' sudah pasti berada di 16000 Hz karena di-bypass pada tingkat atas + f0 = self.model.infer_from_audio(wav16k, thred=0.03) + + # Resize f0 to match p_len perfectly using np.interp (sama dengan resize_f0 di Dio) + source = np.array(f0) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * p_len, len(source)) / p_len, + np.arange(0, len(source)), + source, + ) + res = np.nan_to_num(target) + + # Lakukan interpolasi agar kontinu (menghindari suara robotik & glitch pitch) + return self.interpolate_f0(res)[0] + + +def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs): + device = kargs.get("device", "cpu") + if f0_predictor == "pm": + from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor + + f0_predictor_object = PMF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + elif f0_predictor == "harvest": + from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import ( + HarvestF0Predictor, + ) + + f0_predictor_object = HarvestF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + elif f0_predictor == "dio": + from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor + + f0_predictor_object = DioF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + elif f0_predictor == "rmvpe": + is_half = kargs.get("is_half", False) + f0_predictor_object = RMVPEF0Predictor( + model_path="rmvpe.pt", + is_half=is_half, + device=device, + sampling_rate=sampling_rate + ) + else: + raise Exception("Unknown f0 predictor") + return f0_predictor_object + + +class OnnxRVC: + def __init__( + self, + model_path, + sr=40000, + hop_size=512, + vec_path="vec-768-layer-12", + device="cpu", + ): + vec_path = f"pretrained/{vec_path}.onnx" + self.vec_model = ContentVec(vec_path, device) + import onnxruntime as ort + opts = ort.SessionOptions() + opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + opts.intra_op_num_threads = 2 + + if device == "cpu" or device is None: + providers = ["CPUExecutionProvider"] + elif device == "cuda": + providers = [ + ("CUDAExecutionProvider", { + "device_id": 0, + "arena_extend_strategy": "kNextPowerOfTwo", + "cudnn_conv_algo_search": "EXHAUSTIVE", + "do_copy_in_default_stream": True, + }), + "CPUExecutionProvider" + ] + elif device == "dml": + providers = ["DmlExecutionProvider"] + else: + raise RuntimeError("Unsportted Device") + self.model = ort.InferenceSession(model_path, sess_options=opts, providers=providers) + self.sampling_rate = sr + self.hop_size = hop_size + self.device = device + + def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd): + onnx_input = { + self.model.get_inputs()[0].name: hubert, + self.model.get_inputs()[1].name: hubert_length, + self.model.get_inputs()[2].name: pitch, + self.model.get_inputs()[3].name: pitchf, + self.model.get_inputs()[4].name: ds, + self.model.get_inputs()[5].name: rnd, + } + return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16) + + def inference( + self, + raw_path, + sid, + f0_method="dio", + f0_up_key=0, + pad_time=0.5, + cr_threshold=0.02, + rmvpe_fp16=False, + ): + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + f0_predictor = get_f0_predictor( + f0_method, + hop_length=self.hop_size, + sampling_rate=self.sampling_rate, + threshold=cr_threshold, + device=self.device, + is_half=rmvpe_fp16, + ) + if f0_method == "rmvpe": + wav16k, sr = load_audio_fast(raw_path, 16000) + org_length = int(len(wav16k) * (self.sampling_rate / 16000)) + if len(wav16k) / 16000 > 50.0: + raise RuntimeError("Reached Max Length") + else: + wav, sr = load_audio_fast(raw_path, self.sampling_rate) + org_length = len(wav) + if org_length / sr > 50.0: + raise RuntimeError("Reached Max Length") + wav16k, _ = load_audio_fast(raw_path, 16000) + + hubert = self.vec_model(wav16k) + hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32) + hubert_length = hubert.shape[1] + + if f0_method == "rmvpe": + pitchf = f0_predictor.compute_f0(wav16k, hubert_length) + else: + pitchf = f0_predictor.compute_f0(wav, hubert_length) + pitchf = pitchf * 2 ** (f0_up_key / 12) + pitch = pitchf.copy() + f0_mel = 1127 * np.log(1 + pitch / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + pitch = np.rint(f0_mel).astype(np.int64) + + pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32) + pitch = pitch.reshape(1, len(pitch)) + ds = np.array([sid]).astype(np.int64) + + rnd = np.random.randn(1, 192, hubert_length).astype(np.float32) + hubert_length = np.array([hubert_length]).astype(np.int64) + + out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze() + out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant") + return out_wav[0:org_length] diff --git a/lib/infer_pack/transforms.py b/lib/infer_pack/transforms.py new file mode 100644 index 0000000..a11f799 --- /dev/null +++ b/lib/infer_pack/transforms.py @@ -0,0 +1,209 @@ +import torch +from torch.nn import functional as F + +import numpy as np + + +DEFAULT_MIN_BIN_WIDTH = 1e-3 +DEFAULT_MIN_BIN_HEIGHT = 1e-3 +DEFAULT_MIN_DERIVATIVE = 1e-3 + + +def piecewise_rational_quadratic_transform( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails=None, + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if tails is None: + spline_fn = rational_quadratic_spline + spline_kwargs = {} + else: + spline_fn = unconstrained_rational_quadratic_spline + spline_kwargs = {"tails": tails, "tail_bound": tail_bound} + + outputs, logabsdet = spline_fn( + inputs=inputs, + unnormalized_widths=unnormalized_widths, + unnormalized_heights=unnormalized_heights, + unnormalized_derivatives=unnormalized_derivatives, + inverse=inverse, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + **spline_kwargs + ) + return outputs, logabsdet + + +def searchsorted(bin_locations, inputs, eps=1e-6): + bin_locations[..., -1] += eps + return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 + + +def unconstrained_rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails="linear", + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) + outside_interval_mask = ~inside_interval_mask + + outputs = torch.zeros_like(inputs) + logabsdet = torch.zeros_like(inputs) + + if tails == "linear": + unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) + constant = np.log(np.exp(1 - min_derivative) - 1) + unnormalized_derivatives[..., 0] = constant + unnormalized_derivatives[..., -1] = constant + + outputs[outside_interval_mask] = inputs[outside_interval_mask] + logabsdet[outside_interval_mask] = 0 + else: + raise RuntimeError("{} tails are not implemented.".format(tails)) + + ( + outputs[inside_interval_mask], + logabsdet[inside_interval_mask], + ) = rational_quadratic_spline( + inputs=inputs[inside_interval_mask], + unnormalized_widths=unnormalized_widths[inside_interval_mask, :], + unnormalized_heights=unnormalized_heights[inside_interval_mask, :], + unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], + inverse=inverse, + left=-tail_bound, + right=tail_bound, + bottom=-tail_bound, + top=tail_bound, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + ) + + return outputs, logabsdet + + +def rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + left=0.0, + right=1.0, + bottom=0.0, + top=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if torch.min(inputs) < left or torch.max(inputs) > right: + raise ValueError("Input to a transform is not within its domain") + + num_bins = unnormalized_widths.shape[-1] + + if min_bin_width * num_bins > 1.0: + raise ValueError("Minimal bin width too large for the number of bins") + if min_bin_height * num_bins > 1.0: + raise ValueError("Minimal bin height too large for the number of bins") + + widths = F.softmax(unnormalized_widths, dim=-1) + widths = min_bin_width + (1 - min_bin_width * num_bins) * widths + cumwidths = torch.cumsum(widths, dim=-1) + cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) + cumwidths = (right - left) * cumwidths + left + cumwidths[..., 0] = left + cumwidths[..., -1] = right + widths = cumwidths[..., 1:] - cumwidths[..., :-1] + + derivatives = min_derivative + F.softplus(unnormalized_derivatives) + + heights = F.softmax(unnormalized_heights, dim=-1) + heights = min_bin_height + (1 - min_bin_height * num_bins) * heights + cumheights = torch.cumsum(heights, dim=-1) + cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) + cumheights = (top - bottom) * cumheights + bottom + cumheights[..., 0] = bottom + cumheights[..., -1] = top + heights = cumheights[..., 1:] - cumheights[..., :-1] + + if inverse: + bin_idx = searchsorted(cumheights, inputs)[..., None] + else: + bin_idx = searchsorted(cumwidths, inputs)[..., None] + + input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] + input_bin_widths = widths.gather(-1, bin_idx)[..., 0] + + input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] + delta = heights / widths + input_delta = delta.gather(-1, bin_idx)[..., 0] + + input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] + input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] + + input_heights = heights.gather(-1, bin_idx)[..., 0] + + if inverse: + a = (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + input_heights * (input_delta - input_derivatives) + b = input_heights * input_derivatives - (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + c = -input_delta * (inputs - input_cumheights) + + discriminant = b.pow(2) - 4 * a * c + assert (discriminant >= 0).all() + + root = (2 * c) / (-b - torch.sqrt(discriminant)) + outputs = root * input_bin_widths + input_cumwidths + + theta_one_minus_theta = root * (1 - root) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * root.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - root).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, -logabsdet + else: + theta = (inputs - input_cumwidths) / input_bin_widths + theta_one_minus_theta = theta * (1 - theta) + + numerator = input_heights * ( + input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta + ) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + outputs = input_cumheights + numerator / denominator + + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * theta.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - theta).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, logabsdet diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2a0c934 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +numpy==1.23.5 +onnxruntime-gpu +sounddevice +websockets +librosa==0.9.1 +soundfile==0.13.1 +pydub==0.25.1 +torch +torchaudio +scipy diff --git a/rmvpe.py b/rmvpe.py new file mode 100644 index 0000000..06c9a93 --- /dev/null +++ b/rmvpe.py @@ -0,0 +1,445 @@ +import sys, torch, numpy as np, traceback, pdb +import torch.nn as nn +from time import time as ttime +import torch.nn.functional as F + + +class BiGRU(nn.Module): + def __init__(self, input_features, hidden_features, num_layers): + super(BiGRU, self).__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] + + +class ConvBlockRes(nn.Module): + def __init__(self, in_channels, out_channels, momentum=0.01): + super(ConvBlockRes, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + self.is_shortcut = True + else: + self.is_shortcut = False + + def forward(self, x): + if self.is_shortcut: + return self.conv(x) + self.shortcut(x) + else: + return self.conv(x) + x + + +class Encoder(nn.Module): + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super(Encoder, self).__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, out_channels, kernel_size, n_blocks, momentum=momentum + ) + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x): + concat_tensors = [] + x = self.bn(x) + for i in range(self.n_encoders): + _, x = self.layers[i](x) + concat_tensors.append(_) + return x, concat_tensors + + +class ResEncoderBlock(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 + ): + super(ResEncoderBlock, self).__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i in range(self.n_blocks): + x = self.conv[i](x) + if self.kernel_size is not None: + return x, self.pool(x) + else: + return x + + +class Intermediate(nn.Module): # + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super(Intermediate, self).__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) + ) + for i in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) + ) + + def forward(self, x): + for i in range(self.n_inters): + x = self.layers[i](x) + return x + + +class ResDecoderBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super(ResDecoderBlock, self).__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i in range(self.n_blocks): + x = self.conv2[i](x) + return x + + +class Decoder(nn.Module): + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super(Decoder, self).__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for i in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) + ) + in_channels = out_channels + + def forward(self, x, concat_tensors): + for i in range(self.n_decoders): + x = self.layers[i](x, concat_tensors[-1 - i]) + return x + + +class DeepUnet(nn.Module): + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(DeepUnet, self).__init__() + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, en_de_layers, kernel_size, n_blocks + ) + + def forward(self, x): + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +class E2E(nn.Module): + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(E2E, self).__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, 360), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + ) + + def forward(self, mel): + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + return x + + +from librosa.filters import mel + + +class MelSpectrogram(torch.nn.Module): + def __init__( + self, + is_half, + n_mel_channels, + sampling_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sampling_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sampling_rate = sampling_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + self.is_half = is_half + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + audio.device + ) + fft = torch.stft( + audio, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ) + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + if self.is_half == True: + mel_output = mel_output.half() + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + return log_mel_spec + + +class RMVPE: + def __init__(self, model_path, is_half, device=None): + self.resample_kernel = {} + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu") + model.load_state_dict(ckpt) + model.eval() + if is_half == True: + model = model.half() + self.model = model + self.resample_kernel = {} + self.is_half = is_half + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + self.mel_extractor = MelSpectrogram( + is_half, 128, 16000, 1024, 160, None, 30, 8000 + ).to(device) + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(360) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 + + def mel2hidden(self, mel): + with torch.no_grad(): + n_frames = mel.shape[-1] + # PyTorch reflection_pad1d does not support Float16 (Half). + # Cast to float32 before padding, and cast back to float16 afterwards. + is_half = mel.dtype == torch.float16 + if is_half: + mel = mel.float() + + pad_size = 32 * ((n_frames - 1) // 32 + 1) - n_frames + if pad_size >= n_frames: + # Fallback to constant padding with the minimum value to prevent PyTorch reflection crashes on tiny audio chunks + min_val = mel.min().item() + mel = F.pad(mel, (0, pad_size), mode="constant", value=min_val) + else: + mel = F.pad(mel, (0, pad_size), mode="reflect") + + if is_half: + mel = mel.half() + hidden = self.model(mel) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) + return f0 + + def infer_from_audio(self, audio, thred=0.03): + audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) + # torch.cuda.synchronize() + # t0=ttime() + mel = self.mel_extractor(audio, center=True) + # torch.cuda.synchronize() + # t1=ttime() + hidden = self.mel2hidden(mel) + # torch.cuda.synchronize() + # t2=ttime() + hidden = hidden.squeeze(0).cpu().numpy() + if self.is_half == True: + hidden = hidden.astype("float32") + f0 = self.decode(hidden, thred=thred) + # torch.cuda.synchronize() + # t3=ttime() + # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + # t0 = ttime() + center = np.argmax(salience, axis=1) # 帧长#index + salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 + # t1 = ttime() + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + # t2 = ttime() + todo_salience = np.array(todo_salience) # 帧长,9 + todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) # 帧长 + devided = product_sum / weight_sum # 帧长 + # t3 = ttime() + maxx = np.max(salience, axis=1) # 帧长 + devided[maxx <= thred] = 0 + # t4 = ttime() + # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + return devided + + +# if __name__ == '__main__': +# audio, sampling_rate = sf.read("卢本伟语录~1.wav") +# if len(audio.shape) > 1: +# audio = librosa.to_mono(audio.transpose(1, 0)) +# audio_bak = audio.copy() +# if sampling_rate != 16000: +# audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) +# model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/test-RMVPE/weights/rmvpe_llc_half.pt" +# thred = 0.03 # 0.01 +# device = 'cuda' if torch.cuda.is_available() else 'cpu' +# rmvpe = RMVPE(model_path,is_half=False, device=device) +# t0=ttime() +# f0 = rmvpe.infer_from_audio(audio, thred=thred) +# f0 = rmvpe.infer_from_audio(audio, thred=thred) +# f0 = rmvpe.infer_from_audio(audio, thred=thred) +# f0 = rmvpe.infer_from_audio(audio, thred=thred) +# f0 = rmvpe.infer_from_audio(audio, thred=thred) +# t1=ttime() +# print(f0.shape,t1-t0) diff --git a/server.py b/server.py new file mode 100644 index 0000000..03113a6 --- /dev/null +++ b/server.py @@ -0,0 +1,768 @@ +import os +import sys +import types +import json + +# --- FIX FOR PATH COLLISION BETWEEN modules.py AND modules/ DIRECTORY --- +# Kita memaksa Python untuk mendaftarkan 'lib.infer_pack.modules' sebagai package directory +# alih-alih file-module, sehingga sub-impor (seperti F0Predictor) dapat dimuat dengan sukses +base_dir = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.join(base_dir, "lib", "infer_pack", "modules") +if os.path.isdir(modules_path): + modules_pkg = types.ModuleType("lib.infer_pack.modules") + modules_pkg.__path__ = [modules_path] + modules_pkg.__file__ = os.path.join(modules_path, "__init__.py") + sys.modules["lib.infer_pack.modules"] = modules_pkg +# ------------------------------------------------------------------------ +import time +import asyncio +import logging +import traceback +import argparse +import threading +import webbrowser +from http.server import SimpleHTTPRequestHandler +import socketserver +import numpy as np +import torch +import onnxruntime as ort + +# Add parent directories to sys.path so we can import lib +sys.path.append(os.getcwd()) + +from lib.infer_pack.onnx_inference import OnnxRVC, get_f0_predictor + +# Set logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") +logger = logging.getLogger("RVC-Realtime-Server") + +# Thread pool for audio processing (1 worker = sequential, but non-blocking to event loop) +import concurrent.futures +_audio_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1, thread_name_prefix="rvc-audio") + +# Global instances cache +current_rvc_onnx = None +current_model_key = None +model_root = "weights" +pretrained_root = "pretrained" + +# Patch torch.load to default to weights_only=False for compatibility +original_load = torch.load +def patched_load(*args, **kwargs): + if "weights_only" not in kwargs: + kwargs["weights_only"] = False + return original_load(*args, **kwargs) +torch.load = patched_load + +def get_onnx_models(): + models = [] + if os.path.exists(model_root): + for d in os.listdir(model_root): + d_path = os.path.join(model_root, d) + if os.path.isdir(d_path): + onnx_files = [f for f in os.listdir(d_path) if f.endswith(".onnx")] + if onnx_files: + models.append(d) + models.sort() + return models + +def get_model_metadata(model_name): + model_dir = os.path.join(model_root, model_name) + onnx_files = [f for f in os.listdir(model_dir) if f.endswith(".onnx")] + pth_files = [f for f in os.listdir(model_dir) if f.endswith(".pth")] + + onnx_path = os.path.join(model_dir, onnx_files[0]) + sr = 40000 # default + if pth_files: + try: + pth_path = os.path.join(model_dir, pth_files[0]) + cpt = torch.load(pth_path, map_location="cpu") + sr = cpt["config"][-1] + logger.info(f"Detected sample rate from .pth: {sr} Hz") + except Exception as e: + logger.warning(f"Failed to load sample rate from .pth, using default 40000: {e}") + + version = "v2" + vec_path = "vec-768-layer-12" + try: + sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"]) + feat_dim = sess.get_inputs()[0].shape[2] + if feat_dim == 256: + version = "v1" + vec_path = "vec-256-layer-12" + logger.info("Detected RVC Model Version: v1 (feat_dim = 256)") + else: + version = "v2" + vec_path = "vec-768-layer-12" + logger.info("Detected RVC Model Version: v2 (feat_dim = 768)") + except Exception as e: + logger.error(f"Error auto-detecting model version from ONNX: {e}") + + return onnx_path, sr, vec_path, version + +class RealtimeVoiceChanger: + def __init__(self): + self.processor = None + self.model_name = "" + self.f0_up_key = 0 + self.f0_method = "pm" + self.device = "cuda" + self.input_sr = 44100 + self.noise_gate_db = -40.0 + self.input_gain = 1.0 + self.output_gain = 1.0 + + # Audio sliding buffers + self.input_buffer = np.zeros(0, dtype=np.float32) + self.history_duration = 0.30 # 300ms history = enough context even at 8192 samples @ 48kHz + self.target_sr = 40000 + self.vec_path = "vec-768-layer-12" + self.version = "v2" + self.f0_predictors = {} # Cache to reuse pitch predictors instead of recreating them on every chunk + + # Server Hardware Routing properties + self.local_stream = None + self.routing_mode = "browser" + self.input_device = None + self.output_device = None + self.chunk_size = 8192 + self.loop = None + self.ws_client = None + self.visualizer_queue = None + + # High-pass filter coefficients cache + self.hpf_b = None + self.hpf_a = None + + def load_model(self, model_name, device): + global current_rvc_onnx, current_model_key + + # Resolve the device provider + if device == "cuda" and not torch.cuda.is_available(): + logger.warning("CUDA is not available, falling back to CPU") + device = "cpu" + + model_key = f"{model_name}_{device}" + if current_rvc_onnx is None or current_model_key != model_key: + logger.info(f"Loading RVC model '{model_name}' on {device}...") + onnx_path, sr, vec_path, version = get_model_metadata(model_name) + + # Ensure HuBERT model exists + full_vec_path = os.path.join(pretrained_root, f"{vec_path}.onnx") + if not os.path.exists(full_vec_path): + raise FileNotFoundError(f"ContentVec ONNX not found at: {full_vec_path}") + + current_rvc_onnx = OnnxRVC( + model_path=onnx_path, + sr=sr, + hop_size=512, + vec_path=vec_path, + device=device + ) + current_model_key = model_key + logger.info("Model loaded successfully") + + self.processor = current_rvc_onnx + self.target_sr = self.processor.sampling_rate + self.model_name = model_name + self.device = device + + def set_config(self, config): + logger.info(f"Updating config: {config}") + + # Update config fields + self.f0_up_key = int(config.get("f0_up_key", self.f0_up_key)) + self.f0_method = config.get("f0_method", self.f0_method) + self.input_sr = int(config.get("input_sr", self.input_sr)) + self.noise_gate_db = float(config.get("noise_gate", self.noise_gate_db)) + self.input_gain = float(config.get("input_gain", self.input_gain)) + self.output_gain = float(config.get("output_gain", self.output_gain)) + + model_name = config.get("model_name", self.model_name) + device = config.get("device", self.device) + + if not self.model_name or model_name != self.model_name or device != self.device: + self.load_model(model_name, device) + + # Reset input buffer if input samplerate changed + history_samples = int(self.history_duration * self.input_sr) + if len(self.input_buffer) != history_samples: + self.input_buffer = np.zeros(history_samples, dtype=np.float32) + + # Design a 1st order Butterworth high-pass filter at 80Hz to eliminate low-frequency static rumbling/hums + try: + from scipy import signal + nyq = 0.5 * self.input_sr + normal_cutoff = 80.0 / nyq + self.hpf_b, self.hpf_a = signal.butter(1, normal_cutoff, btype='high', analog=False) + except Exception as e: + logger.error(f"Failed to design high-pass filter: {e}") + self.hpf_b, self.hpf_a = None, None + + def apply_noise_gate(self, audio): + # Calculate RMS energy of the audio chunk + rms = np.sqrt(np.mean(audio**2)) + 1e-9 + rms_db = 20 * np.log10(rms) + + if rms_db < self.noise_gate_db: + return np.zeros_like(audio) + return audio + + def resample(self, audio, orig_sr, target_sr): + if orig_sr == target_sr: + return audio + + # Fast linear interpolation resampling + duration = len(audio) / orig_sr + num_target_samples = int(duration * target_sr) + x_orig = np.linspace(0, duration, len(audio)) + x_target = np.linspace(0, duration, num_target_samples) + return np.interp(x_target, x_orig, audio).astype(np.float32) + + def process_audio_chunk(self, raw_chunk): + """ + Process a raw input Float32 PCM audio chunk in memory with sliding window. + """ + if self.processor is None: + return raw_chunk + + t_start = time.time() + + # 1. Apply High-pass filter (80Hz Low-cut) to eliminate low-frequency background rumbles and AC hum + chunk = raw_chunk + if self.hpf_b is not None and self.hpf_a is not None: + try: + from scipy import signal + chunk = signal.lfilter(self.hpf_b, self.hpf_a, chunk).astype(np.float32) + except Exception as e: + pass + + # 2. Apply Input Gain & Noise Gate + chunk = chunk * self.input_gain + chunk = self.apply_noise_gate(chunk) + + # If chunk is pure silence from the gate, bypass inference immediately to save CPU! + if np.max(np.abs(chunk)) < 1e-6: + output_len = int(len(raw_chunk) * (self.target_sr / self.input_sr)) + return np.zeros(output_len, dtype=np.float32) + + t_gate = time.time() + + # 3. Manage Sliding Window Buffer + self.input_buffer = np.append(self.input_buffer[len(chunk):], chunk) + + # Append 120ms of silence at the end to push RVC convolution edge fading into the padded future (no edge distortion!) + future_samples = int(0.12 * self.input_sr) + full_input_audio = np.append(self.input_buffer, np.zeros(future_samples, dtype=np.float32)) + + # 4. Resample full segment to 16kHz for HuBERT and RMVPE + wav16k = self.resample(full_input_audio, self.input_sr, 16000) + + t_resample_in = time.time() + + # 4. Generate RVC ONNX inputs in-memory + hubert = self.processor.vec_model(wav16k) + hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32) + hubert_length = hubert.shape[1] + + t_hubert = time.time() + + # Initialize and cache pitch predictor (extract at 16kHz for 3x performance boost!) + predictor_key = f"{self.f0_method}_{self.device}_16000" + if predictor_key not in self.f0_predictors: + logger.info(f"Initializing and caching 16kHz F0 Predictor '{self.f0_method}' on {self.device}...") + hop_16k = int(self.processor.hop_size * 16000 / self.target_sr) + self.f0_predictors[predictor_key] = get_f0_predictor( + self.f0_method, + hop_length=hop_16k, + sampling_rate=16000, + threshold=0.02, + device=self.device, + is_half=True if self.device == "cuda" else False + ) + f0_predictor = self.f0_predictors[predictor_key] + + # Calculate pitch on 16kHz audio for all methods (massive CPU speed up!) + pitchf = f0_predictor.compute_f0(wav16k, hubert_length) + + t_f0 = time.time() + + # Pitch transpose + pitchf = pitchf * (2 ** (self.f0_up_key / 12)) + + # Pitch binning for RVC + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + f0_mel = 1127 * np.log(1 + pitchf / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + pitch = np.rint(f0_mel).astype(np.int64) + + pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32) + pitch = pitch.reshape(1, len(pitch)) + ds = np.array([0]).astype(np.int64) # sid = 0 + rnd = np.random.randn(1, 192, hubert_length).astype(np.float32) + hubert_length_tensor = np.array([hubert_length]).astype(np.int64) + + # 5. Run synthesis + out_wav = self.processor.forward(hubert, hubert_length_tensor, pitch, pitchf, ds, rnd).squeeze() + out_wav = out_wav.astype(np.float32) / 32767.0 # Normalize back to [-1.0, 1.0] float32 + + t_synth = time.time() + + # 6. Extract only the newly converted chunk, discarding history and future padding. + # The new chunk starts at (buffer_size - chunk_size) in the updated input_buffer. + # Use exact ratio target_sr/input_sr for clean integer math, not out_wav/full_input_audio + # which can drift due to future silence padding. + sr_ratio = self.target_sr / self.input_sr + history_in_buffer = len(self.input_buffer) - len(chunk) # samples of old audio before new chunk + start_idx = int(history_in_buffer * sr_ratio) + end_idx = int((history_in_buffer + len(chunk)) * sr_ratio) + + # Safety clamp + start_idx = max(0, min(start_idx, len(out_wav) - 1)) + end_idx = max(start_idx + 1, min(end_idx, len(out_wav))) + + # Extract the converted chunk safely from the middle (flawless, clear, continuous voice!) + output_chunk = out_wav[start_idx:end_idx] + + # Resample back to target output samples to match browser playback rate perfectly + target_chunk_len = int(len(chunk) * (self.target_sr / self.input_sr)) + output_chunk = self.resample(output_chunk, len(output_chunk), target_chunk_len) + + # Apply output gain + output_chunk = output_chunk * self.output_gain + + # Ensure we don't clip + output_chunk = np.clip(output_chunk, -1.0, 1.0) + + t_end = time.time() + + d_gate = (t_gate - t_start) * 1000 + d_res_in = (t_resample_in - t_gate) * 1000 + d_hubert = (t_hubert - t_resample_in) * 1000 + d_f0 = (t_f0 - t_hubert) * 1000 + d_synth = (t_synth - t_f0) * 1000 + d_res_out = (t_end - t_synth) * 1000 + t_elapsed = (t_end - t_start) * 1000 + + logger.info( + f"Chunk Profile: total={t_elapsed:.1f}ms | gate={d_gate:.1f}ms | res_in={d_res_in:.1f}ms | " + f"hubert={d_hubert:.1f}ms | f0={d_f0:.1f}ms | synth={d_synth:.1f}ms | res_out={d_res_out:.1f}ms" + ) + + return output_chunk + + def start_local_stream(self, loop, ws_client): + import sounddevice as sd + self.loop = loop + self.ws_client = ws_client + self.visualizer_queue = asyncio.Queue() + + if self.local_stream is not None: + self.stop_local_stream() + + if self.input_device is None: + self.input_device = sd.default.device[0] + if self.output_device is None: + self.output_device = sd.default.device[1] + + input_info = sd.query_devices(self.input_device) + output_info = sd.query_devices(self.output_device) + + input_sr = int(input_info["default_samplerate"]) + logger.info(f"Starting Server Hardware Stream: Input='{input_info['name']}' ({input_sr}Hz) | Output='{output_info['name']}' ({self.target_sr}Hz)") + + self.set_config({ + "input_sr": input_sr + }) + + def audio_callback(indata, outdata, frames, time_info, status): + if status: + logger.warning(f"Hardware Audio Callback Status: {status}") + + raw_chunk = indata[:, 0].copy() + output_chunk = self.process_audio_chunk(raw_chunk) + + if len(output_chunk) < frames: + outdata[:, 0] = np.pad(output_chunk, (0, frames - len(output_chunk)), "constant") + else: + outdata[:, 0] = output_chunk[:frames] + + # Send waveform chunks to WebSocket safely using loop.call_soon_threadsafe + if self.ws_client is not None: + loop.call_soon_threadsafe( + self.visualizer_queue.put_nowait, + (raw_chunk.copy(), output_chunk.copy()) + ) + + try: + self.local_stream = sd.Stream( + device=(self.input_device, self.output_device), + samplerate=self.target_sr, + blocksize=self.chunk_size, + channels=1, + dtype="float32", + callback=audio_callback + ) + self.local_stream.start() + logger.info("Server Hardware Stream active and processing locally!") + except Exception as e: + logger.error(f"Failed to start hardware stream: {e}") + raise e + + def stop_local_stream(self): + if self.local_stream is not None: + try: + self.local_stream.stop() + self.local_stream.close() + logger.info("Server Hardware Stream stopped successfully.") + except Exception as e: + logger.error(f"Error stopping hardware stream: {e}") + self.local_stream = None + self.visualizer_queue = None + +# --- WEBSOCKET SERVER IMPLEMENTATION --- +async def websocket_handler(websocket): + logger.info("New WebSocket client connected") + rvc = RealtimeVoiceChanger() + loop = asyncio.get_running_loop() + + # --- Pipeline queues --- + # input_queue : raw bytes from browser mic + # output_queue: processed bytes ready to send back + # maxsize=3 = ~3 chunk durations of buffer; provides backpressure without unbounded memory + input_queue = asyncio.Queue(maxsize=3) + output_queue = asyncio.Queue(maxsize=3) + + # --- STAGE 1: Receiver --- + # Reads ALL WebSocket messages immediately (never blocks on processing). + # Handles JSON config inline; binary audio chunks go to input_queue. + async def receiver_task(): + try: + async for message in websocket: + if isinstance(message, str): + try: + data = json.loads(message) + if data.get("type") == "config": + new_routing_mode = data.get("routing_mode", rvc.routing_mode) + new_input_device = data.get("input_device", rvc.input_device) + new_output_device = data.get("output_device", rvc.output_device) + new_chunk_size = int(data.get("chunk_size", rvc.chunk_size)) + + if new_input_device is not None: rvc.input_device = int(new_input_device) + if new_output_device is not None: rvc.output_device = int(new_output_device) + rvc.chunk_size = new_chunk_size + rvc.set_config(data) + + if new_routing_mode == "hardware": + if rvc.routing_mode != "hardware" or rvc.local_stream is None: + rvc.routing_mode = "hardware" + rvc.start_local_stream(loop, websocket) + else: + if rvc.routing_mode == "hardware": + rvc.stop_local_stream() + rvc.routing_mode = "browser" + + response = { + "type": "config_success", + "model_name": rvc.model_name, + "target_sr": rvc.target_sr, + "f0_method": rvc.f0_method, + "f0_up_key": rvc.f0_up_key, + "device": rvc.device, + "routing_mode": rvc.routing_mode + } + await websocket.send(json.dumps(response)) + except Exception as e: + logger.error(f"Config parse error: {e}") + await websocket.send(json.dumps({"type": "error", "message": str(e)})) + + elif isinstance(message, bytes): + if rvc.routing_mode == "hardware": + continue + if rvc.processor is None: + # Prepend 0.0 processing time to echoed message + input_chunk = np.frombuffer(message, dtype=np.float32) + payload = np.empty(len(input_chunk) + 1, dtype=np.float32) + payload[0] = 0.0 + payload[1:] = input_chunk + await websocket.send(payload.tobytes()) + continue + # Put chunk in queue — await here means we yield if queue is full (backpressure) + await input_queue.put(message) + except Exception as e: + logger.error(f"Receiver error: {e}") + finally: + # Signal downstream stages to stop + await input_queue.put(None) + + # --- STAGE 2: Processor --- + # Pulls from input_queue, processes in thread (non-blocking to event loop), pushes to output_queue. + # Sequential (1 executor worker) = output order matches input order. + async def processor_task(): + try: + while True: + item = await input_queue.get() + if item is None: + break # shutdown signal + input_chunk = np.frombuffer(item, dtype=np.float32).copy() + t_start = time.time() + output_chunk = await loop.run_in_executor( + _audio_executor, rvc.process_audio_chunk, input_chunk + ) + t_elapsed = (time.time() - t_start) * 1000 + + # Prepend the elapsed processing time to the audio chunk bytes + payload = np.empty(len(output_chunk) + 1, dtype=np.float32) + payload[0] = t_elapsed + payload[1:] = output_chunk + await output_queue.put(payload.tobytes()) + except asyncio.CancelledError: + pass + except Exception as e: + logger.error(f"Processor error: {e}") + finally: + await output_queue.put(None) + + # --- STAGE 3: Sender --- + # Pulls processed audio from output_queue and sends to client. + async def sender_task(): + try: + while True: + item = await output_queue.get() + if item is None: + break # shutdown signal + await websocket.send(item) + except asyncio.CancelledError: + pass + except Exception as e: + logger.error(f"Sender error: {e}") + + # --- Visualizer (hardware mode only) --- + async def visualizer_sender_loop(): + try: + while True: + if rvc.routing_mode == "hardware" and rvc.visualizer_queue is not None: + try: + raw_chunk, output_chunk = await asyncio.wait_for( + rvc.visualizer_queue.get(), timeout=0.1 + ) + payload = { + "type": "visualizer", + "input": raw_chunk.tolist(), + "output": output_chunk.tolist() + } + await websocket.send(json.dumps(payload)) + except asyncio.TimeoutError: + pass + else: + await asyncio.sleep(0.05) + except asyncio.CancelledError: + pass + except Exception as e: + logger.error(f"Visualizer sender error: {e}") + + # --- Send device list on connect --- + import sounddevice as sd + devices_list = [] + try: + for idx, d in enumerate(sd.query_devices()): + devices_list.append({ + "id": idx, + "name": d["name"], + "max_input_channels": d["max_input_channels"], + "max_output_channels": d["max_output_channels"], + "default_samplerate": d["default_samplerate"] + }) + default_input = sd.default.device[0] + default_output = sd.default.device[1] + except Exception as e: + logger.error(f"Failed to query server audio devices: {e}") + devices_list = [] + default_input = -1 + default_output = -1 + + await websocket.send(json.dumps({ + "type": "init_devices", + "devices": devices_list, + "default_input": default_input, + "default_output": default_output + })) + + # --- Run all pipeline stages concurrently --- + vis_task = asyncio.create_task(visualizer_sender_loop()) + proc_task = asyncio.create_task(processor_task()) + send_task = asyncio.create_task(sender_task()) + + try: + await receiver_task() # runs until websocket closes + except Exception as e: + logger.error(f"WebSocket handler error: {e}") + finally: + vis_task.cancel() + proc_task.cancel() + send_task.cancel() + rvc.stop_local_stream() + logger.info("WebSocket client disconnected, pipeline cleaned up.") + + +async def start_websocket_server(host, port): + import websockets + logger.info(f"Starting WebSocket server on ws://{host}:{port}...") + async with websockets.serve(websocket_handler, host, port): + await asyncio.Future() + +# --- HTTP STATIC FILE SERVER FOR FRONTEND --- +def start_http_server(port, directory="frontend"): + class MyHandler(SimpleHTTPRequestHandler): + def __init__(self, *args, **kwargs): + # Force serve from directory relative to the project root + base_dir = os.path.dirname(os.path.abspath(__file__)) + full_dir = os.path.join(base_dir, directory) + super().__init__(*args, directory=full_dir, **kwargs) + + def log_message(self, format, *args): + # Suppress standard logging to prevent console pollution + pass + + try: + # Create a TCPServer that allows address reuse + socketserver.TCPServer.allow_reuse_address = True + with socketserver.TCPServer(("", port), MyHandler) as httpd: + logger.info(f"Serving HTTP frontend on http://localhost:{port}") + httpd.serve_forever() + except Exception as e: + logger.error(f"Failed to start HTTP server: {e}") + +# --- LOCAL AUDIO DEVICE STREAM MODE --- +def run_local_device_mode(model_name, f0_up_key, f0_method, device, input_device, output_device, chunk_size): + import sounddevice as sd + + logger.info("Starting Local Audio Hardware Stream Mode...") + + rvc = RealtimeVoiceChanger() + rvc.load_model(model_name, device) + + if input_device is None: + input_device = sd.default.device[0] + if output_device is None: + output_device = sd.default.device[1] + + input_info = sd.query_devices(input_device) + output_info = sd.query_devices(output_device) + + input_sr = int(input_info["default_samplerate"]) + target_sr = rvc.target_sr + + logger.info(f"Input Device: {input_info['name']} (Sample Rate: {input_sr} Hz)") + logger.info(f"Output Device: {output_info['name']} (Sample Rate: {target_sr} Hz)") + + rvc.set_config({ + "f0_up_key": f0_up_key, + "f0_method": f0_method, + "input_sr": input_sr, + "device": device, + "model_name": model_name, + "noise_gate": -40.0, + "input_gain": 1.0, + "output_gain": 1.0 + }) + + def audio_callback(indata, outdata, frames, time_info, status): + if status: + logger.warning(f"Audio Callback Status: {status}") + + raw_chunk = indata[:, 0].copy() + output_chunk = rvc.process_audio_chunk(raw_chunk) + + if len(output_chunk) < frames: + outdata[:, 0] = np.pad(output_chunk, (0, frames - len(output_chunk)), "constant") + else: + outdata[:, 0] = output_chunk[:frames] + + try: + stream = sd.Stream( + device=(input_device, output_device), + samplerate=target_sr, + blocksize=chunk_size, + channels=1, + dtype="float32", + callback=audio_callback + ) + with stream: + logger.info("Real-Time Sounddevice Stream active! Press Ctrl+C to stop.") + while True: + time.sleep(1) + except KeyboardInterrupt: + logger.info("Local stream stopped by user") + except Exception as e: + logger.error(f"Local stream error: {e}") + traceback.print_exc() + +# --- MAIN --- +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="High-Performance Real-Time RVC ONNX Server") + parser.add_argument("--mode", type=str, default="websocket", choices=["websocket", "device"], help="Server running mode") + parser.add_argument("--host", type=str, default="127.0.0.1", help="WebSocket host") + parser.add_argument("--port", type=int, default=8765, help="WebSocket port") + parser.add_argument("--http_port", type=int, default=8000, help="HTTP static server port for Web UI") + parser.add_argument("--model", type=str, default="", help="RVC Model folder name inside weights/") + parser.add_argument("--transpose", type=int, default=0, help="Pitch shift in semitones (transpose)") + parser.add_argument("--f0_method", type=str, default="pm", choices=["pm", "harvest", "dio", "rmvpe"], help="Pitch extraction method") + parser.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda", "dml"], help="Execution provider") + parser.add_argument("--input_device", type=int, default=None, help="Input device ID (for device mode)") + parser.add_argument("--output_device", type=int, default=None, help="Output device ID (for device mode)") + parser.add_argument("--chunk_size", type=int, default=2048, help="Audio block size in samples") + + args = parser.parse_args() + + model_name = args.model + if not model_name: + models = get_onnx_models() + if models: + model_name = models[0] + logger.info(f"Auto-selected model: {model_name}") + else: + logger.error("No models found in weights/ directory. Please export a model first.") + sys.exit(1) + + if args.mode == "websocket": + # 1. Start HTTP Server in a background thread to serve the frontend! + http_thread = threading.Thread( + target=start_http_server, + args=(args.http_port, "frontend"), + daemon=True + ) + http_thread.start() + + # 2. Automatically open the Web UI in the default browser! + web_ui_url = f"http://127.0.0.1:{args.http_port}" + logger.info(f"Automatically launching Web UI at {web_ui_url} in browser...") + + # We give it a tiny delay to ensure the HTTP server socket is open + def open_browser(): + time.sleep(0.5) + webbrowser.open(web_ui_url) + + browser_thread = threading.Thread(target=open_browser, daemon=True) + browser_thread.start() + + # 3. Start the WebSocket server on the main event loop + try: + asyncio.run(start_websocket_server(args.host, args.port)) + except KeyboardInterrupt: + logger.info("Server shut down") + elif args.mode == "device": + run_local_device_mode( + model_name=model_name, + f0_up_key=args.transpose, + f0_method=args.f0_method, + device=args.device, + input_device=args.input_device, + output_device=args.output_device, + chunk_size=args.chunk_size + ) diff --git a/start.bat b/start.bat new file mode 100644 index 0000000..9d990f5 --- /dev/null +++ b/start.bat @@ -0,0 +1,15 @@ +@echo off +title ONNX Real-Time Voice Changer +cd /d "%~dp0" + +set VENV_PYTHON=..\rvc-tts-webui\venv\Scripts\python.exe + +if exist "%VENV_PYTHON%" ( + echo Menjalankan menggunakan virtual environment dari rvc-tts-webui... + "%VENV_PYTHON%" -u server.py --host 127.0.0.1 --port 8765 --http_port 8000 +) else ( + echo Virtual environment tidak ditemukan, mencoba menggunakan python sistem... + python -u server.py --host 127.0.0.1 --port 8765 --http_port 8000 +) + +pause