diff --git a/api/transcribe.php b/api/transcribe.php new file mode 100644 index 0000000..42cf177 --- /dev/null +++ b/api/transcribe.php @@ -0,0 +1,190 @@ + 'File exceeds server upload limit.', + UPLOAD_ERR_FORM_SIZE => 'File exceeds form size limit.', + UPLOAD_ERR_PARTIAL => 'File was only partially uploaded.', + UPLOAD_ERR_NO_FILE => 'No audio file received.', + ]; + dbnToolsError($map[$code] ?? "Upload error (code {$code}).", 400, 'upload_error'); +} + +$file = $_FILES['audio']; +$maxBytes = 200 * 1024 * 1024; + +if ($file['size'] > $maxBytes) { + dbnToolsError('File too large. Maximum 200 MB.', 413, 'file_too_large'); +} + +$allowedExts = ['mp3', 'wav', 'ogg', 'oga', 'm4a', 'mp4', 'flac', 'webm', 'aac']; +$ext = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION)); +if (!in_array($ext, $allowedExts, true)) { + dbnToolsError("Unsupported format: .{$ext}. Use MP3, WAV, OGG, M4A, FLAC, or WebM.", 415, 'unsupported_format'); +} + +// ── Build Whisper request ───────────────────────────────────────────────────── + +$whisperBase = 'http://194.93.49.14:20019'; +$endpoint = $diarize ? $whisperBase . '/transcribe/diarize' : $whisperBase . '/transcribe'; + +$boundary = '----DBN' . bin2hex(random_bytes(8)); +$body = "--{$boundary}\r\n"; +$body .= 'Content-Disposition: form-data; name="file"; filename="' . addslashes(basename($file['name'])) . '"' . "\r\n"; +$body .= "Content-Type: application/octet-stream\r\n\r\n"; + +$fileContents = file_get_contents($file['tmp_name']); +if ($fileContents === false) { + dbnToolsError('Could not read uploaded file.', 500, 'file_read_error'); +} +$body .= $fileContents . "\r\n"; + +if ($language !== 'auto') { + $body .= "--{$boundary}\r\n"; + $body .= "Content-Disposition: form-data; name=\"language\"\r\n\r\n"; + $body .= $language . "\r\n"; +} + +if ($diarize && $numSpeakers > 1) { + $body .= "--{$boundary}\r\n"; + $body .= "Content-Disposition: form-data; name=\"num_speakers\"\r\n\r\n"; + $body .= $numSpeakers . "\r\n"; +} + +$body .= "--{$boundary}--\r\n"; + +// ── Call Whisper ────────────────────────────────────────────────────────────── + +$t0 = microtime(true); + +if (function_exists('curl_init')) { + $ch = curl_init($endpoint); + curl_setopt_array($ch, [ + CURLOPT_RETURNTRANSFER => true, + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => $body, + CURLOPT_HTTPHEADER => [ + "Content-Type: multipart/form-data; boundary={$boundary}", + 'Accept: application/json', + ], + CURLOPT_TIMEOUT => 600, + ]); + $whisperBody = curl_exec($ch); + $httpCode = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE); + $curlErr = curl_error($ch); + curl_close($ch); + + if ($whisperBody === false || $httpCode !== 200) { + dbnToolsError('Whisper service error (HTTP ' . $httpCode . '): ' . $curlErr, 502, 'whisper_error'); + } +} else { + $ctx = stream_context_create([ + 'http' => [ + 'method' => 'POST', + 'timeout' => 600, + 'header' => "Content-Type: multipart/form-data; boundary={$boundary}\r\nAccept: application/json\r\n", + 'content' => $body, + 'ignore_errors' => true, + ], + ]); + $whisperBody = @file_get_contents($endpoint, false, $ctx); + + if ($whisperBody === false) { + dbnToolsError('Whisper service unreachable. The GPU may be offline.', 502, 'whisper_unreachable'); + } +} + +$latencyMs = (int)round((microtime(true) - $t0) * 1000); + +$whisper = json_decode($whisperBody, true); +if (!is_array($whisper) || empty($whisper['text'])) { + dbnToolsError('Empty or invalid response from Whisper.', 502, 'whisper_empty'); +} + +// ── Speaker role labelling ──────────────────────────────────────────────────── + +$segments = is_array($whisper['segments'] ?? null) ? $whisper['segments'] : []; +$numDetected = (int)($whisper['num_speakers'] ?? 1); + +if ($numDetected < 2 && $segments) { + $uniqueSpeakers = array_filter(array_unique(array_column($segments, 'speaker'))); + if (count($uniqueSpeakers) > 1) $numDetected = count($uniqueSpeakers); +} + +$speakerRoles = null; +if ($diarize && $numDetected > 1 && $segments) { + $speakerRoles = dbnLabelSpeakerRoles($segments); +} + +// ── Respond ─────────────────────────────────────────────────────────────────── + +dbnToolsLogMetadata([ + 'tool' => 'transcribe', + 'language' => $language, + 'ok' => true, + 'latency_ms' => $latencyMs, +]); + +dbnToolsRespond([ + 'ok' => true, + 'tool' => 'transcribe', + 'transcript' => (string)$whisper['text'], + 'segments' => $segments, + 'speaker_roles' => $speakerRoles, + 'num_speakers' => $numDetected, + 'language' => (string)($whisper['language'] ?? $language), + 'duration_sec' => round((float)($whisper['duration_seconds'] ?? 0), 2), + 'model' => (string)($whisper['model'] ?? 'whisper'), + 'latency_ms' => $latencyMs, +]); + +// ── Speaker role labelling helper ───────────────────────────────────────────── + +function dbnLabelSpeakerRoles(array $segments): array +{ + $sample = array_slice( + array_values(array_filter($segments, fn($s) => isset($s['speaker']))), + 0, 20 + ); + if (!$sample) return []; + + $lines = array_map(fn($s) => "[{$s['speaker']}] " . trim((string)($s['text'] ?? '')), $sample); + $azure = new DbnAzureOpenAiGateway(); + $system = 'You are analyzing a legal proceeding transcript. ' + . 'Based on the first segments, identify the role of each speaker. ' + . 'Common roles in Norwegian legal proceedings: dommer (judge), advokat (lawyer), ' + . 'forelder (parent), barn (child), sakkyndig (expert witness), ' + . 'saksbehandler (caseworker), tolk (interpreter), vitne (witness), ' + . 'prosessfullmektig (counsel). ' + . 'Return ONLY valid JSON: {"SPEAKER_00":"dommer","SPEAKER_01":"forelder"}. ' + . 'Use Norwegian role names. Use "ukjent" if role cannot be determined. ' + . 'Only include speakers present in the input.'; + + try { + $text = $azure->chatText([ + ['role' => 'system', 'content' => $system], + ['role' => 'user', 'content' => implode("\n", $lines)], + ], ['temperature' => 0.1, 'max_tokens' => 200]); + $cleaned = preg_replace('/^```(?:json)?\s*|\s*```$/m', '', trim($text)); + $json = json_decode($cleaned, true); + return is_array($json) ? $json : []; + } catch (Throwable) { + return []; + } +} diff --git a/assets/css/tools.css b/assets/css/tools.css index 2f8c4fc..3b1f3a6 100644 --- a/assets/css/tools.css +++ b/assets/css/tools.css @@ -1103,3 +1103,112 @@ p { text-transform: uppercase; letter-spacing: 0.03em; } + +/* ── Transcribe tool ─────────────────────────────────────────────── */ + +.num-speakers-input { + width: 4.5rem; + padding: 0.25rem 0.5rem; + border: 1px solid var(--line); + border-radius: 6px; + background: #fff; + color: var(--ink); + font-size: 0.85rem; +} + +.transcript-roles { + display: flex; + flex-wrap: wrap; + gap: 0.4rem; + margin-bottom: 0.75rem; +} + +.speaker-tag { + display: inline-flex; + align-items: center; + gap: 0.3rem; + font-size: 0.72rem; + font-weight: 600; + padding: 0.2rem 0.55rem; + border-radius: 4px; +} + +.speaker-tag small { + font-weight: 400; + opacity: 0.75; +} + +.speaker-tag--0 { background: #dbeafe; color: #1d4ed8; } +.speaker-tag--1 { background: #ede9fe; color: #6d28d9; } +.speaker-tag--2 { background: #dcfce7; color: #166534; } +.speaker-tag--3 { background: #fef9c3; color: #854d0e; } +.speaker-tag--4 { background: #fee2e2; color: #991b1b; } +.speaker-tag--5 { background: #e7f5f2; color: #0f766e; } + +.transcript-box { + background: var(--bg); + border: 1px solid var(--line); + border-radius: 8px; + padding: 1rem; + max-height: 400px; + overflow-y: auto; + margin-bottom: 0.75rem; +} + +.transcript-text { + white-space: pre-wrap; + word-break: break-word; + font-size: 0.875rem; + line-height: 1.65; + font-family: inherit; + margin: 0; + color: var(--ink); +} + +.segment-details { + border: 1px solid var(--line); + border-radius: 8px; + margin-bottom: 0.75rem; +} + +.segment-summary { + font-size: 0.8rem; + color: var(--muted); + padding: 0.6rem 1rem; + cursor: pointer; + user-select: none; +} + +.segment-list { + padding: 0.25rem 0.75rem 0.75rem; + max-height: 280px; + overflow-y: auto; +} + +.segment-row { + display: flex; + gap: 0.6rem; + align-items: baseline; + padding: 0.2rem 0; + font-size: 0.78rem; + border-bottom: 1px solid var(--bg); +} + +.segment-time { + color: var(--muted); + font-family: ui-monospace, monospace; + min-width: 7rem; + flex-shrink: 0; +} + +.segment-text { + color: var(--ink); + line-height: 1.4; +} + +.transcript-downloads { + display: flex; + flex-wrap: wrap; + gap: 0.5rem; + margin-top: 0.75rem; +} diff --git a/assets/js/tools.js b/assets/js/tools.js index a3adbaf..2adb355 100644 --- a/assets/js/tools.js +++ b/assets/js/tools.js @@ -4,6 +4,8 @@ const state = { }; let lastTimelineEvents = []; +let lastAudioFile = null; +let lastTranscriptData = null; const tools = { ask: { @@ -56,6 +58,16 @@ const tools = { usesLanguage: false, badge: 'deterministic first', }, + transcribe: { + kind: 'Audio Transcription', + title: 'Transcribe audio', + label: 'Audio file', + endpoint: 'api/transcribe.php', + payloadKey: null, + placeholder: '', + usesLanguage: false, + badge: 'Whisper / GPU', + }, }; const els = {}; @@ -91,6 +103,17 @@ document.addEventListener('DOMContentLoaded', () => { aliasSection: document.querySelector('#aliasSection'), addAliasRow: document.querySelector('#addAliasRow'), aliasRows: document.querySelector('#aliasRows'), + audioZone: document.querySelector('#audioZone'), + audioInput: document.querySelector('#audioInput'), + audioPrompt: document.querySelector('#audioPrompt'), + audioFileInfo: document.querySelector('#audioFileInfo'), + audioFileName: document.querySelector('#audioFileName'), + audioFileSize: document.querySelector('#audioFileSize'), + audioClear: document.querySelector('#audioClear'), + diarizeControl: document.querySelector('#diarizeControl'), + diarizeCheck: document.querySelector('#diarizeCheck'), + numSpeakersInput: document.querySelector('#numSpeakersInput'), + transcribeLangControl: document.querySelector('#transcribeLangControl'), }); els.tabs.forEach((button) => { @@ -101,8 +124,12 @@ document.addEventListener('DOMContentLoaded', () => { els.healthButton.addEventListener('click', checkHealth); setupUpload(); setupAliases(); + setupAudio(); els.results.addEventListener('click', (e) => { if (e.target.closest('#exportCsvBtn')) exportTimelineCSV(lastTimelineEvents); + if (e.target.closest('#dlTxt')) downloadTranscriptTxt(); + if (e.target.closest('#dlSrt')) downloadTranscriptSrt(); + if (e.target.closest('#dlVtt')) downloadTranscriptVtt(); }); setTool(state.activeTool); @@ -132,8 +159,14 @@ function setTool(toolName) { els.redactionControl.classList.toggle('is-hidden', toolName !== 'redact'); els.uploadZone.classList.toggle('is-hidden', toolName !== 'redact' && toolName !== 'timeline'); els.aliasSection.classList.toggle('is-hidden', toolName !== 'redact'); + els.audioZone.classList.toggle('is-hidden', toolName !== 'transcribe'); + els.diarizeControl.classList.toggle('is-hidden', toolName !== 'transcribe'); + els.transcribeLangControl.classList.toggle('is-hidden', toolName !== 'transcribe'); + els.input.classList.toggle('is-hidden', toolName === 'transcribe'); + els.inputLabel.classList.toggle('is-hidden', toolName === 'transcribe'); resetUpload(); resetAliases(); + resetAudio(); els.status.textContent = ''; renderTrace([]); } @@ -163,6 +196,12 @@ async function submitPasscode(event) { async function runTool(event) { event.preventDefault(); + + if (state.activeTool === 'transcribe') { + await runTranscribe(); + return; + } + const tool = tools[state.activeTool]; const text = els.input.value.trim(); if (!text) { @@ -389,7 +428,9 @@ async function postJson(url, payload) { function setBusy(isBusy) { const button = document.querySelector('#runButton'); button.disabled = isBusy; - button.textContent = isBusy ? 'Running...' : 'Run Tool'; + button.textContent = isBusy + ? (state.activeTool === 'transcribe' ? 'Transcribing...' : 'Running...') + : 'Run Tool'; } function currentLanguage() { @@ -447,6 +488,10 @@ function renderMainFinding(data) { return `
${escapeHtml(data.what_we_found || '')}
`; } +function currentTranscribeLang() { + return document.querySelector('input[name="transcribeLang"]:checked')?.value || 'auto'; +} + function renderEvidence(data) { const items = data.evidence_trail || data.sources || data.hits || []; if (!items.length) { @@ -513,6 +558,224 @@ function exportTimelineCSV(events) { URL.revokeObjectURL(url); } +async function runTranscribe() { + if (!lastAudioFile) { + els.status.textContent = 'Choose an audio file before transcribing.'; + return; + } + setBusy(true); + renderTrace([{ label: 'Sending to Whisper', detail: 'Uploading audio to cuttlefish GPU…', status: 'running' }]); + + try { + const formData = new FormData(); + formData.append('audio', lastAudioFile); + formData.append('language', currentTranscribeLang()); + if (els.diarizeCheck?.checked) { + formData.append('diarize', '1'); + const n = parseInt(els.numSpeakersInput?.value || '', 10); + if (n >= 2) formData.append('num_speakers', String(n)); + } + + const resp = await fetch('api/transcribe.php', { + method: 'POST', + credentials: 'same-origin', + body: formData, + }); + const data = await resp.json().catch(() => ({})); + if (!resp.ok || !data.ok) { + throw new Error(data.error?.message || `Transcription failed (HTTP ${resp.status}).`); + } + + lastTranscriptData = data; + renderTranscriptResults(data); + + const dur = data.duration_sec ? ` · Audio: ${Math.round(data.duration_sec)}s` : ''; + els.status.textContent = `Done in ${data.latency_ms || 0} ms${dur}.`; + } catch (error) { + els.status.textContent = error.message; + renderTrace([{ label: 'Transcription error', detail: error.message, status: 'warning' }]); + } finally { + setBusy(false); + } +} + +function renderTranscriptResults(data) { + const speakerRoles = data.speaker_roles || {}; + const segments = data.segments || []; + const hasSpeakers = segments.some((s) => s.speaker); + + const speakerOrder = [...new Set(segments.filter((s) => s.speaker).map((s) => s.speaker))]; + + const rolesHtml = speakerOrder.length + ? `${speakerOrder.map((id, i) => { + const role = speakerRoles[id] || id; + return `${escapeHtml(role)}${escapeHtml(id)}`; + }).join('')}
` + : ''; + + const segmentsHtml = hasSpeakers + ? `${escapeHtml(data.transcript)}Remove sensitive personal data with configurable Nordic / ECHR / Global profiles.
Convert audio recordings to text with optional speaker separation and Norwegian role labelling.
+Drop audio file here, or
+MP3, WAV, OGG, M4A, FLAC, WEBM — max 200 MB
+