Transcribe: audio-to-text tool with diarization and speaker role labelling

New sixth tool in the hub. Accepts MP3/WAV/OGG/M4A/FLAC/WEBM up to 200 MB, proxies to Whisper on cuttlefish GPU. Optional speaker separation with LLM role labelling (dommer, advokat, forelder, sakkyndig, etc. via GPT-4o-mini). Client-side TXT / SRT / VTT download from segment data. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 18:43:22 +02:00
parent bddafea049
commit d425c99e8e
4 changed files with 600 additions and 2 deletions
@@ -1103,3 +1103,112 @@ p {
  text-transform: uppercase;
  letter-spacing: 0.03em;
 }
+
+/* ── Transcribe tool ─────────────────────────────────────────────── */
+
+.num-speakers-input {
+  width: 4.5rem;
+  padding: 0.25rem 0.5rem;
+  border: 1px solid var(--line);
+  border-radius: 6px;
+  background: #fff;
+  color: var(--ink);
+  font-size: 0.85rem;
+}
+
+.transcript-roles {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 0.4rem;
+  margin-bottom: 0.75rem;
+}
+
+.speaker-tag {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.3rem;
+  font-size: 0.72rem;
+  font-weight: 600;
+  padding: 0.2rem 0.55rem;
+  border-radius: 4px;
+}
+
+.speaker-tag small {
+  font-weight: 400;
+  opacity: 0.75;
+}
+
+.speaker-tag--0 { background: #dbeafe; color: #1d4ed8; }
+.speaker-tag--1 { background: #ede9fe; color: #6d28d9; }
+.speaker-tag--2 { background: #dcfce7; color: #166534; }
+.speaker-tag--3 { background: #fef9c3; color: #854d0e; }
+.speaker-tag--4 { background: #fee2e2; color: #991b1b; }
+.speaker-tag--5 { background: #e7f5f2; color: #0f766e; }
+
+.transcript-box {
+  background: var(--bg);
+  border: 1px solid var(--line);
+  border-radius: 8px;
+  padding: 1rem;
+  max-height: 400px;
+  overflow-y: auto;
+  margin-bottom: 0.75rem;
+}
+
+.transcript-text {
+  white-space: pre-wrap;
+  word-break: break-word;
+  font-size: 0.875rem;
+  line-height: 1.65;
+  font-family: inherit;
+  margin: 0;
+  color: var(--ink);
+}
+
+.segment-details {
+  border: 1px solid var(--line);
+  border-radius: 8px;
+  margin-bottom: 0.75rem;
+}
+
+.segment-summary {
+  font-size: 0.8rem;
+  color: var(--muted);
+  padding: 0.6rem 1rem;
+  cursor: pointer;
+  user-select: none;
+}
+
+.segment-list {
+  padding: 0.25rem 0.75rem 0.75rem;
+  max-height: 280px;
+  overflow-y: auto;
+}
+
+.segment-row {
+  display: flex;
+  gap: 0.6rem;
+  align-items: baseline;
+  padding: 0.2rem 0;
+  font-size: 0.78rem;
+  border-bottom: 1px solid var(--bg);
+}
+
+.segment-time {
+  color: var(--muted);
+  font-family: ui-monospace, monospace;
+  min-width: 7rem;
+  flex-shrink: 0;
+}
+
+.segment-text {
+  color: var(--ink);
+  line-height: 1.4;
+}
+
+.transcript-downloads {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 0.5rem;
+  margin-top: 0.75rem;
+}
@@ -4,6 +4,8 @@ const state = {
 };

 let lastTimelineEvents = [];
+let lastAudioFile = null;
+let lastTranscriptData = null;

 const tools = {
  ask: {
@@ -56,6 +58,16 @@ const tools = {
    usesLanguage: false,
    badge: 'deterministic first',
  },
+  transcribe: {
+    kind: 'Audio Transcription',
+    title: 'Transcribe audio',
+    label: 'Audio file',
+    endpoint: 'api/transcribe.php',
+    payloadKey: null,
+    placeholder: '',
+    usesLanguage: false,
+    badge: 'Whisper / GPU',
+  },
 };

 const els = {};
@@ -91,6 +103,17 @@ document.addEventListener('DOMContentLoaded', () => {
    aliasSection: document.querySelector('#aliasSection'),
    addAliasRow: document.querySelector('#addAliasRow'),
    aliasRows: document.querySelector('#aliasRows'),
+    audioZone: document.querySelector('#audioZone'),
+    audioInput: document.querySelector('#audioInput'),
+    audioPrompt: document.querySelector('#audioPrompt'),
+    audioFileInfo: document.querySelector('#audioFileInfo'),
+    audioFileName: document.querySelector('#audioFileName'),
+    audioFileSize: document.querySelector('#audioFileSize'),
+    audioClear: document.querySelector('#audioClear'),
+    diarizeControl: document.querySelector('#diarizeControl'),
+    diarizeCheck: document.querySelector('#diarizeCheck'),
+    numSpeakersInput: document.querySelector('#numSpeakersInput'),
+    transcribeLangControl: document.querySelector('#transcribeLangControl'),
  });

  els.tabs.forEach((button) => {
@@ -101,8 +124,12 @@ document.addEventListener('DOMContentLoaded', () => {
  els.healthButton.addEventListener('click', checkHealth);
  setupUpload();
  setupAliases();
+  setupAudio();
  els.results.addEventListener('click', (e) => {
    if (e.target.closest('#exportCsvBtn')) exportTimelineCSV(lastTimelineEvents);
+    if (e.target.closest('#dlTxt')) downloadTranscriptTxt();
+    if (e.target.closest('#dlSrt')) downloadTranscriptSrt();
+    if (e.target.closest('#dlVtt')) downloadTranscriptVtt();
  });
  setTool(state.activeTool);

@@ -132,8 +159,14 @@ function setTool(toolName) {
  els.redactionControl.classList.toggle('is-hidden', toolName !== 'redact');
  els.uploadZone.classList.toggle('is-hidden', toolName !== 'redact' && toolName !== 'timeline');
  els.aliasSection.classList.toggle('is-hidden', toolName !== 'redact');
+  els.audioZone.classList.toggle('is-hidden', toolName !== 'transcribe');
+  els.diarizeControl.classList.toggle('is-hidden', toolName !== 'transcribe');
+  els.transcribeLangControl.classList.toggle('is-hidden', toolName !== 'transcribe');
+  els.input.classList.toggle('is-hidden', toolName === 'transcribe');
+  els.inputLabel.classList.toggle('is-hidden', toolName === 'transcribe');
  resetUpload();
  resetAliases();
+  resetAudio();
  els.status.textContent = '';
  renderTrace([]);
 }
@@ -163,6 +196,12 @@ async function submitPasscode(event) {

 async function runTool(event) {
  event.preventDefault();
+
+  if (state.activeTool === 'transcribe') {
+    await runTranscribe();
+    return;
+  }
+
  const tool = tools[state.activeTool];
  const text = els.input.value.trim();
  if (!text) {
@@ -389,7 +428,9 @@ async function postJson(url, payload) {
 function setBusy(isBusy) {
  const button = document.querySelector('#runButton');
  button.disabled = isBusy;
-  button.textContent = isBusy ? 'Running...' : 'Run Tool';
+  button.textContent = isBusy
+    ? (state.activeTool === 'transcribe' ? 'Transcribing...' : 'Running...')
+    : 'Run Tool';
 }

 function currentLanguage() {
@@ -447,6 +488,10 @@ function renderMainFinding(data) {
  return `<p>${escapeHtml(data.what_we_found || '')}</p>`;
 }

+function currentTranscribeLang() {
+  return document.querySelector('input[name="transcribeLang"]:checked')?.value || 'auto';
+}
+
 function renderEvidence(data) {
  const items = data.evidence_trail || data.sources || data.hits || [];
  if (!items.length) {
@@ -513,6 +558,224 @@ function exportTimelineCSV(events) {
  URL.revokeObjectURL(url);
 }

+async function runTranscribe() {
+  if (!lastAudioFile) {
+    els.status.textContent = 'Choose an audio file before transcribing.';
+    return;
+  }
+  setBusy(true);
+  renderTrace([{ label: 'Sending to Whisper', detail: 'Uploading audio to cuttlefish GPU…', status: 'running' }]);
+
+  try {
+    const formData = new FormData();
+    formData.append('audio', lastAudioFile);
+    formData.append('language', currentTranscribeLang());
+    if (els.diarizeCheck?.checked) {
+      formData.append('diarize', '1');
+      const n = parseInt(els.numSpeakersInput?.value || '', 10);
+      if (n >= 2) formData.append('num_speakers', String(n));
+    }
+
+    const resp = await fetch('api/transcribe.php', {
+      method: 'POST',
+      credentials: 'same-origin',
+      body: formData,
+    });
+    const data = await resp.json().catch(() => ({}));
+    if (!resp.ok || !data.ok) {
+      throw new Error(data.error?.message || `Transcription failed (HTTP ${resp.status}).`);
+    }
+
+    lastTranscriptData = data;
+    renderTranscriptResults(data);
+
+    const dur = data.duration_sec ? ` · Audio: ${Math.round(data.duration_sec)}s` : '';
+    els.status.textContent = `Done in ${data.latency_ms || 0} ms${dur}.`;
+  } catch (error) {
+    els.status.textContent = error.message;
+    renderTrace([{ label: 'Transcription error', detail: error.message, status: 'warning' }]);
+  } finally {
+    setBusy(false);
+  }
+}
+
+function renderTranscriptResults(data) {
+  const speakerRoles = data.speaker_roles || {};
+  const segments = data.segments || [];
+  const hasSpeakers = segments.some((s) => s.speaker);
+
+  const speakerOrder = [...new Set(segments.filter((s) => s.speaker).map((s) => s.speaker))];
+
+  const rolesHtml = speakerOrder.length
+    ? `<p class="transcript-roles">${speakerOrder.map((id, i) => {
+        const role = speakerRoles[id] || id;
+        return `<span class="speaker-tag speaker-tag--${i % 6}">${escapeHtml(role)}<small>${escapeHtml(id)}</small></span>`;
+      }).join('')}</p>`
+    : '';
+
+  const segmentsHtml = hasSpeakers
+    ? `<details class="segment-details"><summary class="segment-summary">Segments (${segments.length})</summary>
+        <div class="segment-list">${segments.map((seg) => {
+          const idx = speakerOrder.indexOf(seg.speaker);
+          const roleLabel = seg.speaker && speakerRoles[seg.speaker]
+            ? `${speakerRoles[seg.speaker]} (${seg.speaker})`
+            : (seg.speaker || '');
+          return `<div class="segment-row">
+            <span class="segment-time">${fmtTime(seg.start)}–${fmtTime(seg.end)}</span>
+            ${seg.speaker ? `<span class="speaker-tag speaker-tag--${idx >= 0 ? idx % 6 : 0}">${escapeHtml(roleLabel)}</span>` : ''}
+            <span class="segment-text">${escapeHtml(seg.text)}</span>
+          </div>`;
+        }).join('')}</div></details>`
+    : '';
+
+  const dlSrtVtt = segments.length
+    ? `<button type="button" class="export-csv-btn" id="dlSrt">Download SRT</button>
+       <button type="button" class="export-csv-btn" id="dlVtt">Download VTT</button>`
+    : '';
+
+  els.results.innerHTML = `
+    <section class="result-section">
+      <h3>Transcript</h3>
+      ${rolesHtml}
+      <div class="transcript-box"><pre class="transcript-text">${escapeHtml(data.transcript)}</pre></div>
+      ${segmentsHtml}
+      <div class="transcript-downloads">
+        <button type="button" class="export-csv-btn" id="dlTxt">Download TXT</button>
+        ${dlSrtVtt}
+      </div>
+    </section>`;
+
+  const traceMeta = [];
+  if (data.duration_sec) traceMeta.push({ label: `Duration: ${Math.round(data.duration_sec)}s`, detail: '', status: 'complete' });
+  if (data.language) traceMeta.push({ label: `Language: ${data.language}`, detail: '', status: 'complete' });
+  if (data.num_speakers > 1) traceMeta.push({ label: `Speakers detected: ${data.num_speakers}`, detail: Object.entries(speakerRoles).map(([id, r]) => `${id}: ${r}`).join(', ') || '', status: 'complete' });
+  if (data.model) traceMeta.push({ label: `Model: ${data.model}`, detail: '', status: 'complete' });
+  renderTrace(traceMeta.length ? traceMeta : [{ label: 'Transcribed', detail: '', status: 'complete' }]);
+}
+
+function fmtTime(secs) {
+  const h = Math.floor(secs / 3600);
+  const m = Math.floor((secs % 3600) / 60);
+  const s = Math.floor(secs % 60);
+  const parts = h > 0 ? [pad2(h), pad2(m), pad2(s)] : [pad2(m), pad2(s)];
+  return parts.join(':');
+}
+
+function pad2(n) { return String(n).padStart(2, '0'); }
+
+function toSrtTime(secs) {
+  const h = Math.floor(secs / 3600);
+  const m = Math.floor((secs % 3600) / 60);
+  const s = Math.floor(secs % 60);
+  const ms = Math.round((secs % 1) * 1000);
+  return `${pad2(h)}:${pad2(m)}:${pad2(s)},${String(ms).padStart(3, '0')}`;
+}
+
+function toVttTime(secs) {
+  return toSrtTime(secs).replace(',', '.');
+}
+
+function downloadBlob(blob, filename) {
+  const url = URL.createObjectURL(blob);
+  const a = Object.assign(document.createElement('a'), { href: url, download: filename });
+  a.click();
+  URL.revokeObjectURL(url);
+}
+
+function downloadTranscriptTxt() {
+  if (!lastTranscriptData) return;
+  downloadBlob(new Blob([lastTranscriptData.transcript], { type: 'text/plain' }), 'transcript.txt');
+}
+
+function downloadTranscriptSrt() {
+  if (!lastTranscriptData?.segments?.length) return;
+  const { segments, speaker_roles: roles = {} } = lastTranscriptData;
+  const lines = segments.map((seg, i) => {
+    const spk = seg.speaker ? `[${roles[seg.speaker] || seg.speaker}] ` : '';
+    return `${i + 1}\n${toSrtTime(seg.start)} --> ${toSrtTime(seg.end)}\n${spk}${seg.text}\n`;
+  });
+  downloadBlob(new Blob([lines.join('\n')], { type: 'text/srt' }), 'transcript.srt');
+}
+
+function downloadTranscriptVtt() {
+  if (!lastTranscriptData?.segments?.length) return;
+  const { segments, speaker_roles: roles = {} } = lastTranscriptData;
+  const lines = ['WEBVTT\n'];
+  segments.forEach((seg) => {
+    const spk = seg.speaker ? `<v ${roles[seg.speaker] || seg.speaker}>` : '';
+    lines.push(`${toVttTime(seg.start)} --> ${toVttTime(seg.end)}\n${spk}${seg.text}\n`);
+  });
+  downloadBlob(new Blob([lines.join('\n')], { type: 'text/vtt' }), 'transcript.vtt');
+}
+
+function resetAudio() {
+  lastAudioFile = null;
+  if (!els.audioInput) return;
+  els.audioInput.value = '';
+  if (els.audioPrompt) els.audioPrompt.classList.remove('is-hidden');
+  if (els.audioFileInfo) els.audioFileInfo.classList.add('is-hidden');
+  if (els.audioFileName) els.audioFileName.textContent = '';
+  if (els.audioFileSize) els.audioFileSize.textContent = '';
+}
+
+function setupAudio() {
+  if (!els.audioZone) return;
+
+  els.audioZone.addEventListener('dragover', (e) => {
+    e.preventDefault();
+    els.audioZone.classList.add('is-drag-over');
+  });
+
+  els.audioZone.addEventListener('dragleave', (e) => {
+    if (!els.audioZone.contains(e.relatedTarget)) {
+      els.audioZone.classList.remove('is-drag-over');
+    }
+  });
+
+  els.audioZone.addEventListener('drop', (e) => {
+    e.preventDefault();
+    els.audioZone.classList.remove('is-drag-over');
+    const f = e.dataTransfer?.files?.[0];
+    if (f) handleAudio(f);
+  });
+
+  els.audioZone.addEventListener('click', (e) => {
+    if (e.target === els.audioClear || els.audioClear?.contains(e.target)) return;
+    if (e.target.tagName === 'LABEL') return;
+    els.audioInput.click();
+  });
+
+  els.audioInput.addEventListener('change', () => {
+    const f = els.audioInput.files?.[0];
+    if (f) handleAudio(f);
+  });
+
+  els.audioClear.addEventListener('click', () => {
+    resetAudio();
+    els.status.textContent = '';
+  });
+}
+
+function handleAudio(file) {
+  const allowedExts = ['mp3', 'wav', 'ogg', 'oga', 'm4a', 'mp4', 'flac', 'webm', 'aac'];
+  const ext = file.name.split('.').pop().toLowerCase();
+  if (!allowedExts.includes(ext)) {
+    els.status.textContent = `Unsupported format: .${ext}. Use MP3, WAV, OGG, M4A, FLAC, or WebM.`;
+    return;
+  }
+  const sizeMB = file.size / 1024 / 1024;
+  if (sizeMB > 200) {
+    els.status.textContent = `File too large (${sizeMB.toFixed(1)} MB). Maximum 200 MB.`;
+    return;
+  }
+  lastAudioFile = file;
+  if (els.audioFileName) els.audioFileName.textContent = file.name;
+  if (els.audioFileSize) els.audioFileSize.textContent = `${sizeMB.toFixed(1)} MB`;
+  if (els.audioPrompt) els.audioPrompt.classList.add('is-hidden');
+  if (els.audioFileInfo) els.audioFileInfo.classList.remove('is-hidden');
+  els.status.textContent = `Ready: ${file.name} (${sizeMB.toFixed(1)} MB)`;
+}
+
 function renderEntityCounts(counts = {}) {
  const entries = Object.entries(counts).filter(([, count]) => Number(count) > 0);
  if (!entries.length) {