Per-tool pages + multi-engine transcribe with expert controls

- Split monolithic index.php into per-tool pages (ask, search, summarize, timeline, redact, transcribe), each with its own URL and bookmarkable state - Shared shell: includes/layout.php + layout_footer.php; shared form: includes/tool_form.php used by all text-tool pages - index.php now redirects authenticated users to ask.php; unauthenticated users see the login gate only - transcribe.php: engine selector (GPU/OpenAI/Azure), model size (small/ medium/large-v3), diarize, language, expert settings (beam, VAD, task, initial prompt) - api/transcribe.php: engine routing — GPU (cuttlefish), OpenAI BYOK, Azure AI Speech; passes model/beam/task/vad/prompt to Whisper server - tools.js: data-active-tool body attr drives setTool() on load; <a> nav tabs skip click listeners; null guards on form/passcodeForm; engine radio toggle shows/hides BYOK key inputs and model selector; RTF shown in status - tools.css: styles for BYOK inputs, expert settings panel, prompt textarea Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 22:14:20 +02:00
parent d178fbf295
commit eaff2a4d86
13 changed files with 789 additions and 257 deletions
@@ -6,12 +6,21 @@ require_once __DIR__ . '/../includes/LegalTools.php';
 dbnToolsRequireMethod('POST');
 dbnToolsRequireAuth();

-$validLangs  = ['auto', 'no', 'en', 'sv', 'da', 'de', 'fr', 'es', 'pl'];
+// ── Common params ─────────────────────────────────────────────────────────────
+
+$validLangs  = ['auto', 'no', 'nn', 'en', 'sv', 'da', 'de', 'fr', 'es', 'pl', 'fi', 'nl', 'it', 'pt', 'ru', 'ar', 'tr', 'zh', 'ja', 'ko'];
 $language    = strtolower(trim((string)($_POST['language'] ?? 'auto')));
 if (!in_array($language, $validLangs, true)) $language = 'auto';

 $diarize     = !empty($_POST['diarize']) && $_POST['diarize'] !== '0';
 $numSpeakers = isset($_POST['num_speakers']) ? max(0, min(20, (int)$_POST['num_speakers'])) : 0;
+$engine      = in_array($_POST['engine'] ?? '', ['gpu', 'openai', 'azure'], true) ? $_POST['engine'] : 'gpu';
+$validModels = ['tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3'];
+$model       = in_array($_POST['model'] ?? '', $validModels, true) ? $_POST['model'] : 'small';
+$beamSize    = max(1, min(5, (int)($_POST['beam_size'] ?? 5)));
+$task        = ($_POST['task'] ?? 'transcribe') === 'translate' ? 'translate' : 'transcribe';
+$vadFilter   = !empty($_POST['vad_filter']) && $_POST['vad_filter'] !== '0';
+$initPrompt  = substr(trim((string)($_POST['initial_prompt'] ?? '')), 0, 500);

 // ── Validate upload ───────────────────────────────────────────────────────────

@@ -39,88 +48,41 @@ if (!in_array($ext, $allowedExts, true)) {
    dbnToolsError("Unsupported format: .{$ext}. Use MP3, WAV, OGG, M4A, FLAC, or WebM.", 415, 'unsupported_format');
 }

-// ── Build Whisper request ─────────────────────────────────────────────────────
-
-$whisperBase = 'http://194.93.49.14:20019';
-$endpoint    = $diarize ? $whisperBase . '/transcribe/diarize' : $whisperBase . '/transcribe';
-
-$boundary = '----DBN' . bin2hex(random_bytes(8));
-$body     = "--{$boundary}\r\n";
-$body    .= 'Content-Disposition: form-data; name="file"; filename="' . addslashes(basename($file['name'])) . '"' . "\r\n";
-$body    .= "Content-Type: application/octet-stream\r\n\r\n";
-
-$fileContents = file_get_contents($file['tmp_name']);
-if ($fileContents === false) {
-    dbnToolsError('Could not read uploaded file.', 500, 'file_read_error');
+// OpenAI has a 25 MB file limit
+if ($engine === 'openai' && $file['size'] > 25 * 1024 * 1024) {
+    dbnToolsError('OpenAI Whisper API has a 25 MB file limit. Use the GPU engine for larger files.', 413, 'openai_file_too_large');
 }
-$body .= $fileContents . "\r\n";
-
-if ($language !== 'auto') {
-    $body .= "--{$boundary}\r\n";
-    $body .= "Content-Disposition: form-data; name=\"language\"\r\n\r\n";
-    $body .= $language . "\r\n";
-}
-
-if ($diarize && $numSpeakers > 1) {
-    $body .= "--{$boundary}\r\n";
-    $body .= "Content-Disposition: form-data; name=\"num_speakers\"\r\n\r\n";
-    $body .= $numSpeakers . "\r\n";
-}
-
-$body .= "--{$boundary}--\r\n";
-
-// ── Call Whisper ──────────────────────────────────────────────────────────────

 $t0 = microtime(true);

-if (function_exists('curl_init')) {
-    $ch = curl_init($endpoint);
-    curl_setopt_array($ch, [
-        CURLOPT_RETURNTRANSFER => true,
-        CURLOPT_POST           => true,
-        CURLOPT_POSTFIELDS     => $body,
-        CURLOPT_HTTPHEADER     => [
-            "Content-Type: multipart/form-data; boundary={$boundary}",
-            'Accept: application/json',
-        ],
-        CURLOPT_TIMEOUT        => 600,
-    ]);
-    $whisperBody = curl_exec($ch);
-    $httpCode    = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
-    $curlErr     = curl_error($ch);
-    curl_close($ch);
+// ── Route to engine ───────────────────────────────────────────────────────────

-    if ($whisperBody === false || $httpCode !== 200) {
-        dbnToolsError('Whisper service error (HTTP ' . $httpCode . '): ' . $curlErr, 502, 'whisper_error');
+if ($engine === 'openai') {
+    $apiKey = trim((string)($_POST['openai_key'] ?? ''));
+    if (!$apiKey || !str_starts_with($apiKey, 'sk-')) {
+        dbnToolsError('A valid OpenAI API key (sk-…) is required for the OpenAI engine.', 400, 'missing_openai_key');
    }
+    $result = transcribeViaOpenAI($file, $language, $task, $apiKey);
+
+} elseif ($engine === 'azure') {
+    $apiKey = trim((string)($_POST['azure_key'] ?? ''));
+    $region = preg_replace('/[^a-z0-9]/', '', strtolower(trim((string)($_POST['azure_region'] ?? 'norwayeast'))));
+    if (!$apiKey) {
+        dbnToolsError('An Azure Speech API key is required for the Azure engine.', 400, 'missing_azure_key');
+    }
+    $result = transcribeViaAzure($file, $language, $apiKey, $region, $diarize);
+
 } else {
-    $ctx = stream_context_create([
-        'http' => [
-            'method'        => 'POST',
-            'timeout'       => 600,
-            'header'        => "Content-Type: multipart/form-data; boundary={$boundary}\r\nAccept: application/json\r\n",
-            'content'       => $body,
-            'ignore_errors' => true,
-        ],
-    ]);
-    $whisperBody = @file_get_contents($endpoint, false, $ctx);
-
-    if ($whisperBody === false) {
-        dbnToolsError('Whisper service unreachable. The GPU may be offline.', 502, 'whisper_unreachable');
-    }
+    // GPU (default)
+    $result = transcribeViaWhisperGpu($file, $language, $diarize, $numSpeakers, $model, $beamSize, $task, $vadFilter, $initPrompt);
 }

 $latencyMs = (int)round((microtime(true) - $t0) * 1000);

-$whisper = json_decode($whisperBody, true);
-if (!is_array($whisper) || empty($whisper['text'])) {
-    dbnToolsError('Empty or invalid response from Whisper.', 502, 'whisper_empty');
-}
+// ── Speaker role labelling (GPU + diarize only) ───────────────────────────────

-// ── Speaker role labelling ────────────────────────────────────────────────────
-
-$segments    = is_array($whisper['segments'] ?? null) ? $whisper['segments'] : [];
-$numDetected = (int)($whisper['num_speakers'] ?? 1);
+$segments    = $result['segments']    ?? [];
+$numDetected = (int)($result['num_speakers'] ?? 1);

 if ($numDetected < 2 && $segments) {
    $uniqueSpeakers = array_filter(array_unique(array_column($segments, 'speaker')));
@@ -132,10 +94,12 @@ if ($diarize && $numDetected > 1 && $segments) {
    $speakerRoles = dbnLabelSpeakerRoles($segments);
 }

-// ── Respond ───────────────────────────────────────────────────────────────────
+// ── Log + respond ─────────────────────────────────────────────────────────────

 dbnToolsLogMetadata([
    'tool'       => 'transcribe',
+    'engine'     => $engine,
+    'model'      => $model,
    'language'   => $language,
    'ok'         => true,
    'latency_ms' => $latencyMs,
@@ -144,17 +108,237 @@ dbnToolsLogMetadata([
 dbnToolsRespond([
    'ok'            => true,
    'tool'          => 'transcribe',
-    'transcript'    => (string)$whisper['text'],
+    'transcript'    => (string)($result['text'] ?? ''),
    'segments'      => $segments,
    'speaker_roles' => $speakerRoles,
    'num_speakers'  => $numDetected,
-    'language'      => (string)($whisper['language'] ?? $language),
-    'duration_sec'  => round((float)($whisper['duration_seconds'] ?? 0), 2),
-    'model'         => (string)($whisper['model'] ?? 'whisper'),
+    'language'      => (string)($result['language'] ?? $language),
+    'duration_sec'  => round((float)($result['duration_seconds'] ?? 0), 2),
+    'processing_sec'=> round((float)($result['processing_seconds'] ?? 0), 2),
+    'model'         => (string)($result['model'] ?? ($engine === 'gpu' ? $model : $engine)),
+    'engine'        => $engine,
    'latency_ms'    => $latencyMs,
 ]);

-// ── Speaker role labelling helper ─────────────────────────────────────────────
+
+// ── Engine implementations ────────────────────────────────────────────────────
+
+function transcribeViaWhisperGpu(array $file, string $language, bool $diarize, int $numSpeakers,
+                                  string $model, int $beamSize, string $task,
+                                  bool $vadFilter, string $initPrompt): array
+{
+    $whisperBase = 'http://194.93.49.14:20019';
+    $endpoint    = $diarize ? $whisperBase . '/transcribe/diarize' : $whisperBase . '/transcribe';
+    $boundary    = '----DBN' . bin2hex(random_bytes(8));
+
+    $body  = "--{$boundary}\r\n";
+    $body .= 'Content-Disposition: form-data; name="file"; filename="' . addslashes(basename($file['name'])) . '"' . "\r\n";
+    $body .= "Content-Type: application/octet-stream\r\n\r\n";
+
+    $fileContents = file_get_contents($file['tmp_name']);
+    if ($fileContents === false) {
+        dbnToolsError('Could not read uploaded file.', 500, 'file_read_error');
+    }
+    $body .= $fileContents . "\r\n";
+
+    $fields = [
+        'model'          => $model,
+        'beam_size'      => (string)$beamSize,
+        'task'           => $task,
+        'vad_filter'     => $vadFilter ? '1' : '0',
+        'initial_prompt' => $initPrompt,
+    ];
+    if ($language !== 'auto') $fields['language'] = $language;
+    if ($diarize && $numSpeakers > 1) $fields['num_speakers'] = (string)$numSpeakers;
+
+    foreach ($fields as $name => $value) {
+        if ($value === '') continue;
+        $body .= "--{$boundary}\r\n";
+        $body .= "Content-Disposition: form-data; name=\"{$name}\"\r\n\r\n";
+        $body .= $value . "\r\n";
+    }
+    $body .= "--{$boundary}--\r\n";
+
+    $ch = curl_init($endpoint);
+    curl_setopt_array($ch, [
+        CURLOPT_RETURNTRANSFER => true,
+        CURLOPT_POST           => true,
+        CURLOPT_POSTFIELDS     => $body,
+        CURLOPT_HTTPHEADER     => [
+            "Content-Type: multipart/form-data; boundary={$boundary}",
+            'Accept: application/json',
+        ],
+        CURLOPT_TIMEOUT        => 600,
+    ]);
+    $responseBody = curl_exec($ch);
+    $httpCode     = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
+    $curlErr      = curl_error($ch);
+    curl_close($ch);
+
+    if ($responseBody === false || $httpCode !== 200) {
+        $detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : '');
+        dbnToolsError('Whisper service error (HTTP ' . $httpCode . '): ' . $detail, 502, 'whisper_error');
+    }
+
+    $data = json_decode($responseBody, true);
+    if (!is_array($data) || empty($data['text'])) {
+        dbnToolsError('Empty or invalid response from Whisper.', 502, 'whisper_empty');
+    }
+    return $data;
+}
+
+
+function transcribeViaOpenAI(array $file, string $language, string $task, string $apiKey): array
+{
+    $boundary = '----DBN' . bin2hex(random_bytes(8));
+    $body     = "--{$boundary}\r\n";
+    $body    .= 'Content-Disposition: form-data; name="file"; filename="' . addslashes(basename($file['name'])) . '"' . "\r\n";
+    $body    .= "Content-Type: application/octet-stream\r\n\r\n";
+    $body    .= file_get_contents($file['tmp_name']) . "\r\n";
+    $body    .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"model\"\r\n\r\nwhisper-1\r\n";
+    $body    .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"response_format\"\r\n\r\nverbose_json\r\n";
+    if ($language !== 'auto') {
+        $body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"language\"\r\n\r\n{$language}\r\n";
+    }
+    if ($task === 'translate') {
+        $body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"task\"\r\n\r\ntranslation\r\n";
+    }
+    $body .= "--{$boundary}--\r\n";
+
+    $ch = curl_init('https://api.openai.com/v1/audio/transcriptions');
+    curl_setopt_array($ch, [
+        CURLOPT_RETURNTRANSFER => true,
+        CURLOPT_POST           => true,
+        CURLOPT_POSTFIELDS     => $body,
+        CURLOPT_HTTPHEADER     => [
+            "Authorization: Bearer {$apiKey}",
+            "Content-Type: multipart/form-data; boundary={$boundary}",
+            'Accept: application/json',
+        ],
+        CURLOPT_TIMEOUT => 300,
+    ]);
+    $responseBody = curl_exec($ch);
+    $httpCode     = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
+    $curlErr      = curl_error($ch);
+    curl_close($ch);
+
+    if ($responseBody === false || $httpCode !== 200) {
+        $detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : '');
+        dbnToolsError('OpenAI API error (HTTP ' . $httpCode . '): ' . $detail, 502, 'openai_error');
+    }
+
+    $data = json_decode($responseBody, true);
+    if (!is_array($data)) {
+        dbnToolsError('Invalid response from OpenAI.', 502, 'openai_empty');
+    }
+
+    // Normalise to internal shape
+    return [
+        'text'               => (string)($data['text'] ?? ''),
+        'language'           => (string)($data['language'] ?? $language),
+        'duration_seconds'   => (float)($data['duration'] ?? 0),
+        'processing_seconds' => 0,
+        'segments'           => array_map(fn($s) => [
+            'id'      => $s['id']    ?? 0,
+            'start'   => $s['start'] ?? 0,
+            'end'     => $s['end']   ?? 0,
+            'text'    => $s['text']  ?? '',
+            'speaker' => 'SPEAKER_00',
+        ], $data['segments'] ?? []),
+        'model' => 'openai/whisper-1',
+    ];
+}
+
+
+function transcribeViaAzure(array $file, string $language, string $apiKey,
+                             string $region, bool $diarize): array
+{
+    // Azure Batch Transcription — POST audio directly for short-form (<60 min)
+    // Uses the simple REST endpoint for synchronous short audio transcription.
+    $langCode = match($language) {
+        'no', 'nb' => 'nb-NO',
+        'nn'       => 'nn-NO',
+        'en'       => 'en-US',
+        'sv'       => 'sv-SE',
+        'da'       => 'da-DK',
+        'de'       => 'de-DE',
+        'fr'       => 'fr-FR',
+        'es'       => 'es-ES',
+        'pl'       => 'pl-PL',
+        'fi'       => 'fi-FI',
+        'nl'       => 'nl-NL',
+        'it'       => 'it-IT',
+        'pt'       => 'pt-PT',
+        default    => 'nb-NO',
+    };
+
+    // Mime type map
+    $mimeMap = [
+        'wav'  => 'audio/wav',  'mp3'  => 'audio/mpeg', 'ogg'  => 'audio/ogg',
+        'oga'  => 'audio/ogg',  'm4a'  => 'audio/mp4',  'mp4'  => 'audio/mp4',
+        'flac' => 'audio/flac', 'webm' => 'audio/webm', 'aac'  => 'audio/aac',
+    ];
+    $fileExt  = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION));
+    $mimeType = $mimeMap[$fileExt] ?? 'audio/wav';
+
+    $endpoint = "https://{$region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1"
+              . "?language={$langCode}&format=detailed";
+
+    $fileContents = file_get_contents($file['tmp_name']);
+    if ($fileContents === false) {
+        dbnToolsError('Could not read uploaded file.', 500, 'file_read_error');
+    }
+
+    $ch = curl_init($endpoint);
+    curl_setopt_array($ch, [
+        CURLOPT_RETURNTRANSFER => true,
+        CURLOPT_POST           => true,
+        CURLOPT_POSTFIELDS     => $fileContents,
+        CURLOPT_HTTPHEADER     => [
+            "Ocp-Apim-Subscription-Key: {$apiKey}",
+            "Content-Type: {$mimeType}",
+            'Accept: application/json',
+        ],
+        CURLOPT_TIMEOUT => 300,
+    ]);
+    $responseBody = curl_exec($ch);
+    $httpCode     = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
+    $curlErr      = curl_error($ch);
+    curl_close($ch);
+
+    if ($responseBody === false || $httpCode !== 200) {
+        $detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : '');
+        dbnToolsError('Azure Speech error (HTTP ' . $httpCode . '): ' . $detail, 502, 'azure_error');
+    }
+
+    $data = json_decode($responseBody, true);
+    if (!is_array($data) || empty($data['DisplayText'])) {
+        dbnToolsError('Empty or invalid response from Azure Speech.', 502, 'azure_empty');
+    }
+
+    // Normalise to internal shape
+    $text = (string)($data['DisplayText'] ?? '');
+    $segs = [];
+    foreach (($data['NBest'][0]['Words'] ?? []) as $i => $word) {
+        $segs[] = [
+            'id'      => $i,
+            'start'   => round((float)($word['Offset'] ?? 0) / 10_000_000, 3),
+            'end'     => round(((float)($word['Offset'] ?? 0) + (float)($word['Duration'] ?? 0)) / 10_000_000, 3),
+            'text'    => (string)($word['Word'] ?? ''),
+            'speaker' => 'SPEAKER_00',
+        ];
+    }
+
+    return [
+        'text'               => $text,
+        'language'           => $langCode,
+        'duration_seconds'   => 0,
+        'processing_seconds' => 0,
+        'segments'           => $segs,
+        'model'              => "azure/{$langCode}",
+    ];
+}
+

 function dbnLabelSpeakerRoles(array $segments): array
 {