feat: auto-select STT engine (Azure → Google Cloud → Whisper) and show provider in results

Removes user-facing engine/model/key/beam controls. The server now picks the best available engine automatically: 1. Microsoft Azure Speech — short clips (≤1MB, no diarization, audio/*) 2. Google Cloud Speech v2 — long audio, diarization, all languages 3. OpenAI Whisper GPU — local fallback Results display which provider was used (e.g. "Transcribed with Google Cloud Speech") via transcript-engine-badge and traceMeta. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 13:22:24 +02:00
parent c6a9cc9199
commit 08d1e3cee3
14 changed files with 2937 additions and 416 deletions
@@ -17,9 +17,8 @@ if (!in_array($language, $validLangs, true)) $language = 'auto';

 $diarize     = !empty($_POST['diarize']) && $_POST['diarize'] !== '0';
 $numSpeakers = isset($_POST['num_speakers']) ? max(0, min(20, (int)$_POST['num_speakers'])) : 0;
-$engine      = in_array($_POST['engine'] ?? '', ['gpu', 'openai', 'azure'], true) ? $_POST['engine'] : 'gpu';
 $validModels = ['tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3'];
-$model       = in_array($_POST['model'] ?? '', $validModels, true) ? $_POST['model'] : 'small';
+$gpuModel    = in_array($_POST['model'] ?? '', $validModels, true) ? $_POST['model'] : 'large-v3';
 $beamSize    = max(1, min(5, (int)($_POST['beam_size'] ?? 5)));
 $task        = ($_POST['task'] ?? 'transcribe') === 'translate' ? 'translate' : 'transcribe';
 $vadFilter   = !empty($_POST['vad_filter']) && $_POST['vad_filter'] !== '0';
@@ -51,36 +50,56 @@ if (!in_array($ext, $allowedExts, true)) {
    dbnToolsError("Unsupported format: .{$ext}. Use MP3, WAV, OGG, M4A, FLAC, or WebM.", 415, 'unsupported_format');
 }

-// OpenAI has a 25 MB file limit
-if ($engine === 'openai' && $file['size'] > 25 * 1024 * 1024) {
-    dbnToolsError('OpenAI Whisper API has a 25 MB file limit. Use the GPU engine for larger files.', 413, 'openai_file_too_large');
+$detectedMime = mime_content_type($file['tmp_name']) ?: 'application/octet-stream';
+$timeOffset   = max(0.0, (float)($_POST['time_offset'] ?? 0));
+$t0           = microtime(true);
+
+// ── Auto-cascade: Azure → GCP → Whisper GPU ───────────────────────────────────
+
+$result      = null;
+$engineUsed  = 'whisper-gpu';
+
+// 1. Microsoft Azure Speech — fast path for short, non-diarize audio clips
+$azureKey    = (string)(dbnToolsEnv('DBN_AZURE_SPEECH_KEY') ?? '');
+$azureRegion = preg_replace('/[^a-z0-9]/', '', strtolower(
+    (string)(dbnToolsEnv('DBN_AZURE_SPEECH_REGION') ?? 'norwayeast')
+));
+if ($azureKey !== '' && !$diarize && $file['size'] <= 1024 * 1024 && str_starts_with($detectedMime, 'audio/')) {
+    $result = transcribeViaAzureServer($file, $language, $azureKey, $azureRegion);
+    if ($result !== null) {
+        $engineUsed = 'azure';
+    } else {
+        error_log('STT: Azure Speech skipped or failed, trying Google Cloud');
+    }
 }

-$timeOffset = max(0.0, (float)($_POST['time_offset'] ?? 0));
-$t0 = microtime(true);
-
-// ── Route to engine ───────────────────────────────────────────────────────────
-
-if ($engine === 'openai') {
-    $apiKey = trim((string)($_POST['openai_key'] ?? ''));
-    if (!$apiKey || !str_starts_with($apiKey, 'sk-')) {
-        dbnToolsError('A valid OpenAI API key (sk-…) is required for the OpenAI engine.', 400, 'missing_openai_key');
+// 2. Google Cloud Speech v2 — long audio, diarization, everything Azure can't handle
+if ($result === null) {
+    $gcpPath = dbnToolsAiPortalRoot() . '/lib/ai/GcpSpeechClient.php';
+    if (is_file($gcpPath)) {
+        require_once $gcpPath;
+        $gcp = GcpSpeechClient::fromConfig();
+        if ($gcp) {
+            $gcpLang = ($language === 'auto') ? '' : $language;
+            $result  = $gcp->transcribe(
+                $file['tmp_name'], $detectedMime, $gcpLang,
+                $diarize,
+                $numSpeakers > 1 ? $numSpeakers : 2,
+                $numSpeakers > 1 ? max($numSpeakers, 2) : 6
+            );
+            if ($result !== null) {
+                $engineUsed = 'gcp';
+            } else {
+                error_log('STT: Google Cloud Speech failed, falling back to Whisper');
+            }
+        }
    }
-    $result = transcribeViaOpenAI($file, $language, $task, $apiKey);
+}

-} elseif ($engine === 'azure') {
-    $apiKey = trim((string)($_POST['azure_key'] ?? ''));
-    if ($apiKey === '') $apiKey = (string)(dbnToolsEnv('DBN_AZURE_SPEECH_KEY') ?? '');
-    $region = preg_replace('/[^a-z0-9]/', '', strtolower(trim((string)($_POST['azure_region'] ?? ''))));
-    if ($region === '') $region = preg_replace('/[^a-z0-9]/', '', strtolower((string)(dbnToolsEnv('DBN_AZURE_SPEECH_REGION') ?? 'norwayeast')));
-    if (!$apiKey) {
-        dbnToolsError('An Azure Speech API key is required for the Azure engine.', 400, 'missing_azure_key');
-    }
-    $result = transcribeViaAzure($file, $language, $apiKey, $region, $diarize);
-
-} else {
-    // GPU (default)
-    $result = transcribeViaWhisperGpu($file, $language, $diarize, $numSpeakers, $model, $beamSize, $task, $vadFilter, $initPrompt);
+// 3. Whisper GPU — local fallback
+if ($result === null) {
+    $result     = transcribeViaWhisperGpu($file, $language, $diarize, $numSpeakers, $gpuModel, $beamSize, $task, $vadFilter, $initPrompt);
+    $engineUsed = 'whisper-gpu';
 }

 $latencyMs = (int)round((microtime(true) - $t0) * 1000);
@@ -95,7 +114,7 @@ if ($timeOffset > 0.0 && !empty($result['segments'])) {
    unset($seg);
 }

-// ── Speaker role labelling (GPU + diarize only) ───────────────────────────────
+// ── Speaker role labelling (diarize + multiple speakers only) ─────────────────

 $segments    = $result['segments']    ?? [];
 $numDetected = (int)($result['num_speakers'] ?? 1);
@@ -110,12 +129,20 @@ if ($diarize && $numDetected > 1 && $segments) {
    $speakerRoles = dbnLabelSpeakerRoles($segments);
 }

+// ── Friendly engine label ─────────────────────────────────────────────────────
+
+$engineLabel = match($engineUsed) {
+    'azure'       => 'Microsoft Azure Speech',
+    'gcp'         => 'Google Cloud Speech',
+    default       => 'OpenAI Whisper ' . $gpuModel,
+};
+
 // ── Log + respond ─────────────────────────────────────────────────────────────

 dbnToolsLogMetadata([
    'tool'       => 'transcribe',
-    'engine'     => $engine,
-    'model'      => $model,
+    'engine'     => $engineUsed,
+    'model'      => $engineLabel,
    'language'   => $language,
    'ok'         => true,
    'latency_ms' => $latencyMs,
@@ -129,16 +156,98 @@ dbnToolsRespond([
    'speaker_roles' => $speakerRoles,
    'num_speakers'  => $numDetected,
    'language'      => (string)($result['language'] ?? $language),
-    'duration_sec'  => round((float)($result['duration_seconds'] ?? 0), 2),
+    'duration_sec'  => round((float)($result['duration_seconds'] ?? $result['duration'] ?? 0), 2),
    'processing_sec'=> round((float)($result['processing_seconds'] ?? 0), 2),
-    'model'         => (string)($result['model'] ?? ($engine === 'gpu' ? $model : $engine)),
-    'engine'        => $engine,
+    'model'         => $engineLabel,
+    'engine'        => $engineUsed,
    'latency_ms'    => $latencyMs,
 ]);


 // ── Engine implementations ────────────────────────────────────────────────────

+/**
+ * Microsoft Azure Speech — short clips (≤1MB, no diarization).
+ * Returns null on any failure so the caller can cascade to the next engine.
+ */
+function transcribeViaAzureServer(array $file, string $language, string $apiKey, string $region): ?array
+{
+    $langCode = match($language) {
+        'no', 'nb' => 'nb-NO',
+        'nn'       => 'nn-NO',
+        'en'       => 'en-US',
+        'sv'       => 'sv-SE',
+        'da'       => 'da-DK',
+        'de'       => 'de-DE',
+        'fr'       => 'fr-FR',
+        'es'       => 'es-ES',
+        'pl'       => 'pl-PL',
+        'fi'       => 'fi-FI',
+        'nl'       => 'nl-NL',
+        'it'       => 'it-IT',
+        'pt'       => 'pt-PT',
+        default    => 'nb-NO',
+    };
+
+    $mimeMap = [
+        'wav'  => 'audio/wav',  'mp3'  => 'audio/mpeg', 'ogg'  => 'audio/ogg',
+        'oga'  => 'audio/ogg',  'm4a'  => 'audio/mp4',  'mp4'  => 'audio/mp4',
+        'flac' => 'audio/flac', 'webm' => 'audio/webm', 'aac'  => 'audio/aac',
+    ];
+    $fileExt  = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION));
+    $mimeType = $mimeMap[$fileExt] ?? 'audio/wav';
+
+    $endpoint = "https://{$region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1"
+              . "?language={$langCode}&format=detailed";
+
+    $fileContents = @file_get_contents($file['tmp_name']);
+    if ($fileContents === false) return null;
+
+    $ch = curl_init($endpoint);
+    curl_setopt_array($ch, [
+        CURLOPT_RETURNTRANSFER => true,
+        CURLOPT_POST           => true,
+        CURLOPT_POSTFIELDS     => $fileContents,
+        CURLOPT_HTTPHEADER     => [
+            "Ocp-Apim-Subscription-Key: {$apiKey}",
+            "Content-Type: {$mimeType}",
+            'Accept: application/json',
+        ],
+        CURLOPT_TIMEOUT => 60,
+    ]);
+    $responseBody = curl_exec($ch);
+    $httpCode     = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
+    curl_close($ch);
+
+    if ($responseBody === false || $httpCode !== 200) {
+        error_log("STT Azure HTTP {$httpCode}: " . substr((string)$responseBody, 0, 200));
+        return null;
+    }
+
+    $data = json_decode($responseBody, true);
+    if (!is_array($data) || empty($data['DisplayText'])) return null;
+
+    $text = (string)($data['DisplayText'] ?? '');
+    $segs = [];
+    foreach (($data['NBest'][0]['Words'] ?? []) as $i => $word) {
+        $segs[] = [
+            'id'    => $i,
+            'start' => round((float)($word['Offset'] ?? 0) / 10_000_000, 3),
+            'end'   => round(((float)($word['Offset'] ?? 0) + (float)($word['Duration'] ?? 0)) / 10_000_000, 3),
+            'text'  => (string)($word['Word'] ?? ''),
+        ];
+    }
+
+    return [
+        'text'               => $text,
+        'language'           => strtolower(explode('-', $langCode)[0]),
+        'duration_seconds'   => 0,
+        'processing_seconds' => 0,
+        'segments'           => $segs,
+    ];
+}
+
+
 function transcribeViaWhisperGpu(array $file, string $language, bool $diarize, int $numSpeakers,
                                  string $model, int $beamSize, string $task,
                                  bool $vadFilter, string $initPrompt): array
@@ -204,158 +313,6 @@ function transcribeViaWhisperGpu(array $file, string $language, bool $diarize, i
 }


-function transcribeViaOpenAI(array $file, string $language, string $task, string $apiKey): array
-{
-    $boundary = '----DBN' . bin2hex(random_bytes(8));
-    $body     = "--{$boundary}\r\n";
-    $body    .= 'Content-Disposition: form-data; name="file"; filename="' . addslashes(basename($file['name'])) . '"' . "\r\n";
-    $body    .= "Content-Type: application/octet-stream\r\n\r\n";
-    $body    .= file_get_contents($file['tmp_name']) . "\r\n";
-    $body    .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"model\"\r\n\r\nwhisper-1\r\n";
-    $body    .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"response_format\"\r\n\r\nverbose_json\r\n";
-    if ($language !== 'auto') {
-        $body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"language\"\r\n\r\n{$language}\r\n";
-    }
-    if ($task === 'translate') {
-        $body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"task\"\r\n\r\ntranslation\r\n";
-    }
-    $body .= "--{$boundary}--\r\n";
-
-    $ch = curl_init('https://api.openai.com/v1/audio/transcriptions');
-    curl_setopt_array($ch, [
-        CURLOPT_RETURNTRANSFER => true,
-        CURLOPT_POST           => true,
-        CURLOPT_POSTFIELDS     => $body,
-        CURLOPT_HTTPHEADER     => [
-            "Authorization: Bearer {$apiKey}",
-            "Content-Type: multipart/form-data; boundary={$boundary}",
-            'Accept: application/json',
-        ],
-        CURLOPT_TIMEOUT => 300,
-    ]);
-    $responseBody = curl_exec($ch);
-    $httpCode     = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
-    $curlErr      = curl_error($ch);
-    curl_close($ch);
-
-    if ($responseBody === false || $httpCode !== 200) {
-        $detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : '');
-        dbnToolsError('OpenAI API error (HTTP ' . $httpCode . '): ' . $detail, 502, 'openai_error');
-    }
-
-    $data = json_decode($responseBody, true);
-    if (!is_array($data)) {
-        dbnToolsError('Invalid response from OpenAI.', 502, 'openai_empty');
-    }
-
-    // Normalise to internal shape
-    return [
-        'text'               => (string)($data['text'] ?? ''),
-        'language'           => (string)($data['language'] ?? $language),
-        'duration_seconds'   => (float)($data['duration'] ?? 0),
-        'processing_seconds' => 0,
-        'segments'           => array_map(fn($s) => [
-            'id'      => $s['id']    ?? 0,
-            'start'   => $s['start'] ?? 0,
-            'end'     => $s['end']   ?? 0,
-            'text'    => $s['text']  ?? '',
-            'speaker' => 'SPEAKER_00',
-        ], $data['segments'] ?? []),
-        'model' => 'openai/whisper-1',
-    ];
-}
-
-
-function transcribeViaAzure(array $file, string $language, string $apiKey,
-                             string $region, bool $diarize): array
-{
-    // Azure Batch Transcription — POST audio directly for short-form (<60 min)
-    // Uses the simple REST endpoint for synchronous short audio transcription.
-    $langCode = match($language) {
-        'no', 'nb' => 'nb-NO',
-        'nn'       => 'nn-NO',
-        'en'       => 'en-US',
-        'sv'       => 'sv-SE',
-        'da'       => 'da-DK',
-        'de'       => 'de-DE',
-        'fr'       => 'fr-FR',
-        'es'       => 'es-ES',
-        'pl'       => 'pl-PL',
-        'fi'       => 'fi-FI',
-        'nl'       => 'nl-NL',
-        'it'       => 'it-IT',
-        'pt'       => 'pt-PT',
-        default    => 'nb-NO',
-    };
-
-    // Mime type map
-    $mimeMap = [
-        'wav'  => 'audio/wav',  'mp3'  => 'audio/mpeg', 'ogg'  => 'audio/ogg',
-        'oga'  => 'audio/ogg',  'm4a'  => 'audio/mp4',  'mp4'  => 'audio/mp4',
-        'flac' => 'audio/flac', 'webm' => 'audio/webm', 'aac'  => 'audio/aac',
-    ];
-    $fileExt  = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION));
-    $mimeType = $mimeMap[$fileExt] ?? 'audio/wav';
-
-    $endpoint = "https://{$region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1"
-              . "?language={$langCode}&format=detailed";
-
-    $fileContents = file_get_contents($file['tmp_name']);
-    if ($fileContents === false) {
-        dbnToolsError('Could not read uploaded file.', 500, 'file_read_error');
-    }
-
-    $ch = curl_init($endpoint);
-    curl_setopt_array($ch, [
-        CURLOPT_RETURNTRANSFER => true,
-        CURLOPT_POST           => true,
-        CURLOPT_POSTFIELDS     => $fileContents,
-        CURLOPT_HTTPHEADER     => [
-            "Ocp-Apim-Subscription-Key: {$apiKey}",
-            "Content-Type: {$mimeType}",
-            'Accept: application/json',
-        ],
-        CURLOPT_TIMEOUT => 300,
-    ]);
-    $responseBody = curl_exec($ch);
-    $httpCode     = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
-    $curlErr      = curl_error($ch);
-    curl_close($ch);
-
-    if ($responseBody === false || $httpCode !== 200) {
-        $detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : '');
-        dbnToolsError('Azure Speech error (HTTP ' . $httpCode . '): ' . $detail, 502, 'azure_error');
-    }
-
-    $data = json_decode($responseBody, true);
-    if (!is_array($data) || empty($data['DisplayText'])) {
-        dbnToolsError('Empty or invalid response from Azure Speech.', 502, 'azure_empty');
-    }
-
-    // Normalise to internal shape
-    $text = (string)($data['DisplayText'] ?? '');
-    $segs = [];
-    foreach (($data['NBest'][0]['Words'] ?? []) as $i => $word) {
-        $segs[] = [
-            'id'      => $i,
-            'start'   => round((float)($word['Offset'] ?? 0) / 10_000_000, 3),
-            'end'     => round(((float)($word['Offset'] ?? 0) + (float)($word['Duration'] ?? 0)) / 10_000_000, 3),
-            'text'    => (string)($word['Word'] ?? ''),
-            'speaker' => 'SPEAKER_00',
-        ];
-    }
-
-    return [
-        'text'               => $text,
-        'language'           => $langCode,
-        'duration_seconds'   => 0,
-        'processing_seconds' => 0,
-        'segments'           => $segs,
-        'model'              => "azure/{$langCode}",
-    ];
-}
-
-
 function dbnLabelSpeakerRoles(array $segments): array
 {
    $sample = array_slice(