feat: auto-select STT engine (Azure → Google Cloud → Whisper) and show provider in results
Removes user-facing engine/model/key/beam controls. The server now picks the best available engine automatically: 1. Microsoft Azure Speech — short clips (≤1MB, no diarization, audio/*) 2. Google Cloud Speech v2 — long audio, diarization, all languages 3. OpenAI Whisper GPU — local fallback Results display which provider was used (e.g. "Transcribed with Google Cloud Speech") via transcript-engine-badge and traceMeta. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+143
-186
@@ -17,9 +17,8 @@ if (!in_array($language, $validLangs, true)) $language = 'auto';
|
||||
|
||||
$diarize = !empty($_POST['diarize']) && $_POST['diarize'] !== '0';
|
||||
$numSpeakers = isset($_POST['num_speakers']) ? max(0, min(20, (int)$_POST['num_speakers'])) : 0;
|
||||
$engine = in_array($_POST['engine'] ?? '', ['gpu', 'openai', 'azure'], true) ? $_POST['engine'] : 'gpu';
|
||||
$validModels = ['tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3'];
|
||||
$model = in_array($_POST['model'] ?? '', $validModels, true) ? $_POST['model'] : 'small';
|
||||
$gpuModel = in_array($_POST['model'] ?? '', $validModels, true) ? $_POST['model'] : 'large-v3';
|
||||
$beamSize = max(1, min(5, (int)($_POST['beam_size'] ?? 5)));
|
||||
$task = ($_POST['task'] ?? 'transcribe') === 'translate' ? 'translate' : 'transcribe';
|
||||
$vadFilter = !empty($_POST['vad_filter']) && $_POST['vad_filter'] !== '0';
|
||||
@@ -51,36 +50,56 @@ if (!in_array($ext, $allowedExts, true)) {
|
||||
dbnToolsError("Unsupported format: .{$ext}. Use MP3, WAV, OGG, M4A, FLAC, or WebM.", 415, 'unsupported_format');
|
||||
}
|
||||
|
||||
// OpenAI has a 25 MB file limit
|
||||
if ($engine === 'openai' && $file['size'] > 25 * 1024 * 1024) {
|
||||
dbnToolsError('OpenAI Whisper API has a 25 MB file limit. Use the GPU engine for larger files.', 413, 'openai_file_too_large');
|
||||
$detectedMime = mime_content_type($file['tmp_name']) ?: 'application/octet-stream';
|
||||
$timeOffset = max(0.0, (float)($_POST['time_offset'] ?? 0));
|
||||
$t0 = microtime(true);
|
||||
|
||||
// ── Auto-cascade: Azure → GCP → Whisper GPU ───────────────────────────────────
|
||||
|
||||
$result = null;
|
||||
$engineUsed = 'whisper-gpu';
|
||||
|
||||
// 1. Microsoft Azure Speech — fast path for short, non-diarize audio clips
|
||||
$azureKey = (string)(dbnToolsEnv('DBN_AZURE_SPEECH_KEY') ?? '');
|
||||
$azureRegion = preg_replace('/[^a-z0-9]/', '', strtolower(
|
||||
(string)(dbnToolsEnv('DBN_AZURE_SPEECH_REGION') ?? 'norwayeast')
|
||||
));
|
||||
if ($azureKey !== '' && !$diarize && $file['size'] <= 1024 * 1024 && str_starts_with($detectedMime, 'audio/')) {
|
||||
$result = transcribeViaAzureServer($file, $language, $azureKey, $azureRegion);
|
||||
if ($result !== null) {
|
||||
$engineUsed = 'azure';
|
||||
} else {
|
||||
error_log('STT: Azure Speech skipped or failed, trying Google Cloud');
|
||||
}
|
||||
}
|
||||
|
||||
$timeOffset = max(0.0, (float)($_POST['time_offset'] ?? 0));
|
||||
$t0 = microtime(true);
|
||||
|
||||
// ── Route to engine ───────────────────────────────────────────────────────────
|
||||
|
||||
if ($engine === 'openai') {
|
||||
$apiKey = trim((string)($_POST['openai_key'] ?? ''));
|
||||
if (!$apiKey || !str_starts_with($apiKey, 'sk-')) {
|
||||
dbnToolsError('A valid OpenAI API key (sk-…) is required for the OpenAI engine.', 400, 'missing_openai_key');
|
||||
// 2. Google Cloud Speech v2 — long audio, diarization, everything Azure can't handle
|
||||
if ($result === null) {
|
||||
$gcpPath = dbnToolsAiPortalRoot() . '/lib/ai/GcpSpeechClient.php';
|
||||
if (is_file($gcpPath)) {
|
||||
require_once $gcpPath;
|
||||
$gcp = GcpSpeechClient::fromConfig();
|
||||
if ($gcp) {
|
||||
$gcpLang = ($language === 'auto') ? '' : $language;
|
||||
$result = $gcp->transcribe(
|
||||
$file['tmp_name'], $detectedMime, $gcpLang,
|
||||
$diarize,
|
||||
$numSpeakers > 1 ? $numSpeakers : 2,
|
||||
$numSpeakers > 1 ? max($numSpeakers, 2) : 6
|
||||
);
|
||||
if ($result !== null) {
|
||||
$engineUsed = 'gcp';
|
||||
} else {
|
||||
error_log('STT: Google Cloud Speech failed, falling back to Whisper');
|
||||
}
|
||||
}
|
||||
}
|
||||
$result = transcribeViaOpenAI($file, $language, $task, $apiKey);
|
||||
}
|
||||
|
||||
} elseif ($engine === 'azure') {
|
||||
$apiKey = trim((string)($_POST['azure_key'] ?? ''));
|
||||
if ($apiKey === '') $apiKey = (string)(dbnToolsEnv('DBN_AZURE_SPEECH_KEY') ?? '');
|
||||
$region = preg_replace('/[^a-z0-9]/', '', strtolower(trim((string)($_POST['azure_region'] ?? ''))));
|
||||
if ($region === '') $region = preg_replace('/[^a-z0-9]/', '', strtolower((string)(dbnToolsEnv('DBN_AZURE_SPEECH_REGION') ?? 'norwayeast')));
|
||||
if (!$apiKey) {
|
||||
dbnToolsError('An Azure Speech API key is required for the Azure engine.', 400, 'missing_azure_key');
|
||||
}
|
||||
$result = transcribeViaAzure($file, $language, $apiKey, $region, $diarize);
|
||||
|
||||
} else {
|
||||
// GPU (default)
|
||||
$result = transcribeViaWhisperGpu($file, $language, $diarize, $numSpeakers, $model, $beamSize, $task, $vadFilter, $initPrompt);
|
||||
// 3. Whisper GPU — local fallback
|
||||
if ($result === null) {
|
||||
$result = transcribeViaWhisperGpu($file, $language, $diarize, $numSpeakers, $gpuModel, $beamSize, $task, $vadFilter, $initPrompt);
|
||||
$engineUsed = 'whisper-gpu';
|
||||
}
|
||||
|
||||
$latencyMs = (int)round((microtime(true) - $t0) * 1000);
|
||||
@@ -95,7 +114,7 @@ if ($timeOffset > 0.0 && !empty($result['segments'])) {
|
||||
unset($seg);
|
||||
}
|
||||
|
||||
// ── Speaker role labelling (GPU + diarize only) ───────────────────────────────
|
||||
// ── Speaker role labelling (diarize + multiple speakers only) ─────────────────
|
||||
|
||||
$segments = $result['segments'] ?? [];
|
||||
$numDetected = (int)($result['num_speakers'] ?? 1);
|
||||
@@ -110,12 +129,20 @@ if ($diarize && $numDetected > 1 && $segments) {
|
||||
$speakerRoles = dbnLabelSpeakerRoles($segments);
|
||||
}
|
||||
|
||||
// ── Friendly engine label ─────────────────────────────────────────────────────
|
||||
|
||||
$engineLabel = match($engineUsed) {
|
||||
'azure' => 'Microsoft Azure Speech',
|
||||
'gcp' => 'Google Cloud Speech',
|
||||
default => 'OpenAI Whisper ' . $gpuModel,
|
||||
};
|
||||
|
||||
// ── Log + respond ─────────────────────────────────────────────────────────────
|
||||
|
||||
dbnToolsLogMetadata([
|
||||
'tool' => 'transcribe',
|
||||
'engine' => $engine,
|
||||
'model' => $model,
|
||||
'engine' => $engineUsed,
|
||||
'model' => $engineLabel,
|
||||
'language' => $language,
|
||||
'ok' => true,
|
||||
'latency_ms' => $latencyMs,
|
||||
@@ -129,16 +156,98 @@ dbnToolsRespond([
|
||||
'speaker_roles' => $speakerRoles,
|
||||
'num_speakers' => $numDetected,
|
||||
'language' => (string)($result['language'] ?? $language),
|
||||
'duration_sec' => round((float)($result['duration_seconds'] ?? 0), 2),
|
||||
'duration_sec' => round((float)($result['duration_seconds'] ?? $result['duration'] ?? 0), 2),
|
||||
'processing_sec'=> round((float)($result['processing_seconds'] ?? 0), 2),
|
||||
'model' => (string)($result['model'] ?? ($engine === 'gpu' ? $model : $engine)),
|
||||
'engine' => $engine,
|
||||
'model' => $engineLabel,
|
||||
'engine' => $engineUsed,
|
||||
'latency_ms' => $latencyMs,
|
||||
]);
|
||||
|
||||
|
||||
// ── Engine implementations ────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Microsoft Azure Speech — short clips (≤1MB, no diarization).
|
||||
* Returns null on any failure so the caller can cascade to the next engine.
|
||||
*/
|
||||
function transcribeViaAzureServer(array $file, string $language, string $apiKey, string $region): ?array
|
||||
{
|
||||
$langCode = match($language) {
|
||||
'no', 'nb' => 'nb-NO',
|
||||
'nn' => 'nn-NO',
|
||||
'en' => 'en-US',
|
||||
'sv' => 'sv-SE',
|
||||
'da' => 'da-DK',
|
||||
'de' => 'de-DE',
|
||||
'fr' => 'fr-FR',
|
||||
'es' => 'es-ES',
|
||||
'pl' => 'pl-PL',
|
||||
'fi' => 'fi-FI',
|
||||
'nl' => 'nl-NL',
|
||||
'it' => 'it-IT',
|
||||
'pt' => 'pt-PT',
|
||||
default => 'nb-NO',
|
||||
};
|
||||
|
||||
$mimeMap = [
|
||||
'wav' => 'audio/wav', 'mp3' => 'audio/mpeg', 'ogg' => 'audio/ogg',
|
||||
'oga' => 'audio/ogg', 'm4a' => 'audio/mp4', 'mp4' => 'audio/mp4',
|
||||
'flac' => 'audio/flac', 'webm' => 'audio/webm', 'aac' => 'audio/aac',
|
||||
];
|
||||
$fileExt = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION));
|
||||
$mimeType = $mimeMap[$fileExt] ?? 'audio/wav';
|
||||
|
||||
$endpoint = "https://{$region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1"
|
||||
. "?language={$langCode}&format=detailed";
|
||||
|
||||
$fileContents = @file_get_contents($file['tmp_name']);
|
||||
if ($fileContents === false) return null;
|
||||
|
||||
$ch = curl_init($endpoint);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $fileContents,
|
||||
CURLOPT_HTTPHEADER => [
|
||||
"Ocp-Apim-Subscription-Key: {$apiKey}",
|
||||
"Content-Type: {$mimeType}",
|
||||
'Accept: application/json',
|
||||
],
|
||||
CURLOPT_TIMEOUT => 60,
|
||||
]);
|
||||
$responseBody = curl_exec($ch);
|
||||
$httpCode = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
|
||||
curl_close($ch);
|
||||
|
||||
if ($responseBody === false || $httpCode !== 200) {
|
||||
error_log("STT Azure HTTP {$httpCode}: " . substr((string)$responseBody, 0, 200));
|
||||
return null;
|
||||
}
|
||||
|
||||
$data = json_decode($responseBody, true);
|
||||
if (!is_array($data) || empty($data['DisplayText'])) return null;
|
||||
|
||||
$text = (string)($data['DisplayText'] ?? '');
|
||||
$segs = [];
|
||||
foreach (($data['NBest'][0]['Words'] ?? []) as $i => $word) {
|
||||
$segs[] = [
|
||||
'id' => $i,
|
||||
'start' => round((float)($word['Offset'] ?? 0) / 10_000_000, 3),
|
||||
'end' => round(((float)($word['Offset'] ?? 0) + (float)($word['Duration'] ?? 0)) / 10_000_000, 3),
|
||||
'text' => (string)($word['Word'] ?? ''),
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
'text' => $text,
|
||||
'language' => strtolower(explode('-', $langCode)[0]),
|
||||
'duration_seconds' => 0,
|
||||
'processing_seconds' => 0,
|
||||
'segments' => $segs,
|
||||
];
|
||||
}
|
||||
|
||||
|
||||
function transcribeViaWhisperGpu(array $file, string $language, bool $diarize, int $numSpeakers,
|
||||
string $model, int $beamSize, string $task,
|
||||
bool $vadFilter, string $initPrompt): array
|
||||
@@ -204,158 +313,6 @@ function transcribeViaWhisperGpu(array $file, string $language, bool $diarize, i
|
||||
}
|
||||
|
||||
|
||||
function transcribeViaOpenAI(array $file, string $language, string $task, string $apiKey): array
|
||||
{
|
||||
$boundary = '----DBN' . bin2hex(random_bytes(8));
|
||||
$body = "--{$boundary}\r\n";
|
||||
$body .= 'Content-Disposition: form-data; name="file"; filename="' . addslashes(basename($file['name'])) . '"' . "\r\n";
|
||||
$body .= "Content-Type: application/octet-stream\r\n\r\n";
|
||||
$body .= file_get_contents($file['tmp_name']) . "\r\n";
|
||||
$body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"model\"\r\n\r\nwhisper-1\r\n";
|
||||
$body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"response_format\"\r\n\r\nverbose_json\r\n";
|
||||
if ($language !== 'auto') {
|
||||
$body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"language\"\r\n\r\n{$language}\r\n";
|
||||
}
|
||||
if ($task === 'translate') {
|
||||
$body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"task\"\r\n\r\ntranslation\r\n";
|
||||
}
|
||||
$body .= "--{$boundary}--\r\n";
|
||||
|
||||
$ch = curl_init('https://api.openai.com/v1/audio/transcriptions');
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $body,
|
||||
CURLOPT_HTTPHEADER => [
|
||||
"Authorization: Bearer {$apiKey}",
|
||||
"Content-Type: multipart/form-data; boundary={$boundary}",
|
||||
'Accept: application/json',
|
||||
],
|
||||
CURLOPT_TIMEOUT => 300,
|
||||
]);
|
||||
$responseBody = curl_exec($ch);
|
||||
$httpCode = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
|
||||
$curlErr = curl_error($ch);
|
||||
curl_close($ch);
|
||||
|
||||
if ($responseBody === false || $httpCode !== 200) {
|
||||
$detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : '');
|
||||
dbnToolsError('OpenAI API error (HTTP ' . $httpCode . '): ' . $detail, 502, 'openai_error');
|
||||
}
|
||||
|
||||
$data = json_decode($responseBody, true);
|
||||
if (!is_array($data)) {
|
||||
dbnToolsError('Invalid response from OpenAI.', 502, 'openai_empty');
|
||||
}
|
||||
|
||||
// Normalise to internal shape
|
||||
return [
|
||||
'text' => (string)($data['text'] ?? ''),
|
||||
'language' => (string)($data['language'] ?? $language),
|
||||
'duration_seconds' => (float)($data['duration'] ?? 0),
|
||||
'processing_seconds' => 0,
|
||||
'segments' => array_map(fn($s) => [
|
||||
'id' => $s['id'] ?? 0,
|
||||
'start' => $s['start'] ?? 0,
|
||||
'end' => $s['end'] ?? 0,
|
||||
'text' => $s['text'] ?? '',
|
||||
'speaker' => 'SPEAKER_00',
|
||||
], $data['segments'] ?? []),
|
||||
'model' => 'openai/whisper-1',
|
||||
];
|
||||
}
|
||||
|
||||
|
||||
function transcribeViaAzure(array $file, string $language, string $apiKey,
|
||||
string $region, bool $diarize): array
|
||||
{
|
||||
// Azure Batch Transcription — POST audio directly for short-form (<60 min)
|
||||
// Uses the simple REST endpoint for synchronous short audio transcription.
|
||||
$langCode = match($language) {
|
||||
'no', 'nb' => 'nb-NO',
|
||||
'nn' => 'nn-NO',
|
||||
'en' => 'en-US',
|
||||
'sv' => 'sv-SE',
|
||||
'da' => 'da-DK',
|
||||
'de' => 'de-DE',
|
||||
'fr' => 'fr-FR',
|
||||
'es' => 'es-ES',
|
||||
'pl' => 'pl-PL',
|
||||
'fi' => 'fi-FI',
|
||||
'nl' => 'nl-NL',
|
||||
'it' => 'it-IT',
|
||||
'pt' => 'pt-PT',
|
||||
default => 'nb-NO',
|
||||
};
|
||||
|
||||
// Mime type map
|
||||
$mimeMap = [
|
||||
'wav' => 'audio/wav', 'mp3' => 'audio/mpeg', 'ogg' => 'audio/ogg',
|
||||
'oga' => 'audio/ogg', 'm4a' => 'audio/mp4', 'mp4' => 'audio/mp4',
|
||||
'flac' => 'audio/flac', 'webm' => 'audio/webm', 'aac' => 'audio/aac',
|
||||
];
|
||||
$fileExt = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION));
|
||||
$mimeType = $mimeMap[$fileExt] ?? 'audio/wav';
|
||||
|
||||
$endpoint = "https://{$region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1"
|
||||
. "?language={$langCode}&format=detailed";
|
||||
|
||||
$fileContents = file_get_contents($file['tmp_name']);
|
||||
if ($fileContents === false) {
|
||||
dbnToolsError('Could not read uploaded file.', 500, 'file_read_error');
|
||||
}
|
||||
|
||||
$ch = curl_init($endpoint);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $fileContents,
|
||||
CURLOPT_HTTPHEADER => [
|
||||
"Ocp-Apim-Subscription-Key: {$apiKey}",
|
||||
"Content-Type: {$mimeType}",
|
||||
'Accept: application/json',
|
||||
],
|
||||
CURLOPT_TIMEOUT => 300,
|
||||
]);
|
||||
$responseBody = curl_exec($ch);
|
||||
$httpCode = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
|
||||
$curlErr = curl_error($ch);
|
||||
curl_close($ch);
|
||||
|
||||
if ($responseBody === false || $httpCode !== 200) {
|
||||
$detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : '');
|
||||
dbnToolsError('Azure Speech error (HTTP ' . $httpCode . '): ' . $detail, 502, 'azure_error');
|
||||
}
|
||||
|
||||
$data = json_decode($responseBody, true);
|
||||
if (!is_array($data) || empty($data['DisplayText'])) {
|
||||
dbnToolsError('Empty or invalid response from Azure Speech.', 502, 'azure_empty');
|
||||
}
|
||||
|
||||
// Normalise to internal shape
|
||||
$text = (string)($data['DisplayText'] ?? '');
|
||||
$segs = [];
|
||||
foreach (($data['NBest'][0]['Words'] ?? []) as $i => $word) {
|
||||
$segs[] = [
|
||||
'id' => $i,
|
||||
'start' => round((float)($word['Offset'] ?? 0) / 10_000_000, 3),
|
||||
'end' => round(((float)($word['Offset'] ?? 0) + (float)($word['Duration'] ?? 0)) / 10_000_000, 3),
|
||||
'text' => (string)($word['Word'] ?? ''),
|
||||
'speaker' => 'SPEAKER_00',
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
'text' => $text,
|
||||
'language' => $langCode,
|
||||
'duration_seconds' => 0,
|
||||
'processing_seconds' => 0,
|
||||
'segments' => $segs,
|
||||
'model' => "azure/{$langCode}",
|
||||
];
|
||||
}
|
||||
|
||||
|
||||
function dbnLabelSpeakerRoles(array $segments): array
|
||||
{
|
||||
$sample = array_slice(
|
||||
|
||||
Reference in New Issue
Block a user