= 0) { header('X-Credits-Remaining: ' . $ftRemaining); } set_time_limit(0); ignore_user_abort(true); // ── Common params ───────────────────────────────────────────────────────────── $validLangs = ['auto', 'no', 'nn', 'en', 'sv', 'da', 'de', 'fr', 'es', 'pl', 'uk', 'fi', 'nl', 'it', 'pt', 'ru', 'ar', 'tr', 'zh', 'ja', 'ko']; $language = strtolower(trim((string)($_POST['language'] ?? 'auto'))); if (!in_array($language, $validLangs, true)) $language = 'auto'; $diarize = !empty($_POST['diarize']) && $_POST['diarize'] !== '0'; $numSpeakers = isset($_POST['num_speakers']) ? max(0, min(20, (int)$_POST['num_speakers'])) : 0; $validModels = ['tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3']; $gpuModel = in_array($_POST['model'] ?? '', $validModels, true) ? $_POST['model'] : 'large-v3'; $beamSize = max(1, min(5, (int)($_POST['beam_size'] ?? 5))); $task = ($_POST['task'] ?? 'transcribe') === 'translate' ? 'translate' : 'transcribe'; $vadFilter = !empty($_POST['vad_filter']) && $_POST['vad_filter'] !== '0'; $initPrompt = substr(trim((string)($_POST['initial_prompt'] ?? '')), 0, 500); $allowedPostModels = ['', 'gpt-4o-mini', 'gpt-4o']; $postModel = in_array($_POST['post_model'] ?? '', $allowedPostModels, true) ? (string)($_POST['post_model'] ?? '') : ''; // ── Validate upload (or load from stored audio corpus) ──────────────────────── $storedAudioTmp = null; if (empty($_FILES['audio']) || $_FILES['audio']['error'] !== UPLOAD_ERR_OK) { // Check if the user picked a previously saved audio document $audioDocId = (int)($_POST['audio_doc_id'] ?? 0); if ($audioDocId > 0) { $clientId = dbnToolsClientIdFromSession(); if ($clientId <= 0) { dbnToolsError('No audio file received and no valid session for stored audio.', 400, 'upload_error'); } $db = dbnToolsDb(); $row = $db->prepare( 'SELECT audio_storage_path, title FROM client_documents WHERE id = ? AND client_id = ? AND source_type = ? AND status = ? LIMIT 1' ); $row->execute([$audioDocId, $clientId, 'audio', 'ready']); $audioRow = $row->fetch(PDO::FETCH_ASSOC); if (!$audioRow || empty($audioRow['audio_storage_path']) || !is_readable((string)$audioRow['audio_storage_path'])) { dbnToolsError('Stored audio file not found or not readable.', 404, 'audio_not_found'); } // Synthesise a $_FILES-compatible entry pointing at the stored file $storedAudioTmp = $audioRow['audio_storage_path']; $_FILES['audio'] = [ 'name' => basename($storedAudioTmp), 'tmp_name' => $storedAudioTmp, 'error' => UPLOAD_ERR_OK, 'size' => (int)filesize($storedAudioTmp), 'type' => mime_content_type($storedAudioTmp) ?: 'application/octet-stream', ]; } else { $code = $_FILES['audio']['error'] ?? -1; $map = [ UPLOAD_ERR_INI_SIZE => 'File exceeds server upload limit.', UPLOAD_ERR_FORM_SIZE => 'File exceeds form size limit.', UPLOAD_ERR_PARTIAL => 'File was only partially uploaded.', UPLOAD_ERR_NO_FILE => 'No audio file received.', ]; dbnToolsError($map[$code] ?? "Upload error (code {$code}).", 400, 'upload_error'); } } $file = $_FILES['audio']; $maxBytes = 200 * 1024 * 1024; if ($file['size'] > $maxBytes) { dbnToolsError('File too large. Maximum 200 MB.', 413, 'file_too_large'); } $allowedExts = ['mp3', 'wav', 'ogg', 'oga', 'm4a', 'mp4', 'flac', 'webm', 'aac']; $ext = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION)); if (!in_array($ext, $allowedExts, true)) { dbnToolsError("Unsupported format: .{$ext}. Use MP3, WAV, OGG, M4A, FLAC, or WebM.", 415, 'unsupported_format'); } $detectedMime = mime_content_type($file['tmp_name']) ?: 'application/octet-stream'; $timeOffset = max(0.0, (float)($_POST['time_offset'] ?? 0)); $t0 = microtime(true); // ── Auto-cascade: Azure → GCP → Whisper GPU ─────────────────────────────────── $result = null; $engineUsed = 'whisper-gpu'; // 1. Microsoft Azure Speech — fast path for short, non-diarize audio clips $azureKey = (string)(dbnToolsEnv('DBN_AZURE_SPEECH_KEY') ?? ''); $azureRegion = preg_replace('/[^a-z0-9]/', '', strtolower( (string)(dbnToolsEnv('DBN_AZURE_SPEECH_REGION') ?? 'norwayeast') )); if ($azureKey !== '' && !$diarize && $file['size'] <= 1024 * 1024 && str_starts_with($detectedMime, 'audio/')) { $result = transcribeViaAzureServer($file, $language, $azureKey, $azureRegion); if ($result !== null) { $engineUsed = 'azure'; } else { error_log('STT: Azure Speech skipped or failed, trying Google Cloud'); } } // 2. Google Cloud Speech v2 — long audio, diarization, everything Azure can't handle if ($result === null) { require_once __DIR__ . '/../includes/GcpSpeechClient.php'; $gcp = GcpSpeechClient::fromConfig(); if ($gcp) { $gcpLang = ($language === 'auto') ? '' : $language; $result = $gcp->transcribe( $file['tmp_name'], $detectedMime, $gcpLang, $diarize, $numSpeakers > 1 ? $numSpeakers : 2, $numSpeakers > 1 ? max($numSpeakers, 2) : 6 ); if ($result !== null) { $engineUsed = 'gcp'; } else { error_log('STT: Google Cloud Speech failed, falling back to Whisper'); } } } // 3. Whisper GPU — local fallback if ($result === null) { $result = transcribeViaWhisperGpu($file, $language, $diarize, $numSpeakers, $gpuModel, $beamSize, $task, $vadFilter, $initPrompt); $engineUsed = 'whisper-gpu'; } $latencyMs = (int)round((microtime(true) - $t0) * 1000); // ── Shift segment timestamps for multi-clip sessions ───────────────────────── if ($timeOffset > 0.0 && !empty($result['segments'])) { foreach ($result['segments'] as &$seg) { $seg['start'] = round(($seg['start'] ?? 0) + $timeOffset, 3); $seg['end'] = round(($seg['end'] ?? 0) + $timeOffset, 3); } unset($seg); } // ── Optional GPT cleanup pass ───────────────────────────────────────────────── $cleanedBy = null; if ($postModel !== '' && !empty($result['text'])) { $cleaned = dbnCleanupTranscript($result['text'], $language, $initPrompt, $postModel); if ($cleaned !== null) { $result['text'] = $cleaned; $cleanedBy = $postModel; } } // ── Speaker role labelling (diarize + multiple speakers only) ───────────────── $segments = $result['segments'] ?? []; $numDetected = (int)($result['num_speakers'] ?? 1); if ($numDetected < 2 && $segments) { $uniqueSpeakers = array_filter(array_unique(array_column($segments, 'speaker'))); if (count($uniqueSpeakers) > 1) $numDetected = count($uniqueSpeakers); } $speakerRoles = null; if ($diarize && $numDetected > 1 && $segments) { $labelDeployment = $postModel ?: 'gpt-4o-mini'; $speakerRoles = dbnLabelSpeakerRoles($segments, $labelDeployment); } // ── Friendly engine label ───────────────────────────────────────────────────── $engineLabel = match($engineUsed) { 'azure' => 'Microsoft Azure Speech', 'gcp' => 'Google Cloud Speech', default => 'OpenAI Whisper ' . $gpuModel, }; // ── Log + respond ───────────────────────────────────────────────────────────── dbnToolsLogMetadata([ 'tool' => 'transcribe', 'engine' => $engineUsed, 'model' => $engineLabel, 'language' => $language, 'ok' => true, 'latency_ms' => $latencyMs, ]); dbnToolsRespond([ 'ok' => true, 'tool' => 'transcribe', 'transcript' => (string)($result['text'] ?? ''), 'segments' => $segments, 'speaker_roles' => $speakerRoles, 'num_speakers' => $numDetected, 'language' => (string)($result['language'] ?? $language), 'duration_sec' => round((float)($result['duration_seconds'] ?? $result['duration'] ?? 0), 2), 'processing_sec'=> round((float)($result['processing_seconds'] ?? 0), 2), 'model' => $engineLabel, 'engine' => $engineUsed, 'latency_ms' => $latencyMs, 'cleaned_by' => $cleanedBy, ]); // ── Engine implementations ──────────────────────────────────────────────────── /** * Microsoft Azure Speech — short clips (≤1MB, no diarization). * Returns null on any failure so the caller can cascade to the next engine. */ function transcribeViaAzureServer(array $file, string $language, string $apiKey, string $region): ?array { $langCode = match($language) { 'no', 'nb' => 'nb-NO', 'nn' => 'nn-NO', 'en' => 'en-US', 'sv' => 'sv-SE', 'da' => 'da-DK', 'de' => 'de-DE', 'fr' => 'fr-FR', 'es' => 'es-ES', 'pl' => 'pl-PL', 'fi' => 'fi-FI', 'nl' => 'nl-NL', 'it' => 'it-IT', 'pt' => 'pt-PT', default => 'nb-NO', }; $mimeMap = [ 'wav' => 'audio/wav', 'mp3' => 'audio/mpeg', 'ogg' => 'audio/ogg', 'oga' => 'audio/ogg', 'm4a' => 'audio/mp4', 'mp4' => 'audio/mp4', 'flac' => 'audio/flac', 'webm' => 'audio/webm', 'aac' => 'audio/aac', ]; $fileExt = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION)); $mimeType = $mimeMap[$fileExt] ?? 'audio/wav'; $endpoint = "https://{$region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1" . "?language={$langCode}&format=detailed"; $fileContents = @file_get_contents($file['tmp_name']); if ($fileContents === false) return null; $ch = curl_init($endpoint); curl_setopt_array($ch, [ CURLOPT_RETURNTRANSFER => true, CURLOPT_POST => true, CURLOPT_POSTFIELDS => $fileContents, CURLOPT_HTTPHEADER => [ "Ocp-Apim-Subscription-Key: {$apiKey}", "Content-Type: {$mimeType}", 'Accept: application/json', ], CURLOPT_TIMEOUT => 60, ]); $responseBody = curl_exec($ch); $httpCode = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE); curl_close($ch); if ($responseBody === false || $httpCode !== 200) { error_log("STT Azure HTTP {$httpCode}: " . substr((string)$responseBody, 0, 200)); return null; } $data = json_decode($responseBody, true); if (!is_array($data) || empty($data['DisplayText'])) return null; $text = (string)($data['DisplayText'] ?? ''); $segs = []; foreach (($data['NBest'][0]['Words'] ?? []) as $i => $word) { $segs[] = [ 'id' => $i, 'start' => round((float)($word['Offset'] ?? 0) / 10_000_000, 3), 'end' => round(((float)($word['Offset'] ?? 0) + (float)($word['Duration'] ?? 0)) / 10_000_000, 3), 'text' => (string)($word['Word'] ?? ''), ]; } return [ 'text' => $text, 'language' => strtolower(explode('-', $langCode)[0]), 'duration_seconds' => 0, 'processing_seconds' => 0, 'segments' => $segs, ]; } function transcribeViaWhisperGpu(array $file, string $language, bool $diarize, int $numSpeakers, string $model, int $beamSize, string $task, bool $vadFilter, string $initPrompt): array { $whisperBase = 'http://127.0.0.1:20019'; $endpoint = $diarize ? $whisperBase . '/transcribe/diarize' : $whisperBase . '/transcribe'; $boundary = '----DBN' . bin2hex(random_bytes(8)); $body = "--{$boundary}\r\n"; $body .= 'Content-Disposition: form-data; name="file"; filename="' . addslashes(basename($file['name'])) . '"' . "\r\n"; $body .= "Content-Type: application/octet-stream\r\n\r\n"; $fileContents = file_get_contents($file['tmp_name']); if ($fileContents === false) { dbnToolsError('Could not read uploaded file.', 500, 'file_read_error'); } $body .= $fileContents . "\r\n"; $fields = [ 'model' => $model, 'beam_size' => (string)$beamSize, 'task' => $task, 'vad_filter' => $vadFilter ? '1' : '0', 'initial_prompt' => $initPrompt, ]; if ($language !== 'auto') $fields['language'] = $language; if ($diarize && $numSpeakers > 1) $fields['num_speakers'] = (string)$numSpeakers; foreach ($fields as $name => $value) { if ($value === '') continue; $body .= "--{$boundary}\r\n"; $body .= "Content-Disposition: form-data; name=\"{$name}\"\r\n\r\n"; $body .= $value . "\r\n"; } $body .= "--{$boundary}--\r\n"; $ch = curl_init($endpoint); curl_setopt_array($ch, [ CURLOPT_RETURNTRANSFER => true, CURLOPT_POST => true, CURLOPT_POSTFIELDS => $body, CURLOPT_HTTPHEADER => [ "Content-Type: multipart/form-data; boundary={$boundary}", 'Accept: application/json', ], CURLOPT_TIMEOUT => 600, ]); $responseBody = curl_exec($ch); $httpCode = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE); $curlErr = curl_error($ch); curl_close($ch); if ($responseBody === false || $httpCode !== 200) { $detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : ''); dbnToolsError('Whisper service error (HTTP ' . $httpCode . '): ' . $detail, 502, 'whisper_error'); } $data = json_decode($responseBody, true); if (!is_array($data) || empty($data['text'])) { dbnToolsError('Empty or invalid response from Whisper.', 502, 'whisper_empty'); } return $data; } function dbnLabelSpeakerRoles(array $segments, string $deployment = 'gpt-4o-mini'): array { $sample = array_slice( array_values(array_filter($segments, fn($s) => isset($s['speaker']))), 0, 20 ); if (!$sample) return []; $lines = array_map(fn($s) => "[{$s['speaker']}] " . trim((string)($s['text'] ?? '')), $sample); $azure = (new DbnAzureOpenAiGateway())->withDeployment($deployment); $system = 'Label speakers in this Norwegian legal transcript. ' . 'Return ONLY valid JSON: {"SPEAKER_00":"dommer","SPEAKER_01":"forelder"}. ' . 'Norwegian role names only — dommer, advokat, forelder, barn, sakkyndig, ' . 'saksbehandler, tolk, vitne, prosessfullmektig. Use "ukjent" if unclear.'; try { $text = $azure->chatText([ ['role' => 'system', 'content' => $system], ['role' => 'user', 'content' => implode("\n", $lines)], ], ['temperature' => 0.1, 'max_tokens' => 200]); $cleaned = preg_replace('/^```(?:json)?\s*|\s*```$/m', '', trim($text)); $json = json_decode($cleaned, true); return is_array($json) ? $json : []; } catch (Throwable) { return []; } } function dbnCleanupTranscript(string $text, string $language, string $vocabulary, string $deployment): ?string { $langName = match($language) { 'no', 'nb', 'nn' => 'Norwegian', 'en' => 'English', 'pl' => 'Polish', 'uk' => 'Ukrainian', 'sv' => 'Swedish', 'da' => 'Danish', 'de' => 'German', 'fr' => 'French', default => 'Norwegian', }; $vocabHint = $vocabulary !== '' ? " Domain terms to preserve correctly: {$vocabulary}." : ''; $system = "Fix transcription errors in this {$langName} text.{$vocabHint} " . "Correct mishearing errors, run-on sentences, and punctuation. " . "Preserve all meaning and the original language exactly. " . "Return only the corrected transcript text, no commentary."; try { $azure = (new DbnAzureOpenAiGateway())->withDeployment($deployment); $result = $azure->chatText( [['role' => 'system', 'content' => $system], ['role' => 'user', 'content' => $text]], ['temperature' => 0.1, 'max_tokens' => 4096] ); return ($result !== '' && $result !== null) ? $result : null; } catch (Throwable) { return null; } }