diff --git a/api/transcribe.php b/api/transcribe.php index 044fcfd..0345db9 100644 --- a/api/transcribe.php +++ b/api/transcribe.php @@ -75,23 +75,20 @@ if ($azureKey !== '' && !$diarize && $file['size'] <= 1024 * 1024 && str_starts_ // 2. Google Cloud Speech v2 — long audio, diarization, everything Azure can't handle if ($result === null) { - $gcpPath = dbnToolsAiPortalRoot() . '/lib/ai/GcpSpeechClient.php'; - if (is_file($gcpPath)) { - require_once $gcpPath; - $gcp = GcpSpeechClient::fromConfig(); - if ($gcp) { - $gcpLang = ($language === 'auto') ? '' : $language; - $result = $gcp->transcribe( - $file['tmp_name'], $detectedMime, $gcpLang, - $diarize, - $numSpeakers > 1 ? $numSpeakers : 2, - $numSpeakers > 1 ? max($numSpeakers, 2) : 6 - ); - if ($result !== null) { - $engineUsed = 'gcp'; - } else { - error_log('STT: Google Cloud Speech failed, falling back to Whisper'); - } + require_once __DIR__ . '/../includes/GcpSpeechClient.php'; + $gcp = GcpSpeechClient::fromConfig(); + if ($gcp) { + $gcpLang = ($language === 'auto') ? '' : $language; + $result = $gcp->transcribe( + $file['tmp_name'], $detectedMime, $gcpLang, + $diarize, + $numSpeakers > 1 ? $numSpeakers : 2, + $numSpeakers > 1 ? max($numSpeakers, 2) : 6 + ); + if ($result !== null) { + $engineUsed = 'gcp'; + } else { + error_log('STT: Google Cloud Speech failed, falling back to Whisper'); } } } diff --git a/includes/GcpSpeechClient.php b/includes/GcpSpeechClient.php new file mode 100644 index 0000000..adaab22 --- /dev/null +++ b/includes/GcpSpeechClient.php @@ -0,0 +1,232 @@ + 'nb-NO', 'nb' => 'nb-NO', 'nn' => 'nn-NO', + 'en' => 'en-US', 'sv' => 'sv-SE', 'da' => 'da-DK', + 'de' => 'de-DE', 'fr' => 'fr-FR', + ]; + + public function __construct(string $projectId, string $apiKey) { + $this->projectId = $projectId; + $this->apiKey = $apiKey; + } + + public static function fromConfig(): ?self { + $projectId = getenv('GCP_PROJECT_ID') ?: null; + $apiKey = getenv('GCP_API_KEY') ?: null; + + if (!$projectId || !$apiKey) { + $cfg = @include '/etc/bnl/gcp.php'; + if (is_array($cfg)) { + if (!$projectId && !empty($cfg['project_id'])) $projectId = (string)$cfg['project_id']; + if (!$apiKey && !empty($cfg['api_key'])) $apiKey = (string)$cfg['api_key']; + } + } + + if (!$projectId || !$apiKey) return null; + return new self($projectId, $apiKey); + } + + /** + * Transcribe an audio file using GCP Speech-to-Text v2. + * + * Returns a Whisper-compatible array on success: + * ['text', 'language', 'duration', 'segments', 'num_speakers'] + * Returns null on any failure (caller should fall back to Whisper). + */ + public function transcribe( + string $audioPath, + string $mimeType, + string $language, + bool $diarize, + int $minSpeakers = 2, + int $maxSpeakers = 6, + int $timeoutSec = 270 + ): ?array { + $locale = $this->resolveLocale($language); + + $features = ['enableAutomaticPunctuation' => true]; + if ($diarize) { + $features['diarizationConfig'] = [ + 'minSpeakerCount' => max(2, $minSpeakers), + 'maxSpeakerCount' => max(max(2, $minSpeakers), $maxSpeakers), + ]; + } + + $langCodes = [$locale]; + // Add Nynorsk as secondary when processing Norwegian content + if ($locale === 'nb-NO') $langCodes[] = 'nn-NO'; + + $config = [ + 'autoDecodingConfig' => (object)[], + 'languageCodes' => $langCodes, + 'model' => 'long', + 'features' => $features, + ]; + + $audioContent = base64_encode((string)file_get_contents($audioPath)); + if ($audioContent === '') { + error_log("GcpSpeechClient: failed to read audio file"); + return null; + } + + $body = json_encode(['config' => $config, 'content' => $audioContent]); + + $url = sprintf( + 'https://speech.googleapis.com/v2/projects/%s/locations/global/recognizers/_:recognize?key=%s', + rawurlencode($this->projectId), + rawurlencode($this->apiKey) + ); + + $ch = curl_init($url); + curl_setopt_array($ch, [ + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => $body, + CURLOPT_HTTPHEADER => ['Content-Type: application/json'], + CURLOPT_RETURNTRANSFER => true, + CURLOPT_TIMEOUT => 60, + ]); + $response = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + $curlErr = curl_error($ch); + curl_close($ch); + + if ($curlErr || !is_string($response)) { + error_log("GcpSpeechClient: curl error: {$curlErr}"); + return null; + } + if ($httpCode !== 200) { + error_log("GcpSpeechClient: HTTP {$httpCode}: " . substr($response, 0, 300)); + return null; + } + + $data = json_decode($response, true); + if (!is_array($data)) return null; + + // Long audio: GCP returns an operation name to poll + if (isset($data['name']) && str_contains((string)$data['name'], '/operations/')) { + $data = $this->pollOperation((string)$data['name'], $timeoutSec); + if ($data === null) return null; + } + + if (empty($data['results']) || !is_array($data['results'])) { + error_log("GcpSpeechClient: no results in response"); + return null; + } + + return $this->normalizeResults($data['results'], $locale); + } + + private function pollOperation(string $operationName, int $timeoutSec): ?array { + $url = "https://speech.googleapis.com/v2/{$operationName}?key=" . rawurlencode($this->apiKey); + $deadline = time() + $timeoutSec; + $interval = 5; + + while (time() < $deadline) { + sleep($interval); + $interval = min($interval + 5, 15); + + $ch = curl_init($url); + curl_setopt_array($ch, [ + CURLOPT_RETURNTRANSFER => true, + CURLOPT_TIMEOUT => 15, + ]); + $response = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($httpCode !== 200 || !is_string($response)) continue; + + $data = json_decode($response, true); + if (!is_array($data)) continue; + if (!empty($data['done'])) { + return $data['response'] ?? null; + } + } + + error_log("GcpSpeechClient: operation timed out after {$timeoutSec}s"); + return null; + } + + private function normalizeResults(array $gcpResults, string $locale): ?array { + $transcriptParts = []; + $allWords = []; + + foreach ($gcpResults as $result) { + $alt = $result['alternatives'][0] ?? null; + if (!$alt) continue; + if (!empty($alt['transcript'])) $transcriptParts[] = (string)$alt['transcript']; + if (!empty($alt['words'])) $allWords = array_merge($allWords, $alt['words']); + } + + $transcript = implode(' ', $transcriptParts); + if ($transcript === '') return null; + + // Group consecutive same-speaker words into segments + $segments = []; + $speakerMap = []; // raw speakerLabel → 'SPEAKER_XX' + $curSegment = null; + + foreach ($allWords as $word) { + $rawLabel = (string)($word['speakerLabel'] ?? ''); + if ($rawLabel !== '' && !isset($speakerMap[$rawLabel])) { + $speakerMap[$rawLabel] = sprintf('SPEAKER_%02d', count($speakerMap)); + } + $speakerKey = $rawLabel !== '' ? $speakerMap[$rawLabel] : null; + + $start = isset($word['startOffset']) ? $this->offsetToSec((string)$word['startOffset']) : 0.0; + $end = isset($word['endOffset']) ? $this->offsetToSec((string)$word['endOffset']) : $start; + $text = (string)($word['word'] ?? ''); + + $newSegment = $curSegment === null + || ($speakerKey !== null && $speakerKey !== ($curSegment['speaker'] ?? null)); + + if ($newSegment) { + if ($curSegment !== null) $segments[] = $curSegment; + $curSegment = ['text' => $text, 'start' => round($start, 3), 'end' => round($end, 3)]; + if ($speakerKey !== null) $curSegment['speaker'] = $speakerKey; + } else { + $curSegment['text'] .= ' ' . $text; + $curSegment['end'] = round($end, 3); + } + } + if ($curSegment !== null) $segments[] = $curSegment; + + // Duration from last word or last segment + $durationSec = 0.0; + if ($allWords) { + $last = end($allWords); + $durationSec = $this->offsetToSec((string)($last['endOffset'] ?? '0s')); + } elseif ($segments) { + $durationSec = (float)(end($segments)['end'] ?? 0.0); + } + + return [ + 'text' => $transcript, + 'language' => strtolower(explode('-', $locale)[0]), + 'duration' => $durationSec, + 'segments' => $segments, + 'num_speakers' => max(1, count($speakerMap)), + ]; + } + + /** Convert GCP offset string like "1.200s" to float seconds. */ + private function offsetToSec(string $offset): float { + return (float)rtrim($offset, 's'); + } + + private function resolveLocale(string $language): string { + if ($language === '') return 'nb-NO'; + return self::LOCALE_MAP[$language] ?? $language; + } +}