Files
daveadmin 568314c554 fix: wire GCP Speech client into tools transcribe (was using unreachable ai-portal path)
Copies GcpSpeechClient into the tools repo so it's deployed with the code;
removes the broken dbnToolsAiPortalRoot() path that resolved to a nonexistent
/home/dobetternorge/ai-portal directory. Also restarted the CPU Whisper
service which had a stuck CLOSE_WAIT socket causing silent fetch failures.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 13:43:28 +02:00

233 lines
8.3 KiB
PHP

<?php
/**
* Google Cloud Speech-to-Text v2 — no SDK, raw curl + API key auth.
*
* Credential priority:
* 1. GCP_PROJECT_ID / GCP_API_KEY env vars
* 2. /etc/bnl/gcp.php (array with 'project_id' and 'api_key')
*/
class GcpSpeechClient {
private string $projectId;
private string $apiKey;
private const LOCALE_MAP = [
'no' => 'nb-NO', 'nb' => 'nb-NO', 'nn' => 'nn-NO',
'en' => 'en-US', 'sv' => 'sv-SE', 'da' => 'da-DK',
'de' => 'de-DE', 'fr' => 'fr-FR',
];
public function __construct(string $projectId, string $apiKey) {
$this->projectId = $projectId;
$this->apiKey = $apiKey;
}
public static function fromConfig(): ?self {
$projectId = getenv('GCP_PROJECT_ID') ?: null;
$apiKey = getenv('GCP_API_KEY') ?: null;
if (!$projectId || !$apiKey) {
$cfg = @include '/etc/bnl/gcp.php';
if (is_array($cfg)) {
if (!$projectId && !empty($cfg['project_id'])) $projectId = (string)$cfg['project_id'];
if (!$apiKey && !empty($cfg['api_key'])) $apiKey = (string)$cfg['api_key'];
}
}
if (!$projectId || !$apiKey) return null;
return new self($projectId, $apiKey);
}
/**
* Transcribe an audio file using GCP Speech-to-Text v2.
*
* Returns a Whisper-compatible array on success:
* ['text', 'language', 'duration', 'segments', 'num_speakers']
* Returns null on any failure (caller should fall back to Whisper).
*/
public function transcribe(
string $audioPath,
string $mimeType,
string $language,
bool $diarize,
int $minSpeakers = 2,
int $maxSpeakers = 6,
int $timeoutSec = 270
): ?array {
$locale = $this->resolveLocale($language);
$features = ['enableAutomaticPunctuation' => true];
if ($diarize) {
$features['diarizationConfig'] = [
'minSpeakerCount' => max(2, $minSpeakers),
'maxSpeakerCount' => max(max(2, $minSpeakers), $maxSpeakers),
];
}
$langCodes = [$locale];
// Add Nynorsk as secondary when processing Norwegian content
if ($locale === 'nb-NO') $langCodes[] = 'nn-NO';
$config = [
'autoDecodingConfig' => (object)[],
'languageCodes' => $langCodes,
'model' => 'long',
'features' => $features,
];
$audioContent = base64_encode((string)file_get_contents($audioPath));
if ($audioContent === '') {
error_log("GcpSpeechClient: failed to read audio file");
return null;
}
$body = json_encode(['config' => $config, 'content' => $audioContent]);
$url = sprintf(
'https://speech.googleapis.com/v2/projects/%s/locations/global/recognizers/_:recognize?key=%s',
rawurlencode($this->projectId),
rawurlencode($this->apiKey)
);
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $body,
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 60,
]);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$curlErr = curl_error($ch);
curl_close($ch);
if ($curlErr || !is_string($response)) {
error_log("GcpSpeechClient: curl error: {$curlErr}");
return null;
}
if ($httpCode !== 200) {
error_log("GcpSpeechClient: HTTP {$httpCode}: " . substr($response, 0, 300));
return null;
}
$data = json_decode($response, true);
if (!is_array($data)) return null;
// Long audio: GCP returns an operation name to poll
if (isset($data['name']) && str_contains((string)$data['name'], '/operations/')) {
$data = $this->pollOperation((string)$data['name'], $timeoutSec);
if ($data === null) return null;
}
if (empty($data['results']) || !is_array($data['results'])) {
error_log("GcpSpeechClient: no results in response");
return null;
}
return $this->normalizeResults($data['results'], $locale);
}
private function pollOperation(string $operationName, int $timeoutSec): ?array {
$url = "https://speech.googleapis.com/v2/{$operationName}?key=" . rawurlencode($this->apiKey);
$deadline = time() + $timeoutSec;
$interval = 5;
while (time() < $deadline) {
sleep($interval);
$interval = min($interval + 5, 15);
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 15,
]);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode !== 200 || !is_string($response)) continue;
$data = json_decode($response, true);
if (!is_array($data)) continue;
if (!empty($data['done'])) {
return $data['response'] ?? null;
}
}
error_log("GcpSpeechClient: operation timed out after {$timeoutSec}s");
return null;
}
private function normalizeResults(array $gcpResults, string $locale): ?array {
$transcriptParts = [];
$allWords = [];
foreach ($gcpResults as $result) {
$alt = $result['alternatives'][0] ?? null;
if (!$alt) continue;
if (!empty($alt['transcript'])) $transcriptParts[] = (string)$alt['transcript'];
if (!empty($alt['words'])) $allWords = array_merge($allWords, $alt['words']);
}
$transcript = implode(' ', $transcriptParts);
if ($transcript === '') return null;
// Group consecutive same-speaker words into segments
$segments = [];
$speakerMap = []; // raw speakerLabel → 'SPEAKER_XX'
$curSegment = null;
foreach ($allWords as $word) {
$rawLabel = (string)($word['speakerLabel'] ?? '');
if ($rawLabel !== '' && !isset($speakerMap[$rawLabel])) {
$speakerMap[$rawLabel] = sprintf('SPEAKER_%02d', count($speakerMap));
}
$speakerKey = $rawLabel !== '' ? $speakerMap[$rawLabel] : null;
$start = isset($word['startOffset']) ? $this->offsetToSec((string)$word['startOffset']) : 0.0;
$end = isset($word['endOffset']) ? $this->offsetToSec((string)$word['endOffset']) : $start;
$text = (string)($word['word'] ?? '');
$newSegment = $curSegment === null
|| ($speakerKey !== null && $speakerKey !== ($curSegment['speaker'] ?? null));
if ($newSegment) {
if ($curSegment !== null) $segments[] = $curSegment;
$curSegment = ['text' => $text, 'start' => round($start, 3), 'end' => round($end, 3)];
if ($speakerKey !== null) $curSegment['speaker'] = $speakerKey;
} else {
$curSegment['text'] .= ' ' . $text;
$curSegment['end'] = round($end, 3);
}
}
if ($curSegment !== null) $segments[] = $curSegment;
// Duration from last word or last segment
$durationSec = 0.0;
if ($allWords) {
$last = end($allWords);
$durationSec = $this->offsetToSec((string)($last['endOffset'] ?? '0s'));
} elseif ($segments) {
$durationSec = (float)(end($segments)['end'] ?? 0.0);
}
return [
'text' => $transcript,
'language' => strtolower(explode('-', $locale)[0]),
'duration' => $durationSec,
'segments' => $segments,
'num_speakers' => max(1, count($speakerMap)),
];
}
/** Convert GCP offset string like "1.200s" to float seconds. */
private function offsetToSec(string $offset): float {
return (float)rtrim($offset, 's');
}
private function resolveLocale(string $language): string {
if ($language === '') return 'nb-NO';
return self::LOCALE_MAP[$language] ?? $language;
}
}