fix: wire GCP Speech client into tools transcribe (was using unreachable ai-portal path)
Copies GcpSpeechClient into the tools repo so it's deployed with the code; removes the broken dbnToolsAiPortalRoot() path that resolved to a nonexistent /home/dobetternorge/ai-portal directory. Also restarted the CPU Whisper service which had a stuck CLOSE_WAIT socket causing silent fetch failures. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+1
-4
@@ -75,9 +75,7 @@ if ($azureKey !== '' && !$diarize && $file['size'] <= 1024 * 1024 && str_starts_
|
||||
|
||||
// 2. Google Cloud Speech v2 — long audio, diarization, everything Azure can't handle
|
||||
if ($result === null) {
|
||||
$gcpPath = dbnToolsAiPortalRoot() . '/lib/ai/GcpSpeechClient.php';
|
||||
if (is_file($gcpPath)) {
|
||||
require_once $gcpPath;
|
||||
require_once __DIR__ . '/../includes/GcpSpeechClient.php';
|
||||
$gcp = GcpSpeechClient::fromConfig();
|
||||
if ($gcp) {
|
||||
$gcpLang = ($language === 'auto') ? '' : $language;
|
||||
@@ -93,7 +91,6 @@ if ($result === null) {
|
||||
error_log('STT: Google Cloud Speech failed, falling back to Whisper');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Whisper GPU — local fallback
|
||||
|
||||
@@ -0,0 +1,232 @@
|
||||
<?php
|
||||
/**
|
||||
* Google Cloud Speech-to-Text v2 — no SDK, raw curl + API key auth.
|
||||
*
|
||||
* Credential priority:
|
||||
* 1. GCP_PROJECT_ID / GCP_API_KEY env vars
|
||||
* 2. /etc/bnl/gcp.php (array with 'project_id' and 'api_key')
|
||||
*/
|
||||
class GcpSpeechClient {
|
||||
private string $projectId;
|
||||
private string $apiKey;
|
||||
|
||||
private const LOCALE_MAP = [
|
||||
'no' => 'nb-NO', 'nb' => 'nb-NO', 'nn' => 'nn-NO',
|
||||
'en' => 'en-US', 'sv' => 'sv-SE', 'da' => 'da-DK',
|
||||
'de' => 'de-DE', 'fr' => 'fr-FR',
|
||||
];
|
||||
|
||||
public function __construct(string $projectId, string $apiKey) {
|
||||
$this->projectId = $projectId;
|
||||
$this->apiKey = $apiKey;
|
||||
}
|
||||
|
||||
public static function fromConfig(): ?self {
|
||||
$projectId = getenv('GCP_PROJECT_ID') ?: null;
|
||||
$apiKey = getenv('GCP_API_KEY') ?: null;
|
||||
|
||||
if (!$projectId || !$apiKey) {
|
||||
$cfg = @include '/etc/bnl/gcp.php';
|
||||
if (is_array($cfg)) {
|
||||
if (!$projectId && !empty($cfg['project_id'])) $projectId = (string)$cfg['project_id'];
|
||||
if (!$apiKey && !empty($cfg['api_key'])) $apiKey = (string)$cfg['api_key'];
|
||||
}
|
||||
}
|
||||
|
||||
if (!$projectId || !$apiKey) return null;
|
||||
return new self($projectId, $apiKey);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe an audio file using GCP Speech-to-Text v2.
|
||||
*
|
||||
* Returns a Whisper-compatible array on success:
|
||||
* ['text', 'language', 'duration', 'segments', 'num_speakers']
|
||||
* Returns null on any failure (caller should fall back to Whisper).
|
||||
*/
|
||||
public function transcribe(
|
||||
string $audioPath,
|
||||
string $mimeType,
|
||||
string $language,
|
||||
bool $diarize,
|
||||
int $minSpeakers = 2,
|
||||
int $maxSpeakers = 6,
|
||||
int $timeoutSec = 270
|
||||
): ?array {
|
||||
$locale = $this->resolveLocale($language);
|
||||
|
||||
$features = ['enableAutomaticPunctuation' => true];
|
||||
if ($diarize) {
|
||||
$features['diarizationConfig'] = [
|
||||
'minSpeakerCount' => max(2, $minSpeakers),
|
||||
'maxSpeakerCount' => max(max(2, $minSpeakers), $maxSpeakers),
|
||||
];
|
||||
}
|
||||
|
||||
$langCodes = [$locale];
|
||||
// Add Nynorsk as secondary when processing Norwegian content
|
||||
if ($locale === 'nb-NO') $langCodes[] = 'nn-NO';
|
||||
|
||||
$config = [
|
||||
'autoDecodingConfig' => (object)[],
|
||||
'languageCodes' => $langCodes,
|
||||
'model' => 'long',
|
||||
'features' => $features,
|
||||
];
|
||||
|
||||
$audioContent = base64_encode((string)file_get_contents($audioPath));
|
||||
if ($audioContent === '') {
|
||||
error_log("GcpSpeechClient: failed to read audio file");
|
||||
return null;
|
||||
}
|
||||
|
||||
$body = json_encode(['config' => $config, 'content' => $audioContent]);
|
||||
|
||||
$url = sprintf(
|
||||
'https://speech.googleapis.com/v2/projects/%s/locations/global/recognizers/_:recognize?key=%s',
|
||||
rawurlencode($this->projectId),
|
||||
rawurlencode($this->apiKey)
|
||||
);
|
||||
|
||||
$ch = curl_init($url);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $body,
|
||||
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_TIMEOUT => 60,
|
||||
]);
|
||||
$response = curl_exec($ch);
|
||||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
$curlErr = curl_error($ch);
|
||||
curl_close($ch);
|
||||
|
||||
if ($curlErr || !is_string($response)) {
|
||||
error_log("GcpSpeechClient: curl error: {$curlErr}");
|
||||
return null;
|
||||
}
|
||||
if ($httpCode !== 200) {
|
||||
error_log("GcpSpeechClient: HTTP {$httpCode}: " . substr($response, 0, 300));
|
||||
return null;
|
||||
}
|
||||
|
||||
$data = json_decode($response, true);
|
||||
if (!is_array($data)) return null;
|
||||
|
||||
// Long audio: GCP returns an operation name to poll
|
||||
if (isset($data['name']) && str_contains((string)$data['name'], '/operations/')) {
|
||||
$data = $this->pollOperation((string)$data['name'], $timeoutSec);
|
||||
if ($data === null) return null;
|
||||
}
|
||||
|
||||
if (empty($data['results']) || !is_array($data['results'])) {
|
||||
error_log("GcpSpeechClient: no results in response");
|
||||
return null;
|
||||
}
|
||||
|
||||
return $this->normalizeResults($data['results'], $locale);
|
||||
}
|
||||
|
||||
private function pollOperation(string $operationName, int $timeoutSec): ?array {
|
||||
$url = "https://speech.googleapis.com/v2/{$operationName}?key=" . rawurlencode($this->apiKey);
|
||||
$deadline = time() + $timeoutSec;
|
||||
$interval = 5;
|
||||
|
||||
while (time() < $deadline) {
|
||||
sleep($interval);
|
||||
$interval = min($interval + 5, 15);
|
||||
|
||||
$ch = curl_init($url);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_TIMEOUT => 15,
|
||||
]);
|
||||
$response = curl_exec($ch);
|
||||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
curl_close($ch);
|
||||
|
||||
if ($httpCode !== 200 || !is_string($response)) continue;
|
||||
|
||||
$data = json_decode($response, true);
|
||||
if (!is_array($data)) continue;
|
||||
if (!empty($data['done'])) {
|
||||
return $data['response'] ?? null;
|
||||
}
|
||||
}
|
||||
|
||||
error_log("GcpSpeechClient: operation timed out after {$timeoutSec}s");
|
||||
return null;
|
||||
}
|
||||
|
||||
private function normalizeResults(array $gcpResults, string $locale): ?array {
|
||||
$transcriptParts = [];
|
||||
$allWords = [];
|
||||
|
||||
foreach ($gcpResults as $result) {
|
||||
$alt = $result['alternatives'][0] ?? null;
|
||||
if (!$alt) continue;
|
||||
if (!empty($alt['transcript'])) $transcriptParts[] = (string)$alt['transcript'];
|
||||
if (!empty($alt['words'])) $allWords = array_merge($allWords, $alt['words']);
|
||||
}
|
||||
|
||||
$transcript = implode(' ', $transcriptParts);
|
||||
if ($transcript === '') return null;
|
||||
|
||||
// Group consecutive same-speaker words into segments
|
||||
$segments = [];
|
||||
$speakerMap = []; // raw speakerLabel → 'SPEAKER_XX'
|
||||
$curSegment = null;
|
||||
|
||||
foreach ($allWords as $word) {
|
||||
$rawLabel = (string)($word['speakerLabel'] ?? '');
|
||||
if ($rawLabel !== '' && !isset($speakerMap[$rawLabel])) {
|
||||
$speakerMap[$rawLabel] = sprintf('SPEAKER_%02d', count($speakerMap));
|
||||
}
|
||||
$speakerKey = $rawLabel !== '' ? $speakerMap[$rawLabel] : null;
|
||||
|
||||
$start = isset($word['startOffset']) ? $this->offsetToSec((string)$word['startOffset']) : 0.0;
|
||||
$end = isset($word['endOffset']) ? $this->offsetToSec((string)$word['endOffset']) : $start;
|
||||
$text = (string)($word['word'] ?? '');
|
||||
|
||||
$newSegment = $curSegment === null
|
||||
|| ($speakerKey !== null && $speakerKey !== ($curSegment['speaker'] ?? null));
|
||||
|
||||
if ($newSegment) {
|
||||
if ($curSegment !== null) $segments[] = $curSegment;
|
||||
$curSegment = ['text' => $text, 'start' => round($start, 3), 'end' => round($end, 3)];
|
||||
if ($speakerKey !== null) $curSegment['speaker'] = $speakerKey;
|
||||
} else {
|
||||
$curSegment['text'] .= ' ' . $text;
|
||||
$curSegment['end'] = round($end, 3);
|
||||
}
|
||||
}
|
||||
if ($curSegment !== null) $segments[] = $curSegment;
|
||||
|
||||
// Duration from last word or last segment
|
||||
$durationSec = 0.0;
|
||||
if ($allWords) {
|
||||
$last = end($allWords);
|
||||
$durationSec = $this->offsetToSec((string)($last['endOffset'] ?? '0s'));
|
||||
} elseif ($segments) {
|
||||
$durationSec = (float)(end($segments)['end'] ?? 0.0);
|
||||
}
|
||||
|
||||
return [
|
||||
'text' => $transcript,
|
||||
'language' => strtolower(explode('-', $locale)[0]),
|
||||
'duration' => $durationSec,
|
||||
'segments' => $segments,
|
||||
'num_speakers' => max(1, count($speakerMap)),
|
||||
];
|
||||
}
|
||||
|
||||
/** Convert GCP offset string like "1.200s" to float seconds. */
|
||||
private function offsetToSec(string $offset): float {
|
||||
return (float)rtrim($offset, 's');
|
||||
}
|
||||
|
||||
private function resolveLocale(string $language): string {
|
||||
if ($language === '') return 'nb-NO';
|
||||
return self::LOCALE_MAP[$language] ?? $language;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user