fix: wire GCP Speech client into tools transcribe (was using unreachable ai-portal path)
Copies GcpSpeechClient into the tools repo so it's deployed with the code; removes the broken dbnToolsAiPortalRoot() path that resolved to a nonexistent /home/dobetternorge/ai-portal directory. Also restarted the CPU Whisper service which had a stuck CLOSE_WAIT socket causing silent fetch failures. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+1
-4
@@ -75,9 +75,7 @@ if ($azureKey !== '' && !$diarize && $file['size'] <= 1024 * 1024 && str_starts_
|
|||||||
|
|
||||||
// 2. Google Cloud Speech v2 — long audio, diarization, everything Azure can't handle
|
// 2. Google Cloud Speech v2 — long audio, diarization, everything Azure can't handle
|
||||||
if ($result === null) {
|
if ($result === null) {
|
||||||
$gcpPath = dbnToolsAiPortalRoot() . '/lib/ai/GcpSpeechClient.php';
|
require_once __DIR__ . '/../includes/GcpSpeechClient.php';
|
||||||
if (is_file($gcpPath)) {
|
|
||||||
require_once $gcpPath;
|
|
||||||
$gcp = GcpSpeechClient::fromConfig();
|
$gcp = GcpSpeechClient::fromConfig();
|
||||||
if ($gcp) {
|
if ($gcp) {
|
||||||
$gcpLang = ($language === 'auto') ? '' : $language;
|
$gcpLang = ($language === 'auto') ? '' : $language;
|
||||||
@@ -93,7 +91,6 @@ if ($result === null) {
|
|||||||
error_log('STT: Google Cloud Speech failed, falling back to Whisper');
|
error_log('STT: Google Cloud Speech failed, falling back to Whisper');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Whisper GPU — local fallback
|
// 3. Whisper GPU — local fallback
|
||||||
|
|||||||
@@ -0,0 +1,232 @@
|
|||||||
|
<?php
|
||||||
|
/**
|
||||||
|
* Google Cloud Speech-to-Text v2 — no SDK, raw curl + API key auth.
|
||||||
|
*
|
||||||
|
* Credential priority:
|
||||||
|
* 1. GCP_PROJECT_ID / GCP_API_KEY env vars
|
||||||
|
* 2. /etc/bnl/gcp.php (array with 'project_id' and 'api_key')
|
||||||
|
*/
|
||||||
|
class GcpSpeechClient {
|
||||||
|
private string $projectId;
|
||||||
|
private string $apiKey;
|
||||||
|
|
||||||
|
private const LOCALE_MAP = [
|
||||||
|
'no' => 'nb-NO', 'nb' => 'nb-NO', 'nn' => 'nn-NO',
|
||||||
|
'en' => 'en-US', 'sv' => 'sv-SE', 'da' => 'da-DK',
|
||||||
|
'de' => 'de-DE', 'fr' => 'fr-FR',
|
||||||
|
];
|
||||||
|
|
||||||
|
public function __construct(string $projectId, string $apiKey) {
|
||||||
|
$this->projectId = $projectId;
|
||||||
|
$this->apiKey = $apiKey;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function fromConfig(): ?self {
|
||||||
|
$projectId = getenv('GCP_PROJECT_ID') ?: null;
|
||||||
|
$apiKey = getenv('GCP_API_KEY') ?: null;
|
||||||
|
|
||||||
|
if (!$projectId || !$apiKey) {
|
||||||
|
$cfg = @include '/etc/bnl/gcp.php';
|
||||||
|
if (is_array($cfg)) {
|
||||||
|
if (!$projectId && !empty($cfg['project_id'])) $projectId = (string)$cfg['project_id'];
|
||||||
|
if (!$apiKey && !empty($cfg['api_key'])) $apiKey = (string)$cfg['api_key'];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$projectId || !$apiKey) return null;
|
||||||
|
return new self($projectId, $apiKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transcribe an audio file using GCP Speech-to-Text v2.
|
||||||
|
*
|
||||||
|
* Returns a Whisper-compatible array on success:
|
||||||
|
* ['text', 'language', 'duration', 'segments', 'num_speakers']
|
||||||
|
* Returns null on any failure (caller should fall back to Whisper).
|
||||||
|
*/
|
||||||
|
public function transcribe(
|
||||||
|
string $audioPath,
|
||||||
|
string $mimeType,
|
||||||
|
string $language,
|
||||||
|
bool $diarize,
|
||||||
|
int $minSpeakers = 2,
|
||||||
|
int $maxSpeakers = 6,
|
||||||
|
int $timeoutSec = 270
|
||||||
|
): ?array {
|
||||||
|
$locale = $this->resolveLocale($language);
|
||||||
|
|
||||||
|
$features = ['enableAutomaticPunctuation' => true];
|
||||||
|
if ($diarize) {
|
||||||
|
$features['diarizationConfig'] = [
|
||||||
|
'minSpeakerCount' => max(2, $minSpeakers),
|
||||||
|
'maxSpeakerCount' => max(max(2, $minSpeakers), $maxSpeakers),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
$langCodes = [$locale];
|
||||||
|
// Add Nynorsk as secondary when processing Norwegian content
|
||||||
|
if ($locale === 'nb-NO') $langCodes[] = 'nn-NO';
|
||||||
|
|
||||||
|
$config = [
|
||||||
|
'autoDecodingConfig' => (object)[],
|
||||||
|
'languageCodes' => $langCodes,
|
||||||
|
'model' => 'long',
|
||||||
|
'features' => $features,
|
||||||
|
];
|
||||||
|
|
||||||
|
$audioContent = base64_encode((string)file_get_contents($audioPath));
|
||||||
|
if ($audioContent === '') {
|
||||||
|
error_log("GcpSpeechClient: failed to read audio file");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
$body = json_encode(['config' => $config, 'content' => $audioContent]);
|
||||||
|
|
||||||
|
$url = sprintf(
|
||||||
|
'https://speech.googleapis.com/v2/projects/%s/locations/global/recognizers/_:recognize?key=%s',
|
||||||
|
rawurlencode($this->projectId),
|
||||||
|
rawurlencode($this->apiKey)
|
||||||
|
);
|
||||||
|
|
||||||
|
$ch = curl_init($url);
|
||||||
|
curl_setopt_array($ch, [
|
||||||
|
CURLOPT_POST => true,
|
||||||
|
CURLOPT_POSTFIELDS => $body,
|
||||||
|
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
|
||||||
|
CURLOPT_RETURNTRANSFER => true,
|
||||||
|
CURLOPT_TIMEOUT => 60,
|
||||||
|
]);
|
||||||
|
$response = curl_exec($ch);
|
||||||
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||||
|
$curlErr = curl_error($ch);
|
||||||
|
curl_close($ch);
|
||||||
|
|
||||||
|
if ($curlErr || !is_string($response)) {
|
||||||
|
error_log("GcpSpeechClient: curl error: {$curlErr}");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if ($httpCode !== 200) {
|
||||||
|
error_log("GcpSpeechClient: HTTP {$httpCode}: " . substr($response, 0, 300));
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
$data = json_decode($response, true);
|
||||||
|
if (!is_array($data)) return null;
|
||||||
|
|
||||||
|
// Long audio: GCP returns an operation name to poll
|
||||||
|
if (isset($data['name']) && str_contains((string)$data['name'], '/operations/')) {
|
||||||
|
$data = $this->pollOperation((string)$data['name'], $timeoutSec);
|
||||||
|
if ($data === null) return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (empty($data['results']) || !is_array($data['results'])) {
|
||||||
|
error_log("GcpSpeechClient: no results in response");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->normalizeResults($data['results'], $locale);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function pollOperation(string $operationName, int $timeoutSec): ?array {
|
||||||
|
$url = "https://speech.googleapis.com/v2/{$operationName}?key=" . rawurlencode($this->apiKey);
|
||||||
|
$deadline = time() + $timeoutSec;
|
||||||
|
$interval = 5;
|
||||||
|
|
||||||
|
while (time() < $deadline) {
|
||||||
|
sleep($interval);
|
||||||
|
$interval = min($interval + 5, 15);
|
||||||
|
|
||||||
|
$ch = curl_init($url);
|
||||||
|
curl_setopt_array($ch, [
|
||||||
|
CURLOPT_RETURNTRANSFER => true,
|
||||||
|
CURLOPT_TIMEOUT => 15,
|
||||||
|
]);
|
||||||
|
$response = curl_exec($ch);
|
||||||
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||||
|
curl_close($ch);
|
||||||
|
|
||||||
|
if ($httpCode !== 200 || !is_string($response)) continue;
|
||||||
|
|
||||||
|
$data = json_decode($response, true);
|
||||||
|
if (!is_array($data)) continue;
|
||||||
|
if (!empty($data['done'])) {
|
||||||
|
return $data['response'] ?? null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
error_log("GcpSpeechClient: operation timed out after {$timeoutSec}s");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function normalizeResults(array $gcpResults, string $locale): ?array {
|
||||||
|
$transcriptParts = [];
|
||||||
|
$allWords = [];
|
||||||
|
|
||||||
|
foreach ($gcpResults as $result) {
|
||||||
|
$alt = $result['alternatives'][0] ?? null;
|
||||||
|
if (!$alt) continue;
|
||||||
|
if (!empty($alt['transcript'])) $transcriptParts[] = (string)$alt['transcript'];
|
||||||
|
if (!empty($alt['words'])) $allWords = array_merge($allWords, $alt['words']);
|
||||||
|
}
|
||||||
|
|
||||||
|
$transcript = implode(' ', $transcriptParts);
|
||||||
|
if ($transcript === '') return null;
|
||||||
|
|
||||||
|
// Group consecutive same-speaker words into segments
|
||||||
|
$segments = [];
|
||||||
|
$speakerMap = []; // raw speakerLabel → 'SPEAKER_XX'
|
||||||
|
$curSegment = null;
|
||||||
|
|
||||||
|
foreach ($allWords as $word) {
|
||||||
|
$rawLabel = (string)($word['speakerLabel'] ?? '');
|
||||||
|
if ($rawLabel !== '' && !isset($speakerMap[$rawLabel])) {
|
||||||
|
$speakerMap[$rawLabel] = sprintf('SPEAKER_%02d', count($speakerMap));
|
||||||
|
}
|
||||||
|
$speakerKey = $rawLabel !== '' ? $speakerMap[$rawLabel] : null;
|
||||||
|
|
||||||
|
$start = isset($word['startOffset']) ? $this->offsetToSec((string)$word['startOffset']) : 0.0;
|
||||||
|
$end = isset($word['endOffset']) ? $this->offsetToSec((string)$word['endOffset']) : $start;
|
||||||
|
$text = (string)($word['word'] ?? '');
|
||||||
|
|
||||||
|
$newSegment = $curSegment === null
|
||||||
|
|| ($speakerKey !== null && $speakerKey !== ($curSegment['speaker'] ?? null));
|
||||||
|
|
||||||
|
if ($newSegment) {
|
||||||
|
if ($curSegment !== null) $segments[] = $curSegment;
|
||||||
|
$curSegment = ['text' => $text, 'start' => round($start, 3), 'end' => round($end, 3)];
|
||||||
|
if ($speakerKey !== null) $curSegment['speaker'] = $speakerKey;
|
||||||
|
} else {
|
||||||
|
$curSegment['text'] .= ' ' . $text;
|
||||||
|
$curSegment['end'] = round($end, 3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($curSegment !== null) $segments[] = $curSegment;
|
||||||
|
|
||||||
|
// Duration from last word or last segment
|
||||||
|
$durationSec = 0.0;
|
||||||
|
if ($allWords) {
|
||||||
|
$last = end($allWords);
|
||||||
|
$durationSec = $this->offsetToSec((string)($last['endOffset'] ?? '0s'));
|
||||||
|
} elseif ($segments) {
|
||||||
|
$durationSec = (float)(end($segments)['end'] ?? 0.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return [
|
||||||
|
'text' => $transcript,
|
||||||
|
'language' => strtolower(explode('-', $locale)[0]),
|
||||||
|
'duration' => $durationSec,
|
||||||
|
'segments' => $segments,
|
||||||
|
'num_speakers' => max(1, count($speakerMap)),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Convert GCP offset string like "1.200s" to float seconds. */
|
||||||
|
private function offsetToSec(string $offset): float {
|
||||||
|
return (float)rtrim($offset, 's');
|
||||||
|
}
|
||||||
|
|
||||||
|
private function resolveLocale(string $language): string {
|
||||||
|
if ($language === '') return 'nb-NO';
|
||||||
|
return self::LOCALE_MAP[$language] ?? $language;
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user