Files
dobetternorge-tools/api/transcribe.php
T
daveadmin 26f4e2231b feat(transcribe): Norwegian defaults, vocabulary presets, multi-file court day queue
- Default language → nb (Bokmål); auto-detect demoted with warning note
- Default model → large-v3; VAD filter on by default
- Vocabulary prompt promoted to main form with 4 preset buttons
  (Barnerett/CPS, Rettssak/tingrett, Generell norsk, Egendefinert)
- Multi-file upload queue: drop/select multiple clips, numbered list UI
- Sequential queue processing with cumulative time_offset per clip
- Backend shifts segment timestamps so SRT/VTT covers full court day
- Merged transcript + segments across all clips for single download

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 22:20:11 +02:00

386 lines
16 KiB
PHP

<?php
declare(strict_types=1);
require_once __DIR__ . '/../includes/LegalTools.php';
dbnToolsRequireMethod('POST');
dbnToolsRequireAuth();
// ── Common params ─────────────────────────────────────────────────────────────
$validLangs = ['auto', 'no', 'nn', 'en', 'sv', 'da', 'de', 'fr', 'es', 'pl', 'fi', 'nl', 'it', 'pt', 'ru', 'ar', 'tr', 'zh', 'ja', 'ko'];
$language = strtolower(trim((string)($_POST['language'] ?? 'auto')));
if (!in_array($language, $validLangs, true)) $language = 'auto';
$diarize = !empty($_POST['diarize']) && $_POST['diarize'] !== '0';
$numSpeakers = isset($_POST['num_speakers']) ? max(0, min(20, (int)$_POST['num_speakers'])) : 0;
$engine = in_array($_POST['engine'] ?? '', ['gpu', 'openai', 'azure'], true) ? $_POST['engine'] : 'gpu';
$validModels = ['tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3'];
$model = in_array($_POST['model'] ?? '', $validModels, true) ? $_POST['model'] : 'small';
$beamSize = max(1, min(5, (int)($_POST['beam_size'] ?? 5)));
$task = ($_POST['task'] ?? 'transcribe') === 'translate' ? 'translate' : 'transcribe';
$vadFilter = !empty($_POST['vad_filter']) && $_POST['vad_filter'] !== '0';
$initPrompt = substr(trim((string)($_POST['initial_prompt'] ?? '')), 0, 500);
// ── Validate upload ───────────────────────────────────────────────────────────
if (empty($_FILES['audio']) || $_FILES['audio']['error'] !== UPLOAD_ERR_OK) {
$code = $_FILES['audio']['error'] ?? -1;
$map = [
UPLOAD_ERR_INI_SIZE => 'File exceeds server upload limit.',
UPLOAD_ERR_FORM_SIZE => 'File exceeds form size limit.',
UPLOAD_ERR_PARTIAL => 'File was only partially uploaded.',
UPLOAD_ERR_NO_FILE => 'No audio file received.',
];
dbnToolsError($map[$code] ?? "Upload error (code {$code}).", 400, 'upload_error');
}
$file = $_FILES['audio'];
$maxBytes = 200 * 1024 * 1024;
if ($file['size'] > $maxBytes) {
dbnToolsError('File too large. Maximum 200 MB.', 413, 'file_too_large');
}
$allowedExts = ['mp3', 'wav', 'ogg', 'oga', 'm4a', 'mp4', 'flac', 'webm', 'aac'];
$ext = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION));
if (!in_array($ext, $allowedExts, true)) {
dbnToolsError("Unsupported format: .{$ext}. Use MP3, WAV, OGG, M4A, FLAC, or WebM.", 415, 'unsupported_format');
}
// OpenAI has a 25 MB file limit
if ($engine === 'openai' && $file['size'] > 25 * 1024 * 1024) {
dbnToolsError('OpenAI Whisper API has a 25 MB file limit. Use the GPU engine for larger files.', 413, 'openai_file_too_large');
}
$timeOffset = max(0.0, (float)($_POST['time_offset'] ?? 0));
$t0 = microtime(true);
// ── Route to engine ───────────────────────────────────────────────────────────
if ($engine === 'openai') {
$apiKey = trim((string)($_POST['openai_key'] ?? ''));
if (!$apiKey || !str_starts_with($apiKey, 'sk-')) {
dbnToolsError('A valid OpenAI API key (sk-…) is required for the OpenAI engine.', 400, 'missing_openai_key');
}
$result = transcribeViaOpenAI($file, $language, $task, $apiKey);
} elseif ($engine === 'azure') {
$apiKey = trim((string)($_POST['azure_key'] ?? ''));
$region = preg_replace('/[^a-z0-9]/', '', strtolower(trim((string)($_POST['azure_region'] ?? 'norwayeast'))));
if (!$apiKey) {
dbnToolsError('An Azure Speech API key is required for the Azure engine.', 400, 'missing_azure_key');
}
$result = transcribeViaAzure($file, $language, $apiKey, $region, $diarize);
} else {
// GPU (default)
$result = transcribeViaWhisperGpu($file, $language, $diarize, $numSpeakers, $model, $beamSize, $task, $vadFilter, $initPrompt);
}
$latencyMs = (int)round((microtime(true) - $t0) * 1000);
// ── Shift segment timestamps for multi-clip sessions ─────────────────────────
if ($timeOffset > 0.0 && !empty($result['segments'])) {
foreach ($result['segments'] as &$seg) {
$seg['start'] = round(($seg['start'] ?? 0) + $timeOffset, 3);
$seg['end'] = round(($seg['end'] ?? 0) + $timeOffset, 3);
}
unset($seg);
}
// ── Speaker role labelling (GPU + diarize only) ───────────────────────────────
$segments = $result['segments'] ?? [];
$numDetected = (int)($result['num_speakers'] ?? 1);
if ($numDetected < 2 && $segments) {
$uniqueSpeakers = array_filter(array_unique(array_column($segments, 'speaker')));
if (count($uniqueSpeakers) > 1) $numDetected = count($uniqueSpeakers);
}
$speakerRoles = null;
if ($diarize && $numDetected > 1 && $segments) {
$speakerRoles = dbnLabelSpeakerRoles($segments);
}
// ── Log + respond ─────────────────────────────────────────────────────────────
dbnToolsLogMetadata([
'tool' => 'transcribe',
'engine' => $engine,
'model' => $model,
'language' => $language,
'ok' => true,
'latency_ms' => $latencyMs,
]);
dbnToolsRespond([
'ok' => true,
'tool' => 'transcribe',
'transcript' => (string)($result['text'] ?? ''),
'segments' => $segments,
'speaker_roles' => $speakerRoles,
'num_speakers' => $numDetected,
'language' => (string)($result['language'] ?? $language),
'duration_sec' => round((float)($result['duration_seconds'] ?? 0), 2),
'processing_sec'=> round((float)($result['processing_seconds'] ?? 0), 2),
'model' => (string)($result['model'] ?? ($engine === 'gpu' ? $model : $engine)),
'engine' => $engine,
'latency_ms' => $latencyMs,
]);
// ── Engine implementations ────────────────────────────────────────────────────
function transcribeViaWhisperGpu(array $file, string $language, bool $diarize, int $numSpeakers,
string $model, int $beamSize, string $task,
bool $vadFilter, string $initPrompt): array
{
$whisperBase = 'http://194.93.49.14:20019';
$endpoint = $diarize ? $whisperBase . '/transcribe/diarize' : $whisperBase . '/transcribe';
$boundary = '----DBN' . bin2hex(random_bytes(8));
$body = "--{$boundary}\r\n";
$body .= 'Content-Disposition: form-data; name="file"; filename="' . addslashes(basename($file['name'])) . '"' . "\r\n";
$body .= "Content-Type: application/octet-stream\r\n\r\n";
$fileContents = file_get_contents($file['tmp_name']);
if ($fileContents === false) {
dbnToolsError('Could not read uploaded file.', 500, 'file_read_error');
}
$body .= $fileContents . "\r\n";
$fields = [
'model' => $model,
'beam_size' => (string)$beamSize,
'task' => $task,
'vad_filter' => $vadFilter ? '1' : '0',
'initial_prompt' => $initPrompt,
];
if ($language !== 'auto') $fields['language'] = $language;
if ($diarize && $numSpeakers > 1) $fields['num_speakers'] = (string)$numSpeakers;
foreach ($fields as $name => $value) {
if ($value === '') continue;
$body .= "--{$boundary}\r\n";
$body .= "Content-Disposition: form-data; name=\"{$name}\"\r\n\r\n";
$body .= $value . "\r\n";
}
$body .= "--{$boundary}--\r\n";
$ch = curl_init($endpoint);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $body,
CURLOPT_HTTPHEADER => [
"Content-Type: multipart/form-data; boundary={$boundary}",
'Accept: application/json',
],
CURLOPT_TIMEOUT => 600,
]);
$responseBody = curl_exec($ch);
$httpCode = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$curlErr = curl_error($ch);
curl_close($ch);
if ($responseBody === false || $httpCode !== 200) {
$detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : '');
dbnToolsError('Whisper service error (HTTP ' . $httpCode . '): ' . $detail, 502, 'whisper_error');
}
$data = json_decode($responseBody, true);
if (!is_array($data) || empty($data['text'])) {
dbnToolsError('Empty or invalid response from Whisper.', 502, 'whisper_empty');
}
return $data;
}
function transcribeViaOpenAI(array $file, string $language, string $task, string $apiKey): array
{
$boundary = '----DBN' . bin2hex(random_bytes(8));
$body = "--{$boundary}\r\n";
$body .= 'Content-Disposition: form-data; name="file"; filename="' . addslashes(basename($file['name'])) . '"' . "\r\n";
$body .= "Content-Type: application/octet-stream\r\n\r\n";
$body .= file_get_contents($file['tmp_name']) . "\r\n";
$body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"model\"\r\n\r\nwhisper-1\r\n";
$body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"response_format\"\r\n\r\nverbose_json\r\n";
if ($language !== 'auto') {
$body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"language\"\r\n\r\n{$language}\r\n";
}
if ($task === 'translate') {
$body .= "--{$boundary}\r\nContent-Disposition: form-data; name=\"task\"\r\n\r\ntranslation\r\n";
}
$body .= "--{$boundary}--\r\n";
$ch = curl_init('https://api.openai.com/v1/audio/transcriptions');
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $body,
CURLOPT_HTTPHEADER => [
"Authorization: Bearer {$apiKey}",
"Content-Type: multipart/form-data; boundary={$boundary}",
'Accept: application/json',
],
CURLOPT_TIMEOUT => 300,
]);
$responseBody = curl_exec($ch);
$httpCode = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$curlErr = curl_error($ch);
curl_close($ch);
if ($responseBody === false || $httpCode !== 200) {
$detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : '');
dbnToolsError('OpenAI API error (HTTP ' . $httpCode . '): ' . $detail, 502, 'openai_error');
}
$data = json_decode($responseBody, true);
if (!is_array($data)) {
dbnToolsError('Invalid response from OpenAI.', 502, 'openai_empty');
}
// Normalise to internal shape
return [
'text' => (string)($data['text'] ?? ''),
'language' => (string)($data['language'] ?? $language),
'duration_seconds' => (float)($data['duration'] ?? 0),
'processing_seconds' => 0,
'segments' => array_map(fn($s) => [
'id' => $s['id'] ?? 0,
'start' => $s['start'] ?? 0,
'end' => $s['end'] ?? 0,
'text' => $s['text'] ?? '',
'speaker' => 'SPEAKER_00',
], $data['segments'] ?? []),
'model' => 'openai/whisper-1',
];
}
function transcribeViaAzure(array $file, string $language, string $apiKey,
string $region, bool $diarize): array
{
// Azure Batch Transcription — POST audio directly for short-form (<60 min)
// Uses the simple REST endpoint for synchronous short audio transcription.
$langCode = match($language) {
'no', 'nb' => 'nb-NO',
'nn' => 'nn-NO',
'en' => 'en-US',
'sv' => 'sv-SE',
'da' => 'da-DK',
'de' => 'de-DE',
'fr' => 'fr-FR',
'es' => 'es-ES',
'pl' => 'pl-PL',
'fi' => 'fi-FI',
'nl' => 'nl-NL',
'it' => 'it-IT',
'pt' => 'pt-PT',
default => 'nb-NO',
};
// Mime type map
$mimeMap = [
'wav' => 'audio/wav', 'mp3' => 'audio/mpeg', 'ogg' => 'audio/ogg',
'oga' => 'audio/ogg', 'm4a' => 'audio/mp4', 'mp4' => 'audio/mp4',
'flac' => 'audio/flac', 'webm' => 'audio/webm', 'aac' => 'audio/aac',
];
$fileExt = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION));
$mimeType = $mimeMap[$fileExt] ?? 'audio/wav';
$endpoint = "https://{$region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1"
. "?language={$langCode}&format=detailed";
$fileContents = file_get_contents($file['tmp_name']);
if ($fileContents === false) {
dbnToolsError('Could not read uploaded file.', 500, 'file_read_error');
}
$ch = curl_init($endpoint);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $fileContents,
CURLOPT_HTTPHEADER => [
"Ocp-Apim-Subscription-Key: {$apiKey}",
"Content-Type: {$mimeType}",
'Accept: application/json',
],
CURLOPT_TIMEOUT => 300,
]);
$responseBody = curl_exec($ch);
$httpCode = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$curlErr = curl_error($ch);
curl_close($ch);
if ($responseBody === false || $httpCode !== 200) {
$detail = $curlErr ?: (is_string($responseBody) ? substr(strip_tags($responseBody), 0, 300) : '');
dbnToolsError('Azure Speech error (HTTP ' . $httpCode . '): ' . $detail, 502, 'azure_error');
}
$data = json_decode($responseBody, true);
if (!is_array($data) || empty($data['DisplayText'])) {
dbnToolsError('Empty or invalid response from Azure Speech.', 502, 'azure_empty');
}
// Normalise to internal shape
$text = (string)($data['DisplayText'] ?? '');
$segs = [];
foreach (($data['NBest'][0]['Words'] ?? []) as $i => $word) {
$segs[] = [
'id' => $i,
'start' => round((float)($word['Offset'] ?? 0) / 10_000_000, 3),
'end' => round(((float)($word['Offset'] ?? 0) + (float)($word['Duration'] ?? 0)) / 10_000_000, 3),
'text' => (string)($word['Word'] ?? ''),
'speaker' => 'SPEAKER_00',
];
}
return [
'text' => $text,
'language' => $langCode,
'duration_seconds' => 0,
'processing_seconds' => 0,
'segments' => $segs,
'model' => "azure/{$langCode}",
];
}
function dbnLabelSpeakerRoles(array $segments): array
{
$sample = array_slice(
array_values(array_filter($segments, fn($s) => isset($s['speaker']))),
0, 20
);
if (!$sample) return [];
$lines = array_map(fn($s) => "[{$s['speaker']}] " . trim((string)($s['text'] ?? '')), $sample);
$azure = new DbnAzureOpenAiGateway();
$system = 'You are analyzing a legal proceeding transcript. '
. 'Based on the first segments, identify the role of each speaker. '
. 'Common roles in Norwegian legal proceedings: dommer (judge), advokat (lawyer), '
. 'forelder (parent), barn (child), sakkyndig (expert witness), '
. 'saksbehandler (caseworker), tolk (interpreter), vitne (witness), '
. 'prosessfullmektig (counsel). '
. 'Return ONLY valid JSON: {"SPEAKER_00":"dommer","SPEAKER_01":"forelder"}. '
. 'Use Norwegian role names. Use "ukjent" if role cannot be determined. '
. 'Only include speakers present in the input.';
try {
$text = $azure->chatText([
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => implode("\n", $lines)],
], ['temperature' => 0.1, 'max_tokens' => 200]);
$cleaned = preg_replace('/^```(?:json)?\s*|\s*```$/m', '', trim($text));
$json = json_decode($cleaned, true);
return is_array($json) ? $json : [];
} catch (Throwable) {
return [];
}
}