Transcribe: audio-to-text tool with diarization and speaker role labelling
New sixth tool in the hub. Accepts MP3/WAV/OGG/M4A/FLAC/WEBM up to 200 MB, proxies to Whisper on cuttlefish GPU. Optional speaker separation with LLM role labelling (dommer, advokat, forelder, sakkyndig, etc. via GPT-4o-mini). Client-side TXT / SRT / VTT download from segment data. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,190 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/../includes/LegalTools.php';
|
||||
|
||||
dbnToolsRequireMethod('POST');
|
||||
dbnToolsRequireAuth();
|
||||
|
||||
$validLangs = ['auto', 'no', 'en', 'sv', 'da', 'de', 'fr', 'es', 'pl'];
|
||||
$language = strtolower(trim((string)($_POST['language'] ?? 'auto')));
|
||||
if (!in_array($language, $validLangs, true)) $language = 'auto';
|
||||
|
||||
$diarize = !empty($_POST['diarize']) && $_POST['diarize'] !== '0';
|
||||
$numSpeakers = isset($_POST['num_speakers']) ? max(0, min(20, (int)$_POST['num_speakers'])) : 0;
|
||||
|
||||
// ── Validate upload ───────────────────────────────────────────────────────────
|
||||
|
||||
if (empty($_FILES['audio']) || $_FILES['audio']['error'] !== UPLOAD_ERR_OK) {
|
||||
$code = $_FILES['audio']['error'] ?? -1;
|
||||
$map = [
|
||||
UPLOAD_ERR_INI_SIZE => 'File exceeds server upload limit.',
|
||||
UPLOAD_ERR_FORM_SIZE => 'File exceeds form size limit.',
|
||||
UPLOAD_ERR_PARTIAL => 'File was only partially uploaded.',
|
||||
UPLOAD_ERR_NO_FILE => 'No audio file received.',
|
||||
];
|
||||
dbnToolsError($map[$code] ?? "Upload error (code {$code}).", 400, 'upload_error');
|
||||
}
|
||||
|
||||
$file = $_FILES['audio'];
|
||||
$maxBytes = 200 * 1024 * 1024;
|
||||
|
||||
if ($file['size'] > $maxBytes) {
|
||||
dbnToolsError('File too large. Maximum 200 MB.', 413, 'file_too_large');
|
||||
}
|
||||
|
||||
$allowedExts = ['mp3', 'wav', 'ogg', 'oga', 'm4a', 'mp4', 'flac', 'webm', 'aac'];
|
||||
$ext = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION));
|
||||
if (!in_array($ext, $allowedExts, true)) {
|
||||
dbnToolsError("Unsupported format: .{$ext}. Use MP3, WAV, OGG, M4A, FLAC, or WebM.", 415, 'unsupported_format');
|
||||
}
|
||||
|
||||
// ── Build Whisper request ─────────────────────────────────────────────────────
|
||||
|
||||
$whisperBase = 'http://194.93.49.14:20019';
|
||||
$endpoint = $diarize ? $whisperBase . '/transcribe/diarize' : $whisperBase . '/transcribe';
|
||||
|
||||
$boundary = '----DBN' . bin2hex(random_bytes(8));
|
||||
$body = "--{$boundary}\r\n";
|
||||
$body .= 'Content-Disposition: form-data; name="file"; filename="' . addslashes(basename($file['name'])) . '"' . "\r\n";
|
||||
$body .= "Content-Type: application/octet-stream\r\n\r\n";
|
||||
|
||||
$fileContents = file_get_contents($file['tmp_name']);
|
||||
if ($fileContents === false) {
|
||||
dbnToolsError('Could not read uploaded file.', 500, 'file_read_error');
|
||||
}
|
||||
$body .= $fileContents . "\r\n";
|
||||
|
||||
if ($language !== 'auto') {
|
||||
$body .= "--{$boundary}\r\n";
|
||||
$body .= "Content-Disposition: form-data; name=\"language\"\r\n\r\n";
|
||||
$body .= $language . "\r\n";
|
||||
}
|
||||
|
||||
if ($diarize && $numSpeakers > 1) {
|
||||
$body .= "--{$boundary}\r\n";
|
||||
$body .= "Content-Disposition: form-data; name=\"num_speakers\"\r\n\r\n";
|
||||
$body .= $numSpeakers . "\r\n";
|
||||
}
|
||||
|
||||
$body .= "--{$boundary}--\r\n";
|
||||
|
||||
// ── Call Whisper ──────────────────────────────────────────────────────────────
|
||||
|
||||
$t0 = microtime(true);
|
||||
|
||||
if (function_exists('curl_init')) {
|
||||
$ch = curl_init($endpoint);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $body,
|
||||
CURLOPT_HTTPHEADER => [
|
||||
"Content-Type: multipart/form-data; boundary={$boundary}",
|
||||
'Accept: application/json',
|
||||
],
|
||||
CURLOPT_TIMEOUT => 600,
|
||||
]);
|
||||
$whisperBody = curl_exec($ch);
|
||||
$httpCode = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
|
||||
$curlErr = curl_error($ch);
|
||||
curl_close($ch);
|
||||
|
||||
if ($whisperBody === false || $httpCode !== 200) {
|
||||
dbnToolsError('Whisper service error (HTTP ' . $httpCode . '): ' . $curlErr, 502, 'whisper_error');
|
||||
}
|
||||
} else {
|
||||
$ctx = stream_context_create([
|
||||
'http' => [
|
||||
'method' => 'POST',
|
||||
'timeout' => 600,
|
||||
'header' => "Content-Type: multipart/form-data; boundary={$boundary}\r\nAccept: application/json\r\n",
|
||||
'content' => $body,
|
||||
'ignore_errors' => true,
|
||||
],
|
||||
]);
|
||||
$whisperBody = @file_get_contents($endpoint, false, $ctx);
|
||||
|
||||
if ($whisperBody === false) {
|
||||
dbnToolsError('Whisper service unreachable. The GPU may be offline.', 502, 'whisper_unreachable');
|
||||
}
|
||||
}
|
||||
|
||||
$latencyMs = (int)round((microtime(true) - $t0) * 1000);
|
||||
|
||||
$whisper = json_decode($whisperBody, true);
|
||||
if (!is_array($whisper) || empty($whisper['text'])) {
|
||||
dbnToolsError('Empty or invalid response from Whisper.', 502, 'whisper_empty');
|
||||
}
|
||||
|
||||
// ── Speaker role labelling ────────────────────────────────────────────────────
|
||||
|
||||
$segments = is_array($whisper['segments'] ?? null) ? $whisper['segments'] : [];
|
||||
$numDetected = (int)($whisper['num_speakers'] ?? 1);
|
||||
|
||||
if ($numDetected < 2 && $segments) {
|
||||
$uniqueSpeakers = array_filter(array_unique(array_column($segments, 'speaker')));
|
||||
if (count($uniqueSpeakers) > 1) $numDetected = count($uniqueSpeakers);
|
||||
}
|
||||
|
||||
$speakerRoles = null;
|
||||
if ($diarize && $numDetected > 1 && $segments) {
|
||||
$speakerRoles = dbnLabelSpeakerRoles($segments);
|
||||
}
|
||||
|
||||
// ── Respond ───────────────────────────────────────────────────────────────────
|
||||
|
||||
dbnToolsLogMetadata([
|
||||
'tool' => 'transcribe',
|
||||
'language' => $language,
|
||||
'ok' => true,
|
||||
'latency_ms' => $latencyMs,
|
||||
]);
|
||||
|
||||
dbnToolsRespond([
|
||||
'ok' => true,
|
||||
'tool' => 'transcribe',
|
||||
'transcript' => (string)$whisper['text'],
|
||||
'segments' => $segments,
|
||||
'speaker_roles' => $speakerRoles,
|
||||
'num_speakers' => $numDetected,
|
||||
'language' => (string)($whisper['language'] ?? $language),
|
||||
'duration_sec' => round((float)($whisper['duration_seconds'] ?? 0), 2),
|
||||
'model' => (string)($whisper['model'] ?? 'whisper'),
|
||||
'latency_ms' => $latencyMs,
|
||||
]);
|
||||
|
||||
// ── Speaker role labelling helper ─────────────────────────────────────────────
|
||||
|
||||
function dbnLabelSpeakerRoles(array $segments): array
|
||||
{
|
||||
$sample = array_slice(
|
||||
array_values(array_filter($segments, fn($s) => isset($s['speaker']))),
|
||||
0, 20
|
||||
);
|
||||
if (!$sample) return [];
|
||||
|
||||
$lines = array_map(fn($s) => "[{$s['speaker']}] " . trim((string)($s['text'] ?? '')), $sample);
|
||||
$azure = new DbnAzureOpenAiGateway();
|
||||
$system = 'You are analyzing a legal proceeding transcript. '
|
||||
. 'Based on the first segments, identify the role of each speaker. '
|
||||
. 'Common roles in Norwegian legal proceedings: dommer (judge), advokat (lawyer), '
|
||||
. 'forelder (parent), barn (child), sakkyndig (expert witness), '
|
||||
. 'saksbehandler (caseworker), tolk (interpreter), vitne (witness), '
|
||||
. 'prosessfullmektig (counsel). '
|
||||
. 'Return ONLY valid JSON: {"SPEAKER_00":"dommer","SPEAKER_01":"forelder"}. '
|
||||
. 'Use Norwegian role names. Use "ukjent" if role cannot be determined. '
|
||||
. 'Only include speakers present in the input.';
|
||||
|
||||
try {
|
||||
$text = $azure->chatText([
|
||||
['role' => 'system', 'content' => $system],
|
||||
['role' => 'user', 'content' => implode("\n", $lines)],
|
||||
], ['temperature' => 0.1, 'max_tokens' => 200]);
|
||||
$cleaned = preg_replace('/^```(?:json)?\s*|\s*```$/m', '', trim($text));
|
||||
$json = json_decode($cleaned, true);
|
||||
return is_array($json) ? $json : [];
|
||||
} catch (Throwable) {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user