feat(transcribe): GPT cleanup pass + advanced options i18n

Adds optional post-transcription cleanup via GPT-4o/GPT-4o-mini to fix
mishearing errors, punctuation, and domain terms. Speaker role labelling
now accepts a deployment param. Adds i18n strings for advanced options
panel (task, VAD filter, Whisper model, AI cleanup) in all four languages.
Updates BvjAnalyzerAgent and DeepResearchAgent.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-18 07:23:01 +02:00
parent e32ee60e78
commit c4362738c1
5 changed files with 345 additions and 112 deletions
+57 -11
View File
@@ -27,6 +27,11 @@ $task = ($_POST['task'] ?? 'transcribe') === 'translate' ? 'translate' :
$vadFilter = !empty($_POST['vad_filter']) && $_POST['vad_filter'] !== '0';
$initPrompt = substr(trim((string)($_POST['initial_prompt'] ?? '')), 0, 500);
$allowedPostModels = ['', 'gpt-4o-mini', 'gpt-4o'];
$postModel = in_array($_POST['post_model'] ?? '', $allowedPostModels, true)
? (string)($_POST['post_model'] ?? '')
: '';
// ── Validate upload ───────────────────────────────────────────────────────────
if (empty($_FILES['audio']) || $_FILES['audio']['error'] !== UPLOAD_ERR_OK) {
@@ -114,6 +119,17 @@ if ($timeOffset > 0.0 && !empty($result['segments'])) {
unset($seg);
}
// ── Optional GPT cleanup pass ─────────────────────────────────────────────────
$cleanedBy = null;
if ($postModel !== '' && !empty($result['text'])) {
$cleaned = dbnCleanupTranscript($result['text'], $language, $initPrompt, $postModel);
if ($cleaned !== null) {
$result['text'] = $cleaned;
$cleanedBy = $postModel;
}
}
// ── Speaker role labelling (diarize + multiple speakers only) ─────────────────
$segments = $result['segments'] ?? [];
@@ -126,7 +142,8 @@ if ($numDetected < 2 && $segments) {
$speakerRoles = null;
if ($diarize && $numDetected > 1 && $segments) {
$speakerRoles = dbnLabelSpeakerRoles($segments);
$labelDeployment = $postModel ?: 'gpt-4o-mini';
$speakerRoles = dbnLabelSpeakerRoles($segments, $labelDeployment);
}
// ── Friendly engine label ─────────────────────────────────────────────────────
@@ -161,6 +178,7 @@ dbnToolsRespond([
'model' => $engineLabel,
'engine' => $engineUsed,
'latency_ms' => $latencyMs,
'cleaned_by' => $cleanedBy,
]);
@@ -313,7 +331,7 @@ function transcribeViaWhisperGpu(array $file, string $language, bool $diarize, i
}
function dbnLabelSpeakerRoles(array $segments): array
function dbnLabelSpeakerRoles(array $segments, string $deployment = 'gpt-4o-mini'): array
{
$sample = array_slice(
array_values(array_filter($segments, fn($s) => isset($s['speaker']))),
@@ -322,16 +340,11 @@ function dbnLabelSpeakerRoles(array $segments): array
if (!$sample) return [];
$lines = array_map(fn($s) => "[{$s['speaker']}] " . trim((string)($s['text'] ?? '')), $sample);
$azure = new DbnAzureOpenAiGateway();
$system = 'You are analyzing a legal proceeding transcript. '
. 'Based on the first segments, identify the role of each speaker. '
. 'Common roles in Norwegian legal proceedings: dommer (judge), advokat (lawyer), '
. 'forelder (parent), barn (child), sakkyndig (expert witness), '
. 'saksbehandler (caseworker), tolk (interpreter), vitne (witness), '
. 'prosessfullmektig (counsel). '
$azure = (new DbnAzureOpenAiGateway())->withDeployment($deployment);
$system = 'Label speakers in this Norwegian legal transcript. '
. 'Return ONLY valid JSON: {"SPEAKER_00":"dommer","SPEAKER_01":"forelder"}. '
. 'Use Norwegian role names. Use "ukjent" if role cannot be determined. '
. 'Only include speakers present in the input.';
. 'Norwegian role names only — dommer, advokat, forelder, barn, sakkyndig, '
. 'saksbehandler, tolk, vitne, prosessfullmektig. Use "ukjent" if unclear.';
try {
$text = $azure->chatText([
@@ -345,3 +358,36 @@ function dbnLabelSpeakerRoles(array $segments): array
return [];
}
}
function dbnCleanupTranscript(string $text, string $language, string $vocabulary, string $deployment): ?string
{
$langName = match($language) {
'no', 'nb', 'nn' => 'Norwegian',
'en' => 'English',
'pl' => 'Polish',
'uk' => 'Ukrainian',
'sv' => 'Swedish',
'da' => 'Danish',
'de' => 'German',
'fr' => 'French',
default => 'Norwegian',
};
$vocabHint = $vocabulary !== '' ? " Domain terms to preserve correctly: {$vocabulary}." : '';
$system = "Fix transcription errors in this {$langName} text.{$vocabHint} "
. "Correct mishearing errors, run-on sentences, and punctuation. "
. "Preserve all meaning and the original language exactly. "
. "Return only the corrected transcript text, no commentary.";
try {
$azure = (new DbnAzureOpenAiGateway())->withDeployment($deployment);
$result = $azure->chatText(
[['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $text]],
['temperature' => 0.1, 'max_tokens' => 4096]
);
return ($result !== '' && $result !== null) ? $result : null;
} catch (Throwable) {
return null;
}
}
+61
View File
@@ -441,6 +441,20 @@ const TRANSCRIBE_I18N = {
traceProcessingDetail: () => 'Processing audio. Large files may take 13 minutes.',
traceStillLabel: (clip) => `${clip} — still processing…`,
traceStillDetail: (e) => { const m = Math.floor(e / 60), s = e % 60; return m > 0 ? `${m}m ${s}s elapsed — working through the audio.` : `${e}s elapsed — processing.`; },
advancedOptions: 'Advanced options',
task: 'Task',
taskTranscribe: 'Transcribe',
taskTranslate: 'Translate to English',
vadFilter: 'VAD filter',
vadFilterLabel: 'Remove silence / noise',
vadFilterHint: 'Improves accuracy on recordings with long pauses.',
whisperModel: 'Whisper model',
whisperModelHint: 'Used when Azure/GCP unavailable. large-v3 is the default.',
postModel: 'AI cleanup',
postModelNone: 'None',
postModelMini: 'GPT-4o Mini',
postModelFull: 'GPT-4o',
postModelHint: 'Fixes errors, punctuation, and domain terms after transcription.',
},
no: {
transcribeLang: 'Språk i lydfil',
@@ -481,6 +495,20 @@ const TRANSCRIBE_I18N = {
traceProcessingLabel: (clip) => `${clip} — transkriberer`,
traceProcessingDetail: () => 'Behandler lyden. Store filer tar 13 minutter.', traceStillLabel: (clip) => `${clip} — behandler fortsatt…`,
traceStillDetail: (e) => { const m = Math.floor(e / 60), s = e % 60; return m > 0 ? `${m} min ${s}s gått — jobber gjennom lyden.` : `${e}s gått — behandler.`; },
advancedOptions: 'Avanserte valg',
task: 'Oppgave',
taskTranscribe: 'Transkriber',
taskTranslate: 'Oversett til engelsk',
vadFilter: 'VAD-filter',
vadFilterLabel: 'Fjern stillhet / støy',
vadFilterHint: 'Forbedrer nøyaktigheten ved opptak med lange pauser.',
whisperModel: 'Whisper-modell',
whisperModelHint: 'Brukes når Azure/GCP ikke er tilgjengelig. large-v3 er standard.',
postModel: 'AI-opprydding',
postModelNone: 'Ingen',
postModelMini: 'GPT-4o Mini',
postModelFull: 'GPT-4o',
postModelHint: 'Retter feil, tegnsetting og fagtermer etter transkripsjon.',
},
uk: {
transcribeLang: 'Мова аудіо',
@@ -521,6 +549,20 @@ const TRANSCRIBE_I18N = {
traceProcessingLabel: (clip) => `${clip} — транскрибування`,
traceProcessingDetail: () => 'Обробка аудіо. Великі файли займають 1–3 хвилини.', traceStillLabel: (clip) => `${clip} — ще обробляється…`,
traceStillDetail: (e) => { const m = Math.floor(e / 60), s = e % 60; return m > 0 ? `Минуло ${m} хв ${s} с — обробка.` : `Минуло ${e} с — обробка.`; },
advancedOptions: 'Розширені параметри',
task: 'Завдання',
taskTranscribe: 'Транскрибувати',
taskTranslate: 'Перекласти на англійську',
vadFilter: 'VAD-фільтр',
vadFilterLabel: 'Видалити тишу / шум',
vadFilterHint: 'Покращує точність для записів з довгими паузами.',
whisperModel: 'Модель Whisper',
whisperModelHint: 'Використовується, якщо Azure/GCP недоступні. large-v3 за замовчуванням.',
postModel: 'AI-очищення',
postModelNone: 'Без',
postModelMini: 'GPT-4o Mini',
postModelFull: 'GPT-4o',
postModelHint: 'Виправляє помилки, пунктуацію та терміни після транскрипції.',
},
pl: {
transcribeLang: 'Język audio',
@@ -561,6 +603,20 @@ const TRANSCRIBE_I18N = {
traceProcessingLabel: (clip) => `${clip} — transkrybowanie`,
traceProcessingDetail: () => 'Przetwarzanie audio. Duże pliki zajmują 13 minuty.', traceStillLabel: (clip) => `${clip} — nadal przetwarza…`,
traceStillDetail: (e) => { const m = Math.floor(e / 60), s = e % 60; return m > 0 ? `Minęło ${m} min ${s} s — przetwarzanie audio.` : `Minęło ${e} s — przetwarzanie.`; },
advancedOptions: 'Opcje zaawansowane',
task: 'Zadanie',
taskTranscribe: 'Transkrypcja',
taskTranslate: 'Przetłumacz na angielski',
vadFilter: 'Filtr VAD',
vadFilterLabel: 'Usuń ciszę / szum',
vadFilterHint: 'Poprawia dokładność nagrań z długimi przerwami.',
whisperModel: 'Model Whisper',
whisperModelHint: 'Używany gdy Azure/GCP niedostępne. large-v3 jest domyślny.',
postModel: 'Korekta AI',
postModelNone: 'Brak',
postModelMini: 'GPT-4o Mini',
postModelFull: 'GPT-4o',
postModelHint: 'Poprawia błędy, interpunkcję i terminy po transkrypcji.',
},
};
@@ -1515,6 +1571,10 @@ async function runTranscribe() {
formData.append('time_offset', String(cumulativeOffset));
if (vadFilter) formData.append('vad_filter', '1');
if (initPrompt) formData.append('initial_prompt', initPrompt);
const whisperModel = document.getElementById('whisperModelSelect')?.value;
if (whisperModel) formData.append('model', whisperModel);
const postModel = document.querySelector('input[name="post_model"]:checked')?.value;
if (postModel) formData.append('post_model', postModel);
if (diarize) {
formData.append('diarize', '1');
if (numSpeakers >= 2) formData.append('num_speakers', String(numSpeakers));
@@ -1650,6 +1710,7 @@ function renderTranscriptResults(data) {
if (data.language) traceMeta.push({ label: `Language: ${data.language}`, detail: '', status: 'complete' });
if (data.num_speakers > 1) traceMeta.push({ label: `Speakers detected: ${data.num_speakers}`, detail: Object.entries(speakerRoles).map(([id, r]) => `${id}: ${r}`).join(', ') || '', status: 'complete' });
if (data.model) traceMeta.push({ label: data.model, detail: '', status: 'complete' });
if (data.cleaned_by) traceMeta.push({ label: `Cleaned by ${data.cleaned_by}`, detail: '', status: 'complete' });
renderTrace(traceMeta.length ? traceMeta : [{ label: 'Transcribed', detail: '', status: 'complete' }]);
}
+128 -59
View File
@@ -493,7 +493,7 @@ PROMPT;
private function extractParties(string $docText, string $language): array
{
$locale = dbnToolsLanguageName($language);
$excerpt = mb_substr($docText, 0, 12000, 'UTF-8');
$excerpt = mb_substr($docText, 0, 20000, 'UTF-8');
$prompt = <<<PROMPT
You are analysing a Norwegian child welfare (Barnevernet) document.
@@ -502,15 +502,16 @@ Identify ALL named parties — every person or institution referred to by name o
Respond in {$locale}. Return a JSON object with a single key "parties" containing an array of objects.
Each object must have these four fields:
- "name": full name or institution name (string)
- "role": their role in the case, e.g. Biological mother, Child, Barnevernarbeider, Saksbehandler, Melder, Politi, Lege, Advokat, Foster carer, Rusklinikk
- "role": their role in the case, e.g. Biological mother, Biological father, Child, Barnevernarbeider, Saksbehandler, Leder, Melder, Politi, Lege, Psykolog, Advokat, Talsperson for barnet, Tilsynsfører, Sakkyndig, Foster carer (fosterforelder), Rusklinikk, Statsforvalter
- "organization": employer or institution if mentioned, otherwise null
- "relationship_to_child": relationship to the child in the document, e.g. Mother, Father, Caseworker, Melder, or null
- "relationship_to_child": relationship to the child in the document, e.g. Mother, Father, Sibling, Caseworker, Melder, Supervisor, or null
Rules:
- Include every named person and named institution — even peripheral ones.
- Include Barnevernvakta (bvv) as an institution even if no individual caseworkers are named.
- If a name appears to be redacted or anonymised (e.g. "mor", "far", "barnet", initials like "A.B."), include them with role inferred from context.
- Do not invent parties not present in the text.
- Maximum 20 parties.
- Maximum 25 parties.
Document text:
{$excerpt}
@@ -520,14 +521,14 @@ PROMPT;
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 1500, 'timeout' => 40]);
], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 2000, 'timeout' => 45]);
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && is_array($json['parties'] ?? null)) {
return array_slice($json['parties'], 0, 20);
return array_slice($json['parties'], 0, 25);
}
// Fallback: model returned an array at root level instead of {parties:[...]}
if (is_array($json) && isset($json[0]['name'])) {
return array_slice($json, 0, 20);
return array_slice($json, 0, 25);
}
error_log('BVJ extractParties unexpected structure: ' . substr($raw, 0, 300));
} catch (Throwable $e) {
@@ -541,7 +542,7 @@ PROMPT;
private function extractTimeline(string $docText, string $language): array
{
$locale = dbnToolsLanguageName($language);
$excerpt = mb_substr($docText, 0, 12000, 'UTF-8');
$excerpt = mb_substr($docText, 0, 20000, 'UTF-8');
$prompt = <<<PROMPT
Build a chronological timeline from this Norwegian child welfare (Barnevernet) document in {$locale}.
@@ -557,14 +558,24 @@ IMPORTANT — Norwegian date and time formats to recognise:
- Diary/log format: lines beginning with a date or time are always events.
- Two-digit years: interpret as 20YY (20 → 2020, 21 → 2021).
Barnevernet-specific events that are ALWAYS high significance:
- Akuttvedtak (emergency placement) under §4-6 or §4-25
- Omsorgsovertakelse (care order) under §4-12
- Police involvement or assistance (politibistand)
- Formal decision (vedtak) or court order (kjennelse)
- Deadline breaches: bekymringsmelding not processed within 7 days; investigation not opened within 6 weeks
- Forhandlingsmøte (negotiation hearing) or Fylkesnemnda hearing
- Supervised contact visits (samvær) being reduced or denied
- Placement in foster care or institution (fosterhjem, institusjon)
For each event provide:
- "date": ISO 8601 date (YYYY-MM-DD) if determinable, otherwise best-effort description
- "time_of_day": HH:MM if present, otherwise null
- "actor": person, institution, or party involved
- "action": concise description (≤ 80 chars) of what happened
- "significance": high (acute measure, removal, police involvement, formal decision) | medium (home visit, phone call, meeting) | low (minor update, note)
- "significance": high (acute measure, removal, police involvement, formal decision, statutory deadline breach) | medium (home visit, phone call, meeting, assessment) | low (minor update, note)
Sort chronologically. Maximum 30 events.
Sort chronologically. Maximum 40 events.
Document text:
{$excerpt}
@@ -579,10 +590,10 @@ PROMPT;
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 3000, 'timeout' => 45]);
], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 4000, 'timeout' => 55]);
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && is_array($json['events'] ?? null)) {
return array_slice($json['events'], 0, 30);
return array_slice($json['events'], 0, 40);
}
} catch (Throwable $e) {
error_log('BVJ extractTimeline failed: ' . $e->getMessage());
@@ -602,50 +613,82 @@ PROMPT;
): array {
$locale = dbnToolsLanguageName($language);
$docType = $docMeta['doc_type'] ?? 'BVJ document';
$docDate = $docMeta['doc_date'] ?? 'unknown date';
$authority = $docMeta['issuing_authority'] ?? 'the municipality';
$roleStr = $advocateRole !== '' ? $advocateRole : 'the affected party';
// Summarise the top events to give the model context
// Summarise high-significance events first, then others
$highEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
$otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'));
$topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 12);
$eventSummary = '';
$highEvents = array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high');
$topEvents = array_slice(array_merge(array_values($highEvents),
array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'))), 0, 8);
foreach ($topEvents as $ev) {
$eventSummary .= sprintf("- %s: %s (%s)\n", $ev['date'] ?? '?', $ev['action'] ?? '', $ev['actor'] ?? '');
$sig = ($ev['significance'] ?? 'low') === 'high' ? '[HIGH] ' : '';
$eventSummary .= sprintf("- %s %s%s (%s)\n",
$ev['date'] ?? '?', $sig, $ev['action'] ?? '', $ev['actor'] ?? '');
}
// Summarise parties
$partyList = '';
foreach (array_slice($parties, 0, 8) as $p) {
$partyList .= sprintf("- %s (%s)\n", $p['name'] ?? '', $p['role'] ?? '');
foreach (array_slice($parties, 0, 10) as $p) {
$org = !empty($p['organization']) ? ' at ' . $p['organization'] : '';
$partyList .= sprintf("- %s (%s%s)\n", $p['name'] ?? '?', $p['role'] ?? '?', $org);
}
$angleGuidance = match (true) {
$count >= 5 => <<<ANGLES
Cover these five distinct legal angles (one per question):
1. Statutory rights and obligations under Barnevernloven (e.g. §4-2, §4-6, §4-12) specific to the measures taken
2. ECHR Article 8 proportionality and procedural safeguards cite the specific measures and dates from this case
3. Procedural obligations BVV must fulfil (advance notice, documentation, hearing rights) anchor to documented events
4. Bufdir/Statsforvalter guidance on investigation standards and thresholds for intervention
5. Norwegian appellate court decisions on comparable measures and family circumstances
ANGLES,
$count === 4 => <<<ANGLES
Cover these four distinct legal angles (one per question):
1. Statutory rights under Barnevernloven anchored to the specific measures and dates in this case
2. ECHR Article 8 proportionality of the specific intervention and any procedural violations
3. BVV's procedural obligations — documentation, notice, and hearing rights — as evidenced by the timeline
4. Bufdir guidance and Norwegian court decisions on comparable fact patterns
ANGLES,
default => <<<ANGLES
Cover three distinct legal angles (one per question):
1. Statutory rights under Barnevernloven for the specific type of measure documented
2. ECHR Article 8 proportionality and procedural safeguards
3. BVV's procedural obligations and whether the documented timeline shows any breach
ANGLES,
};
$prompt = <<<PROMPT
You are a Norwegian family-law research assistant building a case for: {$roleStr}.
A {$docType} has been uploaded. Key events:
Case facts extracted from the uploaded document:
- Document type: {$docType}
- Date: {$docDate}
- Issuing authority: {$authority}
- Key events (chronological):
{$eventSummary}
Key parties:
- Key parties:
{$partyList}
Generate exactly {$count} targeted sub-questions to research the legal corpus for arguments that SUPPORT {$roleStr}'s position. Each question should explore a different angle:
1. Statutory rights and obligations (Barnevernloven, Barneloven)
2. ECHR Article 8 and 9 precedents vs Norway
3. Procedural requirements BVV must follow (notice, documentation, proportionality)
4. Bufdir guidance on case handling standards
5. Norwegian court decisions on similar fact patterns
Generate exactly {$count} sub-questions to search the Norwegian legal corpus for arguments that SUPPORT {$roleStr}'s position.
{$angleGuidance}
CRITICAL: Every question MUST embed specific facts from this case — use the actual authority name, document date, type of measure, and parties where relevant. Generic questions ("What are parental rights?") are useless for retrieval. Specific questions ("What notice requirements must {$authority} meet before issuing an emergency placement under Barnevernloven §4-6?") are highly effective.
Return JSON only in {$locale}:
{
"sub_questions": [
{"id":"q1","question":"...","rationale":"how this angle strengthens {$roleStr}'s position (≤ 120 chars)"}
{"id":"q1","question":"...","rationale":"why this angle strengthens {$roleStr}'s position (≤ 120 chars)"}
]
}
Rules:
- Exactly {$count} sub-questions, no more no fewer.
- Every question must be answerable from Norwegian family-law, child-welfare, or ECHR sources.
- Each question must cover a DIFFERENT legal angle.
- Questions must be self-contained without needing the raw document.
- Exactly {$count} sub-questions.
- Each question targets a DIFFERENT legal angle.
- Include specific case details (authority, date, measure type) in each question.
- Questions must be self-contained and answerable from Norwegian family-law, child-welfare, or ECHR sources.
- Respond in {$locale}.
PROMPT;
@@ -734,16 +777,16 @@ PROMPT;
// Build parties summary (top 8)
$partiesSummary = '';
foreach (array_slice($parties, 0, 8) as $i => $p) {
foreach (array_slice($parties, 0, 12) as $i => $p) {
$org = $p['organization'] ? ' (' . $p['organization'] . ')' : '';
$rel = $p['relationship_to_child'] ? ' — rel: ' . $p['relationship_to_child'] : '';
$partiesSummary .= sprintf("%d. %s — %s%s%s\n", $i + 1, $p['name'] ?? '', $p['role'] ?? '', $org, $rel);
}
// Build timeline summary (top 15 most significant events)
// Build timeline summary (top 20 most significant events)
$highEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
$otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'));
$topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 15);
$topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 20);
$timelineSummary = '';
foreach ($topEvents as $ev) {
$time = $ev['time_of_day'] ? ' kl.' . $ev['time_of_day'] : '';
@@ -783,14 +826,17 @@ PROMPT;
? "\n== ADDITIONAL CONTEXT FROM ADVOCATE ==\n{$additionalNotes}\n"
: '';
$docExcerpt = mb_substr($docText, 0, 3000, 'UTF-8');
$docExcerpt = mb_substr($docText, 0, 8000, 'UTF-8');
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools producing a structured Barnevernet case analysis brief.
You are representing: {$roleStr}
You are Do Better Norge Legal Tools. Produce a structured Barnevernet case analysis for: {$roleStr}.
HALLUCINATION RULES — READ FIRST:
- You may ONLY cite statute sections (§), ECHR article numbers, ECHR application numbers, case names, and Bufdir/Statsforvalter circular references that appear verbatim in the numbered corpus sources below.
- Do NOT cite statute sections, case names, or ECHR applications from your training memory — they may be misremembered or no longer in force.
- If no source supports a claim, omit the claim rather than invent support.
- Every factual legal claim in advocacy_brief MUST end with at least one [n] or [DOC] citation. Unsupported claims are a liability for the client.
Ground every claim in the numbered corpus sources below using [n] markers, OR in the uploaded document using [DOC].
Do NOT invent statutes, paragraph numbers, case names, ECHR applications, dates, or parties.
Return valid JSON only. No markdown fences.
== DOCUMENT METADATA ==
@@ -805,51 +851,74 @@ Child: {$childInfo}
== TIMELINE (from document) ==
{$timelineSummary}
== CORPUS SOURCES ({$sourceCount} numbered) ==
== CORPUS SOURCES ({$sourceCount} numbered — cite as [n]) ==
{$sourcesText}
{$notesSection}
{$subQText}
== DOCUMENT EXCERPT (first 3000 chars — use [DOC] to cite) ==
== DOCUMENT EXCERPT (first 8000 chars — cite as [DOC]) ==
{$docExcerpt}
Return JSON in {$locale}:
== ADVOCACY BRIEF FORMAT ==
Write the advocacy_brief as a Markdown document with these sections:
## Case Overview
Summarise what happened: document type, issuing authority, key events from the timeline. Every factual statement must cite [DOC].
## {$roleStr}'s Core Legal Position
The strongest statutory and ECHR arguments in favour of {$roleStr}. Cite [n] for each legal point. Only cite statutes and cases that appear in the corpus sources above.
## Procedural Compliance Issues
Where BVV/the authority may have failed their own procedural obligations. Ground each point in a specific documented action from [DOC] and the applicable statute or guidance from [n].
## Client Strengths
3-6 factual and legal advantages for {$roleStr}, each anchored with [n] or [DOC].
## Counter-Arguments and Responses
The most likely opposing arguments and how to rebut them. Cite [n] for rebuttal sources.
## Recommended Next Steps
2-4 concrete legal actions {$roleStr} should take now.
End with one line: "*This brief is AI-assisted and for discussion purposes only — verify all legal references with a qualified Norwegian family-law lawyer.*"
Target length: 600-1000 words.
== JSON OUTPUT ==
{
"advocacy_brief": "Partisan legal brief in Markdown. Structure:\n## Case Overview\n(What happened according to [DOC] — doc type, authority, key events)\n\n## {$roleStr}'s Core Legal Position\n(Strongest statutory and ECHR arguments — cite [n] and [DOC])\n\n## Procedural Compliance Issues\n(Where BVV may have failed their own procedural obligations — cite [DOC][n])\n\n## Client Strengths\n(Factual and legal advantages for {$roleStr} — cite [n][DOC])\n\n## Counter-Arguments and Responses\n(Likely opposing arguments and how to rebut — cite [n])\n\n## Recommended Next Steps\n(Concrete legal actions)\n\nEnd with a one-line disclaimer. Length: 500-1000 words.",
"advocacy_brief": "<the Markdown brief following the format above>",
"procedural_red_flags": [
{
"description": "Concise description of the potential procedural violation",
"legal_basis": "Statute or ECHR article potentially violated, e.g. Barnevernloven §6-1, ECHR Art.8",
"severity": "high",
"legal_basis": "Statute or ECHR article from a corpus source — e.g. Barnevernloven §4-2 [3]",
"severity": "high|medium|low",
"source_refs": ["[n]", "[DOC]"],
"what_to_check": "Specific document text or action requiring legal verification"
"what_to_check": "Exact document text or action to verify with a lawyer"
}
],
"client_strengths": ["3-6 items anchored with [n] or [DOC]"],
"opposing_weaknesses": ["2-5 vulnerabilities in BVV or opposing party position — omit if unsupported by sources"],
"what_we_found": "2-sentence plain-language summary of the most critical finding",
"what_remains_uncertain": ["3-5 specific gaps — missing information, unclear authority, conflicting sources"],
"next_practical_step": "The single most important concrete legal action for {$roleStr}"
"client_strengths": ["3-6 items, each ending with [n] or [DOC]"],
"opposing_weaknesses": ["2-5 documented vulnerabilities in BVV or opposing position — OMIT if not supported by at least one [n]"],
"what_we_found": "2-sentence plain-language summary of the single most critical finding",
"what_remains_uncertain": ["3-5 specific information gaps or legal questions that need clarification"],
"next_practical_step": "The single most important concrete legal action for {$roleStr} to take within the next 7 days"
}
Rules:
- Every factual claim in advocacy_brief must end with [n] or [DOC].
- procedural_red_flags must be grounded in documented BVV actions — no speculation.
- severity: high = likely violation of a codified right; medium = procedural irregularity; low = best-practice gap.
- If no corpus source supports a claimed weakness, omit it from opposing_weaknesses.
- Cite statute sections and ECHR articles as they appear in the corpus excerpts.
- severity: high = likely violation of a codified statutory right or ECHR guarantee; medium = procedural irregularity; low = best-practice gap only.
- procedural_red_flags must be grounded in documented BVV actions visible in [DOC] or the timeline.
- If fewer than 2 corpus sources support opposing_weaknesses, return an empty array.
- Respond in {$locale}.
PROMPT;
$sysPrompt = 'You return valid JSON only. No markdown fences.';
$sysPrompt = 'You return valid JSON only. No markdown fences. Every legal citation must come from the provided corpus sources, not from training memory.';
$messages = [
['role' => 'system', 'content' => $sysPrompt],
['role' => 'user', 'content' => $prompt],
];
$opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3000, 'timeout' => 200];
$opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 4500, 'timeout' => 240];
$deployLabel = match ($engine) {
'gpu' => 'GPU (cuttlefish)',
+55 -35
View File
@@ -91,7 +91,7 @@ final class DbnDeepResearchAgent
// STEP 2: Query expansion
$emitRunning('expansion', 'Query expansion', 'Generating sub-questions…');
$stepStart = microtime(true);
$expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language, $advocateRole);
$expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $interpretation['key_signals'], $controls['sub_q_count'], $language, $advocateRole);
$this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
$subQuestions = $expansion['questions'];
$expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
@@ -323,7 +323,8 @@ final class DbnDeepResearchAgent
$controls['temperature'],
$advocateRole,
$priorContext,
$branchNotes
$branchNotes,
$interpretation['key_signals'] ?? []
);
$this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
$emitStep(
@@ -406,7 +407,7 @@ final class DbnDeepResearchAgent
'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.10))),
];
}
@@ -472,7 +473,7 @@ Input:
In {$locale}, produce JSON with:
{
"brief": "1-3 sentence description of what the user is trying to research (≤ 220 chars)",
"brief": "1-3 sentence description of what the user is trying to research (≤ 300 chars)",
"key_signals": ["short keywords or terms that should drive retrieval"]
}
PROMPT;
@@ -483,19 +484,20 @@ PROMPT;
if ($language === 'no' || $advocateRole !== '') {
$resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [
'model' => 'dbn-legal-agent', 'json' => true,
'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 40,
'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 40,
]);
$raw = (string)($resp['choices'][0]['message']['content'] ?? '');
} else {
$raw = $this->azure->chatText([$sysMsg, $userMsg],
['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]);
['json' => true, 'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 30]);
}
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && !empty($json['brief'])) {
$signals = $json['key_signals'] ?? [];
$signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : '';
$signals = is_array($json['key_signals'] ?? null) ? array_slice($json['key_signals'], 0, 8) : [];
$signalText = $signals ? implode(', ', $signals) : '';
return [
'brief' => (string)$json['brief'],
'key_signals' => $signals,
'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
];
}
@@ -505,13 +507,17 @@ PROMPT;
return [
'brief' => '',
'key_signals' => [],
'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
];
}
private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language, string $advocateRole = ''): array
private function expandQueries(string $seedDescription, string $brief, array $keySignals, int $targetCount, string $language, string $advocateRole = ''): array
{
$locale = dbnToolsLanguageName($language);
$anchorsLine = !empty($keySignals)
? "\nKey retrieval anchors (incorporate these terms into your sub-questions where relevant):\n" . implode(', ', $keySignals) . "\n"
: '';
if ($advocateRole !== '') {
$prompt = <<<PROMPT
@@ -521,10 +527,11 @@ Generate exactly {$targetCount} targeted sub-questions designed to find:
2. Procedural rights and obligations the opposing party must satisfy — failures here help {$advocateRole}.
3. Case law that exposes weaknesses in the opposing party's likely arguments.
4. Specific articles, paragraphs, or judgments {$advocateRole}'s representative should cite.
5. Specific documentation and procedural obligations Barnevernet or the opposing authority must fulfil — procedural or evidentiary failures that Norwegian courts have used to rule in favour of parents or children.
Research brief:
{$brief}
{$anchorsLine}
Raw input:
{$seedDescription}
@@ -538,7 +545,8 @@ Return JSON only in {$locale}:
Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Every question must be answerable from Norwegian family-law, child-welfare, or ECHR/Hague sources.
- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame).
- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame, Barnevernet procedural obligation).
- Each sub-question must reference a DIFFERENT legal instrument, statute section, or ECHR article — do not repeat the same §-reference or case name across sub-questions.
- Sub-questions must be self-contained — readable without the raw input.
- Write the questions in {$locale}.
PROMPT;
@@ -548,7 +556,7 @@ You are decomposing a Do Better Norge legal-research request into {$targetCount}
Research brief:
{$brief}
{$anchorsLine}
Raw input:
{$seedDescription}
@@ -563,6 +571,7 @@ Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
- Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
- Each sub-question must reference a DIFFERENT legal instrument, statute section, or ECHR article — do not repeat the same §-reference or case name across sub-questions.
- Sub-questions must be self-contained — readable without seeing the seed text.
- Write the questions in {$locale}.
PROMPT;
@@ -667,7 +676,7 @@ PROMPT;
'title' => 'uploaded: ' . $entry['meta']['filename'],
'section' => null,
'package_or_corpus' => 'Your upload',
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620),
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 950),
'chunk_text' => $entry['meta']['text'],
'similarity' => round($sim, 4),
'reranker_score' => null,
@@ -709,7 +718,7 @@ PROMPT;
'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
'section' => $chunk['section_title'] ?? null,
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 950),
'chunk_text' => (string)($chunk['content'] ?? ''),
'similarity' => $similarity,
'reranker_score' => $rerankerScore,
@@ -940,7 +949,8 @@ PROMPT;
float $temperature,
string $advocateRole = '',
?array $priorContext = null,
string $branchNotes = ''
string $branchNotes = '',
array $keySignals = []
): array {
$locale = dbnToolsLanguageName($language);
@@ -1014,41 +1024,49 @@ PROMPT;
? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.'
: '250-450 words, 2-3 short paragraphs. Note when evidence is thin.';
$keySignalsLine = !empty($keySignals)
? "\nKey retrieval signals (statutory/factual terms that drove corpus search — ground your brief in these where sources permit):\n" . implode(', ', $keySignals) . "\n"
: '';
if ($advocateRole !== '') {
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools producing a legal preparation brief in {$locale}.
Your client: {$advocateRole}
{$priorContextSection}
You MUST ground every claim in the numbered sources below using inline `[n]` citation markers. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
User input:
{$seedDescription}
Research brief:
{$brief}
{$keySignalsLine}
{$subQText}
Sources ({$sourceCount} numbered):
{$sourcesText}
Return JSON only in {$locale}:
{
"brief_markdown": "Partisan but factually grounded advocate brief. {$lengthGuidance} Structure: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Identified weaknesses in the opposing party's position with [n] citations, (4) Procedural rights and obligations {$advocateRole} should assert. End with a one-line caveat that this is legal preparation support, not final legal advice.",
"client_strengths": ["3-6 strings — the strongest factual/legal points for {$advocateRole}, each anchored to at least one [n] source"],
"opposing_weaknesses": ["2-5 strings — vulnerabilities in the opposing position supported by retrieved sources. Omit this array entirely if evidence is thin — do NOT invent weaknesses."],
"what_we_found": "2-sentence summary of the most relevant retrieved authority for {$advocateRole}",
"what_remains_uncertain": ["3-5 gaps where evidence is insufficient or law is unclear — be honest"],
"next_practical_step": "one concrete action for {$advocateRole} to take next (legal filing, evidence gathering, consultation type, etc.)"
}
Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer citing statute sections (e.g. "Barneloven §43") and case names verbatim from source excerpts.
Rules — read ALL of these before writing a single word of output:
- Every factual claim must end with one or more `[n]` markers. A citation is valid ONLY when that source's excerpt explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic.
- Do NOT invent statute sections, case names, paragraph numbers, dates, or parties. Copy statute references (e.g. §43, §4-12) and ECHR citations verbatim from the excerpt text — never infer a section number that does not appear in an excerpt.
- If no source supports a point, omit the point entirely — do NOT speculate.
- Legal hierarchy: when multiple sources support a claim, prefer the highest-authority source — statute (Barneloven/Barnevernsloven/etc.) > Høyesterett decision > ECHR Grand Chamber > ECHR regular chamber > lower courts > Bufdir guidance.
- Citation self-check: before writing each [n] marker, confirm that source [n] exists in the list and its excerpt actually supports the specific claim being made.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- `opposing_weaknesses` must be omitted or empty when no retrieved source actually supports the identified weakness.
- `opposing_weaknesses`: OMIT this field by default. Populate it only when ≥2 retrieved sources explicitly support the identified weakness. Do not speculate or infer weaknesses from thin evidence.
- `brief_markdown` must be {$lengthGuidance} Structure it as: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Procedural rights and obligations {$advocateRole} should assert, (4) Opposing weaknesses — only if `opposing_weaknesses` is non-empty. End with a one-line caveat that this is legal preparation support, not final legal advice.
- `client_strengths`: 3-6 items, each must include at least one [n] citation.
- `what_remains_uncertain`: 3-5 honest gaps where evidence is insufficient or law is unclear.
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
Return JSON:
{
"brief_markdown": "<advocate brief>",
"client_strengths": ["<strength with [n]>"],
"opposing_weaknesses": ["<weakness with [n]>"],
"what_we_found": "<2-sentence summary of the most relevant retrieved authority for {$advocateRole}>",
"what_remains_uncertain": ["<gap>"],
"next_practical_step": "<one concrete action for {$advocateRole} to take next>"
}
PROMPT;
} else {
$prompt = <<<PROMPT
@@ -1074,8 +1092,9 @@ Return JSON only in {$locale}:
Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- A `[n]` citation is only valid when the excerpt for source [n] explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer pinpointing statute sections (e.g. "Barneloven §43") and case names verbatim from the source excerpts.
- Copy statute section numbers (e.g. §43, §4-12) and ECHR case citations verbatim from the excerpt text — never rephrase or infer a section number that does not appear in an excerpt.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
@@ -1083,10 +1102,11 @@ PROMPT;
}
$messages = [
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences. Every legal claim must be supported by a source from the numbered list. Do not invent statute sections, case names, paragraph numbers, or dates. If no source supports a point, omit it entirely.'],
['role' => 'user', 'content' => $prompt],
];
$opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3200, 'timeout' => 180];
$synthTemp = ($advocateRole !== '') ? min($temperature, 0.20) : $temperature;
$opts = ['json' => true, 'temperature' => $synthTemp, 'max_tokens' => 4000, 'timeout' => 180];
try {
if ($engine === 'dbn_legal') {
+37
View File
@@ -48,6 +48,43 @@ require_once __DIR__ . '/includes/layout.php';
<p class="upload-hint" data-i18n="vocabHint">Helps Whisper recognise technical terms. Not included in the transcript.</p>
</div>
<details id="advancedOptions" class="expert-field">
<summary data-i18n="advancedOptions">Advanced options</summary>
<div class="control-row" id="taskControl">
<span class="control-label" data-i18n="task">Task</span>
<label><input type="radio" name="task" value="transcribe" checked> <span data-i18n="taskTranscribe">Transcribe</span></label>
<label><input type="radio" name="task" value="translate"> <span data-i18n="taskTranslate">Translate to English</span></label>
</div>
<div class="control-row">
<span class="control-label" data-i18n="vadFilter">VAD filter</span>
<label><input type="checkbox" id="vadFilterCheck" name="vad_filter"> <span data-i18n="vadFilterLabel">Remove silence / noise</span></label>
<small class="control-hint" data-i18n="vadFilterHint">Improves accuracy on recordings with long pauses.</small>
</div>
<div class="control-row" id="whisperModelControl">
<span class="control-label" data-i18n="whisperModel">Whisper model</span>
<select id="whisperModelSelect" name="whisper_model">
<option value="large-v3" selected>large-v3 (best)</option>
<option value="large-v2">large-v2</option>
<option value="medium">medium (faster)</option>
<option value="small">small</option>
<option value="base">base</option>
<option value="tiny">tiny</option>
</select>
<small class="control-hint" data-i18n="whisperModelHint">Used when Azure/GCP unavailable. large-v3 is the default.</small>
</div>
<div class="control-row" id="postModelControl">
<span class="control-label" data-i18n="postModel">AI cleanup</span>
<label><input type="radio" name="post_model" value="" checked> <span data-i18n="postModelNone">None</span></label>
<label><input type="radio" name="post_model" value="gpt-4o-mini"> <span data-i18n="postModelMini">GPT-4o Mini</span></label>
<label><input type="radio" name="post_model" value="gpt-4o"> <span data-i18n="postModelFull">GPT-4o</span></label>
<small class="control-hint" data-i18n="postModelHint">Fixes errors, punctuation, and domain terms after transcription.</small>
</div>
</details>
<div class="upload-zone" id="audioZone" role="region" aria-label="Audio upload" data-i18n-aria="uploadAria">
<input type="file" id="audioInput" accept="audio/*,video/mp4,video/webm" multiple aria-label="Choose audio files">
<div id="audioPrompt" class="upload-prompt">