feat(transcribe): GPT cleanup pass + advanced options i18n

Adds optional post-transcription cleanup via GPT-4o/GPT-4o-mini to fix
mishearing errors, punctuation, and domain terms. Speaker role labelling
now accepts a deployment param. Adds i18n strings for advanced options
panel (task, VAD filter, Whisper model, AI cleanup) in all four languages.
Updates BvjAnalyzerAgent and DeepResearchAgent.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-18 07:23:01 +02:00
parent e32ee60e78
commit c4362738c1
5 changed files with 345 additions and 112 deletions
+59 -39
View File
@@ -91,7 +91,7 @@ final class DbnDeepResearchAgent
// STEP 2: Query expansion
$emitRunning('expansion', 'Query expansion', 'Generating sub-questions…');
$stepStart = microtime(true);
$expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language, $advocateRole);
$expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $interpretation['key_signals'], $controls['sub_q_count'], $language, $advocateRole);
$this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
$subQuestions = $expansion['questions'];
$expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
@@ -323,7 +323,8 @@ final class DbnDeepResearchAgent
$controls['temperature'],
$advocateRole,
$priorContext,
$branchNotes
$branchNotes,
$interpretation['key_signals'] ?? []
);
$this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
$emitStep(
@@ -406,7 +407,7 @@ final class DbnDeepResearchAgent
'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.10))),
];
}
@@ -472,7 +473,7 @@ Input:
In {$locale}, produce JSON with:
{
"brief": "1-3 sentence description of what the user is trying to research (≤ 220 chars)",
"brief": "1-3 sentence description of what the user is trying to research (≤ 300 chars)",
"key_signals": ["short keywords or terms that should drive retrieval"]
}
PROMPT;
@@ -483,20 +484,21 @@ PROMPT;
if ($language === 'no' || $advocateRole !== '') {
$resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [
'model' => 'dbn-legal-agent', 'json' => true,
'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 40,
'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 40,
]);
$raw = (string)($resp['choices'][0]['message']['content'] ?? '');
} else {
$raw = $this->azure->chatText([$sysMsg, $userMsg],
['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]);
['json' => true, 'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 30]);
}
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && !empty($json['brief'])) {
$signals = $json['key_signals'] ?? [];
$signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : '';
$signals = is_array($json['key_signals'] ?? null) ? array_slice($json['key_signals'], 0, 8) : [];
$signalText = $signals ? implode(', ', $signals) : '';
return [
'brief' => (string)$json['brief'],
'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
'brief' => (string)$json['brief'],
'key_signals' => $signals,
'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
];
}
} catch (Throwable $e) {
@@ -504,14 +506,18 @@ PROMPT;
}
return [
'brief' => '',
'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
'brief' => '',
'key_signals' => [],
'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
];
}
private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language, string $advocateRole = ''): array
private function expandQueries(string $seedDescription, string $brief, array $keySignals, int $targetCount, string $language, string $advocateRole = ''): array
{
$locale = dbnToolsLanguageName($language);
$anchorsLine = !empty($keySignals)
? "\nKey retrieval anchors (incorporate these terms into your sub-questions where relevant):\n" . implode(', ', $keySignals) . "\n"
: '';
if ($advocateRole !== '') {
$prompt = <<<PROMPT
@@ -521,10 +527,11 @@ Generate exactly {$targetCount} targeted sub-questions designed to find:
2. Procedural rights and obligations the opposing party must satisfy — failures here help {$advocateRole}.
3. Case law that exposes weaknesses in the opposing party's likely arguments.
4. Specific articles, paragraphs, or judgments {$advocateRole}'s representative should cite.
5. Specific documentation and procedural obligations Barnevernet or the opposing authority must fulfil — procedural or evidentiary failures that Norwegian courts have used to rule in favour of parents or children.
Research brief:
{$brief}
{$anchorsLine}
Raw input:
{$seedDescription}
@@ -538,7 +545,8 @@ Return JSON only in {$locale}:
Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Every question must be answerable from Norwegian family-law, child-welfare, or ECHR/Hague sources.
- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame).
- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame, Barnevernet procedural obligation).
- Each sub-question must reference a DIFFERENT legal instrument, statute section, or ECHR article — do not repeat the same §-reference or case name across sub-questions.
- Sub-questions must be self-contained — readable without the raw input.
- Write the questions in {$locale}.
PROMPT;
@@ -548,7 +556,7 @@ You are decomposing a Do Better Norge legal-research request into {$targetCount}
Research brief:
{$brief}
{$anchorsLine}
Raw input:
{$seedDescription}
@@ -563,6 +571,7 @@ Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
- Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
- Each sub-question must reference a DIFFERENT legal instrument, statute section, or ECHR article — do not repeat the same §-reference or case name across sub-questions.
- Sub-questions must be self-contained — readable without seeing the seed text.
- Write the questions in {$locale}.
PROMPT;
@@ -667,7 +676,7 @@ PROMPT;
'title' => 'uploaded: ' . $entry['meta']['filename'],
'section' => null,
'package_or_corpus' => 'Your upload',
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620),
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 950),
'chunk_text' => $entry['meta']['text'],
'similarity' => round($sim, 4),
'reranker_score' => null,
@@ -709,7 +718,7 @@ PROMPT;
'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
'section' => $chunk['section_title'] ?? null,
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 950),
'chunk_text' => (string)($chunk['content'] ?? ''),
'similarity' => $similarity,
'reranker_score' => $rerankerScore,
@@ -940,7 +949,8 @@ PROMPT;
float $temperature,
string $advocateRole = '',
?array $priorContext = null,
string $branchNotes = ''
string $branchNotes = '',
array $keySignals = []
): array {
$locale = dbnToolsLanguageName($language);
@@ -1014,41 +1024,49 @@ PROMPT;
? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.'
: '250-450 words, 2-3 short paragraphs. Note when evidence is thin.';
$keySignalsLine = !empty($keySignals)
? "\nKey retrieval signals (statutory/factual terms that drove corpus search — ground your brief in these where sources permit):\n" . implode(', ', $keySignals) . "\n"
: '';
if ($advocateRole !== '') {
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools producing a legal preparation brief in {$locale}.
Your client: {$advocateRole}
{$priorContextSection}
You MUST ground every claim in the numbered sources below using inline `[n]` citation markers. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
User input:
{$seedDescription}
Research brief:
{$brief}
{$keySignalsLine}
{$subQText}
Sources ({$sourceCount} numbered):
{$sourcesText}
Return JSON only in {$locale}:
{
"brief_markdown": "Partisan but factually grounded advocate brief. {$lengthGuidance} Structure: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Identified weaknesses in the opposing party's position with [n] citations, (4) Procedural rights and obligations {$advocateRole} should assert. End with a one-line caveat that this is legal preparation support, not final legal advice.",
"client_strengths": ["3-6 strings — the strongest factual/legal points for {$advocateRole}, each anchored to at least one [n] source"],
"opposing_weaknesses": ["2-5 strings — vulnerabilities in the opposing position supported by retrieved sources. Omit this array entirely if evidence is thin — do NOT invent weaknesses."],
"what_we_found": "2-sentence summary of the most relevant retrieved authority for {$advocateRole}",
"what_remains_uncertain": ["3-5 gaps where evidence is insufficient or law is unclear — be honest"],
"next_practical_step": "one concrete action for {$advocateRole} to take next (legal filing, evidence gathering, consultation type, etc.)"
}
Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer citing statute sections (e.g. "Barneloven §43") and case names verbatim from source excerpts.
Rules — read ALL of these before writing a single word of output:
- Every factual claim must end with one or more `[n]` markers. A citation is valid ONLY when that source's excerpt explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic.
- Do NOT invent statute sections, case names, paragraph numbers, dates, or parties. Copy statute references (e.g. §43, §4-12) and ECHR citations verbatim from the excerpt text — never infer a section number that does not appear in an excerpt.
- If no source supports a point, omit the point entirely — do NOT speculate.
- Legal hierarchy: when multiple sources support a claim, prefer the highest-authority source — statute (Barneloven/Barnevernsloven/etc.) > Høyesterett decision > ECHR Grand Chamber > ECHR regular chamber > lower courts > Bufdir guidance.
- Citation self-check: before writing each [n] marker, confirm that source [n] exists in the list and its excerpt actually supports the specific claim being made.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- `opposing_weaknesses` must be omitted or empty when no retrieved source actually supports the identified weakness.
- `opposing_weaknesses`: OMIT this field by default. Populate it only when ≥2 retrieved sources explicitly support the identified weakness. Do not speculate or infer weaknesses from thin evidence.
- `brief_markdown` must be {$lengthGuidance} Structure it as: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Procedural rights and obligations {$advocateRole} should assert, (4) Opposing weaknesses — only if `opposing_weaknesses` is non-empty. End with a one-line caveat that this is legal preparation support, not final legal advice.
- `client_strengths`: 3-6 items, each must include at least one [n] citation.
- `what_remains_uncertain`: 3-5 honest gaps where evidence is insufficient or law is unclear.
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
Return JSON:
{
"brief_markdown": "<advocate brief>",
"client_strengths": ["<strength with [n]>"],
"opposing_weaknesses": ["<weakness with [n]>"],
"what_we_found": "<2-sentence summary of the most relevant retrieved authority for {$advocateRole}>",
"what_remains_uncertain": ["<gap>"],
"next_practical_step": "<one concrete action for {$advocateRole} to take next>"
}
PROMPT;
} else {
$prompt = <<<PROMPT
@@ -1074,8 +1092,9 @@ Return JSON only in {$locale}:
Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- A `[n]` citation is only valid when the excerpt for source [n] explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer pinpointing statute sections (e.g. "Barneloven §43") and case names verbatim from the source excerpts.
- Copy statute section numbers (e.g. §43, §4-12) and ECHR case citations verbatim from the excerpt text — never rephrase or infer a section number that does not appear in an excerpt.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
@@ -1083,10 +1102,11 @@ PROMPT;
}
$messages = [
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences. Every legal claim must be supported by a source from the numbered list. Do not invent statute sections, case names, paragraph numbers, or dates. If no source supports a point, omit it entirely.'],
['role' => 'user', 'content' => $prompt],
];
$opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3200, 'timeout' => 180];
$synthTemp = ($advocateRole !== '') ? min($temperature, 0.20) : $temperature;
$opts = ['json' => true, 'temperature' => $synthTemp, 'max_tokens' => 4000, 'timeout' => 180];
try {
if ($engine === 'dbn_legal') {