feat(transcribe): GPT cleanup pass + advanced options i18n

Adds optional post-transcription cleanup via GPT-4o/GPT-4o-mini to fix mishearing errors, punctuation, and domain terms. Speaker role labelling now accepts a deployment param. Adds i18n strings for advanced options panel (task, VAD filter, Whisper model, AI cleanup) in all four languages. Updates BvjAnalyzerAgent and DeepResearchAgent. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 07:23:01 +02:00
parent e32ee60e78
commit c4362738c1
5 changed files with 345 additions and 112 deletions
@@ -91,7 +91,7 @@ final class DbnDeepResearchAgent
        // STEP 2: Query expansion
        $emitRunning('expansion', 'Query expansion', 'Generating sub-questions…');
        $stepStart = microtime(true);
-        $expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language, $advocateRole);
+        $expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $interpretation['key_signals'], $controls['sub_q_count'], $language, $advocateRole);
        $this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
        $subQuestions = $expansion['questions'];
        $expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
@@ -323,7 +323,8 @@ final class DbnDeepResearchAgent
            $controls['temperature'],
            $advocateRole,
            $priorContext,
-            $branchNotes
+            $branchNotes,
+            $interpretation['key_signals'] ?? []
        );
        $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
        $emitStep(
@@ -406,7 +407,7 @@ final class DbnDeepResearchAgent
            'chunk_limit'          => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
            'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
            'reranker_top_k'       => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
-            'temperature'          => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
+            'temperature'          => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.10))),
        ];
    }

@@ -472,7 +473,7 @@ Input:

 In {$locale}, produce JSON with:
 {
-  "brief": "1-3 sentence description of what the user is trying to research (≤ 220 chars)",
+  "brief": "1-3 sentence description of what the user is trying to research (≤ 300 chars)",
  "key_signals": ["short keywords or terms that should drive retrieval"]
 }
 PROMPT;
@@ -483,20 +484,21 @@ PROMPT;
            if ($language === 'no' || $advocateRole !== '') {
                $resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [
                    'model' => 'dbn-legal-agent', 'json' => true,
-                    'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 40,
+                    'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 40,
                ]);
                $raw = (string)($resp['choices'][0]['message']['content'] ?? '');
            } else {
                $raw = $this->azure->chatText([$sysMsg, $userMsg],
-                    ['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]);
+                    ['json' => true, 'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 30]);
            }
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && !empty($json['brief'])) {
-                $signals = $json['key_signals'] ?? [];
-                $signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : '';
+                $signals = is_array($json['key_signals'] ?? null) ? array_slice($json['key_signals'], 0, 8) : [];
+                $signalText = $signals ? implode(', ', $signals) : '';
                return [
-                    'brief' => (string)$json['brief'],
-                    'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
+                    'brief'       => (string)$json['brief'],
+                    'key_signals' => $signals,
+                    'detail'      => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
                ];
            }
        } catch (Throwable $e) {
@@ -504,14 +506,18 @@ PROMPT;
        }

        return [
-            'brief' => '',
-            'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
+            'brief'       => '',
+            'key_signals' => [],
+            'detail'      => 'Interpretation step skipped — proceeding with raw seed input.',
        ];
    }

-    private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language, string $advocateRole = ''): array
+    private function expandQueries(string $seedDescription, string $brief, array $keySignals, int $targetCount, string $language, string $advocateRole = ''): array
    {
        $locale = dbnToolsLanguageName($language);
+        $anchorsLine = !empty($keySignals)
+            ? "\nKey retrieval anchors (incorporate these terms into your sub-questions where relevant):\n" . implode(', ', $keySignals) . "\n"
+            : '';

        if ($advocateRole !== '') {
            $prompt = <<<PROMPT
@@ -521,10 +527,11 @@ Generate exactly {$targetCount} targeted sub-questions designed to find:
 2. Procedural rights and obligations the opposing party must satisfy — failures here help {$advocateRole}.
 3. Case law that exposes weaknesses in the opposing party's likely arguments.
 4. Specific articles, paragraphs, or judgments {$advocateRole}'s representative should cite.
+5. Specific documentation and procedural obligations Barnevernet or the opposing authority must fulfil — procedural or evidentiary failures that Norwegian courts have used to rule in favour of parents or children.

 Research brief:
 {$brief}
-
+{$anchorsLine}
 Raw input:
 {$seedDescription}

@@ -538,7 +545,8 @@ Return JSON only in {$locale}:
 Rules:
 - Exactly {$targetCount} sub-questions, no more, no fewer.
 - Every question must be answerable from Norwegian family-law, child-welfare, or ECHR/Hague sources.
- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame).
+- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame, Barnevernet procedural obligation).
+- Each sub-question must reference a DIFFERENT legal instrument, statute section, or ECHR article — do not repeat the same §-reference or case name across sub-questions.
 - Sub-questions must be self-contained — readable without the raw input.
 - Write the questions in {$locale}.
 PROMPT;
@@ -548,7 +556,7 @@ You are decomposing a Do Better Norge legal-research request into {$targetCount}

 Research brief:
 {$brief}
-
+{$anchorsLine}
 Raw input:
 {$seedDescription}

@@ -563,6 +571,7 @@ Rules:
 - Exactly {$targetCount} sub-questions, no more, no fewer.
 - Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
 - Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
+- Each sub-question must reference a DIFFERENT legal instrument, statute section, or ECHR article — do not repeat the same §-reference or case name across sub-questions.
 - Sub-questions must be self-contained — readable without seeing the seed text.
 - Write the questions in {$locale}.
 PROMPT;
@@ -667,7 +676,7 @@ PROMPT;
                'title'             => 'uploaded: ' . $entry['meta']['filename'],
                'section'           => null,
                'package_or_corpus' => 'Your upload',
-                'excerpt'           => dbnToolsExcerpt($entry['meta']['text'], 620),
+                'excerpt'           => dbnToolsExcerpt($entry['meta']['text'], 950),
                'chunk_text'        => $entry['meta']['text'],
                'similarity'        => round($sim, 4),
                'reranker_score'    => null,
@@ -709,7 +718,7 @@ PROMPT;
            'title'             => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
            'section'           => $chunk['section_title'] ?? null,
            'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
-            'excerpt'           => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
+            'excerpt'           => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 950),
            'chunk_text'        => (string)($chunk['content'] ?? ''),
            'similarity'        => $similarity,
            'reranker_score'    => $rerankerScore,
@@ -940,7 +949,8 @@ PROMPT;
        float   $temperature,
        string  $advocateRole = '',
        ?array  $priorContext = null,
-        string  $branchNotes = ''
+        string  $branchNotes = '',
+        array   $keySignals = []
    ): array {
        $locale = dbnToolsLanguageName($language);

@@ -1014,41 +1024,49 @@ PROMPT;
            ? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.'
            : '250-450 words, 2-3 short paragraphs. Note when evidence is thin.';

+        $keySignalsLine = !empty($keySignals)
+            ? "\nKey retrieval signals (statutory/factual terms that drove corpus search — ground your brief in these where sources permit):\n" . implode(', ', $keySignals) . "\n"
+            : '';
+
        if ($advocateRole !== '') {
            $prompt = <<<PROMPT
 You are Do Better Norge Legal Tools producing a legal preparation brief in {$locale}.
 Your client: {$advocateRole}
 {$priorContextSection}
-You MUST ground every claim in the numbered sources below using inline `[n]` citation markers. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
-
 User input:
 {$seedDescription}

 Research brief:
 {$brief}
+{$keySignalsLine}
 {$subQText}

 Sources ({$sourceCount} numbered):
 {$sourcesText}

-Return JSON only in {$locale}:
-{
-  "brief_markdown": "Partisan but factually grounded advocate brief. {$lengthGuidance} Structure: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Identified weaknesses in the opposing party's position with [n] citations, (4) Procedural rights and obligations {$advocateRole} should assert. End with a one-line caveat that this is legal preparation support, not final legal advice.",
-  "client_strengths": ["3-6 strings — the strongest factual/legal points for {$advocateRole}, each anchored to at least one [n] source"],
-  "opposing_weaknesses": ["2-5 strings — vulnerabilities in the opposing position supported by retrieved sources. Omit this array entirely if evidence is thin — do NOT invent weaknesses."],
-  "what_we_found": "2-sentence summary of the most relevant retrieved authority for {$advocateRole}",
-  "what_remains_uncertain": ["3-5 gaps where evidence is insufficient or law is unclear — be honest"],
-  "next_practical_step": "one concrete action for {$advocateRole} to take next (legal filing, evidence gathering, consultation type, etc.)"
-}
-
-Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer citing statute sections (e.g. "Barneloven §43") and case names verbatim from source excerpts.
+Rules — read ALL of these before writing a single word of output:
+- Every factual claim must end with one or more `[n]` markers. A citation is valid ONLY when that source's excerpt explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic.
+- Do NOT invent statute sections, case names, paragraph numbers, dates, or parties. Copy statute references (e.g. §43, §4-12) and ECHR citations verbatim from the excerpt text — never infer a section number that does not appear in an excerpt.
+- If no source supports a point, omit the point entirely — do NOT speculate.
+- Legal hierarchy: when multiple sources support a claim, prefer the highest-authority source — statute (Barneloven/Barnevernsloven/etc.) > Høyesterett decision > ECHR Grand Chamber > ECHR regular chamber > lower courts > Bufdir guidance.
+- Citation self-check: before writing each [n] marker, confirm that source [n] exists in the list and its excerpt actually supports the specific claim being made.
 - When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- `opposing_weaknesses` must be omitted or empty when no retrieved source actually supports the identified weakness.
+- `opposing_weaknesses`: OMIT this field by default. Populate it only when ≥2 retrieved sources explicitly support the identified weakness. Do not speculate or infer weaknesses from thin evidence.
+- `brief_markdown` must be {$lengthGuidance} Structure it as: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Procedural rights and obligations {$advocateRole} should assert, (4) Opposing weaknesses — only if `opposing_weaknesses` is non-empty. End with a one-line caveat that this is legal preparation support, not final legal advice.
+- `client_strengths`: 3-6 items, each must include at least one [n] citation.
+- `what_remains_uncertain`: 3-5 honest gaps where evidence is insufficient or law is unclear.
 - Respond in {$locale}.
 - Output valid JSON only — no markdown fences around the JSON object itself.
+
+Return JSON:
+{
+  "brief_markdown": "<advocate brief>",
+  "client_strengths": ["<strength with [n]>"],
+  "opposing_weaknesses": ["<weakness with [n]>"],
+  "what_we_found": "<2-sentence summary of the most relevant retrieved authority for {$advocateRole}>",
+  "what_remains_uncertain": ["<gap>"],
+  "next_practical_step": "<one concrete action for {$advocateRole} to take next>"
+}
 PROMPT;
        } else {
            $prompt = <<<PROMPT
@@ -1074,8 +1092,9 @@ Return JSON only in {$locale}:

 Rules:
 - Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
+- A `[n]` citation is only valid when the excerpt for source [n] explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic.
 - If no source supports a point, omit the point — DO NOT speculate.
- Prefer pinpointing statute sections (e.g. "Barneloven §43") and case names verbatim from the source excerpts.
+- Copy statute section numbers (e.g. §43, §4-12) and ECHR case citations verbatim from the excerpt text — never rephrase or infer a section number that does not appear in an excerpt.
 - When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
 - Respond in {$locale}.
 - Output valid JSON only — no markdown fences around the JSON object itself.
@@ -1083,10 +1102,11 @@ PROMPT;
        }

        $messages = [
-            ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
+            ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences. Every legal claim must be supported by a source from the numbered list. Do not invent statute sections, case names, paragraph numbers, or dates. If no source supports a point, omit it entirely.'],
            ['role' => 'user',   'content' => $prompt],
        ];
-        $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3200, 'timeout' => 180];
+        $synthTemp = ($advocateRole !== '') ? min($temperature, 0.20) : $temperature;
+        $opts = ['json' => true, 'temperature' => $synthTemp, 'max_tokens' => 4000, 'timeout' => 180];

        try {
            if ($engine === 'dbn_legal') {