Fix legal analysis issue extraction for long documents

2026-05-25 19:06:23 +02:00
parent 190f639784
commit 8205a22205
3 changed files with 263 additions and 1 deletions
@@ -40,7 +40,7 @@ final class DbnLegalAnalysisAgent
     *
     * @return array<int,array{id:int,question:string,brief_context:string,doc_type:string,severity_hint:string}>
     */
-    public function extractIssues(string $text, string $language, string $docType): array
+    private function extractIssuesFromSingleChunk(string $text, string $language, string $docType): array
    {
        $locale = dbnToolsLanguageName($language);
        $text   = mb_substr($text, 0, 24000, 'UTF-8'); // keep prompt within 4o-mini context
@@ -118,6 +118,239 @@ PROMPT;
        return $clean;
    }

+    /**
+     * Pass 1 - extract distinct legal issues from representative document windows.
+     *
+     * @return array<int,array{id:int,question:string,brief_context:string,doc_type:string,severity_hint:string}>
+     */
+    public function extractIssues(string $text, string $language, string $docType): array
+    {
+        $text = trim($text);
+        if ($text === '') {
+            return [];
+        }
+
+        $allIssues = [];
+        foreach ($this->issueExtractionChunks($text) as $chunk) {
+            try {
+                $chunkIssues = $this->extractIssuesFromSingleChunk((string)$chunk['text'], $language, $docType);
+            } catch (Throwable $e) {
+                error_log('legal-analysis issue extraction failed for ' . (string)$chunk['label'] . ': ' . $e->getMessage());
+                $chunkIssues = [];
+            }
+
+            foreach ($chunkIssues as $issue) {
+                $this->appendUniqueIssue($allIssues, $issue);
+                if (count($allIssues) >= self::MAX_ISSUES) {
+                    break 2;
+                }
+            }
+        }
+
+        if (!$allIssues && $this->looksLikeSubstantiveFamilyLawDocument($text, $docType)) {
+            $allIssues = $this->fallbackLegalIssues($language, $docType, $text);
+        }
+
+        foreach ($allIssues as $idx => &$issue) {
+            $issue['id'] = $idx + 1;
+        }
+        unset($issue);
+
+        return array_slice($allIssues, 0, self::MAX_ISSUES);
+    }
+
+    /**
+     * @return array<int,array{label:string,text:string,offset:int,score:int}>
+     */
+    private function issueExtractionChunks(string $text): array
+    {
+        $len = mb_strlen($text, 'UTF-8');
+        $window = 24000;
+        if ($len <= $window + 4000) {
+            return [[
+                'label' => 'full document',
+                'text' => $text,
+                'offset' => 0,
+                'score' => 0,
+            ]];
+        }
+
+        $chunks = [];
+        $add = function (string $label, int $offset, int $score = 0) use (&$chunks, $text, $len, $window): void {
+            $offset = max(0, min($offset, max(0, $len - $window)));
+            foreach ($chunks as $existing) {
+                if (abs((int)$existing['offset'] - $offset) < 6000) {
+                    return;
+                }
+            }
+            $chunks[] = [
+                'label' => $label,
+                'text' => mb_substr($text, $offset, $window, 'UTF-8'),
+                'offset' => $offset,
+                'score' => $score,
+            ];
+        };
+
+        $add('beginning of document', 0, 1);
+        $add('middle of document', (int)floor(($len - $window) / 2), 1);
+        $add('end of document', $len - $window, 1);
+
+        $keywords = [
+            'samvaer', 'samvær', 'omsorg', 'barnevern', 'sakkyndig', 'risiko',
+            'tilknytning', 'rus', 'vold', 'emk', 'barnets beste', 'foreldre',
+            'bekymring', 'kontakt', 'plassering', 'fylkesnemnd', 'retten',
+        ];
+
+        $candidates = [];
+        for ($offset = 0; $offset < $len; $offset += 10000) {
+            $chunk = mb_substr($text, $offset, $window, 'UTF-8');
+            if ($chunk === '') {
+                break;
+            }
+            $lower = mb_strtolower($chunk, 'UTF-8');
+            $score = 0;
+            foreach ($keywords as $kw) {
+                $score += substr_count($lower, mb_strtolower($kw, 'UTF-8'));
+            }
+            if ($score > 0) {
+                $candidates[] = ['offset' => $offset, 'score' => $score];
+            }
+            if ($offset + $window >= $len) {
+                break;
+            }
+        }
+
+        usort($candidates, static fn(array $a, array $b): int => ($b['score'] <=> $a['score']));
+        foreach (array_slice($candidates, 0, 4) as $candidate) {
+            $add('keyword-heavy legal section', (int)$candidate['offset'], (int)$candidate['score']);
+            if (count($chunks) >= 6) {
+                break;
+            }
+        }
+
+        usort($chunks, static function (array $a, array $b): int {
+            if ($a['score'] !== $b['score']) {
+                return $b['score'] <=> $a['score'];
+            }
+            return $a['offset'] <=> $b['offset'];
+        });
+
+        return array_slice($chunks, 0, 6);
+    }
+
+    /**
+     * @param array<int,array{id:int,question:string,brief_context:string,doc_type:string,severity_hint:string}> $issues
+     * @param array{id:int,question:string,brief_context:string,doc_type:string,severity_hint:string} $candidate
+     */
+    private function appendUniqueIssue(array &$issues, array $candidate): void
+    {
+        $candidateKey = $this->issueDedupeKey((string)$candidate['question']);
+        foreach ($issues as $existing) {
+            $existingKey = $this->issueDedupeKey((string)$existing['question']);
+            if ($candidateKey === $existingKey || $this->issueSimilarity($candidateKey, $existingKey) >= 0.58) {
+                return;
+            }
+        }
+        $issues[] = $candidate;
+    }
+
+    private function issueDedupeKey(string $question): string
+    {
+        $question = mb_strtolower($question, 'UTF-8');
+        $question = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $question) ?? $question;
+        $question = preg_replace('/\s+/u', ' ', trim($question)) ?? $question;
+        return $question;
+    }
+
+    private function issueSimilarity(string $a, string $b): float
+    {
+        $aWords = array_values(array_unique(array_filter(preg_split('/\s+/u', $a) ?: [], static fn($w) => mb_strlen($w, 'UTF-8') > 3)));
+        $bWords = array_values(array_unique(array_filter(preg_split('/\s+/u', $b) ?: [], static fn($w) => mb_strlen($w, 'UTF-8') > 3)));
+        if (!$aWords || !$bWords) {
+            return 0.0;
+        }
+        $intersection = count(array_intersect($aWords, $bWords));
+        $union = count(array_unique(array_merge($aWords, $bWords)));
+        return $union > 0 ? $intersection / $union : 0.0;
+    }
+
+    private function looksLikeSubstantiveFamilyLawDocument(string $text, string $docType): bool
+    {
+        if (mb_strlen($text, 'UTF-8') < 8000) {
+            return false;
+        }
+        if (in_array($docType, ['barnevernet', 'adopsjon', 'emergency', 'samvÃ¦r', 'fylkesnemnd'], true)) {
+            return true;
+        }
+
+        $lower = mb_strtolower($text, 'UTF-8');
+        $hits = 0;
+        foreach (['sakkyndig', 'barnevern', 'barnets beste', 'samvær', 'samvaer', 'omsorg', 'tilknytning', 'emk', 'fylkesnemnd'] as $kw) {
+            if (str_contains($lower, mb_strtolower($kw, 'UTF-8'))) {
+                $hits++;
+            }
+        }
+        return $hits >= 2;
+    }
+
+    /**
+     * @return array<int,array{id:int,question:string,brief_context:string,doc_type:string,severity_hint:string}>
+     */
+    private function fallbackLegalIssues(string $language, string $docType, string $text): array
+    {
+        $context = mb_substr(preg_replace('/\s+/u', ' ', trim($text)) ?? trim($text), 0, 300, 'UTF-8');
+
+        if ($language === 'no') {
+            return [
+                [
+                    'id' => 1,
+                    'question' => 'Hvordan skal barnets beste og samvaer vurderes etter norsk rett?',
+                    'brief_context' => 'Langt familie- eller barnevernsdokument der modellen ikke identifiserte strukturerte spørsmål. Utdrag: ' . $context,
+                    'doc_type' => $docType,
+                    'severity_hint' => 'high',
+                ],
+                [
+                    'id' => 2,
+                    'question' => 'Er den sakkyndige vurderingen og bevisgrunnlaget tilstrekkelig for konklusjonene?',
+                    'brief_context' => 'Dokumentet ser ut til å inneholde sakkyndige eller faktiske vurderinger som bør testes juridisk.',
+                    'doc_type' => $docType,
+                    'severity_hint' => 'medium',
+                ],
+                [
+                    'id' => 3,
+                    'question' => 'Er saksbehandling, kontradiksjon og offentlige plikter oppfylt etter norsk rett og EMK?',
+                    'brief_context' => 'Lang sak bør vurderes for prosessuelle rettigheter, dokumentasjonsplikt og forholdsmessighet.',
+                    'doc_type' => $docType,
+                    'severity_hint' => 'medium',
+                ],
+            ];
+        }
+
+        return [
+            [
+                'id' => 1,
+                'question' => 'How should the child best-interests and contact/visitation assessment be reviewed under Norwegian law?',
+                'brief_context' => 'Long family-law or child-welfare document where the model did not return structured issues. Excerpt: ' . $context,
+                'doc_type' => $docType,
+                'severity_hint' => 'high',
+            ],
+            [
+                'id' => 2,
+                'question' => 'Is the expert assessment and evidentiary basis sufficient for the conclusions reached?',
+                'brief_context' => 'The document appears to contain expert or factual assessments that require legal testing.',
+                'doc_type' => $docType,
+                'severity_hint' => 'medium',
+            ],
+            [
+                'id' => 3,
+                'question' => 'Were procedural fairness, contradiction rights, and public-authority duties satisfied under Norwegian law and ECHR?',
+                'brief_context' => 'A long case file should be checked for procedural rights, documentation duties, and proportionality.',
+                'doc_type' => $docType,
+                'severity_hint' => 'medium',
+            ],
+        ];
+    }
+
    /**
     * Pass 2 — single targeted question to dbn-legal-agent-v3 with corpus context.
     * Ocelot-only. Capped at 350 tokens / 60s to avoid the documented loop bug.