From 8205a222056d52cb1f484c2e50dca120533c42ba Mon Sep 17 00:00:00 2001 From: davegilligan Date: Mon, 25 May 2026 19:06:23 +0200 Subject: [PATCH] Fix legal analysis issue extraction for long documents --- api/legal-analysis.php | 9 ++ assets/js/legal-analysis.js | 20 +++ includes/LegalAnalysisAgent.php | 235 +++++++++++++++++++++++++++++++- 3 files changed, 263 insertions(+), 1 deletion(-) diff --git a/api/legal-analysis.php b/api/legal-analysis.php index c8f00ff..6f5bc05 100644 --- a/api/legal-analysis.php +++ b/api/legal-analysis.php @@ -118,6 +118,14 @@ try { $emit('progress', ['step' => 'synthesising', 'detail' => 'Synthesising overall assessment…']); $synth = $agent->synthesise($answered, $language, $docType); + $legalCheck = []; + try { + $legalCheck = dbnToolsRunLegalCheck( + mb_strimwidth((string)($synth['overall_assessment'] ?? ''), 0, 800), + $docType + ); + } catch (Throwable) {} + $result = [ 'ok' => true, 'issues' => $answered, @@ -126,6 +134,7 @@ try { 'disclaimer' => $synth['disclaimer'], 'doc_type' => $docType, 'model' => 'dbn-legal-agent-v3', + 'legal_check' => $legalCheck, 'latency_ms' => (int)round((microtime(true) - $startTime) * 1000), ]; if ($ftRemaining >= 0) { diff --git a/assets/js/legal-analysis.js b/assets/js/legal-analysis.js index c9c1121..a12cfe8 100644 --- a/assets/js/legal-analysis.js +++ b/assets/js/legal-analysis.js @@ -353,6 +353,9 @@ if (result.disclaimer) { topHtml += '

' + esc(result.disclaimer) + '

'; } + if (Array.isArray(result.legal_check) && result.legal_check.length) { + topHtml += renderLegalCheck(result.legal_check); + } topHtml += ''; } @@ -383,6 +386,23 @@ setStatus(''); } + function renderLegalCheck(findings) { + return ''; + } + // ── Helpers ─────────────────────────────────────────────────────────────── function setBusy(on) { if (runBtn) runBtn.disabled = on; diff --git a/includes/LegalAnalysisAgent.php b/includes/LegalAnalysisAgent.php index 1e73edf..bfff63a 100644 --- a/includes/LegalAnalysisAgent.php +++ b/includes/LegalAnalysisAgent.php @@ -40,7 +40,7 @@ final class DbnLegalAnalysisAgent * * @return array */ - public function extractIssues(string $text, string $language, string $docType): array + private function extractIssuesFromSingleChunk(string $text, string $language, string $docType): array { $locale = dbnToolsLanguageName($language); $text = mb_substr($text, 0, 24000, 'UTF-8'); // keep prompt within 4o-mini context @@ -118,6 +118,239 @@ PROMPT; return $clean; } + /** + * Pass 1 - extract distinct legal issues from representative document windows. + * + * @return array + */ + public function extractIssues(string $text, string $language, string $docType): array + { + $text = trim($text); + if ($text === '') { + return []; + } + + $allIssues = []; + foreach ($this->issueExtractionChunks($text) as $chunk) { + try { + $chunkIssues = $this->extractIssuesFromSingleChunk((string)$chunk['text'], $language, $docType); + } catch (Throwable $e) { + error_log('legal-analysis issue extraction failed for ' . (string)$chunk['label'] . ': ' . $e->getMessage()); + $chunkIssues = []; + } + + foreach ($chunkIssues as $issue) { + $this->appendUniqueIssue($allIssues, $issue); + if (count($allIssues) >= self::MAX_ISSUES) { + break 2; + } + } + } + + if (!$allIssues && $this->looksLikeSubstantiveFamilyLawDocument($text, $docType)) { + $allIssues = $this->fallbackLegalIssues($language, $docType, $text); + } + + foreach ($allIssues as $idx => &$issue) { + $issue['id'] = $idx + 1; + } + unset($issue); + + return array_slice($allIssues, 0, self::MAX_ISSUES); + } + + /** + * @return array + */ + private function issueExtractionChunks(string $text): array + { + $len = mb_strlen($text, 'UTF-8'); + $window = 24000; + if ($len <= $window + 4000) { + return [[ + 'label' => 'full document', + 'text' => $text, + 'offset' => 0, + 'score' => 0, + ]]; + } + + $chunks = []; + $add = function (string $label, int $offset, int $score = 0) use (&$chunks, $text, $len, $window): void { + $offset = max(0, min($offset, max(0, $len - $window))); + foreach ($chunks as $existing) { + if (abs((int)$existing['offset'] - $offset) < 6000) { + return; + } + } + $chunks[] = [ + 'label' => $label, + 'text' => mb_substr($text, $offset, $window, 'UTF-8'), + 'offset' => $offset, + 'score' => $score, + ]; + }; + + $add('beginning of document', 0, 1); + $add('middle of document', (int)floor(($len - $window) / 2), 1); + $add('end of document', $len - $window, 1); + + $keywords = [ + 'samvaer', 'samvær', 'omsorg', 'barnevern', 'sakkyndig', 'risiko', + 'tilknytning', 'rus', 'vold', 'emk', 'barnets beste', 'foreldre', + 'bekymring', 'kontakt', 'plassering', 'fylkesnemnd', 'retten', + ]; + + $candidates = []; + for ($offset = 0; $offset < $len; $offset += 10000) { + $chunk = mb_substr($text, $offset, $window, 'UTF-8'); + if ($chunk === '') { + break; + } + $lower = mb_strtolower($chunk, 'UTF-8'); + $score = 0; + foreach ($keywords as $kw) { + $score += substr_count($lower, mb_strtolower($kw, 'UTF-8')); + } + if ($score > 0) { + $candidates[] = ['offset' => $offset, 'score' => $score]; + } + if ($offset + $window >= $len) { + break; + } + } + + usort($candidates, static fn(array $a, array $b): int => ($b['score'] <=> $a['score'])); + foreach (array_slice($candidates, 0, 4) as $candidate) { + $add('keyword-heavy legal section', (int)$candidate['offset'], (int)$candidate['score']); + if (count($chunks) >= 6) { + break; + } + } + + usort($chunks, static function (array $a, array $b): int { + if ($a['score'] !== $b['score']) { + return $b['score'] <=> $a['score']; + } + return $a['offset'] <=> $b['offset']; + }); + + return array_slice($chunks, 0, 6); + } + + /** + * @param array $issues + * @param array{id:int,question:string,brief_context:string,doc_type:string,severity_hint:string} $candidate + */ + private function appendUniqueIssue(array &$issues, array $candidate): void + { + $candidateKey = $this->issueDedupeKey((string)$candidate['question']); + foreach ($issues as $existing) { + $existingKey = $this->issueDedupeKey((string)$existing['question']); + if ($candidateKey === $existingKey || $this->issueSimilarity($candidateKey, $existingKey) >= 0.58) { + return; + } + } + $issues[] = $candidate; + } + + private function issueDedupeKey(string $question): string + { + $question = mb_strtolower($question, 'UTF-8'); + $question = preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', $question) ?? $question; + $question = preg_replace('/\s+/u', ' ', trim($question)) ?? $question; + return $question; + } + + private function issueSimilarity(string $a, string $b): float + { + $aWords = array_values(array_unique(array_filter(preg_split('/\s+/u', $a) ?: [], static fn($w) => mb_strlen($w, 'UTF-8') > 3))); + $bWords = array_values(array_unique(array_filter(preg_split('/\s+/u', $b) ?: [], static fn($w) => mb_strlen($w, 'UTF-8') > 3))); + if (!$aWords || !$bWords) { + return 0.0; + } + $intersection = count(array_intersect($aWords, $bWords)); + $union = count(array_unique(array_merge($aWords, $bWords))); + return $union > 0 ? $intersection / $union : 0.0; + } + + private function looksLikeSubstantiveFamilyLawDocument(string $text, string $docType): bool + { + if (mb_strlen($text, 'UTF-8') < 8000) { + return false; + } + if (in_array($docType, ['barnevernet', 'adopsjon', 'emergency', 'samvær', 'fylkesnemnd'], true)) { + return true; + } + + $lower = mb_strtolower($text, 'UTF-8'); + $hits = 0; + foreach (['sakkyndig', 'barnevern', 'barnets beste', 'samvær', 'samvaer', 'omsorg', 'tilknytning', 'emk', 'fylkesnemnd'] as $kw) { + if (str_contains($lower, mb_strtolower($kw, 'UTF-8'))) { + $hits++; + } + } + return $hits >= 2; + } + + /** + * @return array + */ + private function fallbackLegalIssues(string $language, string $docType, string $text): array + { + $context = mb_substr(preg_replace('/\s+/u', ' ', trim($text)) ?? trim($text), 0, 300, 'UTF-8'); + + if ($language === 'no') { + return [ + [ + 'id' => 1, + 'question' => 'Hvordan skal barnets beste og samvaer vurderes etter norsk rett?', + 'brief_context' => 'Langt familie- eller barnevernsdokument der modellen ikke identifiserte strukturerte spørsmål. Utdrag: ' . $context, + 'doc_type' => $docType, + 'severity_hint' => 'high', + ], + [ + 'id' => 2, + 'question' => 'Er den sakkyndige vurderingen og bevisgrunnlaget tilstrekkelig for konklusjonene?', + 'brief_context' => 'Dokumentet ser ut til å inneholde sakkyndige eller faktiske vurderinger som bør testes juridisk.', + 'doc_type' => $docType, + 'severity_hint' => 'medium', + ], + [ + 'id' => 3, + 'question' => 'Er saksbehandling, kontradiksjon og offentlige plikter oppfylt etter norsk rett og EMK?', + 'brief_context' => 'Lang sak bør vurderes for prosessuelle rettigheter, dokumentasjonsplikt og forholdsmessighet.', + 'doc_type' => $docType, + 'severity_hint' => 'medium', + ], + ]; + } + + return [ + [ + 'id' => 1, + 'question' => 'How should the child best-interests and contact/visitation assessment be reviewed under Norwegian law?', + 'brief_context' => 'Long family-law or child-welfare document where the model did not return structured issues. Excerpt: ' . $context, + 'doc_type' => $docType, + 'severity_hint' => 'high', + ], + [ + 'id' => 2, + 'question' => 'Is the expert assessment and evidentiary basis sufficient for the conclusions reached?', + 'brief_context' => 'The document appears to contain expert or factual assessments that require legal testing.', + 'doc_type' => $docType, + 'severity_hint' => 'medium', + ], + [ + 'id' => 3, + 'question' => 'Were procedural fairness, contradiction rights, and public-authority duties satisfied under Norwegian law and ECHR?', + 'brief_context' => 'A long case file should be checked for procedural rights, documentation duties, and proportionality.', + 'doc_type' => $docType, + 'severity_hint' => 'medium', + ], + ]; + } + /** * Pass 2 — single targeted question to dbn-legal-agent-v3 with corpus context. * Ocelot-only. Capped at 350 tokens / 60s to avoid the documented loop bug.