From 3ad8f4843c00d3f1dc22a96879076cc259a6d6a5 Mon Sep 17 00:00:00 2001 From: davegilligan Date: Mon, 25 May 2026 11:14:21 +0200 Subject: [PATCH] Harden timeline quick extraction --- assets/js/tools.js | 5 + includes/AzureOpenAiGateway.php | 11 +- includes/LegalTools.php | 177 ++++++++++++++++++++++++++++++-- 3 files changed, 185 insertions(+), 8 deletions(-) diff --git a/assets/js/tools.js b/assets/js/tools.js index edbcacf..39b6c1a 100644 --- a/assets/js/tools.js +++ b/assets/js/tools.js @@ -388,6 +388,7 @@ const TIMELINE_I18N = { let lastTimelineEvents = []; let lastTimelineEventsOriginal = []; let lastTimelineWhatWeFound = ''; +let lastTimelineInputDateHintCount = null; let activeActorFilters = new Set(); let timelineSearchTerm = ''; let showSources = true; @@ -1602,6 +1603,7 @@ function renderMainFinding(data) { lastTimelineEventsOriginal = data.events || []; lastTimelineEvents = [...lastTimelineEventsOriginal]; lastTimelineWhatWeFound = data.what_we_found || ''; + lastTimelineInputDateHintCount = data.trace_metadata?.input_date_hint_count ?? null; activeActorFilters = new Set(); timelineSearchTerm = ''; showSources = true; @@ -1728,6 +1730,9 @@ function applyTimelineFilters() { function renderTimeline(events, grouped = false) { if (!events.length) { + if (lastTimelineInputDateHintCount === 0) { + return '

No recognizable dates were found in the extracted text. Check that the upload is text-searchable, or paste the relevant dated section and run again.

'; + } return '

No matching events.

'; } const MONTH_NAMES = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']; diff --git a/includes/AzureOpenAiGateway.php b/includes/AzureOpenAiGateway.php index 5fd92fb..275bd13 100644 --- a/includes/AzureOpenAiGateway.php +++ b/includes/AzureOpenAiGateway.php @@ -149,13 +149,20 @@ final class DbnAzureOpenAiGateway public function decodeJsonObject(string $content): ?array { $content = trim($content); + $content = (string)preg_replace('/^```(?:json)?\s*\n?/i', '', $content); + $content = (string)preg_replace('/\n?```\s*$/', '', $content); + $content = trim($content); + $decoded = json_decode($content, true); if (is_array($decoded)) { return $decoded; } - if (preg_match('/\{(?:[^{}]|(?R))*\}/s', $content, $match)) { - $decoded = json_decode($match[0], true); + $start = strpos($content, '{'); + $end = strrpos($content, '}'); + if ($start !== false && $end !== false && $end > $start) { + $candidate = substr($content, $start, $end - $start + 1); + $decoded = json_decode($candidate, true); if (is_array($decoded)) { return $decoded; } diff --git a/includes/LegalTools.php b/includes/LegalTools.php index 4a112b0..2408d94 100644 --- a/includes/LegalTools.php +++ b/includes/LegalTools.php @@ -361,6 +361,7 @@ PROMPT; $onProgress && $onProgress("Preparing document\u{2026}"); $locale = dbnToolsLanguageName($language); + $inputDateHintCount = $this->timelineDateHintCount($text); $focusInstruction = match ($focus) { 'deadlines' => "\nFocus specifically on: legal deadlines, filing dates, response windows, appeal periods, and statutory time limits. Deprioritise narrative events with no legal deadline significance.", @@ -445,6 +446,42 @@ Return JSON only: } PROMPT; + if ($engine === 'nova_lite') { + $prompt = <<legalJsonSystemPrompt($language); $messages = [ ['role' => 'system', 'content' => $system], @@ -464,23 +501,45 @@ PROMPT; $response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOptions); } } catch (Throwable $e) { - dbnToolsAbort('LLM request failed: ' . $e->getMessage(), 502, 'llm_error'); + $msg = $e->getMessage(); + if (preg_match('/timed?\s*out|timeout|operation timed out/i', $msg)) { + dbnToolsAbort('The model timed out. Try Quick mode, a smaller file, or fewer selected documents.', 504, 'llm_timeout'); + } + dbnToolsAbort('LLM request failed: ' . $msg, 502, 'llm_error'); } $onProgress && $onProgress("Parsing events\u{2026}"); $raw = (string)($response['choices'][0]['message']['content'] ?? ''); - if ($engine === 'nova_lite') { - $raw = (string)preg_replace('/^```(?:json)?\s*\n?/m', '', $raw); - $raw = (string)preg_replace('/\n?```\s*$/m', '', $raw); - $raw = trim($raw); - } $json = $this->azure->decodeJsonObject($raw); if (!$json) { dbnToolsAbort('The selected engine did not return valid structured JSON.', 502, 'llm_invalid_json'); } $events = is_array($json['events'] ?? null) ? $json['events'] : []; + $usedFallbackExtractor = false; + if (!$events && $inputDateHintCount > 0) { + $fallbackEvents = $this->fallbackTimelineEvents($text); + if ($fallbackEvents) { + $events = $fallbackEvents; + $usedFallbackExtractor = true; + $uncertain = is_array($json['what_remains_uncertain'] ?? null) ? $json['what_remains_uncertain'] : []; + array_unshift($uncertain, 'The selected engine returned no events, so a deterministic date-line fallback extracted visible dated lines. Review these medium-confidence entries against the original file.'); + $json['what_remains_uncertain'] = $uncertain; + $json['what_we_found'] = count($events) . ' date-like event(s) extracted by fallback after the selected engine returned no events.'; + $json['next_practical_step'] = 'Review each fallback event against the original uploaded document and rerun with Standard or Deep if you need fuller actor/event interpretation.'; + } + } + if (!$events && $inputDateHintCount === 0) { + $json['what_we_found'] = (string)($json['what_we_found'] ?? 'No recognizable dates were found in the extracted text from this upload.'); + if (trim((string)$json['what_we_found']) === '') { + $json['what_we_found'] = 'No recognizable dates were found in the extracted text from this upload.'; + } + $json['next_practical_step'] = (string)($json['next_practical_step'] ?? 'Check that the file text was extracted correctly, or upload a text-searchable PDF/DOCX.'); + if (trim((string)$json['next_practical_step']) === '') { + $json['next_practical_step'] = 'Check that the file text was extracted correctly, or upload a text-searchable PDF/DOCX.'; + } + } // Post-filter: confidence if ($confidenceFilter === 'high_medium') { @@ -523,11 +582,117 @@ PROMPT; 'chunk_count' => count($events), 'source_count' => 1, 'deployment' => $engineLabel, + 'input_date_hint_count' => $inputDateHintCount, + 'used_fallback_extractor' => $usedFallbackExtractor, ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } + private function timelineDateHintCount(string $text): int + { + preg_match_all('/(?= 80) { + break; + } + $line = trim((string)preg_replace('/\s+/u', ' ', $line)); + if ($line === '') { + continue; + } + if (preg_match('/\b(20\d{2}|19\d{2})\b/u', $line, $ym)) { + $lastYear = (int)$ym[1]; + } + if (!preg_match_all('/(?= 80) { + break 2; + } + $day = (int)$m[1][0]; + $month = (int)$m[2][0]; + if ($day < 1 || $day > 31 || $month < 1 || $month > 12) { + continue; + } + $yearRaw = $m[3][0] ?? ''; + $year = null; + if ($yearRaw !== '') { + $year = strlen($yearRaw) === 2 ? 2000 + (int)$yearRaw : (int)$yearRaw; + $lastYear = $year; + } elseif ($lastYear !== null) { + $year = $lastYear; + } + $date = $year !== null + ? sprintf('%04d-%02d-%02d', $year, $month, $day) + : sprintf('%02d.%02d. (year unknown)', $day, $month); + $time = null; + if (preg_match('/\bkl\.?\s*(\d{1,2})[:.](\d{2})\b|\b(\d{1,2}):(\d{2})\b/u', $line, $tm)) { + $hour = (int)($tm[1] !== '' ? $tm[1] : $tm[3]); + $min = (int)($tm[2] !== '' ? $tm[2] : $tm[4]); + if ($hour >= 0 && $hour <= 23 && $min >= 0 && $min <= 59) { + $time = sprintf('%02d:%02d', $hour, $min); + } + } + $eventText = trim(preg_replace('/^\s*[-*#\s]*/u', '', $line)); + $eventText = trim(preg_replace('/^' . preg_quote($m[0][0], '/') . '\s*(?:kl\.?\s*\d{1,2}[:.]\d{2})?\s*[:\-–—]?\s*/u', '', $eventText)); + if ($eventText === '') { + $eventText = 'Dated event found in uploaded text.'; + } + $events[] = [ + 'date' => $date, + 'end_date' => null, + 'time' => $time, + 'date_type' => $year !== null ? 'absolute' : 'relative', + 'actor' => $this->fallbackTimelineActor($line), + 'event' => mb_substr($eventText, 0, 240, 'UTF-8'), + 'source_excerpt' => mb_substr($line, 0, 300, 'UTF-8'), + 'confidence' => 'medium', + ]; + } + } + + usort($events, static function (array $a, array $b): int { + $ad = (string)($a['date'] ?? ''); + $bd = (string)($b['date'] ?? ''); + $ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99'; + $bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99'; + return strcmp($ai, $bi); + }); + + return $events; + } + + private function fallbackTimelineActor(string $line): string + { + $actors = [ + '/barnevern(?:s?tjenesten)?|bv\b/iu' => 'Barnevernstjenesten', + '/fylkesnemnda/iu' => 'Fylkesnemnda', + '/statsforvalter(?:en)?/iu' => 'Statsforvalteren', + '/tingrett/iu' => 'Tingrett', + '/lagmannsrett/iu' => 'Lagmannsrett', + '/høyesterett|høyesterett/iu' => 'Høyesterett', + '/\bnav\b/iu' => 'NAV', + '/\bbup\b/iu' => 'BUP', + '/\bppt\b/iu' => 'PPT', + ]; + foreach ($actors as $pattern => $actor) { + if (preg_match($pattern, $line)) { + return $actor; + } + } + return 'unknown'; + } + public function redact( string $text, string $mode = 'standard',