Harden timeline quick extraction

This commit is contained in:
2026-05-25 11:14:21 +02:00
parent 983c423740
commit 3ad8f4843c
3 changed files with 185 additions and 8 deletions
+5
View File
@@ -388,6 +388,7 @@ const TIMELINE_I18N = {
let lastTimelineEvents = [];
let lastTimelineEventsOriginal = [];
let lastTimelineWhatWeFound = '';
let lastTimelineInputDateHintCount = null;
let activeActorFilters = new Set();
let timelineSearchTerm = '';
let showSources = true;
@@ -1602,6 +1603,7 @@ function renderMainFinding(data) {
lastTimelineEventsOriginal = data.events || [];
lastTimelineEvents = [...lastTimelineEventsOriginal];
lastTimelineWhatWeFound = data.what_we_found || '';
lastTimelineInputDateHintCount = data.trace_metadata?.input_date_hint_count ?? null;
activeActorFilters = new Set();
timelineSearchTerm = '';
showSources = true;
@@ -1728,6 +1730,9 @@ function applyTimelineFilters() {
function renderTimeline(events, grouped = false) {
if (!events.length) {
if (lastTimelineInputDateHintCount === 0) {
return '<p class="timeline-empty">No recognizable dates were found in the extracted text. Check that the upload is text-searchable, or paste the relevant dated section and run again.</p>';
}
return '<p class="timeline-empty">No matching events.</p>';
}
const MONTH_NAMES = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'];
+9 -2
View File
@@ -149,13 +149,20 @@ final class DbnAzureOpenAiGateway
public function decodeJsonObject(string $content): ?array
{
$content = trim($content);
$content = (string)preg_replace('/^```(?:json)?\s*\n?/i', '', $content);
$content = (string)preg_replace('/\n?```\s*$/', '', $content);
$content = trim($content);
$decoded = json_decode($content, true);
if (is_array($decoded)) {
return $decoded;
}
if (preg_match('/\{(?:[^{}]|(?R))*\}/s', $content, $match)) {
$decoded = json_decode($match[0], true);
$start = strpos($content, '{');
$end = strrpos($content, '}');
if ($start !== false && $end !== false && $end > $start) {
$candidate = substr($content, $start, $end - $start + 1);
$decoded = json_decode($candidate, true);
if (is_array($decoded)) {
return $decoded;
}
+171 -6
View File
@@ -361,6 +361,7 @@ PROMPT;
$onProgress && $onProgress("Preparing document\u{2026}");
$locale = dbnToolsLanguageName($language);
$inputDateHintCount = $this->timelineDateHintCount($text);
$focusInstruction = match ($focus) {
'deadlines' => "\nFocus specifically on: legal deadlines, filing dates, response windows, appeal periods, and statutory time limits. Deprioritise narrative events with no legal deadline significance.",
@@ -445,6 +446,42 @@ Return JSON only:
}
PROMPT;
if ($engine === 'nova_lite') {
$prompt = <<<PROMPT
Build a chronological timeline from the pasted text in {$locale}.{$userNotesBlock}
Extract dated lines and temporal references from uploaded or pasted case text. Focus on recall: if a line starts with or contains a date, include it as an event.{$focusInstruction}{$backgroundInstruction}{$relativeInstruction}
Recognise Norwegian formats:
- DD.MM.YYYY, DD.MM.YY, D.M.YY
- DD.MM. or D.M. without a year; infer the nearest year from nearby text when possible
- "den DD. month YYYY" and Norwegian month names
- optional times such as "kl. 09.00" or "14:30"
For every event return:
- date as YYYY-MM-DD when determinable, otherwise a short human-readable date
- end_date as null unless the source states a period
- time as HH:MM or null
- date_type: absolute, relative, recurring, conditional, or period
- actor: the named person/institution or "unknown"
- event: concise description
- source_excerpt: the exact source words that show the date and event
- confidence: high, medium, or low
Pasted text:
{$text}
Return JSON only:
{
"what_we_found": "event count, date range, main actors, notable gaps",
"events": [{"date":"YYYY-MM-DD","end_date":null,"time":null,"date_type":"absolute","actor":"unknown","event":"...","source_excerpt":"...","confidence":"high"}],
"evidence_trail": [{"title":"Pasted text","excerpt":"Processed in-memory only; not stored."}],
"what_remains_uncertain": [],
"next_practical_step": "..."
}
PROMPT;
}
$system = $this->legalJsonSystemPrompt($language);
$messages = [
['role' => 'system', 'content' => $system],
@@ -464,23 +501,45 @@ PROMPT;
$response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOptions);
}
} catch (Throwable $e) {
dbnToolsAbort('LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
$msg = $e->getMessage();
if (preg_match('/timed?\s*out|timeout|operation timed out/i', $msg)) {
dbnToolsAbort('The model timed out. Try Quick mode, a smaller file, or fewer selected documents.', 504, 'llm_timeout');
}
dbnToolsAbort('LLM request failed: ' . $msg, 502, 'llm_error');
}
$onProgress && $onProgress("Parsing events\u{2026}");
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
if ($engine === 'nova_lite') {
$raw = (string)preg_replace('/^```(?:json)?\s*\n?/m', '', $raw);
$raw = (string)preg_replace('/\n?```\s*$/m', '', $raw);
$raw = trim($raw);
}
$json = $this->azure->decodeJsonObject($raw);
if (!$json) {
dbnToolsAbort('The selected engine did not return valid structured JSON.', 502, 'llm_invalid_json');
}
$events = is_array($json['events'] ?? null) ? $json['events'] : [];
$usedFallbackExtractor = false;
if (!$events && $inputDateHintCount > 0) {
$fallbackEvents = $this->fallbackTimelineEvents($text);
if ($fallbackEvents) {
$events = $fallbackEvents;
$usedFallbackExtractor = true;
$uncertain = is_array($json['what_remains_uncertain'] ?? null) ? $json['what_remains_uncertain'] : [];
array_unshift($uncertain, 'The selected engine returned no events, so a deterministic date-line fallback extracted visible dated lines. Review these medium-confidence entries against the original file.');
$json['what_remains_uncertain'] = $uncertain;
$json['what_we_found'] = count($events) . ' date-like event(s) extracted by fallback after the selected engine returned no events.';
$json['next_practical_step'] = 'Review each fallback event against the original uploaded document and rerun with Standard or Deep if you need fuller actor/event interpretation.';
}
}
if (!$events && $inputDateHintCount === 0) {
$json['what_we_found'] = (string)($json['what_we_found'] ?? 'No recognizable dates were found in the extracted text from this upload.');
if (trim((string)$json['what_we_found']) === '') {
$json['what_we_found'] = 'No recognizable dates were found in the extracted text from this upload.';
}
$json['next_practical_step'] = (string)($json['next_practical_step'] ?? 'Check that the file text was extracted correctly, or upload a text-searchable PDF/DOCX.');
if (trim((string)$json['next_practical_step']) === '') {
$json['next_practical_step'] = 'Check that the file text was extracted correctly, or upload a text-searchable PDF/DOCX.';
}
}
// Post-filter: confidence
if ($confidenceFilter === 'high_medium') {
@@ -523,11 +582,117 @@ PROMPT;
'chunk_count' => count($events),
'source_count' => 1,
'deployment' => $engineLabel,
'input_date_hint_count' => $inputDateHintCount,
'used_fallback_extractor' => $usedFallbackExtractor,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function timelineDateHintCount(string $text): int
{
preg_match_all('/(?<!\d)\d{1,2}\.\d{1,2}\.(?:\d{2,4})?(?!\d)/u', $text, $numeric);
preg_match_all('/\b\d{1,2}\.\s*(?:januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)\b/iu', $text, $months);
return count($numeric[0] ?? []) + count($months[0] ?? []);
}
private function fallbackTimelineEvents(string $text): array
{
$lines = preg_split('/\R/u', $text) ?: [];
$events = [];
$lastYear = null;
foreach ($lines as $line) {
if (count($events) >= 80) {
break;
}
$line = trim((string)preg_replace('/\s+/u', ' ', $line));
if ($line === '') {
continue;
}
if (preg_match('/\b(20\d{2}|19\d{2})\b/u', $line, $ym)) {
$lastYear = (int)$ym[1];
}
if (!preg_match_all('/(?<!\d)(\d{1,2})\.(\d{1,2})\.(?:(\d{2,4}))?(?!\d)/u', $line, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)) {
continue;
}
foreach ($matches as $m) {
if (count($events) >= 80) {
break 2;
}
$day = (int)$m[1][0];
$month = (int)$m[2][0];
if ($day < 1 || $day > 31 || $month < 1 || $month > 12) {
continue;
}
$yearRaw = $m[3][0] ?? '';
$year = null;
if ($yearRaw !== '') {
$year = strlen($yearRaw) === 2 ? 2000 + (int)$yearRaw : (int)$yearRaw;
$lastYear = $year;
} elseif ($lastYear !== null) {
$year = $lastYear;
}
$date = $year !== null
? sprintf('%04d-%02d-%02d', $year, $month, $day)
: sprintf('%02d.%02d. (year unknown)', $day, $month);
$time = null;
if (preg_match('/\bkl\.?\s*(\d{1,2})[:.](\d{2})\b|\b(\d{1,2}):(\d{2})\b/u', $line, $tm)) {
$hour = (int)($tm[1] !== '' ? $tm[1] : $tm[3]);
$min = (int)($tm[2] !== '' ? $tm[2] : $tm[4]);
if ($hour >= 0 && $hour <= 23 && $min >= 0 && $min <= 59) {
$time = sprintf('%02d:%02d', $hour, $min);
}
}
$eventText = trim(preg_replace('/^\s*[-*#\s]*/u', '', $line));
$eventText = trim(preg_replace('/^' . preg_quote($m[0][0], '/') . '\s*(?:kl\.?\s*\d{1,2}[:.]\d{2})?\s*[:\-–—]?\s*/u', '', $eventText));
if ($eventText === '') {
$eventText = 'Dated event found in uploaded text.';
}
$events[] = [
'date' => $date,
'end_date' => null,
'time' => $time,
'date_type' => $year !== null ? 'absolute' : 'relative',
'actor' => $this->fallbackTimelineActor($line),
'event' => mb_substr($eventText, 0, 240, 'UTF-8'),
'source_excerpt' => mb_substr($line, 0, 300, 'UTF-8'),
'confidence' => 'medium',
];
}
}
usort($events, static function (array $a, array $b): int {
$ad = (string)($a['date'] ?? '');
$bd = (string)($b['date'] ?? '');
$ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99';
$bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99';
return strcmp($ai, $bi);
});
return $events;
}
private function fallbackTimelineActor(string $line): string
{
$actors = [
'/barnevern(?:s?tjenesten)?|bv\b/iu' => 'Barnevernstjenesten',
'/fylkesnemnda/iu' => 'Fylkesnemnda',
'/statsforvalter(?:en)?/iu' => 'Statsforvalteren',
'/tingrett/iu' => 'Tingrett',
'/lagmannsrett/iu' => 'Lagmannsrett',
'/høyesterett|høyesterett/iu' => 'Høyesterett',
'/\bnav\b/iu' => 'NAV',
'/\bbup\b/iu' => 'BUP',
'/\bppt\b/iu' => 'PPT',
];
foreach ($actors as $pattern => $actor) {
if (preg_match($pattern, $line)) {
return $actor;
}
}
return 'unknown';
}
public function redact(
string $text,
string $mode = 'standard',