Harden timeline quick extraction
This commit is contained in:
@@ -388,6 +388,7 @@ const TIMELINE_I18N = {
|
||||
let lastTimelineEvents = [];
|
||||
let lastTimelineEventsOriginal = [];
|
||||
let lastTimelineWhatWeFound = '';
|
||||
let lastTimelineInputDateHintCount = null;
|
||||
let activeActorFilters = new Set();
|
||||
let timelineSearchTerm = '';
|
||||
let showSources = true;
|
||||
@@ -1602,6 +1603,7 @@ function renderMainFinding(data) {
|
||||
lastTimelineEventsOriginal = data.events || [];
|
||||
lastTimelineEvents = [...lastTimelineEventsOriginal];
|
||||
lastTimelineWhatWeFound = data.what_we_found || '';
|
||||
lastTimelineInputDateHintCount = data.trace_metadata?.input_date_hint_count ?? null;
|
||||
activeActorFilters = new Set();
|
||||
timelineSearchTerm = '';
|
||||
showSources = true;
|
||||
@@ -1728,6 +1730,9 @@ function applyTimelineFilters() {
|
||||
|
||||
function renderTimeline(events, grouped = false) {
|
||||
if (!events.length) {
|
||||
if (lastTimelineInputDateHintCount === 0) {
|
||||
return '<p class="timeline-empty">No recognizable dates were found in the extracted text. Check that the upload is text-searchable, or paste the relevant dated section and run again.</p>';
|
||||
}
|
||||
return '<p class="timeline-empty">No matching events.</p>';
|
||||
}
|
||||
const MONTH_NAMES = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'];
|
||||
|
||||
@@ -149,13 +149,20 @@ final class DbnAzureOpenAiGateway
|
||||
public function decodeJsonObject(string $content): ?array
|
||||
{
|
||||
$content = trim($content);
|
||||
$content = (string)preg_replace('/^```(?:json)?\s*\n?/i', '', $content);
|
||||
$content = (string)preg_replace('/\n?```\s*$/', '', $content);
|
||||
$content = trim($content);
|
||||
|
||||
$decoded = json_decode($content, true);
|
||||
if (is_array($decoded)) {
|
||||
return $decoded;
|
||||
}
|
||||
|
||||
if (preg_match('/\{(?:[^{}]|(?R))*\}/s', $content, $match)) {
|
||||
$decoded = json_decode($match[0], true);
|
||||
$start = strpos($content, '{');
|
||||
$end = strrpos($content, '}');
|
||||
if ($start !== false && $end !== false && $end > $start) {
|
||||
$candidate = substr($content, $start, $end - $start + 1);
|
||||
$decoded = json_decode($candidate, true);
|
||||
if (is_array($decoded)) {
|
||||
return $decoded;
|
||||
}
|
||||
|
||||
+171
-6
@@ -361,6 +361,7 @@ PROMPT;
|
||||
$onProgress && $onProgress("Preparing document\u{2026}");
|
||||
|
||||
$locale = dbnToolsLanguageName($language);
|
||||
$inputDateHintCount = $this->timelineDateHintCount($text);
|
||||
|
||||
$focusInstruction = match ($focus) {
|
||||
'deadlines' => "\nFocus specifically on: legal deadlines, filing dates, response windows, appeal periods, and statutory time limits. Deprioritise narrative events with no legal deadline significance.",
|
||||
@@ -445,6 +446,42 @@ Return JSON only:
|
||||
}
|
||||
PROMPT;
|
||||
|
||||
if ($engine === 'nova_lite') {
|
||||
$prompt = <<<PROMPT
|
||||
Build a chronological timeline from the pasted text in {$locale}.{$userNotesBlock}
|
||||
|
||||
Extract dated lines and temporal references from uploaded or pasted case text. Focus on recall: if a line starts with or contains a date, include it as an event.{$focusInstruction}{$backgroundInstruction}{$relativeInstruction}
|
||||
|
||||
Recognise Norwegian formats:
|
||||
- DD.MM.YYYY, DD.MM.YY, D.M.YY
|
||||
- DD.MM. or D.M. without a year; infer the nearest year from nearby text when possible
|
||||
- "den DD. month YYYY" and Norwegian month names
|
||||
- optional times such as "kl. 09.00" or "14:30"
|
||||
|
||||
For every event return:
|
||||
- date as YYYY-MM-DD when determinable, otherwise a short human-readable date
|
||||
- end_date as null unless the source states a period
|
||||
- time as HH:MM or null
|
||||
- date_type: absolute, relative, recurring, conditional, or period
|
||||
- actor: the named person/institution or "unknown"
|
||||
- event: concise description
|
||||
- source_excerpt: the exact source words that show the date and event
|
||||
- confidence: high, medium, or low
|
||||
|
||||
Pasted text:
|
||||
{$text}
|
||||
|
||||
Return JSON only:
|
||||
{
|
||||
"what_we_found": "event count, date range, main actors, notable gaps",
|
||||
"events": [{"date":"YYYY-MM-DD","end_date":null,"time":null,"date_type":"absolute","actor":"unknown","event":"...","source_excerpt":"...","confidence":"high"}],
|
||||
"evidence_trail": [{"title":"Pasted text","excerpt":"Processed in-memory only; not stored."}],
|
||||
"what_remains_uncertain": [],
|
||||
"next_practical_step": "..."
|
||||
}
|
||||
PROMPT;
|
||||
}
|
||||
|
||||
$system = $this->legalJsonSystemPrompt($language);
|
||||
$messages = [
|
||||
['role' => 'system', 'content' => $system],
|
||||
@@ -464,23 +501,45 @@ PROMPT;
|
||||
$response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOptions);
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
dbnToolsAbort('LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
|
||||
$msg = $e->getMessage();
|
||||
if (preg_match('/timed?\s*out|timeout|operation timed out/i', $msg)) {
|
||||
dbnToolsAbort('The model timed out. Try Quick mode, a smaller file, or fewer selected documents.', 504, 'llm_timeout');
|
||||
}
|
||||
dbnToolsAbort('LLM request failed: ' . $msg, 502, 'llm_error');
|
||||
}
|
||||
|
||||
$onProgress && $onProgress("Parsing events\u{2026}");
|
||||
|
||||
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
|
||||
if ($engine === 'nova_lite') {
|
||||
$raw = (string)preg_replace('/^```(?:json)?\s*\n?/m', '', $raw);
|
||||
$raw = (string)preg_replace('/\n?```\s*$/m', '', $raw);
|
||||
$raw = trim($raw);
|
||||
}
|
||||
$json = $this->azure->decodeJsonObject($raw);
|
||||
if (!$json) {
|
||||
dbnToolsAbort('The selected engine did not return valid structured JSON.', 502, 'llm_invalid_json');
|
||||
}
|
||||
|
||||
$events = is_array($json['events'] ?? null) ? $json['events'] : [];
|
||||
$usedFallbackExtractor = false;
|
||||
if (!$events && $inputDateHintCount > 0) {
|
||||
$fallbackEvents = $this->fallbackTimelineEvents($text);
|
||||
if ($fallbackEvents) {
|
||||
$events = $fallbackEvents;
|
||||
$usedFallbackExtractor = true;
|
||||
$uncertain = is_array($json['what_remains_uncertain'] ?? null) ? $json['what_remains_uncertain'] : [];
|
||||
array_unshift($uncertain, 'The selected engine returned no events, so a deterministic date-line fallback extracted visible dated lines. Review these medium-confidence entries against the original file.');
|
||||
$json['what_remains_uncertain'] = $uncertain;
|
||||
$json['what_we_found'] = count($events) . ' date-like event(s) extracted by fallback after the selected engine returned no events.';
|
||||
$json['next_practical_step'] = 'Review each fallback event against the original uploaded document and rerun with Standard or Deep if you need fuller actor/event interpretation.';
|
||||
}
|
||||
}
|
||||
if (!$events && $inputDateHintCount === 0) {
|
||||
$json['what_we_found'] = (string)($json['what_we_found'] ?? 'No recognizable dates were found in the extracted text from this upload.');
|
||||
if (trim((string)$json['what_we_found']) === '') {
|
||||
$json['what_we_found'] = 'No recognizable dates were found in the extracted text from this upload.';
|
||||
}
|
||||
$json['next_practical_step'] = (string)($json['next_practical_step'] ?? 'Check that the file text was extracted correctly, or upload a text-searchable PDF/DOCX.');
|
||||
if (trim((string)$json['next_practical_step']) === '') {
|
||||
$json['next_practical_step'] = 'Check that the file text was extracted correctly, or upload a text-searchable PDF/DOCX.';
|
||||
}
|
||||
}
|
||||
|
||||
// Post-filter: confidence
|
||||
if ($confidenceFilter === 'high_medium') {
|
||||
@@ -523,11 +582,117 @@ PROMPT;
|
||||
'chunk_count' => count($events),
|
||||
'source_count' => 1,
|
||||
'deployment' => $engineLabel,
|
||||
'input_date_hint_count' => $inputDateHintCount,
|
||||
'used_fallback_extractor' => $usedFallbackExtractor,
|
||||
],
|
||||
'disclaimer' => dbnToolsDisclaimer($language),
|
||||
];
|
||||
}
|
||||
|
||||
private function timelineDateHintCount(string $text): int
|
||||
{
|
||||
preg_match_all('/(?<!\d)\d{1,2}\.\d{1,2}\.(?:\d{2,4})?(?!\d)/u', $text, $numeric);
|
||||
preg_match_all('/\b\d{1,2}\.\s*(?:januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)\b/iu', $text, $months);
|
||||
return count($numeric[0] ?? []) + count($months[0] ?? []);
|
||||
}
|
||||
|
||||
private function fallbackTimelineEvents(string $text): array
|
||||
{
|
||||
$lines = preg_split('/\R/u', $text) ?: [];
|
||||
$events = [];
|
||||
$lastYear = null;
|
||||
foreach ($lines as $line) {
|
||||
if (count($events) >= 80) {
|
||||
break;
|
||||
}
|
||||
$line = trim((string)preg_replace('/\s+/u', ' ', $line));
|
||||
if ($line === '') {
|
||||
continue;
|
||||
}
|
||||
if (preg_match('/\b(20\d{2}|19\d{2})\b/u', $line, $ym)) {
|
||||
$lastYear = (int)$ym[1];
|
||||
}
|
||||
if (!preg_match_all('/(?<!\d)(\d{1,2})\.(\d{1,2})\.(?:(\d{2,4}))?(?!\d)/u', $line, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)) {
|
||||
continue;
|
||||
}
|
||||
foreach ($matches as $m) {
|
||||
if (count($events) >= 80) {
|
||||
break 2;
|
||||
}
|
||||
$day = (int)$m[1][0];
|
||||
$month = (int)$m[2][0];
|
||||
if ($day < 1 || $day > 31 || $month < 1 || $month > 12) {
|
||||
continue;
|
||||
}
|
||||
$yearRaw = $m[3][0] ?? '';
|
||||
$year = null;
|
||||
if ($yearRaw !== '') {
|
||||
$year = strlen($yearRaw) === 2 ? 2000 + (int)$yearRaw : (int)$yearRaw;
|
||||
$lastYear = $year;
|
||||
} elseif ($lastYear !== null) {
|
||||
$year = $lastYear;
|
||||
}
|
||||
$date = $year !== null
|
||||
? sprintf('%04d-%02d-%02d', $year, $month, $day)
|
||||
: sprintf('%02d.%02d. (year unknown)', $day, $month);
|
||||
$time = null;
|
||||
if (preg_match('/\bkl\.?\s*(\d{1,2})[:.](\d{2})\b|\b(\d{1,2}):(\d{2})\b/u', $line, $tm)) {
|
||||
$hour = (int)($tm[1] !== '' ? $tm[1] : $tm[3]);
|
||||
$min = (int)($tm[2] !== '' ? $tm[2] : $tm[4]);
|
||||
if ($hour >= 0 && $hour <= 23 && $min >= 0 && $min <= 59) {
|
||||
$time = sprintf('%02d:%02d', $hour, $min);
|
||||
}
|
||||
}
|
||||
$eventText = trim(preg_replace('/^\s*[-*#\s]*/u', '', $line));
|
||||
$eventText = trim(preg_replace('/^' . preg_quote($m[0][0], '/') . '\s*(?:kl\.?\s*\d{1,2}[:.]\d{2})?\s*[:\-–—]?\s*/u', '', $eventText));
|
||||
if ($eventText === '') {
|
||||
$eventText = 'Dated event found in uploaded text.';
|
||||
}
|
||||
$events[] = [
|
||||
'date' => $date,
|
||||
'end_date' => null,
|
||||
'time' => $time,
|
||||
'date_type' => $year !== null ? 'absolute' : 'relative',
|
||||
'actor' => $this->fallbackTimelineActor($line),
|
||||
'event' => mb_substr($eventText, 0, 240, 'UTF-8'),
|
||||
'source_excerpt' => mb_substr($line, 0, 300, 'UTF-8'),
|
||||
'confidence' => 'medium',
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
usort($events, static function (array $a, array $b): int {
|
||||
$ad = (string)($a['date'] ?? '');
|
||||
$bd = (string)($b['date'] ?? '');
|
||||
$ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99';
|
||||
$bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99';
|
||||
return strcmp($ai, $bi);
|
||||
});
|
||||
|
||||
return $events;
|
||||
}
|
||||
|
||||
private function fallbackTimelineActor(string $line): string
|
||||
{
|
||||
$actors = [
|
||||
'/barnevern(?:s?tjenesten)?|bv\b/iu' => 'Barnevernstjenesten',
|
||||
'/fylkesnemnda/iu' => 'Fylkesnemnda',
|
||||
'/statsforvalter(?:en)?/iu' => 'Statsforvalteren',
|
||||
'/tingrett/iu' => 'Tingrett',
|
||||
'/lagmannsrett/iu' => 'Lagmannsrett',
|
||||
'/høyesterett|høyesterett/iu' => 'Høyesterett',
|
||||
'/\bnav\b/iu' => 'NAV',
|
||||
'/\bbup\b/iu' => 'BUP',
|
||||
'/\bppt\b/iu' => 'PPT',
|
||||
];
|
||||
foreach ($actors as $pattern => $actor) {
|
||||
if (preg_match($pattern, $line)) {
|
||||
return $actor;
|
||||
}
|
||||
}
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
public function redact(
|
||||
string $text,
|
||||
string $mode = 'standard',
|
||||
|
||||
Reference in New Issue
Block a user