From e32ee60e780e6f7566da9856f62c61ad74cdd2b0 Mon Sep 17 00:00:00 2001 From: davegilligan Date: Mon, 18 May 2026 07:11:31 +0200 Subject: [PATCH] =?UTF-8?q?feat(timeline):=20tighten=20prompt=20for=20accu?= =?UTF-8?q?racy=20=E2=80=94=20year=20inference,=20month=20names,=20actor?= =?UTF-8?q?=20normalization,=20confidence=20calibration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add 4-step year inference rule for DD.MM. entries (scan backward/forward for anchor year) - Add Norwegian month-name formats (18. september, den 18. september 2025, etc.) with month lookup table - Add $relativeInstruction to tell LLM upfront when relative dates are excluded (not just PHP-filtered post-hoc) - Define confidence calibration criteria explicitly (high/medium/low) - Improve source_excerpt guidance: most diagnostic phrase, not just any verbatim phrase - Add actor normalization for Norwegian institutions (Barnevernstjenesten, Fylkesnemnda, Statsforvalteren, etc.) - Add deduplication rule for events appearing across multiple documents - Add end_date field for date_type=period events - Improve what_we_found schema hint to require count/range/actors/gaps - Increase max_tokens to 8000 for azure_full (gpt-4o) to avoid truncation on large documents - Tighten system prompt with Norwegian CPS legal chain context Co-Authored-By: Claude Sonnet 4.6 --- includes/LegalTools.php | 112 ++++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 34 deletions(-) diff --git a/includes/LegalTools.php b/includes/LegalTools.php index b60a46a..ecef6e7 100644 --- a/includes/LegalTools.php +++ b/includes/LegalTools.php @@ -312,45 +312,68 @@ PROMPT; ? "\nAlso extract BACKGROUND and NARRATIVE events: dates embedded in contextual paragraphs, historical facts, year-only references, and approximate years (e.g. \"rundt 2011/2012\", \"David ble født den 30.07.2015\", \"familien i 2015\"). These are valid timeline events even when they appear in introductory or background text — do NOT skip them." : "\nDo NOT include purely historical background or narrative context dates. Focus only on operational events, deadlines, and milestones that are directly actionable in the case."; + $relativeInstruction = $includeRelative + ? '' + : "\nDo NOT extract relative, recurring, or conditional date references — extract only events with determinable absolute dates (date_type=absolute)."; + $prompt = << 'system', 'content' => $system], ['role' => 'user', 'content' => $prompt], ]; - $chatOptions = ['json' => true, 'temperature' => 0.1, 'max_tokens' => 4000, 'timeout' => 120]; + $chatOptions = ['json' => true, 'temperature' => 0.1, 'max_tokens' => ($engine === 'azure_full' ? 8000 : 4000), 'timeout' => 120]; $deployLabel = $this->azure->chatDeployment(); try { @@ -519,7 +542,15 @@ PROMPT; if (!preg_match('/^\[[A-Za-z0-9_\- ]+(?::\s*[^\]]+)?\]$/', $tag)) { $tag = '[IDENTIFIER]'; } - if (str_contains($finalRedacted, $original)) { + // Try word-boundary match first to avoid partial-word substitutions (e.g. "Per" inside "Persson") + $escaped = preg_quote($original, '/'); + $replaced = preg_replace('/\b' . $escaped . '\b/u', $tag, $finalRedacted); + if ($replaced !== null && $replaced !== $finalRedacted) { + $finalRedacted = $replaced; + $pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1; + $applied++; + } elseif (str_contains($finalRedacted, $original)) { + // Fallback for names adjacent to punctuation or non-word characters $finalRedacted = str_replace($original, $tag, $finalRedacted); $pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1; $applied++; @@ -607,7 +638,8 @@ PROMPT; { $locale = dbnToolsLanguageName($language); return <<