From f183678f3560584a8cc45d8540436e9e3a3d3df5 Mon Sep 17 00:00:00 2001 From: davegilligan Date: Fri, 15 May 2026 01:58:35 +0200 Subject: [PATCH] Redact: catch soft dates (years, month+year, ranges, prepositions) Adds Nordic-pack regex patterns for: - DD.MM.YYYY / DD/MM/YYYY / YYYY-MM-DD - Year ranges (2011/2012, 2018-2019) - Month + year (Norwegian + English, with optional day) - Year preceded by temporal preposition (i 2015, fra 2019, rundt 2018) Also renames the entity toggle from "Dates of birth" to "Dates" (broader scope) in all four languages, and expands the LLM prompt so soft date references in free text are caught even when regex misses them. Co-Authored-By: Claude Sonnet 4.6 --- assets/js/tools.js | 8 ++++---- includes/LegalTools.php | 31 ++++++++++++++++++++++++------- redact.php | 2 +- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/assets/js/tools.js b/assets/js/tools.js index babe7c4..07bb41c 100644 --- a/assets/js/tools.js +++ b/assets/js/tools.js @@ -25,7 +25,7 @@ const REDACT_I18N = { redactEntityNames: 'Names', redactEntityOrgs: 'Organisations', redactEntityPlaces: 'Places', - redactEntityDob: 'Dates of birth', + redactEntityDob: 'Dates', redactOfficials: 'Officials', redactKeepOfficials: 'Keep official names (judges, experts)', redactOfficialsHint: 'When checked, judges, expert witnesses and caseworkers keep their names in a labelled tag: [JUDGE: Andersen]. Uncheck to replace all names with generic role tags.', @@ -79,7 +79,7 @@ const REDACT_I18N = { redactEntityNames: 'Navn', redactEntityOrgs: 'Organisasjoner', redactEntityPlaces: 'Steder', - redactEntityDob: 'Fødselsdatoer', + redactEntityDob: 'Datoer', redactOfficials: 'Offisielle', redactKeepOfficials: 'Behold offisielle navn (dommere, sakkyndige)', redactOfficialsHint: 'Når avkrysset beholder dommere, sakkyndige og saksbehandlere sine navn i en merket tagg: [DOMMER: Andersen]. Fjern haken for å erstatte alle navn med generiske rolletaggar.', @@ -133,7 +133,7 @@ const REDACT_I18N = { redactEntityNames: 'Імена', redactEntityOrgs: 'Організації', redactEntityPlaces: 'Місця', - redactEntityDob: 'Дати народження', + redactEntityDob: 'Дати', redactOfficials: 'Офіційні особи', redactKeepOfficials: 'Зберігати офіційні імена (судді, експерти)', redactOfficialsHint: 'Якщо позначено, судді, експерти та соціальні працівники зберігають свої імена у позначеному тезі: [СУДДЯ: Andersen].', @@ -187,7 +187,7 @@ const REDACT_I18N = { redactEntityNames: 'Imiona', redactEntityOrgs: 'Organizacje', redactEntityPlaces: 'Miejsca', - redactEntityDob: 'Daty urodzenia', + redactEntityDob: 'Daty', redactOfficials: 'Urzędnicy', redactKeepOfficials: 'Zachowaj oficjalne nazwy (sędziowie, eksperci)', redactOfficialsHint: 'Gdy zaznaczone, sędziowie, biegli i pracownicy socjalni zachowują swoje nazwiska w oznaczonym tagu: [SĘDZIA: Andersen].', diff --git a/includes/LegalTools.php b/includes/LegalTools.php index cea9515..2b3a24c 100644 --- a/includes/LegalTools.php +++ b/includes/LegalTools.php @@ -281,11 +281,12 @@ PROMPT; public function timeline( string $text, - string $language = 'en', - string $engine = 'azure_mini', - string $focus = 'all', - string $confidenceFilter = 'all', - bool $includeRelative = true + string $language = 'en', + string $engine = 'azure_mini', + string $focus = 'all', + string $confidenceFilter = 'all', + bool $includeRelative = true, + bool $includeBackground = true ): array { $text = $this->requirePasteText($text); $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini'; @@ -304,10 +305,14 @@ PROMPT; default => '', }; + $backgroundInstruction = $includeBackground + ? "\nAlso extract BACKGROUND and NARRATIVE events: dates embedded in contextual paragraphs, historical facts, year-only references, and approximate years (e.g. \"rundt 2011/2012\", \"David ble født den 30.07.2015\", \"familien i 2015\"). These are valid timeline events even when they appear in introductory or background text — do NOT skip them." + : "\nDo NOT include purely historical background or narrative context dates. Focus only on operational events, deadlines, and milestones that are directly actionable in the case."; + $prompt = << '/(? '[FNR]', 'type' => 'fødselsnummer'], ['pattern' => '/(? '[PHONE]', 'type' => 'phone'], ['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'], + // Dates — must precede generic numeric patterns + // Year range (e.g. 2011/2012, 2018-2019) + ['pattern' => '/(? '[DATE]', 'type' => 'date'], + // Norwegian DD.MM.YYYY and DD/MM/YYYY + ['pattern' => '/(? '[DATE]', 'type' => 'date'], + // ISO YYYY-MM-DD + ['pattern' => '/(? '[DATE]', 'type' => 'date'], + // DD. Month YYYY (e.g. "30. juli 2015") and Month YYYY (Norwegian + English) + ['pattern' => '/\b(?:\d{1,2}\.?\s+)?(?:januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember|January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\s+(?:19|20)\d{2}\b/iu', 'replacement' => '[DATE]', 'type' => 'date'], + // Year after Norwegian/English temporal preposition (lookbehind keeps preposition) + ['pattern' => '/(?<=\b(?:i|fra|siden|innen|før|etter|rundt|omkring|cirka|in|from|since|until|before|after|around|circa)\s)(?:19|20)\d{2}(?![\d\/\-])/iu', 'replacement' => '[DATE]', 'type' => 'date'], ]; if ($region === 'nordic') { @@ -991,7 +1007,7 @@ PROMPT; $system = << Names - +