Redact: catch soft dates (years, month+year, ranges, prepositions)
Adds Nordic-pack regex patterns for: - DD.MM.YYYY / DD/MM/YYYY / YYYY-MM-DD - Year ranges (2011/2012, 2018-2019) - Month + year (Norwegian + English, with optional day) - Year preceded by temporal preposition (i 2015, fra 2019, rundt 2018) Also renames the entity toggle from "Dates of birth" to "Dates" (broader scope) in all four languages, and expands the LLM prompt so soft date references in free text are caught even when regex misses them. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+4
-4
@@ -25,7 +25,7 @@ const REDACT_I18N = {
|
||||
redactEntityNames: 'Names',
|
||||
redactEntityOrgs: 'Organisations',
|
||||
redactEntityPlaces: 'Places',
|
||||
redactEntityDob: 'Dates of birth',
|
||||
redactEntityDob: 'Dates',
|
||||
redactOfficials: 'Officials',
|
||||
redactKeepOfficials: 'Keep official names (judges, experts)',
|
||||
redactOfficialsHint: 'When checked, judges, expert witnesses and caseworkers keep their names in a labelled tag: [JUDGE: Andersen]. Uncheck to replace all names with generic role tags.',
|
||||
@@ -79,7 +79,7 @@ const REDACT_I18N = {
|
||||
redactEntityNames: 'Navn',
|
||||
redactEntityOrgs: 'Organisasjoner',
|
||||
redactEntityPlaces: 'Steder',
|
||||
redactEntityDob: 'Fødselsdatoer',
|
||||
redactEntityDob: 'Datoer',
|
||||
redactOfficials: 'Offisielle',
|
||||
redactKeepOfficials: 'Behold offisielle navn (dommere, sakkyndige)',
|
||||
redactOfficialsHint: 'Når avkrysset beholder dommere, sakkyndige og saksbehandlere sine navn i en merket tagg: [DOMMER: Andersen]. Fjern haken for å erstatte alle navn med generiske rolletaggar.',
|
||||
@@ -133,7 +133,7 @@ const REDACT_I18N = {
|
||||
redactEntityNames: 'Імена',
|
||||
redactEntityOrgs: 'Організації',
|
||||
redactEntityPlaces: 'Місця',
|
||||
redactEntityDob: 'Дати народження',
|
||||
redactEntityDob: 'Дати',
|
||||
redactOfficials: 'Офіційні особи',
|
||||
redactKeepOfficials: 'Зберігати офіційні імена (судді, експерти)',
|
||||
redactOfficialsHint: 'Якщо позначено, судді, експерти та соціальні працівники зберігають свої імена у позначеному тезі: [СУДДЯ: Andersen].',
|
||||
@@ -187,7 +187,7 @@ const REDACT_I18N = {
|
||||
redactEntityNames: 'Imiona',
|
||||
redactEntityOrgs: 'Organizacje',
|
||||
redactEntityPlaces: 'Miejsca',
|
||||
redactEntityDob: 'Daty urodzenia',
|
||||
redactEntityDob: 'Daty',
|
||||
redactOfficials: 'Urzędnicy',
|
||||
redactKeepOfficials: 'Zachowaj oficjalne nazwy (sędziowie, eksperci)',
|
||||
redactOfficialsHint: 'Gdy zaznaczone, sędziowie, biegli i pracownicy socjalni zachowują swoje nazwiska w oznaczonym tagu: [SĘDZIA: Andersen].',
|
||||
|
||||
+24
-7
@@ -281,11 +281,12 @@ PROMPT;
|
||||
|
||||
public function timeline(
|
||||
string $text,
|
||||
string $language = 'en',
|
||||
string $engine = 'azure_mini',
|
||||
string $focus = 'all',
|
||||
string $confidenceFilter = 'all',
|
||||
bool $includeRelative = true
|
||||
string $language = 'en',
|
||||
string $engine = 'azure_mini',
|
||||
string $focus = 'all',
|
||||
string $confidenceFilter = 'all',
|
||||
bool $includeRelative = true,
|
||||
bool $includeBackground = true
|
||||
): array {
|
||||
$text = $this->requirePasteText($text);
|
||||
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
|
||||
@@ -304,10 +305,14 @@ PROMPT;
|
||||
default => '',
|
||||
};
|
||||
|
||||
$backgroundInstruction = $includeBackground
|
||||
? "\nAlso extract BACKGROUND and NARRATIVE events: dates embedded in contextual paragraphs, historical facts, year-only references, and approximate years (e.g. \"rundt 2011/2012\", \"David ble født den 30.07.2015\", \"familien i 2015\"). These are valid timeline events even when they appear in introductory or background text — do NOT skip them."
|
||||
: "\nDo NOT include purely historical background or narrative context dates. Focus only on operational events, deadlines, and milestones that are directly actionable in the case.";
|
||||
|
||||
$prompt = <<<PROMPT
|
||||
Build a chronological timeline from the pasted text in {$locale}.
|
||||
|
||||
Extract ALL dates, deadlines, milestones, and temporal references.{$focusInstruction}
|
||||
Extract ALL dates, deadlines, milestones, and temporal references.{$focusInstruction}{$backgroundInstruction}
|
||||
|
||||
IMPORTANT — Norwegian date formats to recognise:
|
||||
- DD.MM.YY (e.g. 18.09.25 = 2025-09-18, 09.04.25 = 2025-04-09)
|
||||
@@ -871,6 +876,17 @@ PROMPT;
|
||||
['pattern' => '/(?<!\d)(?:\d{6}[\s\-]?\d{5}|\d{11})(?!\d)/u', 'replacement' => '[FNR]', 'type' => 'fødselsnummer'],
|
||||
['pattern' => '/(?<!\d)(?:\+47[\s.\-]?)?(?:\d[\s.\-]?){8}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
|
||||
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
|
||||
// Dates — must precede generic numeric patterns
|
||||
// Year range (e.g. 2011/2012, 2018-2019)
|
||||
['pattern' => '/(?<!\d)(?:19|20)\d{2}\s*[\/\-–—]\s*(?:19|20)?\d{2}(?!\d)/u', 'replacement' => '[DATE]', 'type' => 'date'],
|
||||
// Norwegian DD.MM.YYYY and DD/MM/YYYY
|
||||
['pattern' => '/(?<!\d)(?:0?[1-9]|[12]\d|3[01])[.\/](?:0?[1-9]|1[0-2])[.\/](?:19|20)\d{2}(?!\d)/u', 'replacement' => '[DATE]', 'type' => 'date'],
|
||||
// ISO YYYY-MM-DD
|
||||
['pattern' => '/(?<!\d)(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])(?!\d)/u', 'replacement' => '[DATE]', 'type' => 'date'],
|
||||
// DD. Month YYYY (e.g. "30. juli 2015") and Month YYYY (Norwegian + English)
|
||||
['pattern' => '/\b(?:\d{1,2}\.?\s+)?(?:januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember|January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\s+(?:19|20)\d{2}\b/iu', 'replacement' => '[DATE]', 'type' => 'date'],
|
||||
// Year after Norwegian/English temporal preposition (lookbehind keeps preposition)
|
||||
['pattern' => '/(?<=\b(?:i|fra|siden|innen|før|etter|rundt|omkring|cirka|in|from|since|until|before|after|around|circa)\s)(?:19|20)\d{2}(?![\d\/\-])/iu', 'replacement' => '[DATE]', 'type' => 'date'],
|
||||
];
|
||||
|
||||
if ($region === 'nordic') {
|
||||
@@ -991,7 +1007,7 @@ PROMPT;
|
||||
$system = <<<PROMPT
|
||||
You are a privacy redaction assistant for legal documents (ECHR judgements, Norwegian family law cases, EU child welfare documents). The text below has already had mechanical identifiers replaced with placeholder tags in [BRACKETS].
|
||||
|
||||
Your task: find any remaining identifiable information — person names, organisation names, specific places at city level or below, dates of birth, and identifying descriptions.
|
||||
Your task: find any remaining identifiable information — person names, organisation names, specific places at city level or below, dates and dates of birth (including soft references like "i 2015", "august 2018", "rundt 2011/2012", "spring of 2019"), and identifying descriptions.
|
||||
|
||||
STEP 1 — For person names: identify each individual and infer their role or relationship from context.
|
||||
Assign each person a consistent contextual tag used for every occurrence of their name:
|
||||
@@ -1008,6 +1024,7 @@ Allowed types and their tag format:
|
||||
org → [ORG]
|
||||
place → [PLACE]
|
||||
date_of_birth → [DOB]
|
||||
date → [DATE] (years, year ranges, month+year, soft temporal references — e.g. "i 2015" → "i [DATE]", "rundt 2011/2012" → "rundt [DATE]")
|
||||
other → [IDENTIFIER]
|
||||
|
||||
Rules:
|
||||
|
||||
+1
-1
@@ -48,7 +48,7 @@ require_once __DIR__ . '/includes/layout.php';
|
||||
<label><input type="checkbox" name="redactNames" id="redactNames" checked> <span data-i18n="redactEntityNames">Names</span></label>
|
||||
<label><input type="checkbox" name="redactOrgs" id="redactOrgs" checked> <span data-i18n="redactEntityOrgs">Organisations</span></label>
|
||||
<label><input type="checkbox" name="redactPlaces" id="redactPlaces" checked> <span data-i18n="redactEntityPlaces">Places</span></label>
|
||||
<label><input type="checkbox" name="redactDob" id="redactDob" checked> <span data-i18n="redactEntityDob">Dates of birth</span></label>
|
||||
<label><input type="checkbox" name="redactDob" id="redactDob" checked> <span data-i18n="redactEntityDob">Dates</span></label>
|
||||
</div>
|
||||
|
||||
<div class="control-row" id="redactOfficialsControl">
|
||||
|
||||
Reference in New Issue
Block a user