Two-pass PII redaction with multi-country pattern packs
Pass 1: deterministic regex with Nordic/European/ECHR/Global packs covering fødselsnummer, Swedish personnummer, Danish/Finnish CPR, UK NI, French INSEE, IBAN, EU phones, ECHR application numbers, DOB, and national ID label patterns. Pass 2: LLM semantic scan (Azure OpenAI) finds names, orgs, places and identifying descriptions missed by regex. Runs on pre-redacted text so no raw PII reaches the LLM. Adds region selector (Nordic/European/ECHR/Global) to the Redact UI. Falls back gracefully when Azure is not yet configured. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+208
-39
@@ -298,37 +298,93 @@ PROMPT;
|
||||
];
|
||||
}
|
||||
|
||||
public function redact(string $text, string $mode = 'standard'): array
|
||||
public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en'): array
|
||||
{
|
||||
$text = $this->requirePasteText($text);
|
||||
$mode = $mode === 'strict' ? 'strict' : 'standard';
|
||||
[$redacted, $entities] = $this->deterministicRedaction($text, $mode);
|
||||
$mode = $mode === 'strict' ? 'strict' : 'standard';
|
||||
$region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic';
|
||||
|
||||
// Pass 1 — deterministic regex
|
||||
[$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region);
|
||||
$pass1Total = array_sum($pass1Counts);
|
||||
$pass1Detail = $pass1Total
|
||||
? implode(', ', array_map(
|
||||
fn($k, $v) => "{$k}: {$v}",
|
||||
array_keys(array_filter($pass1Counts, fn($v): bool => $v > 0)),
|
||||
array_filter($pass1Counts, fn($v): bool => $v > 0)
|
||||
))
|
||||
: 'none detected';
|
||||
|
||||
$categories = array_keys(array_filter($entities, fn(int $count): bool => $count > 0));
|
||||
$trace = [
|
||||
$this->trace('Query interpretation', 'Detect and redact sensitive identifiers from pasted text.', 'complete'),
|
||||
$this->trace('Search tools used', 'Deterministic Norwegian privacy patterns first; no text was stored.', 'complete'),
|
||||
$this->trace('Evidence found', count($categories) ? 'Detected categories: ' . implode(', ', $categories) . '.' : 'No deterministic sensitive categories were detected.', count($categories) ? 'complete' : 'warning'),
|
||||
$this->trace('Citation confidence', 'High for emails and fødselsnummer-like values; medium for addresses and names.', 'complete'),
|
||||
$this->trace('Uncertainty / missing evidence', 'Contextual names may need human review, especially in standard mode.', 'warning'),
|
||||
$this->trace('Next practical step', 'Review the redacted output before sharing it outside the case team.', 'complete'),
|
||||
$this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}.", 'complete'),
|
||||
$this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'),
|
||||
];
|
||||
|
||||
// Pass 2 — LLM semantic scan
|
||||
$finalRedacted = $preRedacted;
|
||||
$pass2Counts = [];
|
||||
$llmDeployment = null;
|
||||
|
||||
$llmResult = $this->llmRedactionPass($preRedacted, $language);
|
||||
|
||||
if (!empty($llmResult['skipped'])) {
|
||||
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'Azure not configured') . '.', 'warning');
|
||||
} elseif (!empty($llmResult['error'])) {
|
||||
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning');
|
||||
} else {
|
||||
$entities = $llmResult['entities'] ?? [];
|
||||
$llmDeployment = $llmResult['deployment'] ?? null;
|
||||
$applied = 0;
|
||||
|
||||
foreach ($entities as $entity) {
|
||||
if (!is_array($entity)) {
|
||||
continue;
|
||||
}
|
||||
$original = (string)($entity['original'] ?? '');
|
||||
$type = (string)($entity['type'] ?? 'other');
|
||||
$tag = (string)($entity['tag'] ?? '[IDENTIFIER]');
|
||||
if ($original === '' || str_starts_with($original, '[')) {
|
||||
continue;
|
||||
}
|
||||
if (!in_array($tag, ['[PERSON]', '[ORG]', '[PLACE]', '[DOB]', '[IDENTIFIER]'], true)) {
|
||||
$tag = '[IDENTIFIER]';
|
||||
}
|
||||
if (str_contains($finalRedacted, $original)) {
|
||||
$finalRedacted = str_replace($original, $tag, $finalRedacted);
|
||||
$pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1;
|
||||
$applied++;
|
||||
}
|
||||
}
|
||||
|
||||
$pass2Detail = $applied > 0
|
||||
? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts))
|
||||
: 'no additional entities found';
|
||||
|
||||
$trace[] = $this->trace('Pass 2 — LLM semantic scan', "Azure reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete');
|
||||
}
|
||||
|
||||
$allCounts = array_merge($pass1Counts, $pass2Counts);
|
||||
$categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0));
|
||||
|
||||
$trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning');
|
||||
$trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete');
|
||||
|
||||
return [
|
||||
'tool' => 'redact',
|
||||
'mode' => $mode,
|
||||
'what_we_found' => 'Redacted deterministic privacy patterns from the pasted text.',
|
||||
'redacted_text' => $redacted,
|
||||
'tool' => 'redact',
|
||||
'mode' => $mode,
|
||||
'region' => $region,
|
||||
'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment ? ' and LLM semantic scan' : '') . '.',
|
||||
'redacted_text' => $finalRedacted,
|
||||
'detected_entity_categories' => $categories,
|
||||
'entity_counts' => $entities,
|
||||
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
|
||||
'what_remains_uncertain' => ['Human review is still needed for names that depend on case context.'],
|
||||
'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.',
|
||||
'trace' => $trace,
|
||||
'trace_metadata' => [
|
||||
'chunk_count' => 1,
|
||||
'entity_counts' => $allCounts,
|
||||
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
|
||||
'what_remains_uncertain' => ['Human review is still recommended for contextual identification.'],
|
||||
'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.',
|
||||
'trace' => $trace,
|
||||
'trace_metadata' => [
|
||||
'chunk_count' => 1,
|
||||
'source_count' => 1,
|
||||
'deployment' => null,
|
||||
'deployment' => $llmDeployment,
|
||||
],
|
||||
'disclaimer' => 'Privacy support tool. Review before disclosure.',
|
||||
];
|
||||
@@ -564,41 +620,36 @@ PROMPT;
|
||||
return $text;
|
||||
}
|
||||
|
||||
private function deterministicRedaction(string $text, string $mode): array
|
||||
private function deterministicRedaction(string $text, string $mode, string $region = 'nordic'): array
|
||||
{
|
||||
$counts = [
|
||||
'email' => 0,
|
||||
'phone' => 0,
|
||||
'fødselsnummer' => 0,
|
||||
'address' => 0,
|
||||
'person_or_child_name' => 0,
|
||||
];
|
||||
$counts = [];
|
||||
|
||||
$replace = function (string $pattern, string $category, string $token) use (&$text, &$counts): void {
|
||||
$text = preg_replace_callback($pattern, function () use (&$counts, $category, $token): string {
|
||||
$counts[$category]++;
|
||||
$replace = function (string $pattern, string $type, string $token) use (&$text, &$counts): void {
|
||||
$text = preg_replace_callback($pattern, function () use (&$counts, $type, $token): string {
|
||||
$counts[$type] = ($counts[$type] ?? 0) + 1;
|
||||
return $token;
|
||||
}, $text) ?? $text;
|
||||
};
|
||||
|
||||
$replace('/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'email', '[EMAIL]');
|
||||
$replace('/(?<!\d)(?:\d{6}[\s\-]?\d{5}|\d{11})(?!\d)/u', 'fødselsnummer', '[FNR]');
|
||||
$replace('/(?<!\d)(?:\+47[\s.\-]?)?(?:\d[\s.\-]?){8}(?!\d)/u', 'phone', '[PHONE]');
|
||||
$replace('/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave)\s+\d+[A-Z]?\b/iu', 'address', '[ADDRESS]');
|
||||
foreach ($this->getPatternPack($region) as $entry) {
|
||||
$replace($entry['pattern'], $entry['type'], $entry['replacement']);
|
||||
}
|
||||
|
||||
// Structured role-label names (Barn: X, Mother: X, etc.) — universal
|
||||
$text = preg_replace_callback(
|
||||
'/\b(Barn|Child|Navn|Name|Mor|Far|Mother|Father|Sønn|Datter)\s*:\s*([^\r\n,.;]+)/iu',
|
||||
function (array $m) use (&$counts): string {
|
||||
$counts['person_or_child_name']++;
|
||||
$counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
|
||||
return $m[1] . ': [PERSON]';
|
||||
},
|
||||
$text
|
||||
) ?? $text;
|
||||
|
||||
// Child-identifier phrases ("barnet heter X", "child named X") — universal
|
||||
$text = preg_replace_callback(
|
||||
'/\b(?:barnet|child|sønn|son|datter|daughter)\s+(?:heter|named|called)?\s*([A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,})\b/iu',
|
||||
function () use (&$counts): string {
|
||||
$counts['person_or_child_name']++;
|
||||
$counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
|
||||
return '[CHILD_IDENTIFIER]';
|
||||
},
|
||||
$text
|
||||
@@ -611,6 +662,124 @@ PROMPT;
|
||||
return [$text, $counts];
|
||||
}
|
||||
|
||||
private function getPatternPack(string $region): array
|
||||
{
|
||||
$nordic = [
|
||||
['pattern' => '/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'replacement' => '[EMAIL]', 'type' => 'email'],
|
||||
['pattern' => '/(?<!\d)(?:\d{6}[\s\-]?\d{5}|\d{11})(?!\d)/u', 'replacement' => '[FNR]', 'type' => 'fødselsnummer'],
|
||||
['pattern' => '/(?<!\d)(?:\+47[\s.\-]?)?(?:\d[\s.\-]?){8}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
|
||||
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
|
||||
];
|
||||
|
||||
if ($region === 'nordic') {
|
||||
return $nordic;
|
||||
}
|
||||
|
||||
$european = array_merge($nordic, [
|
||||
// Swedish personnummer short (YYMMDD-XXXX / YYMMDD+XXXX)
|
||||
['pattern' => '/(?<!\d)\d{6}[-+]\d{4}(?!\d)/u', 'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
|
||||
// Swedish personnummer full (YYYYMMDD-XXXX)
|
||||
['pattern' => '/(?<!\d)\d{8}[-+]\d{4}(?!\d)/u', 'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
|
||||
// Danish/Finnish CPR / henkilötunnus — same format as short SE personnummer but included for clarity
|
||||
['pattern' => '/\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b/i', 'replacement' => '[UK_NI]', 'type' => 'uk_ni'],
|
||||
// French INSEE (15 digits, not overlapping with 11-digit FNR due to lookarounds)
|
||||
['pattern' => '/(?<!\d)\d{15}(?!\d)/u', 'replacement' => '[FR_INSEE]', 'type' => 'fr_insee'],
|
||||
// IBAN (2-letter country code + 2 check digits + up to 30 alphanumeric)
|
||||
['pattern' => '/\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}[A-Z0-9]{0,16}\b/i', 'replacement' => '[IBAN]', 'type' => 'iban'],
|
||||
// European phone (international prefix for major EU/EEA country codes)
|
||||
['pattern' => '/(?<!\d)\+(?:44|46|45|358|33|49|34|39|31|32|41|43|30|351|353|48|36|420|421|372|371|370|386|385|356|357|40|359|352)[\s.\-]?(?:\d[\s.\-]?){7,12}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
|
||||
// Street address expanded to European street-type keywords
|
||||
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave|rue|straße|strasse|straat|gade|calle|via|gatan|vägen)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
|
||||
]);
|
||||
|
||||
if ($region === 'european') {
|
||||
return $european;
|
||||
}
|
||||
|
||||
$echr = array_merge($european, [
|
||||
// ECHR application number (requires "Application no." or "App. No." prefix to avoid matching dates/pages)
|
||||
['pattern' => '/\b(?:Application|App\.?)\s+(?:no\.?|nr\.?|#)\s*\d{3,6}\s*\/\s*\d{2,4}\b/i', 'replacement' => '[ECHR_APP_NO]', 'type' => 'echr_app_no'],
|
||||
// Date of birth stated in judgment context
|
||||
['pattern' => '/\bborn\s+(?:on\s+)?\d{1,2}[.\s]+(?:January|February|March|April|May|June|July|August|September|October|November|December|januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)[,\s]+\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
|
||||
['pattern' => '/\bf\.\s*\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
|
||||
// National ID label patterns in multiple languages
|
||||
['pattern' => '/\b(?:personal\s+number|numéro\s+national|Personalausweis|personnummer|fødselsnummer|henkilötunnus|CPR-nummer)\s*:\s*[\w\s\-]+/iu', 'replacement' => '[NAT_ID]', 'type' => 'nat_id'],
|
||||
]);
|
||||
|
||||
if ($region === 'echr') {
|
||||
return $echr;
|
||||
}
|
||||
|
||||
// global
|
||||
return array_merge($echr, [
|
||||
// US Social Security Number
|
||||
['pattern' => '/(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/u', 'replacement' => '[SSN]', 'type' => 'ssn'],
|
||||
// Document number in context (passport no., ID No., document no.)
|
||||
['pattern' => '/\b(?:passport\s+(?:no\.?|number)|ID\s+(?:no\.?|number)|document\s+(?:no\.?|number))\s*[:\#]?\s*([A-Z0-9]{6,12})\b/iu', 'replacement' => '[DOC_NO]', 'type' => 'doc_no'],
|
||||
]);
|
||||
}
|
||||
|
||||
private function llmRedactionPass(string $preRedacted, string $language = 'en'): array
|
||||
{
|
||||
$missing = $this->azure->missingChatConfig();
|
||||
if ($missing) {
|
||||
return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')'];
|
||||
}
|
||||
|
||||
$languageNote = $language === 'no' ? "\nThe document may contain Norwegian or mixed-language content." : '';
|
||||
|
||||
$system = <<<PROMPT
|
||||
You are a privacy redaction assistant for legal documents (ECHR judgements, Norwegian family law cases, EU child welfare documents). The text below has already had mechanical identifiers replaced with placeholder tags in [BRACKETS].
|
||||
|
||||
Your task: find any remaining identifiable information — person names, organisation names, specific places at city level or below, dates of birth, and identifying descriptions.
|
||||
|
||||
Return ONLY a valid JSON object:
|
||||
{"redactions":[{"original":"exact text as it appears","type":"person_name","tag":"[PERSON]"}]}
|
||||
|
||||
Allowed type values and their tags:
|
||||
- person_name → [PERSON]
|
||||
- org → [ORG]
|
||||
- place → [PLACE]
|
||||
- date_of_birth → [DOB]
|
||||
- other → [IDENTIFIER]
|
||||
|
||||
Rules:
|
||||
- Include only text that appears verbatim in the input. Do not invent or paraphrase.
|
||||
- If nothing needs redacting, return {"redactions":[]}.
|
||||
- Do not redact text already inside [BRACKETS].
|
||||
- Legal citations, statute names, article numbers, and institution names (e.g. "the European Court of Human Rights", "Barnevernloven § 4-12") are NOT PII.
|
||||
- Short common words, conjunctions, and prepositions are NOT PII.{$languageNote}
|
||||
PROMPT;
|
||||
|
||||
try {
|
||||
$response = $this->azure->chat([
|
||||
['role' => 'system', 'content' => $system],
|
||||
['role' => 'user', 'content' => $preRedacted],
|
||||
], [
|
||||
'temperature' => 0.1,
|
||||
'max_tokens' => 2000,
|
||||
'json' => true,
|
||||
'timeout' => 60,
|
||||
]);
|
||||
|
||||
$content = (string)($response['choices'][0]['message']['content'] ?? '');
|
||||
$json = $this->azure->decodeJsonObject($content);
|
||||
|
||||
if (!is_array($json) || !array_key_exists('redactions', $json)) {
|
||||
return ['skipped' => false, 'entities' => [], 'error' => 'LLM returned unexpected JSON structure'];
|
||||
}
|
||||
|
||||
return [
|
||||
'skipped' => false,
|
||||
'entities' => is_array($json['redactions']) ? $json['redactions'] : [],
|
||||
'deployment' => $this->azure->chatDeployment(),
|
||||
];
|
||||
} catch (Throwable $e) {
|
||||
error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage());
|
||||
return ['skipped' => false, 'entities' => [], 'error' => $e->getMessage()];
|
||||
}
|
||||
}
|
||||
|
||||
private function uncertaintySummary(mixed $uncertainty): string
|
||||
{
|
||||
if (is_array($uncertainty)) {
|
||||
|
||||
Reference in New Issue
Block a user