Two-pass PII redaction with multi-country pattern packs

Pass 1: deterministic regex with Nordic/European/ECHR/Global packs
covering fødselsnummer, Swedish personnummer, Danish/Finnish CPR,
UK NI, French INSEE, IBAN, EU phones, ECHR application numbers, DOB,
and national ID label patterns.

Pass 2: LLM semantic scan (Azure OpenAI) finds names, orgs, places
and identifying descriptions missed by regex. Runs on pre-redacted
text so no raw PII reaches the LLM.

Adds region selector (Nordic/European/ECHR/Global) to the Redact UI.
Falls back gracefully when Azure is not yet configured.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-07 01:27:52 +02:00
parent 2d8d1c7409
commit 9b22947eb2
5 changed files with 229 additions and 42 deletions
+3 -1
View File
@@ -10,5 +10,7 @@ $input = dbnToolsJsonInput(70000);
dbnToolsWithTelemetry('redact', '', function () use ($input): array {
$text = dbnToolsString($input, 'text', 32000);
$mode = (string)($input['mode'] ?? 'standard');
return (new DbnLegalToolsService())->redact($text, $mode);
$region = dbnToolsNormalizeRegion($input['region'] ?? 'nordic');
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
return (new DbnLegalToolsService())->redact($text, $mode, $region, $language);
});
+5
View File
@@ -160,6 +160,7 @@ async function runTool(event) {
}
if (state.activeTool === 'redact') {
payload.mode = currentRedactionMode();
payload.region = currentRedactionRegion();
}
setBusy(true);
@@ -236,6 +237,10 @@ function currentRedactionMode() {
return document.querySelector('input[name="redactionMode"]:checked')?.value || 'standard';
}
function currentRedactionRegion() {
return document.querySelector('input[name="redactionRegion"]:checked')?.value || 'nordic';
}
function renderResults(data) {
const sections = [];
sections.push(sectionHtml('What We Found', renderMainFinding(data)));
+200 -31
View File
@@ -298,37 +298,93 @@ PROMPT;
];
}
public function redact(string $text, string $mode = 'standard'): array
public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en'): array
{
$text = $this->requirePasteText($text);
$mode = $mode === 'strict' ? 'strict' : 'standard';
[$redacted, $entities] = $this->deterministicRedaction($text, $mode);
$region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic';
// Pass 1 — deterministic regex
[$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region);
$pass1Total = array_sum($pass1Counts);
$pass1Detail = $pass1Total
? implode(', ', array_map(
fn($k, $v) => "{$k}: {$v}",
array_keys(array_filter($pass1Counts, fn($v): bool => $v > 0)),
array_filter($pass1Counts, fn($v): bool => $v > 0)
))
: 'none detected';
$categories = array_keys(array_filter($entities, fn(int $count): bool => $count > 0));
$trace = [
$this->trace('Query interpretation', 'Detect and redact sensitive identifiers from pasted text.', 'complete'),
$this->trace('Search tools used', 'Deterministic Norwegian privacy patterns first; no text was stored.', 'complete'),
$this->trace('Evidence found', count($categories) ? 'Detected categories: ' . implode(', ', $categories) . '.' : 'No deterministic sensitive categories were detected.', count($categories) ? 'complete' : 'warning'),
$this->trace('Citation confidence', 'High for emails and fødselsnummer-like values; medium for addresses and names.', 'complete'),
$this->trace('Uncertainty / missing evidence', 'Contextual names may need human review, especially in standard mode.', 'warning'),
$this->trace('Next practical step', 'Review the redacted output before sharing it outside the case team.', 'complete'),
$this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}.", 'complete'),
$this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'),
];
// Pass 2 — LLM semantic scan
$finalRedacted = $preRedacted;
$pass2Counts = [];
$llmDeployment = null;
$llmResult = $this->llmRedactionPass($preRedacted, $language);
if (!empty($llmResult['skipped'])) {
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'Azure not configured') . '.', 'warning');
} elseif (!empty($llmResult['error'])) {
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning');
} else {
$entities = $llmResult['entities'] ?? [];
$llmDeployment = $llmResult['deployment'] ?? null;
$applied = 0;
foreach ($entities as $entity) {
if (!is_array($entity)) {
continue;
}
$original = (string)($entity['original'] ?? '');
$type = (string)($entity['type'] ?? 'other');
$tag = (string)($entity['tag'] ?? '[IDENTIFIER]');
if ($original === '' || str_starts_with($original, '[')) {
continue;
}
if (!in_array($tag, ['[PERSON]', '[ORG]', '[PLACE]', '[DOB]', '[IDENTIFIER]'], true)) {
$tag = '[IDENTIFIER]';
}
if (str_contains($finalRedacted, $original)) {
$finalRedacted = str_replace($original, $tag, $finalRedacted);
$pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1;
$applied++;
}
}
$pass2Detail = $applied > 0
? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts))
: 'no additional entities found';
$trace[] = $this->trace('Pass 2 — LLM semantic scan', "Azure reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete');
}
$allCounts = array_merge($pass1Counts, $pass2Counts);
$categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0));
$trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning');
$trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete');
return [
'tool' => 'redact',
'mode' => $mode,
'what_we_found' => 'Redacted deterministic privacy patterns from the pasted text.',
'redacted_text' => $redacted,
'region' => $region,
'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment ? ' and LLM semantic scan' : '') . '.',
'redacted_text' => $finalRedacted,
'detected_entity_categories' => $categories,
'entity_counts' => $entities,
'entity_counts' => $allCounts,
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
'what_remains_uncertain' => ['Human review is still needed for names that depend on case context.'],
'what_remains_uncertain' => ['Human review is still recommended for contextual identification.'],
'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.',
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => 1,
'source_count' => 1,
'deployment' => null,
'deployment' => $llmDeployment,
],
'disclaimer' => 'Privacy support tool. Review before disclosure.',
];
@@ -564,41 +620,36 @@ PROMPT;
return $text;
}
private function deterministicRedaction(string $text, string $mode): array
private function deterministicRedaction(string $text, string $mode, string $region = 'nordic'): array
{
$counts = [
'email' => 0,
'phone' => 0,
'fødselsnummer' => 0,
'address' => 0,
'person_or_child_name' => 0,
];
$counts = [];
$replace = function (string $pattern, string $category, string $token) use (&$text, &$counts): void {
$text = preg_replace_callback($pattern, function () use (&$counts, $category, $token): string {
$counts[$category]++;
$replace = function (string $pattern, string $type, string $token) use (&$text, &$counts): void {
$text = preg_replace_callback($pattern, function () use (&$counts, $type, $token): string {
$counts[$type] = ($counts[$type] ?? 0) + 1;
return $token;
}, $text) ?? $text;
};
$replace('/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'email', '[EMAIL]');
$replace('/(?<!\d)(?:\d{6}[\s\-]?\d{5}|\d{11})(?!\d)/u', 'fødselsnummer', '[FNR]');
$replace('/(?<!\d)(?:\+47[\s.\-]?)?(?:\d[\s.\-]?){8}(?!\d)/u', 'phone', '[PHONE]');
$replace('/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave)\s+\d+[A-Z]?\b/iu', 'address', '[ADDRESS]');
foreach ($this->getPatternPack($region) as $entry) {
$replace($entry['pattern'], $entry['type'], $entry['replacement']);
}
// Structured role-label names (Barn: X, Mother: X, etc.) — universal
$text = preg_replace_callback(
'/\b(Barn|Child|Navn|Name|Mor|Far|Mother|Father|Sønn|Datter)\s*:\s*([^\r\n,.;]+)/iu',
function (array $m) use (&$counts): string {
$counts['person_or_child_name']++;
$counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
return $m[1] . ': [PERSON]';
},
$text
) ?? $text;
// Child-identifier phrases ("barnet heter X", "child named X") — universal
$text = preg_replace_callback(
'/\b(?:barnet|child|sønn|son|datter|daughter)\s+(?:heter|named|called)?\s*([A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,})\b/iu',
function () use (&$counts): string {
$counts['person_or_child_name']++;
$counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
return '[CHILD_IDENTIFIER]';
},
$text
@@ -611,6 +662,124 @@ PROMPT;
return [$text, $counts];
}
private function getPatternPack(string $region): array
{
$nordic = [
['pattern' => '/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'replacement' => '[EMAIL]', 'type' => 'email'],
['pattern' => '/(?<!\d)(?:\d{6}[\s\-]?\d{5}|\d{11})(?!\d)/u', 'replacement' => '[FNR]', 'type' => 'fødselsnummer'],
['pattern' => '/(?<!\d)(?:\+47[\s.\-]?)?(?:\d[\s.\-]?){8}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
];
if ($region === 'nordic') {
return $nordic;
}
$european = array_merge($nordic, [
// Swedish personnummer short (YYMMDD-XXXX / YYMMDD+XXXX)
['pattern' => '/(?<!\d)\d{6}[-+]\d{4}(?!\d)/u', 'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
// Swedish personnummer full (YYYYMMDD-XXXX)
['pattern' => '/(?<!\d)\d{8}[-+]\d{4}(?!\d)/u', 'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
// Danish/Finnish CPR / henkilötunnus — same format as short SE personnummer but included for clarity
['pattern' => '/\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b/i', 'replacement' => '[UK_NI]', 'type' => 'uk_ni'],
// French INSEE (15 digits, not overlapping with 11-digit FNR due to lookarounds)
['pattern' => '/(?<!\d)\d{15}(?!\d)/u', 'replacement' => '[FR_INSEE]', 'type' => 'fr_insee'],
// IBAN (2-letter country code + 2 check digits + up to 30 alphanumeric)
['pattern' => '/\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}[A-Z0-9]{0,16}\b/i', 'replacement' => '[IBAN]', 'type' => 'iban'],
// European phone (international prefix for major EU/EEA country codes)
['pattern' => '/(?<!\d)\+(?:44|46|45|358|33|49|34|39|31|32|41|43|30|351|353|48|36|420|421|372|371|370|386|385|356|357|40|359|352)[\s.\-]?(?:\d[\s.\-]?){7,12}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
// Street address expanded to European street-type keywords
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave|rue|straße|strasse|straat|gade|calle|via|gatan|vägen)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
]);
if ($region === 'european') {
return $european;
}
$echr = array_merge($european, [
// ECHR application number (requires "Application no." or "App. No." prefix to avoid matching dates/pages)
['pattern' => '/\b(?:Application|App\.?)\s+(?:no\.?|nr\.?|#)\s*\d{3,6}\s*\/\s*\d{2,4}\b/i', 'replacement' => '[ECHR_APP_NO]', 'type' => 'echr_app_no'],
// Date of birth stated in judgment context
['pattern' => '/\bborn\s+(?:on\s+)?\d{1,2}[.\s]+(?:January|February|March|April|May|June|July|August|September|October|November|December|januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)[,\s]+\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
['pattern' => '/\bf\.\s*\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
// National ID label patterns in multiple languages
['pattern' => '/\b(?:personal\s+number|numéro\s+national|Personalausweis|personnummer|fødselsnummer|henkilötunnus|CPR-nummer)\s*:\s*[\w\s\-]+/iu', 'replacement' => '[NAT_ID]', 'type' => 'nat_id'],
]);
if ($region === 'echr') {
return $echr;
}
// global
return array_merge($echr, [
// US Social Security Number
['pattern' => '/(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/u', 'replacement' => '[SSN]', 'type' => 'ssn'],
// Document number in context (passport no., ID No., document no.)
['pattern' => '/\b(?:passport\s+(?:no\.?|number)|ID\s+(?:no\.?|number)|document\s+(?:no\.?|number))\s*[:\#]?\s*([A-Z0-9]{6,12})\b/iu', 'replacement' => '[DOC_NO]', 'type' => 'doc_no'],
]);
}
private function llmRedactionPass(string $preRedacted, string $language = 'en'): array
{
$missing = $this->azure->missingChatConfig();
if ($missing) {
return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')'];
}
$languageNote = $language === 'no' ? "\nThe document may contain Norwegian or mixed-language content." : '';
$system = <<<PROMPT
You are a privacy redaction assistant for legal documents (ECHR judgements, Norwegian family law cases, EU child welfare documents). The text below has already had mechanical identifiers replaced with placeholder tags in [BRACKETS].
Your task: find any remaining identifiable information — person names, organisation names, specific places at city level or below, dates of birth, and identifying descriptions.
Return ONLY a valid JSON object:
{"redactions":[{"original":"exact text as it appears","type":"person_name","tag":"[PERSON]"}]}
Allowed type values and their tags:
- person_name → [PERSON]
- org → [ORG]
- place → [PLACE]
- date_of_birth → [DOB]
- other → [IDENTIFIER]
Rules:
- Include only text that appears verbatim in the input. Do not invent or paraphrase.
- If nothing needs redacting, return {"redactions":[]}.
- Do not redact text already inside [BRACKETS].
- Legal citations, statute names, article numbers, and institution names (e.g. "the European Court of Human Rights", "Barnevernloven § 4-12") are NOT PII.
- Short common words, conjunctions, and prepositions are NOT PII.{$languageNote}
PROMPT;
try {
$response = $this->azure->chat([
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $preRedacted],
], [
'temperature' => 0.1,
'max_tokens' => 2000,
'json' => true,
'timeout' => 60,
]);
$content = (string)($response['choices'][0]['message']['content'] ?? '');
$json = $this->azure->decodeJsonObject($content);
if (!is_array($json) || !array_key_exists('redactions', $json)) {
return ['skipped' => false, 'entities' => [], 'error' => 'LLM returned unexpected JSON structure'];
}
return [
'skipped' => false,
'entities' => is_array($json['redactions']) ? $json['redactions'] : [],
'deployment' => $this->azure->chatDeployment(),
];
} catch (Throwable $e) {
error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage());
return ['skipped' => false, 'entities' => [], 'error' => $e->getMessage()];
}
}
private function uncertaintySummary(mixed $uncertainty): string
{
if (is_array($uncertainty)) {
+6
View File
@@ -195,6 +195,12 @@ function dbnToolsNormalizeLanguage(mixed $value): string
return in_array($language, ['no', 'en'], true) ? $language : 'en';
}
function dbnToolsNormalizeRegion(mixed $value): string
{
$region = strtolower(trim((string)$value));
return in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic';
}
function dbnToolsString(array $input, string $key, int $maxChars, bool $required = true): string
{
$value = trim((string)($input[$key] ?? ''));
+5
View File
@@ -99,6 +99,11 @@ $authenticated = dbnToolsIsAuthenticated();
<span class="control-label">Mode</span>
<label><input type="radio" name="redactionMode" value="standard" checked> Standard</label>
<label><input type="radio" name="redactionMode" value="strict"> Strict</label>
<span class="control-label" style="margin-left:1.25rem">Region</span>
<label><input type="radio" name="redactionRegion" value="nordic" checked> Nordic</label>
<label><input type="radio" name="redactionRegion" value="european"> European</label>
<label><input type="radio" name="redactionRegion" value="echr"> ECHR</label>
<label><input type="radio" name="redactionRegion" value="global"> Global</label>
</div>
<label class="input-label" for="toolInput" id="inputLabel">Question</label>