From 9b22947eb2426655ba34af7dd59b83e8d9d60c98 Mon Sep 17 00:00:00 2001 From: davegilligan Date: Thu, 7 May 2026 01:27:52 +0200 Subject: [PATCH] Two-pass PII redaction with multi-country pattern packs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass 1: deterministic regex with Nordic/European/ECHR/Global packs covering fødselsnummer, Swedish personnummer, Danish/Finnish CPR, UK NI, French INSEE, IBAN, EU phones, ECHR application numbers, DOB, and national ID label patterns. Pass 2: LLM semantic scan (Azure OpenAI) finds names, orgs, places and identifying descriptions missed by regex. Runs on pre-redacted text so no raw PII reaches the LLM. Adds region selector (Nordic/European/ECHR/Global) to the Redact UI. Falls back gracefully when Azure is not yet configured. Co-Authored-By: Claude Sonnet 4.6 --- api/redact.php | 8 +- assets/js/tools.js | 5 + includes/LegalTools.php | 247 +++++++++++++++++++++++++++++++++------- includes/bootstrap.php | 6 + index.php | 5 + 5 files changed, 229 insertions(+), 42 deletions(-) diff --git a/api/redact.php b/api/redact.php index 1709d03..7467ca1 100644 --- a/api/redact.php +++ b/api/redact.php @@ -8,7 +8,9 @@ dbnToolsRequireAuth(); $input = dbnToolsJsonInput(70000); dbnToolsWithTelemetry('redact', '', function () use ($input): array { - $text = dbnToolsString($input, 'text', 32000); - $mode = (string)($input['mode'] ?? 'standard'); - return (new DbnLegalToolsService())->redact($text, $mode); + $text = dbnToolsString($input, 'text', 32000); + $mode = (string)($input['mode'] ?? 'standard'); + $region = dbnToolsNormalizeRegion($input['region'] ?? 'nordic'); + $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); + return (new DbnLegalToolsService())->redact($text, $mode, $region, $language); }); diff --git a/assets/js/tools.js b/assets/js/tools.js index 8f0912b..1f37c33 100644 --- a/assets/js/tools.js +++ b/assets/js/tools.js @@ -160,6 +160,7 @@ async function runTool(event) { } if (state.activeTool === 'redact') { payload.mode = currentRedactionMode(); + payload.region = currentRedactionRegion(); } setBusy(true); @@ -236,6 +237,10 @@ function currentRedactionMode() { return document.querySelector('input[name="redactionMode"]:checked')?.value || 'standard'; } +function currentRedactionRegion() { + return document.querySelector('input[name="redactionRegion"]:checked')?.value || 'nordic'; +} + function renderResults(data) { const sections = []; sections.push(sectionHtml('What We Found', renderMainFinding(data))); diff --git a/includes/LegalTools.php b/includes/LegalTools.php index 23c24c9..6745795 100644 --- a/includes/LegalTools.php +++ b/includes/LegalTools.php @@ -298,37 +298,93 @@ PROMPT; ]; } - public function redact(string $text, string $mode = 'standard'): array + public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en'): array { $text = $this->requirePasteText($text); - $mode = $mode === 'strict' ? 'strict' : 'standard'; - [$redacted, $entities] = $this->deterministicRedaction($text, $mode); + $mode = $mode === 'strict' ? 'strict' : 'standard'; + $region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic'; + + // Pass 1 — deterministic regex + [$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region); + $pass1Total = array_sum($pass1Counts); + $pass1Detail = $pass1Total + ? implode(', ', array_map( + fn($k, $v) => "{$k}: {$v}", + array_keys(array_filter($pass1Counts, fn($v): bool => $v > 0)), + array_filter($pass1Counts, fn($v): bool => $v > 0) + )) + : 'none detected'; - $categories = array_keys(array_filter($entities, fn(int $count): bool => $count > 0)); $trace = [ - $this->trace('Query interpretation', 'Detect and redact sensitive identifiers from pasted text.', 'complete'), - $this->trace('Search tools used', 'Deterministic Norwegian privacy patterns first; no text was stored.', 'complete'), - $this->trace('Evidence found', count($categories) ? 'Detected categories: ' . implode(', ', $categories) . '.' : 'No deterministic sensitive categories were detected.', count($categories) ? 'complete' : 'warning'), - $this->trace('Citation confidence', 'High for emails and fødselsnummer-like values; medium for addresses and names.', 'complete'), - $this->trace('Uncertainty / missing evidence', 'Contextual names may need human review, especially in standard mode.', 'warning'), - $this->trace('Next practical step', 'Review the redacted output before sharing it outside the case team.', 'complete'), + $this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}.", 'complete'), + $this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'), ]; + // Pass 2 — LLM semantic scan + $finalRedacted = $preRedacted; + $pass2Counts = []; + $llmDeployment = null; + + $llmResult = $this->llmRedactionPass($preRedacted, $language); + + if (!empty($llmResult['skipped'])) { + $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'Azure not configured') . '.', 'warning'); + } elseif (!empty($llmResult['error'])) { + $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning'); + } else { + $entities = $llmResult['entities'] ?? []; + $llmDeployment = $llmResult['deployment'] ?? null; + $applied = 0; + + foreach ($entities as $entity) { + if (!is_array($entity)) { + continue; + } + $original = (string)($entity['original'] ?? ''); + $type = (string)($entity['type'] ?? 'other'); + $tag = (string)($entity['tag'] ?? '[IDENTIFIER]'); + if ($original === '' || str_starts_with($original, '[')) { + continue; + } + if (!in_array($tag, ['[PERSON]', '[ORG]', '[PLACE]', '[DOB]', '[IDENTIFIER]'], true)) { + $tag = '[IDENTIFIER]'; + } + if (str_contains($finalRedacted, $original)) { + $finalRedacted = str_replace($original, $tag, $finalRedacted); + $pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1; + $applied++; + } + } + + $pass2Detail = $applied > 0 + ? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts)) + : 'no additional entities found'; + + $trace[] = $this->trace('Pass 2 — LLM semantic scan', "Azure reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete'); + } + + $allCounts = array_merge($pass1Counts, $pass2Counts); + $categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0)); + + $trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning'); + $trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete'); + return [ - 'tool' => 'redact', - 'mode' => $mode, - 'what_we_found' => 'Redacted deterministic privacy patterns from the pasted text.', - 'redacted_text' => $redacted, + 'tool' => 'redact', + 'mode' => $mode, + 'region' => $region, + 'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment ? ' and LLM semantic scan' : '') . '.', + 'redacted_text' => $finalRedacted, 'detected_entity_categories' => $categories, - 'entity_counts' => $entities, - 'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']], - 'what_remains_uncertain' => ['Human review is still needed for names that depend on case context.'], - 'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.', - 'trace' => $trace, - 'trace_metadata' => [ - 'chunk_count' => 1, + 'entity_counts' => $allCounts, + 'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']], + 'what_remains_uncertain' => ['Human review is still recommended for contextual identification.'], + 'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.', + 'trace' => $trace, + 'trace_metadata' => [ + 'chunk_count' => 1, 'source_count' => 1, - 'deployment' => null, + 'deployment' => $llmDeployment, ], 'disclaimer' => 'Privacy support tool. Review before disclosure.', ]; @@ -564,41 +620,36 @@ PROMPT; return $text; } - private function deterministicRedaction(string $text, string $mode): array + private function deterministicRedaction(string $text, string $mode, string $region = 'nordic'): array { - $counts = [ - 'email' => 0, - 'phone' => 0, - 'fødselsnummer' => 0, - 'address' => 0, - 'person_or_child_name' => 0, - ]; + $counts = []; - $replace = function (string $pattern, string $category, string $token) use (&$text, &$counts): void { - $text = preg_replace_callback($pattern, function () use (&$counts, $category, $token): string { - $counts[$category]++; + $replace = function (string $pattern, string $type, string $token) use (&$text, &$counts): void { + $text = preg_replace_callback($pattern, function () use (&$counts, $type, $token): string { + $counts[$type] = ($counts[$type] ?? 0) + 1; return $token; }, $text) ?? $text; }; - $replace('/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'email', '[EMAIL]'); - $replace('/(?getPatternPack($region) as $entry) { + $replace($entry['pattern'], $entry['type'], $entry['replacement']); + } + // Structured role-label names (Barn: X, Mother: X, etc.) — universal $text = preg_replace_callback( '/\b(Barn|Child|Navn|Name|Mor|Far|Mother|Father|Sønn|Datter)\s*:\s*([^\r\n,.;]+)/iu', function (array $m) use (&$counts): string { - $counts['person_or_child_name']++; + $counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1; return $m[1] . ': [PERSON]'; }, $text ) ?? $text; + // Child-identifier phrases ("barnet heter X", "child named X") — universal $text = preg_replace_callback( '/\b(?:barnet|child|sønn|son|datter|daughter)\s+(?:heter|named|called)?\s*([A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,})\b/iu', function () use (&$counts): string { - $counts['person_or_child_name']++; + $counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1; return '[CHILD_IDENTIFIER]'; }, $text @@ -611,6 +662,124 @@ PROMPT; return [$text, $counts]; } + private function getPatternPack(string $region): array + { + $nordic = [ + ['pattern' => '/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'replacement' => '[EMAIL]', 'type' => 'email'], + ['pattern' => '/(? '[FNR]', 'type' => 'fødselsnummer'], + ['pattern' => '/(? '[PHONE]', 'type' => 'phone'], + ['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'], + ]; + + if ($region === 'nordic') { + return $nordic; + } + + $european = array_merge($nordic, [ + // Swedish personnummer short (YYMMDD-XXXX / YYMMDD+XXXX) + ['pattern' => '/(? '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'], + // Swedish personnummer full (YYYYMMDD-XXXX) + ['pattern' => '/(? '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'], + // Danish/Finnish CPR / henkilötunnus — same format as short SE personnummer but included for clarity + ['pattern' => '/\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b/i', 'replacement' => '[UK_NI]', 'type' => 'uk_ni'], + // French INSEE (15 digits, not overlapping with 11-digit FNR due to lookarounds) + ['pattern' => '/(? '[FR_INSEE]', 'type' => 'fr_insee'], + // IBAN (2-letter country code + 2 check digits + up to 30 alphanumeric) + ['pattern' => '/\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}[A-Z0-9]{0,16}\b/i', 'replacement' => '[IBAN]', 'type' => 'iban'], + // European phone (international prefix for major EU/EEA country codes) + ['pattern' => '/(? '[PHONE]', 'type' => 'phone'], + // Street address expanded to European street-type keywords + ['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave|rue|straße|strasse|straat|gade|calle|via|gatan|vägen)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'], + ]); + + if ($region === 'european') { + return $european; + } + + $echr = array_merge($european, [ + // ECHR application number (requires "Application no." or "App. No." prefix to avoid matching dates/pages) + ['pattern' => '/\b(?:Application|App\.?)\s+(?:no\.?|nr\.?|#)\s*\d{3,6}\s*\/\s*\d{2,4}\b/i', 'replacement' => '[ECHR_APP_NO]', 'type' => 'echr_app_no'], + // Date of birth stated in judgment context + ['pattern' => '/\bborn\s+(?:on\s+)?\d{1,2}[.\s]+(?:January|February|March|April|May|June|July|August|September|October|November|December|januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)[,\s]+\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'], + ['pattern' => '/\bf\.\s*\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'], + // National ID label patterns in multiple languages + ['pattern' => '/\b(?:personal\s+number|numéro\s+national|Personalausweis|personnummer|fødselsnummer|henkilötunnus|CPR-nummer)\s*:\s*[\w\s\-]+/iu', 'replacement' => '[NAT_ID]', 'type' => 'nat_id'], + ]); + + if ($region === 'echr') { + return $echr; + } + + // global + return array_merge($echr, [ + // US Social Security Number + ['pattern' => '/(? '[SSN]', 'type' => 'ssn'], + // Document number in context (passport no., ID No., document no.) + ['pattern' => '/\b(?:passport\s+(?:no\.?|number)|ID\s+(?:no\.?|number)|document\s+(?:no\.?|number))\s*[:\#]?\s*([A-Z0-9]{6,12})\b/iu', 'replacement' => '[DOC_NO]', 'type' => 'doc_no'], + ]); + } + + private function llmRedactionPass(string $preRedacted, string $language = 'en'): array + { + $missing = $this->azure->missingChatConfig(); + if ($missing) { + return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')']; + } + + $languageNote = $language === 'no' ? "\nThe document may contain Norwegian or mixed-language content." : ''; + + $system = <<azure->chat([ + ['role' => 'system', 'content' => $system], + ['role' => 'user', 'content' => $preRedacted], + ], [ + 'temperature' => 0.1, + 'max_tokens' => 2000, + 'json' => true, + 'timeout' => 60, + ]); + + $content = (string)($response['choices'][0]['message']['content'] ?? ''); + $json = $this->azure->decodeJsonObject($content); + + if (!is_array($json) || !array_key_exists('redactions', $json)) { + return ['skipped' => false, 'entities' => [], 'error' => 'LLM returned unexpected JSON structure']; + } + + return [ + 'skipped' => false, + 'entities' => is_array($json['redactions']) ? $json['redactions'] : [], + 'deployment' => $this->azure->chatDeployment(), + ]; + } catch (Throwable $e) { + error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage()); + return ['skipped' => false, 'entities' => [], 'error' => $e->getMessage()]; + } + } + private function uncertaintySummary(mixed $uncertainty): string { if (is_array($uncertainty)) { diff --git a/includes/bootstrap.php b/includes/bootstrap.php index ec3db9e..324a41a 100644 --- a/includes/bootstrap.php +++ b/includes/bootstrap.php @@ -195,6 +195,12 @@ function dbnToolsNormalizeLanguage(mixed $value): string return in_array($language, ['no', 'en'], true) ? $language : 'en'; } +function dbnToolsNormalizeRegion(mixed $value): string +{ + $region = strtolower(trim((string)$value)); + return in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic'; +} + function dbnToolsString(array $input, string $key, int $maxChars, bool $required = true): string { $value = trim((string)($input[$key] ?? '')); diff --git a/index.php b/index.php index 5fc1705..93b63ac 100644 --- a/index.php +++ b/index.php @@ -99,6 +99,11 @@ $authenticated = dbnToolsIsAuthenticated(); Mode + Region + + + +