Two-pass PII redaction with multi-country pattern packs

Pass 1: deterministic regex with Nordic/European/ECHR/Global packs covering fødselsnummer, Swedish personnummer, Danish/Finnish CPR, UK NI, French INSEE, IBAN, EU phones, ECHR application numbers, DOB, and national ID label patterns. Pass 2: LLM semantic scan (Azure OpenAI) finds names, orgs, places and identifying descriptions missed by regex. Runs on pre-redacted text so no raw PII reaches the LLM. Adds region selector (Nordic/European/ECHR/Global) to the Redact UI. Falls back gracefully when Azure is not yet configured. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 01:27:52 +02:00
parent 2d8d1c7409
commit 9b22947eb2
5 changed files with 229 additions and 42 deletions
@@ -298,37 +298,93 @@ PROMPT;
        ];
    }

-    public function redact(string $text, string $mode = 'standard'): array
+    public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en'): array
    {
        $text = $this->requirePasteText($text);
-        $mode = $mode === 'strict' ? 'strict' : 'standard';
-        [$redacted, $entities] = $this->deterministicRedaction($text, $mode);
+        $mode   = $mode   === 'strict'    ? 'strict'    : 'standard';
+        $region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic';
+
+        // Pass 1 — deterministic regex
+        [$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region);
+        $pass1Total = array_sum($pass1Counts);
+        $pass1Detail = $pass1Total
+            ? implode(', ', array_map(
+                fn($k, $v) => "{$k}: {$v}",
+                array_keys(array_filter($pass1Counts, fn($v): bool => $v > 0)),
+                array_filter($pass1Counts, fn($v): bool => $v > 0)
+              ))
+            : 'none detected';

-        $categories = array_keys(array_filter($entities, fn(int $count): bool => $count > 0));
        $trace = [
-            $this->trace('Query interpretation', 'Detect and redact sensitive identifiers from pasted text.', 'complete'),
-            $this->trace('Search tools used', 'Deterministic Norwegian privacy patterns first; no text was stored.', 'complete'),
-            $this->trace('Evidence found', count($categories) ? 'Detected categories: ' . implode(', ', $categories) . '.' : 'No deterministic sensitive categories were detected.', count($categories) ? 'complete' : 'warning'),
-            $this->trace('Citation confidence', 'High for emails and fødselsnummer-like values; medium for addresses and names.', 'complete'),
-            $this->trace('Uncertainty / missing evidence', 'Contextual names may need human review, especially in standard mode.', 'warning'),
-            $this->trace('Next practical step', 'Review the redacted output before sharing it outside the case team.', 'complete'),
+            $this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}.", 'complete'),
+            $this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'),
        ];

+        // Pass 2 — LLM semantic scan
+        $finalRedacted = $preRedacted;
+        $pass2Counts   = [];
+        $llmDeployment = null;
+
+        $llmResult = $this->llmRedactionPass($preRedacted, $language);
+
+        if (!empty($llmResult['skipped'])) {
+            $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'Azure not configured') . '.', 'warning');
+        } elseif (!empty($llmResult['error'])) {
+            $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning');
+        } else {
+            $entities      = $llmResult['entities'] ?? [];
+            $llmDeployment = $llmResult['deployment'] ?? null;
+            $applied       = 0;
+
+            foreach ($entities as $entity) {
+                if (!is_array($entity)) {
+                    continue;
+                }
+                $original = (string)($entity['original'] ?? '');
+                $type     = (string)($entity['type']     ?? 'other');
+                $tag      = (string)($entity['tag']      ?? '[IDENTIFIER]');
+                if ($original === '' || str_starts_with($original, '[')) {
+                    continue;
+                }
+                if (!in_array($tag, ['[PERSON]', '[ORG]', '[PLACE]', '[DOB]', '[IDENTIFIER]'], true)) {
+                    $tag = '[IDENTIFIER]';
+                }
+                if (str_contains($finalRedacted, $original)) {
+                    $finalRedacted = str_replace($original, $tag, $finalRedacted);
+                    $pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1;
+                    $applied++;
+                }
+            }
+
+            $pass2Detail = $applied > 0
+                ? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts))
+                : 'no additional entities found';
+
+            $trace[] = $this->trace('Pass 2 — LLM semantic scan', "Azure reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete');
+        }
+
+        $allCounts  = array_merge($pass1Counts, $pass2Counts);
+        $categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0));
+
+        $trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning');
+        $trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete');
+
        return [
-            'tool' => 'redact',
-            'mode' => $mode,
-            'what_we_found' => 'Redacted deterministic privacy patterns from the pasted text.',
-            'redacted_text' => $redacted,
+            'tool'                       => 'redact',
+            'mode'                       => $mode,
+            'region'                     => $region,
+            'what_we_found'              => "Applied {$region} pattern pack" . ($llmDeployment ? ' and LLM semantic scan' : '') . '.',
+            'redacted_text'              => $finalRedacted,
            'detected_entity_categories' => $categories,
-            'entity_counts' => $entities,
-            'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
-            'what_remains_uncertain' => ['Human review is still needed for names that depend on case context.'],
-            'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.',
-            'trace' => $trace,
-            'trace_metadata' => [
-                'chunk_count' => 1,
+            'entity_counts'              => $allCounts,
+            'evidence_trail'             => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
+            'what_remains_uncertain'     => ['Human review is still recommended for contextual identification.'],
+            'next_practical_step'        => 'Review the output and rerun in strict mode if the text will be shared broadly.',
+            'trace'                      => $trace,
+            'trace_metadata'             => [
+                'chunk_count'  => 1,
                'source_count' => 1,
-                'deployment' => null,
+                'deployment'   => $llmDeployment,
            ],
            'disclaimer' => 'Privacy support tool. Review before disclosure.',
        ];
@@ -564,41 +620,36 @@ PROMPT;
        return $text;
    }

-    private function deterministicRedaction(string $text, string $mode): array
+    private function deterministicRedaction(string $text, string $mode, string $region = 'nordic'): array
    {
-        $counts = [
-            'email' => 0,
-            'phone' => 0,
-            'fødselsnummer' => 0,
-            'address' => 0,
-            'person_or_child_name' => 0,
-        ];
+        $counts = [];

-        $replace = function (string $pattern, string $category, string $token) use (&$text, &$counts): void {
-            $text = preg_replace_callback($pattern, function () use (&$counts, $category, $token): string {
-                $counts[$category]++;
+        $replace = function (string $pattern, string $type, string $token) use (&$text, &$counts): void {
+            $text = preg_replace_callback($pattern, function () use (&$counts, $type, $token): string {
+                $counts[$type] = ($counts[$type] ?? 0) + 1;
                return $token;
            }, $text) ?? $text;
        };

-        $replace('/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'email', '[EMAIL]');
-        $replace('/(?<!\d)(?:\d{6}[\s\-]?\d{5}|\d{11})(?!\d)/u', 'fødselsnummer', '[FNR]');
-        $replace('/(?<!\d)(?:\+47[\s.\-]?)?(?:\d[\s.\-]?){8}(?!\d)/u', 'phone', '[PHONE]');
-        $replace('/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave)\s+\d+[A-Z]?\b/iu', 'address', '[ADDRESS]');
+        foreach ($this->getPatternPack($region) as $entry) {
+            $replace($entry['pattern'], $entry['type'], $entry['replacement']);
+        }

+        // Structured role-label names (Barn: X, Mother: X, etc.) — universal
        $text = preg_replace_callback(
            '/\b(Barn|Child|Navn|Name|Mor|Far|Mother|Father|Sønn|Datter)\s*:\s*([^\r\n,.;]+)/iu',
            function (array $m) use (&$counts): string {
-                $counts['person_or_child_name']++;
+                $counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
                return $m[1] . ': [PERSON]';
            },
            $text
        ) ?? $text;

+        // Child-identifier phrases ("barnet heter X", "child named X") — universal
        $text = preg_replace_callback(
            '/\b(?:barnet|child|sønn|son|datter|daughter)\s+(?:heter|named|called)?\s*([A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,})\b/iu',
            function () use (&$counts): string {
-                $counts['person_or_child_name']++;
+                $counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
                return '[CHILD_IDENTIFIER]';
            },
            $text
@@ -611,6 +662,124 @@ PROMPT;
        return [$text, $counts];
    }

+    private function getPatternPack(string $region): array
+    {
+        $nordic = [
+            ['pattern' => '/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i',                                                                    'replacement' => '[EMAIL]',   'type' => 'email'],
+            ['pattern' => '/(?<!\d)(?:\d{6}[\s\-]?\d{5}|\d{11})(?!\d)/u',                                                                       'replacement' => '[FNR]',     'type' => 'fødselsnummer'],
+            ['pattern' => '/(?<!\d)(?:\+47[\s.\-]?)?(?:\d[\s.\-]?){8}(?!\d)/u',                                                                 'replacement' => '[PHONE]',   'type' => 'phone'],
+            ['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu',                          'replacement' => '[ADDRESS]', 'type' => 'address'],
+        ];
+
+        if ($region === 'nordic') {
+            return $nordic;
+        }
+
+        $european = array_merge($nordic, [
+            // Swedish personnummer short (YYMMDD-XXXX / YYMMDD+XXXX)
+            ['pattern' => '/(?<!\d)\d{6}[-+]\d{4}(?!\d)/u',                                                                                      'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
+            // Swedish personnummer full (YYYYMMDD-XXXX)
+            ['pattern' => '/(?<!\d)\d{8}[-+]\d{4}(?!\d)/u',                                                                                      'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
+            // Danish/Finnish CPR / henkilötunnus — same format as short SE personnummer but included for clarity
+            ['pattern' => '/\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b/i',                                                       'replacement' => '[UK_NI]',           'type' => 'uk_ni'],
+            // French INSEE (15 digits, not overlapping with 11-digit FNR due to lookarounds)
+            ['pattern' => '/(?<!\d)\d{15}(?!\d)/u',                                                                                              'replacement' => '[FR_INSEE]',        'type' => 'fr_insee'],
+            // IBAN (2-letter country code + 2 check digits + up to 30 alphanumeric)
+            ['pattern' => '/\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}[A-Z0-9]{0,16}\b/i',                                                                'replacement' => '[IBAN]',            'type' => 'iban'],
+            // European phone (international prefix for major EU/EEA country codes)
+            ['pattern' => '/(?<!\d)\+(?:44|46|45|358|33|49|34|39|31|32|41|43|30|351|353|48|36|420|421|372|371|370|386|385|356|357|40|359|352)[\s.\-]?(?:\d[\s.\-]?){7,12}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
+            // Street address expanded to European street-type keywords
+            ['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave|rue|straße|strasse|straat|gade|calle|via|gatan|vägen)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
+        ]);
+
+        if ($region === 'european') {
+            return $european;
+        }
+
+        $echr = array_merge($european, [
+            // ECHR application number (requires "Application no." or "App. No." prefix to avoid matching dates/pages)
+            ['pattern' => '/\b(?:Application|App\.?)\s+(?:no\.?|nr\.?|#)\s*\d{3,6}\s*\/\s*\d{2,4}\b/i',                                        'replacement' => '[ECHR_APP_NO]',     'type' => 'echr_app_no'],
+            // Date of birth stated in judgment context
+            ['pattern' => '/\bborn\s+(?:on\s+)?\d{1,2}[.\s]+(?:January|February|March|April|May|June|July|August|September|October|November|December|januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)[,\s]+\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
+            ['pattern' => '/\bf\.\s*\d{4}\b/iu',                                                                                                 'replacement' => '[DOB]',             'type' => 'date_of_birth'],
+            // National ID label patterns in multiple languages
+            ['pattern' => '/\b(?:personal\s+number|numéro\s+national|Personalausweis|personnummer|fødselsnummer|henkilötunnus|CPR-nummer)\s*:\s*[\w\s\-]+/iu', 'replacement' => '[NAT_ID]', 'type' => 'nat_id'],
+        ]);
+
+        if ($region === 'echr') {
+            return $echr;
+        }
+
+        // global
+        return array_merge($echr, [
+            // US Social Security Number
+            ['pattern' => '/(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/u',                                                                                  'replacement' => '[SSN]',             'type' => 'ssn'],
+            // Document number in context (passport no., ID No., document no.)
+            ['pattern' => '/\b(?:passport\s+(?:no\.?|number)|ID\s+(?:no\.?|number)|document\s+(?:no\.?|number))\s*[:\#]?\s*([A-Z0-9]{6,12})\b/iu', 'replacement' => '[DOC_NO]',       'type' => 'doc_no'],
+        ]);
+    }
+
+    private function llmRedactionPass(string $preRedacted, string $language = 'en'): array
+    {
+        $missing = $this->azure->missingChatConfig();
+        if ($missing) {
+            return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')'];
+        }
+
+        $languageNote = $language === 'no' ? "\nThe document may contain Norwegian or mixed-language content." : '';
+
+        $system = <<<PROMPT
+You are a privacy redaction assistant for legal documents (ECHR judgements, Norwegian family law cases, EU child welfare documents). The text below has already had mechanical identifiers replaced with placeholder tags in [BRACKETS].
+
+Your task: find any remaining identifiable information — person names, organisation names, specific places at city level or below, dates of birth, and identifying descriptions.
+
+Return ONLY a valid JSON object:
+{"redactions":[{"original":"exact text as it appears","type":"person_name","tag":"[PERSON]"}]}
+
+Allowed type values and their tags:
+- person_name → [PERSON]
+- org → [ORG]
+- place → [PLACE]
+- date_of_birth → [DOB]
+- other → [IDENTIFIER]
+
+Rules:
+- Include only text that appears verbatim in the input. Do not invent or paraphrase.
+- If nothing needs redacting, return {"redactions":[]}.
+- Do not redact text already inside [BRACKETS].
+- Legal citations, statute names, article numbers, and institution names (e.g. "the European Court of Human Rights", "Barnevernloven § 4-12") are NOT PII.
+- Short common words, conjunctions, and prepositions are NOT PII.{$languageNote}
+PROMPT;
+
+        try {
+            $response = $this->azure->chat([
+                ['role' => 'system', 'content' => $system],
+                ['role' => 'user',   'content' => $preRedacted],
+            ], [
+                'temperature' => 0.1,
+                'max_tokens'  => 2000,
+                'json'        => true,
+                'timeout'     => 60,
+            ]);
+
+            $content = (string)($response['choices'][0]['message']['content'] ?? '');
+            $json    = $this->azure->decodeJsonObject($content);
+
+            if (!is_array($json) || !array_key_exists('redactions', $json)) {
+                return ['skipped' => false, 'entities' => [], 'error' => 'LLM returned unexpected JSON structure'];
+            }
+
+            return [
+                'skipped'    => false,
+                'entities'   => is_array($json['redactions']) ? $json['redactions'] : [],
+                'deployment' => $this->azure->chatDeployment(),
+            ];
+        } catch (Throwable $e) {
+            error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage());
+            return ['skipped' => false, 'entities' => [], 'error' => $e->getMessage()];
+        }
+    }
+
    private function uncertaintySummary(mixed $uncertainty): string
    {
        if (is_array($uncertainty)) {