Redact tool: rich UI, multilingual, engine choice, output formats
- Custom inline form (EN/NO/UK/PL lang switcher) replacing generic stub - Engine selector: Azure gpt-4o-mini (default), gpt-4o, GPU cuttlefish, regex-only - Entity type toggles: names, organisations, places, dates of birth - Output formats: contextual role tags, generic [PERSON], Norwegian pseudonyms - Keep officials mode: judges/experts kept as [JUDGE: Andersen] format - Exempt names list: specific names excluded from redaction - Hint paragraphs explaining each option in all four languages - Backend: engine routing, callGpuLlm(), applyGenericTags(), applyPseudonymization() - AzureOpenAiGateway: withDeployment() clone pattern for per-call model override Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+268
-30
@@ -343,15 +343,33 @@ PROMPT;
|
||||
];
|
||||
}
|
||||
|
||||
public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en', array $aliases = []): array
|
||||
{
|
||||
$text = $this->requirePasteText($text);
|
||||
$mode = $mode === 'strict' ? 'strict' : 'standard';
|
||||
$region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic';
|
||||
public function redact(
|
||||
string $text,
|
||||
string $mode = 'standard',
|
||||
string $region = 'nordic',
|
||||
string $language = 'en',
|
||||
array $aliases = [],
|
||||
string $engine = 'azure_mini',
|
||||
string $outputFormat = 'contextual',
|
||||
bool $keepOfficials = false,
|
||||
array $exemptNames = [],
|
||||
array $redactTypes = []
|
||||
): array {
|
||||
$text = $this->requirePasteText($text);
|
||||
$mode = $mode === 'strict' ? 'strict' : 'standard';
|
||||
$region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic';
|
||||
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'regex'], true) ? $engine : 'azure_mini';
|
||||
$outputFormat = in_array($outputFormat, ['contextual', 'generic', 'pseudonym'], true) ? $outputFormat : 'contextual';
|
||||
|
||||
// Normalise entity-type flags (all on by default)
|
||||
$doNames = ($redactTypes['names'] ?? true) !== false;
|
||||
$doOrgs = ($redactTypes['orgs'] ?? true) !== false;
|
||||
$doPlaces = ($redactTypes['places'] ?? true) !== false;
|
||||
$doDob = ($redactTypes['dob'] ?? true) !== false;
|
||||
|
||||
// Pass 1 — deterministic regex
|
||||
[$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region);
|
||||
$pass1Total = array_sum($pass1Counts);
|
||||
$pass1Total = array_sum($pass1Counts);
|
||||
$pass1Detail = $pass1Total
|
||||
? implode(', ', array_map(
|
||||
fn($k, $v) => "{$k}: {$v}",
|
||||
@@ -360,8 +378,15 @@ PROMPT;
|
||||
))
|
||||
: 'none detected';
|
||||
|
||||
$engineLabel = match ($engine) {
|
||||
'azure_full' => 'Azure gpt-4o',
|
||||
'gpu' => 'GPU (cuttlefish)',
|
||||
'regex' => 'Regex only',
|
||||
default => 'Azure gpt-4o-mini',
|
||||
};
|
||||
|
||||
$trace = [
|
||||
$this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}.", 'complete'),
|
||||
$this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}. Engine: {$engineLabel}.", 'complete'),
|
||||
$this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'),
|
||||
];
|
||||
|
||||
@@ -370,10 +395,14 @@ PROMPT;
|
||||
$pass2Counts = [];
|
||||
$llmDeployment = null;
|
||||
|
||||
$llmResult = $this->llmRedactionPass($preRedacted, $language, $aliases);
|
||||
$llmResult = $this->llmRedactionPass(
|
||||
$preRedacted, $language, $aliases, $engine,
|
||||
$keepOfficials, $exemptNames,
|
||||
$doNames, $doOrgs, $doPlaces, $doDob
|
||||
);
|
||||
|
||||
if (!empty($llmResult['skipped'])) {
|
||||
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'Azure not configured') . '.', 'warning');
|
||||
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'not configured') . '.', 'warning');
|
||||
} elseif (!empty($llmResult['error'])) {
|
||||
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning');
|
||||
} else {
|
||||
@@ -391,7 +420,8 @@ PROMPT;
|
||||
if ($original === '' || str_starts_with($original, '[')) {
|
||||
continue;
|
||||
}
|
||||
if (!preg_match('/^\[[A-Za-z0-9_\- ]+\]$/', $tag)) {
|
||||
// Allow [ROLE: Name] format when keepOfficials is on, else require plain bracket tag
|
||||
if (!preg_match('/^\[[A-Za-z0-9_\- ]+(?::\s*[^\]]+)?\]$/', $tag)) {
|
||||
$tag = '[IDENTIFIER]';
|
||||
}
|
||||
if (str_contains($finalRedacted, $original)) {
|
||||
@@ -405,12 +435,24 @@ PROMPT;
|
||||
? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts))
|
||||
: 'no additional entities found';
|
||||
|
||||
$trace[] = $this->trace('Pass 2 — LLM semantic scan', "Azure reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete');
|
||||
$trace[] = $this->trace('Pass 2 — LLM semantic scan', "{$engineLabel} reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete');
|
||||
}
|
||||
|
||||
// Apply output format post-processing
|
||||
$allCounts = array_merge($pass1Counts, $pass2Counts);
|
||||
if ($outputFormat === 'generic') {
|
||||
$finalRedacted = $this->applyGenericTags($finalRedacted);
|
||||
} elseif ($outputFormat === 'pseudonym') {
|
||||
$finalRedacted = $this->applyPseudonymization($finalRedacted, $allCounts);
|
||||
}
|
||||
|
||||
$allCounts = array_merge($pass1Counts, $pass2Counts);
|
||||
$categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0));
|
||||
|
||||
$trace[] = $this->trace('Output format', match ($outputFormat) {
|
||||
'generic' => 'All identifiers normalised to generic tags ([PERSON], [ORG], etc.).',
|
||||
'pseudonym' => 'Identifiers replaced with plausible pseudonymous values.',
|
||||
default => 'Contextual role tags used (e.g. [FATHER], [JUDGE: Name]).',
|
||||
}, 'complete');
|
||||
$trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning');
|
||||
$trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete');
|
||||
|
||||
@@ -418,7 +460,9 @@ PROMPT;
|
||||
'tool' => 'redact',
|
||||
'mode' => $mode,
|
||||
'region' => $region,
|
||||
'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment ? ' and LLM semantic scan' : '') . '.',
|
||||
'engine_used' => $engineLabel,
|
||||
'output_format' => $outputFormat,
|
||||
'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment || $engine === 'gpu' ? " and {$engineLabel} semantic scan" : '') . '.',
|
||||
'redacted_text' => $finalRedacted,
|
||||
'detected_entity_categories' => $categories,
|
||||
'entity_counts' => $allCounts,
|
||||
@@ -429,7 +473,7 @@ PROMPT;
|
||||
'trace_metadata' => [
|
||||
'chunk_count' => 1,
|
||||
'source_count' => 1,
|
||||
'deployment' => $llmDeployment,
|
||||
'deployment' => $llmDeployment ?? $engineLabel,
|
||||
],
|
||||
'disclaimer' => 'Privacy support tool. Review before disclosure.',
|
||||
];
|
||||
@@ -793,15 +837,32 @@ PROMPT;
|
||||
]);
|
||||
}
|
||||
|
||||
private function llmRedactionPass(string $preRedacted, string $language = 'en', array $aliases = []): array
|
||||
{
|
||||
$missing = $this->azure->missingChatConfig();
|
||||
if ($missing) {
|
||||
return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')'];
|
||||
private function llmRedactionPass(
|
||||
string $preRedacted,
|
||||
string $language = 'en',
|
||||
array $aliases = [],
|
||||
string $engine = 'azure_mini',
|
||||
bool $keepOfficials = false,
|
||||
array $exemptNames = [],
|
||||
bool $doNames = true,
|
||||
bool $doOrgs = true,
|
||||
bool $doPlaces = true,
|
||||
bool $doDob = true
|
||||
): array {
|
||||
if ($engine === 'regex') {
|
||||
return ['skipped' => true, 'reason' => 'Regex-only mode selected'];
|
||||
}
|
||||
|
||||
if ($engine !== 'gpu') {
|
||||
$missing = $this->azure->missingChatConfig();
|
||||
if ($missing) {
|
||||
return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')'];
|
||||
}
|
||||
}
|
||||
|
||||
$languageNote = $language === 'no' ? "\n • The document may contain Norwegian or mixed-language content." : '';
|
||||
|
||||
// Build alias block
|
||||
$aliasBlock = '';
|
||||
if (!empty($aliases)) {
|
||||
$lines = [];
|
||||
@@ -817,6 +878,32 @@ PROMPT;
|
||||
}
|
||||
}
|
||||
|
||||
// Build exempt names block
|
||||
$exemptBlock = '';
|
||||
if (!empty($exemptNames)) {
|
||||
$quoted = array_map(fn($n) => '"' . str_replace(['"', "\n"], ['\\"', ' '], $n) . '"', array_slice($exemptNames, 0, 20));
|
||||
$exemptBlock = "\n\nEXEMPT NAMES — these names must NOT be redacted under any circumstances:\n " . implode(', ', $quoted);
|
||||
}
|
||||
|
||||
// Build entity-type restriction note
|
||||
$skipTypes = [];
|
||||
if (!$doOrgs) $skipTypes[] = 'organisation names';
|
||||
if (!$doPlaces) $skipTypes[] = 'place names';
|
||||
if (!$doDob) $skipTypes[] = 'dates of birth';
|
||||
if (!$doNames) $skipTypes[] = 'person names';
|
||||
$skipNote = $skipTypes ? "\n\nSKIP these entity types — do NOT redact them: " . implode(', ', $skipTypes) . '.' : '';
|
||||
|
||||
// Build officials note
|
||||
$officialsNote = '';
|
||||
if ($keepOfficials) {
|
||||
$officialsNote = "\n\nOFFICIALS — for persons identified as JUDGE, EXPERT_WITNESS, or CASEWORKER in an official capacity: do NOT replace their name with a plain bracket tag. Instead use the format [ROLE: Name], e.g. [JUDGE: Andersen] or [EXPERT_WITNESS: Dr. Larsen]. Their name must remain visible inside the tag.";
|
||||
}
|
||||
|
||||
$allowedTypesNote = '';
|
||||
if (!$doNames) {
|
||||
$allowedTypesNote = "\n\nDo NOT include person_name entries in your output.";
|
||||
}
|
||||
|
||||
$system = <<<PROMPT
|
||||
You are a privacy redaction assistant for legal documents (ECHR judgements, Norwegian family law cases, EU child welfare documents). The text below has already had mechanical identifiers replaced with placeholder tags in [BRACKETS].
|
||||
|
||||
@@ -827,7 +914,7 @@ Assign each person a consistent contextual tag used for every occurrence of thei
|
||||
• Family roles: FATHER, MOTHER, CHILD, CHILD_1, CHILD_2, GRANDPARENT, SIBLING
|
||||
• Professional roles: ATTORNEY, JUDGE, CASEWORKER, EXPERT_WITNESS
|
||||
• Generic fallback: PERSON_1, PERSON_2 (use only when role cannot be determined)
|
||||
The same individual MUST receive the same tag every time they appear.{$aliasBlock}
|
||||
The same individual MUST receive the same tag every time they appear.{$aliasBlock}{$exemptBlock}{$officialsNote}{$skipNote}{$allowedTypesNote}
|
||||
|
||||
Return ONLY a valid JSON object:
|
||||
{"redactions":[{"original":"exact text as it appears","type":"person_name","tag":"[FATHER]"}]}
|
||||
@@ -848,16 +935,23 @@ Rules:
|
||||
• Short common words, conjunctions, and prepositions are NOT PII.{$languageNote}
|
||||
PROMPT;
|
||||
|
||||
$messages = [
|
||||
['role' => 'system', 'content' => $system],
|
||||
['role' => 'user', 'content' => $preRedacted],
|
||||
];
|
||||
$chatOptions = ['temperature' => 0.1, 'max_tokens' => 8000, 'json' => true, 'timeout' => 90];
|
||||
|
||||
try {
|
||||
$response = $this->azure->chat([
|
||||
['role' => 'system', 'content' => $system],
|
||||
['role' => 'user', 'content' => $preRedacted],
|
||||
], [
|
||||
'temperature' => 0.1,
|
||||
'max_tokens' => 8000,
|
||||
'json' => true,
|
||||
'timeout' => 90,
|
||||
]);
|
||||
if ($engine === 'gpu') {
|
||||
$response = $this->callGpuLlm($messages, $chatOptions);
|
||||
$deployLabel = 'GPU (cuttlefish)';
|
||||
} elseif ($engine === 'azure_full') {
|
||||
$response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOptions);
|
||||
$deployLabel = 'gpt-4o';
|
||||
} else {
|
||||
$response = $this->azure->chat($messages, $chatOptions);
|
||||
$deployLabel = $this->azure->chatDeployment();
|
||||
}
|
||||
|
||||
$content = (string)($response['choices'][0]['message']['content'] ?? '');
|
||||
$json = $this->azure->decodeJsonObject($content);
|
||||
@@ -869,7 +963,7 @@ PROMPT;
|
||||
return [
|
||||
'skipped' => false,
|
||||
'entities' => is_array($json['redactions']) ? $json['redactions'] : [],
|
||||
'deployment' => $this->azure->chatDeployment(),
|
||||
'deployment' => $deployLabel,
|
||||
];
|
||||
} catch (Throwable $e) {
|
||||
error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage());
|
||||
@@ -877,6 +971,150 @@ PROMPT;
|
||||
}
|
||||
}
|
||||
|
||||
private function callGpuLlm(array $messages, array $options = []): array
|
||||
{
|
||||
$url = 'http://10.0.1.10:4000/v1/chat/completions';
|
||||
$apiKey = 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d';
|
||||
$model = 'qwen2.5:14b';
|
||||
$timeout = (int)($options['timeout'] ?? 90);
|
||||
|
||||
$payload = [
|
||||
'model' => $model,
|
||||
'messages' => $messages,
|
||||
'temperature' => $options['temperature'] ?? 0.1,
|
||||
'max_tokens' => $options['max_tokens'] ?? 8000,
|
||||
];
|
||||
if (!empty($options['json'])) {
|
||||
$payload['response_format'] = ['type' => 'json_object'];
|
||||
}
|
||||
|
||||
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
$headers = [
|
||||
'Content-Type: application/json',
|
||||
'Authorization: Bearer ' . $apiKey,
|
||||
];
|
||||
|
||||
if (function_exists('curl_init')) {
|
||||
$ch = curl_init($url);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $body,
|
||||
CURLOPT_HTTPHEADER => $headers,
|
||||
CURLOPT_TIMEOUT => $timeout,
|
||||
]);
|
||||
$response = curl_exec($ch);
|
||||
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
|
||||
$err = curl_error($ch);
|
||||
curl_close($ch);
|
||||
|
||||
if ($response === false) {
|
||||
throw new RuntimeException('GPU LiteLLM request failed: ' . $err);
|
||||
}
|
||||
} else {
|
||||
$ctx = stream_context_create(['http' => [
|
||||
'method' => 'POST',
|
||||
'header' => implode("\r\n", $headers),
|
||||
'content' => $body,
|
||||
'timeout' => $timeout,
|
||||
'ignore_errors' => true,
|
||||
]]);
|
||||
$response = @file_get_contents($url, false, $ctx);
|
||||
$code = 0;
|
||||
if (isset($http_response_header[0]) && preg_match('/\s(\d{3})\s/', $http_response_header[0], $m)) {
|
||||
$code = (int)$m[1];
|
||||
}
|
||||
if ($response === false) {
|
||||
throw new RuntimeException('GPU LiteLLM request failed.');
|
||||
}
|
||||
}
|
||||
|
||||
$decoded = json_decode($response, true);
|
||||
if (!is_array($decoded)) {
|
||||
throw new RuntimeException('GPU LiteLLM returned non-JSON response.');
|
||||
}
|
||||
if ($code < 200 || $code >= 300) {
|
||||
$msg = $decoded['error']['message'] ?? ('HTTP ' . $code);
|
||||
throw new RuntimeException('GPU LiteLLM error: ' . $msg);
|
||||
}
|
||||
return $decoded;
|
||||
}
|
||||
|
||||
private function applyGenericTags(string $text): string
|
||||
{
|
||||
// Collapse contextual role tags (e.g. [FATHER], [JUDGE: Andersen], [CHILD_1]) → [PERSON]
|
||||
$text = preg_replace('/\[(?:FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u', '[PERSON]', $text) ?? $text;
|
||||
return $text;
|
||||
}
|
||||
|
||||
private function applyPseudonymization(string $text, array $allCounts): string
|
||||
{
|
||||
$norwegianNames = [
|
||||
'Ola Nordmann', 'Per Hansen', 'Kari Larsen', 'Anne Berg', 'Erik Dahl',
|
||||
'Ingrid Holm', 'Lars Moen', 'Silje Bakke', 'Tor Haugen', 'Eva Strand',
|
||||
];
|
||||
$nameCursor = 0;
|
||||
$phoneBase = 1;
|
||||
$emailCursor = 0;
|
||||
$addrCursor = 1;
|
||||
$orgCursor = 1;
|
||||
$personMap = [];
|
||||
|
||||
// Replace named role tags (keeping consistent mapping per unique tag)
|
||||
$text = preg_replace_callback(
|
||||
'/\[(FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u',
|
||||
function (array $m) use (&$nameCursor, &$personMap, $norwegianNames): string {
|
||||
$key = $m[1];
|
||||
if (!isset($personMap[$key])) {
|
||||
$personMap[$key] = $norwegianNames[$nameCursor % count($norwegianNames)];
|
||||
$nameCursor++;
|
||||
}
|
||||
return $personMap[$key];
|
||||
},
|
||||
$text
|
||||
) ?? $text;
|
||||
|
||||
$text = preg_replace_callback('/\[PHONE\]/', function () use (&$phoneBase): string {
|
||||
return sprintf('+47 400 00 %03d', $phoneBase++);
|
||||
}, $text) ?? $text;
|
||||
|
||||
$text = preg_replace_callback('/\[EMAIL\]/', function () use (&$emailCursor): string {
|
||||
$letter = chr(ord('a') + ($emailCursor % 26));
|
||||
$emailCursor++;
|
||||
return "person.{$letter}@example.no";
|
||||
}, $text) ?? $text;
|
||||
|
||||
$text = preg_replace_callback('/\[ADDRESS\]/', function () use (&$addrCursor): string {
|
||||
return "Eksempelveien {$addrCursor}, 0001 Oslo";
|
||||
}, $text) ?? $text;
|
||||
|
||||
$text = preg_replace_callback('/\[ORG\]/', function () use (&$orgCursor): string {
|
||||
return "Eksempel AS ({$orgCursor})";
|
||||
}, $text) ?? $text;
|
||||
|
||||
$text = preg_replace_callback('/\[FNR\]/', function (): string {
|
||||
return '010100XXXXX';
|
||||
}, $text) ?? $text;
|
||||
|
||||
$text = preg_replace_callback('/\[(?:SE_PERSONNUMMER|FR_INSEE|UK_NI|SSN|NAT_ID|DOC_NO|ECHR_APP_NO)\]/', function (): string {
|
||||
return '[ID-REDACTED]';
|
||||
}, $text) ?? $text;
|
||||
|
||||
$text = preg_replace_callback('/\[PLACE\]/', function (): string {
|
||||
return 'Eksempelby';
|
||||
}, $text) ?? $text;
|
||||
|
||||
$text = preg_replace_callback('/\[DOB\]/', function (): string {
|
||||
return '01.01.0000';
|
||||
}, $text) ?? $text;
|
||||
|
||||
$text = preg_replace_callback('/\[IBAN\]/', function (): string {
|
||||
return 'NO00 0000 00 00000';
|
||||
}, $text) ?? $text;
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
private function uncertaintySummary(mixed $uncertainty): string
|
||||
{
|
||||
if (is_array($uncertainty)) {
|
||||
|
||||
Reference in New Issue
Block a user