Redact: multi-doc upload, contextual person naming, aliases

- Extract limit raised from 32K to 128K chars per file (long legal docs now fit)
- Redact API body/text limits raised (400KB / 128K chars) to match
- Upload zone accepts multiple files (up to 5); extracted text concatenated with
  doc separator and combined before redaction; shows per-file char counts
- LLM redact pass now infers contextual person roles (FATHER, MOTHER, CHILD,
  ATTORNEY, JUDGE, etc.) instead of generic [PERSON] for all names; same
  individual gets consistent tag throughout the document
- Tag validation widened to allow any [A-Za-z0-9_- ] pattern (not just the
  five hardcoded tags), supporting contextual and alias tags
- Alias UI added to Redact mode: user maps real names to bracketed aliases
  (e.g. "David Jr" -> [Junior]); aliases injected into LLM system prompt as
  override instructions; max 20 aliases, 100 chars each
- max_tokens raised from 2000 to 4000; timeout from 60s to 90s for larger docs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-13 07:17:02 +02:00
parent bbe5307c03
commit 95685862ab
6 changed files with 276 additions and 55 deletions
+1 -1
View File
@@ -7,7 +7,7 @@ dbnToolsRequireMethod('POST');
dbnToolsRequireAuth(); dbnToolsRequireAuth();
const EXTRACT_MAX_BYTES = 4 * 1024 * 1024; const EXTRACT_MAX_BYTES = 4 * 1024 * 1024;
const EXTRACT_TEXT_LIMIT = 32000; const EXTRACT_TEXT_LIMIT = 128000;
const EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx']; const EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];
try { try {
+19 -3
View File
@@ -5,12 +5,28 @@ require_once __DIR__ . '/../includes/LegalTools.php';
dbnToolsRequireMethod('POST'); dbnToolsRequireMethod('POST');
dbnToolsRequireAuth(); dbnToolsRequireAuth();
$input = dbnToolsJsonInput(70000); $input = dbnToolsJsonInput(400000);
dbnToolsWithTelemetry('redact', '', function () use ($input): array { dbnToolsWithTelemetry('redact', '', function () use ($input): array {
$text = dbnToolsString($input, 'text', 32000); $text = dbnToolsString($input, 'text', 128000);
$mode = (string)($input['mode'] ?? 'standard'); $mode = (string)($input['mode'] ?? 'standard');
$region = dbnToolsNormalizeRegion($input['region'] ?? 'nordic'); $region = dbnToolsNormalizeRegion($input['region'] ?? 'nordic');
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
return (new DbnLegalToolsService())->redact($text, $mode, $region, $language);
$aliases = [];
$rawAliases = $input['aliases'] ?? [];
if (is_array($rawAliases)) {
foreach (array_slice($rawAliases, 0, 20) as $item) {
if (!is_array($item)) {
continue;
}
$original = substr(trim((string)($item['original'] ?? '')), 0, 100);
$alias = substr(trim((string)($item['alias'] ?? '')), 0, 100);
if ($original !== '' && $alias !== '') {
$aliases[] = ['original' => $original, 'alias' => $alias];
}
}
}
return (new DbnLegalToolsService())->redact($text, $mode, $region, $language, $aliases);
}); });
+110
View File
@@ -963,3 +963,113 @@ p {
color: var(--coral); color: var(--coral);
border-color: #f5c6aa; border-color: #f5c6aa;
} }
/* ─── Multi-file list ─────────────────────────────────────────────────────── */
.upload-file-list {
list-style: none;
padding: 0;
margin: 0;
text-align: left;
width: 100%;
}
.upload-file-list li {
display: flex;
align-items: baseline;
gap: 0.5rem;
padding: 2px 0;
font-size: 0.85rem;
}
.upload-chars {
color: var(--muted);
font-size: 0.78rem;
flex-shrink: 0;
}
/* ─── Name aliases (Redact tool) ──────────────────────────────────────────── */
.alias-section {
margin-top: 0.75rem;
padding-top: 0.75rem;
border-top: 1px solid var(--line);
}
.alias-header {
display: flex;
align-items: center;
gap: 1rem;
margin-bottom: 0.5rem;
}
.alias-add-btn {
background: var(--soft-teal);
color: var(--teal-dark);
border: 1px solid transparent;
border-radius: 6px;
padding: 3px 10px;
font-size: 0.8rem;
font-weight: 600;
cursor: pointer;
transition: background 0.12s;
}
.alias-add-btn:hover {
background: #c4ece5;
}
.alias-row {
display: flex;
align-items: center;
gap: 0.5rem;
margin-bottom: 0.4rem;
}
.alias-original,
.alias-label {
flex: 1;
padding: 0.3rem 0.55rem;
border: 1px solid var(--line);
border-radius: 6px;
font-size: 0.875rem;
background: var(--panel);
color: var(--ink);
min-width: 0;
}
.alias-original:focus,
.alias-label:focus {
outline: 3px solid rgba(15, 118, 110, 0.28);
outline-offset: 1px;
border-color: var(--teal);
}
.alias-arrow {
color: var(--muted);
font-size: 1rem;
flex-shrink: 0;
}
.alias-remove {
flex-shrink: 0;
background: transparent;
color: var(--muted);
font-size: 1.1rem;
line-height: 1;
padding: 2px 6px;
border-radius: 4px;
border: 1px solid transparent;
}
.alias-remove:hover {
background: var(--soft-coral);
color: var(--coral);
border-color: #f5c6aa;
}
.alias-hint {
font-size: 0.76rem;
color: var(--muted);
margin: 0.35rem 0 0;
}
+89 -26
View File
@@ -84,8 +84,11 @@ document.addEventListener('DOMContentLoaded', () => {
uploadInput: document.querySelector('#uploadInput'), uploadInput: document.querySelector('#uploadInput'),
uploadPrompt: document.querySelector('#uploadPrompt'), uploadPrompt: document.querySelector('#uploadPrompt'),
uploadFileInfo: document.querySelector('#uploadFileInfo'), uploadFileInfo: document.querySelector('#uploadFileInfo'),
uploadFileName: document.querySelector('#uploadFileName'), uploadFileList: document.querySelector('#uploadFileList'),
uploadClear: document.querySelector('#uploadClear'), uploadClear: document.querySelector('#uploadClear'),
aliasSection: document.querySelector('#aliasSection'),
addAliasRow: document.querySelector('#addAliasRow'),
aliasRows: document.querySelector('#aliasRows'),
}); });
els.tabs.forEach((button) => { els.tabs.forEach((button) => {
@@ -95,6 +98,7 @@ document.addEventListener('DOMContentLoaded', () => {
els.passcodeForm.addEventListener('submit', submitPasscode); els.passcodeForm.addEventListener('submit', submitPasscode);
els.healthButton.addEventListener('click', checkHealth); els.healthButton.addEventListener('click', checkHealth);
setupUpload(); setupUpload();
setupAliases();
setTool(state.activeTool); setTool(state.activeTool);
if (state.authenticated) { if (state.authenticated) {
@@ -122,7 +126,9 @@ function setTool(toolName) {
els.languageControl.classList.toggle('is-hidden', !tool.usesLanguage); els.languageControl.classList.toggle('is-hidden', !tool.usesLanguage);
els.redactionControl.classList.toggle('is-hidden', toolName !== 'redact'); els.redactionControl.classList.toggle('is-hidden', toolName !== 'redact');
els.uploadZone.classList.toggle('is-hidden', toolName !== 'redact'); els.uploadZone.classList.toggle('is-hidden', toolName !== 'redact');
els.aliasSection.classList.toggle('is-hidden', toolName !== 'redact');
resetUpload(); resetUpload();
resetAliases();
els.status.textContent = ''; els.status.textContent = '';
renderTrace([]); renderTrace([]);
} }
@@ -170,6 +176,7 @@ async function runTool(event) {
if (state.activeTool === 'redact') { if (state.activeTool === 'redact') {
payload.mode = currentRedactionMode(); payload.mode = currentRedactionMode();
payload.region = currentRedactionRegion(); payload.region = currentRedactionRegion();
payload.aliases = getAliases();
} }
setBusy(true); setBusy(true);
@@ -200,7 +207,7 @@ function resetUpload() {
els.uploadInput.value = ''; els.uploadInput.value = '';
els.uploadPrompt.classList.remove('is-hidden'); els.uploadPrompt.classList.remove('is-hidden');
els.uploadFileInfo.classList.add('is-hidden'); els.uploadFileInfo.classList.add('is-hidden');
els.uploadFileName.textContent = ''; els.uploadFileList.innerHTML = '';
els.uploadZone.classList.remove('is-drag-over'); els.uploadZone.classList.remove('is-drag-over');
} }
@@ -219,8 +226,7 @@ function setupUpload() {
els.uploadZone.addEventListener('drop', (e) => { els.uploadZone.addEventListener('drop', (e) => {
e.preventDefault(); e.preventDefault();
els.uploadZone.classList.remove('is-drag-over'); els.uploadZone.classList.remove('is-drag-over');
const file = e.dataTransfer?.files?.[0]; if (e.dataTransfer?.files?.length) handleFiles(e.dataTransfer.files);
if (file) handleFileUpload(file);
}); });
els.uploadZone.addEventListener('click', (e) => { els.uploadZone.addEventListener('click', (e) => {
@@ -230,49 +236,74 @@ function setupUpload() {
}); });
els.uploadInput.addEventListener('change', () => { els.uploadInput.addEventListener('change', () => {
const file = els.uploadInput.files?.[0]; if (els.uploadInput.files?.length) handleFiles(els.uploadInput.files);
if (file) handleFileUpload(file);
}); });
els.uploadClear.addEventListener('click', () => { els.uploadClear.addEventListener('click', () => {
resetUpload(); resetUpload();
els.input.value = '';
els.status.textContent = ''; els.status.textContent = '';
}); });
} }
async function handleFileUpload(file) { async function handleFiles(fileList) {
const allowed = ['pdf', 'docx', 'txt']; const allowed = ['pdf', 'docx', 'txt'];
const ext = file.name.split('.').pop().toLowerCase(); const files = Array.from(fileList).slice(0, 5);
if (!allowed.includes(ext)) {
els.status.textContent = 'Unsupported file type. Use .pdf, .docx, or .txt.'; for (const file of files) {
return; const ext = file.name.split('.').pop().toLowerCase();
if (!allowed.includes(ext)) {
els.status.textContent = `Skipped ${file.name}: unsupported type. Use .pdf, .docx, or .txt.`;
return;
}
} }
els.status.textContent = `Extracting ${file.name}`; els.status.textContent = files.length === 1 ? `Extracting ${files[0].name}` : `Extracting ${files.length} files`;
setBusy(true); setBusy(true);
const parts = [];
let totalChars = 0;
let anyTruncated = false;
try { try {
const formData = new FormData(); for (const file of files) {
formData.append('file', file); const formData = new FormData();
formData.append('file', file);
const resp = await fetch('api/extract.php', { const resp = await fetch('api/extract.php', {
method: 'POST', method: 'POST',
credentials: 'same-origin', credentials: 'same-origin',
body: formData, body: formData,
}); });
const data = await resp.json().catch(() => ({})); const data = await resp.json().catch(() => ({}));
if (!resp.ok || !data.ok) { if (!resp.ok || !data.ok) {
throw new Error(data.error?.message || `Extraction failed (HTTP ${resp.status}).`); throw new Error(data.error?.message || `Extraction failed for ${file.name} (HTTP ${resp.status}).`);
}
parts.push({ filename: file.name, chars: data.chars, truncated: data.truncated, text: data.text });
totalChars += data.chars;
if (data.truncated) anyTruncated = true;
} }
els.input.value = data.text; const combined = parts.length === 1
els.uploadFileName.textContent = file.name; ? parts[0].text
: parts.map((p) => `--- Document: ${p.filename} ---\n\n${p.text}`).join('\n\n');
const MAX_COMBINED = 128000;
const combinedTruncated = combined.length > MAX_COMBINED;
els.input.value = combinedTruncated ? combined.slice(0, MAX_COMBINED) : combined;
els.uploadFileList.innerHTML = parts
.map((p) => `<li><span class="upload-filename">${escapeHtml(p.filename)}</span><span class="upload-chars">${p.chars.toLocaleString()} chars${p.truncated ? ' • per-file limit reached' : ''}</span></li>`)
.join('');
els.uploadPrompt.classList.add('is-hidden'); els.uploadPrompt.classList.add('is-hidden');
els.uploadFileInfo.classList.remove('is-hidden'); els.uploadFileInfo.classList.remove('is-hidden');
const note = data.truncated ? ' (truncated to 32000 chars)' : ''; const truncNote = (anyTruncated || combinedTruncated) ? ' truncated to 128000 char limit' : '';
els.status.textContent = `Extracted ${data.chars.toLocaleString()} chars from ${file.name}${note}.`; els.status.textContent = parts.length === 1
? `Extracted ${totalChars.toLocaleString()} chars from ${parts[0].filename}${truncNote}.`
: `Extracted ${totalChars.toLocaleString()} chars total from ${parts.length} files${truncNote}.`;
} catch (err) { } catch (err) {
els.status.textContent = err.message; els.status.textContent = err.message;
resetUpload(); resetUpload();
@@ -281,6 +312,38 @@ async function handleFileUpload(file) {
} }
} }
function setupAliases() {
els.addAliasRow.addEventListener('click', () => {
const row = document.createElement('div');
row.className = 'alias-row';
row.innerHTML = [
'<input type="text" class="alias-original" placeholder="Real name" maxlength="100">',
'<span class="alias-arrow" aria-hidden="true">→</span>',
'<input type="text" class="alias-label" placeholder="Alias (without brackets)" maxlength="100">',
'<button type="button" class="alias-remove" aria-label="Remove alias">×</button>',
].join('');
els.aliasRows.appendChild(row);
row.querySelector('.alias-original').focus();
});
els.aliasRows.addEventListener('click', (e) => {
const btn = e.target.closest('.alias-remove');
if (btn) btn.closest('.alias-row').remove();
});
}
function getAliases() {
return Array.from(els.aliasRows.querySelectorAll('.alias-row')).flatMap((row) => {
const original = row.querySelector('.alias-original')?.value.trim() ?? '';
const alias = row.querySelector('.alias-label')?.value.trim() ?? '';
return original && alias ? [{ original, alias }] : [];
});
}
function resetAliases() {
if (els.aliasRows) els.aliasRows.innerHTML = '';
}
async function checkHealth() { async function checkHealth() {
els.healthPill.textContent = 'Checking...'; els.healthPill.textContent = 'Checking...';
try { try {
+43 -20
View File
@@ -330,7 +330,7 @@ PROMPT;
]; ];
} }
public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en'): array public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en', array $aliases = []): array
{ {
$text = $this->requirePasteText($text); $text = $this->requirePasteText($text);
$mode = $mode === 'strict' ? 'strict' : 'standard'; $mode = $mode === 'strict' ? 'strict' : 'standard';
@@ -357,7 +357,7 @@ PROMPT;
$pass2Counts = []; $pass2Counts = [];
$llmDeployment = null; $llmDeployment = null;
$llmResult = $this->llmRedactionPass($preRedacted, $language); $llmResult = $this->llmRedactionPass($preRedacted, $language, $aliases);
if (!empty($llmResult['skipped'])) { if (!empty($llmResult['skipped'])) {
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'Azure not configured') . '.', 'warning'); $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'Azure not configured') . '.', 'warning');
@@ -378,7 +378,7 @@ PROMPT;
if ($original === '' || str_starts_with($original, '[')) { if ($original === '' || str_starts_with($original, '[')) {
continue; continue;
} }
if (!in_array($tag, ['[PERSON]', '[ORG]', '[PLACE]', '[DOB]', '[IDENTIFIER]'], true)) { if (!preg_match('/^\[[A-Za-z0-9_\- ]+\]$/', $tag)) {
$tag = '[IDENTIFIER]'; $tag = '[IDENTIFIER]';
} }
if (str_contains($finalRedacted, $original)) { if (str_contains($finalRedacted, $original)) {
@@ -780,36 +780,59 @@ PROMPT;
]); ]);
} }
private function llmRedactionPass(string $preRedacted, string $language = 'en'): array private function llmRedactionPass(string $preRedacted, string $language = 'en', array $aliases = []): array
{ {
$missing = $this->azure->missingChatConfig(); $missing = $this->azure->missingChatConfig();
if ($missing) { if ($missing) {
return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')']; return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')'];
} }
$languageNote = $language === 'no' ? "\nThe document may contain Norwegian or mixed-language content." : ''; $languageNote = $language === 'no' ? "\nThe document may contain Norwegian or mixed-language content." : '';
$aliasBlock = '';
if (!empty($aliases)) {
$lines = [];
foreach ($aliases as $a) {
$orig = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['original'] ?? '')), 0, 100));
$lbl = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['alias'] ?? '')), 0, 100));
if ($orig !== '' && $lbl !== '') {
$lines[] = " \"{$orig}\" → [{$lbl}]";
}
}
if ($lines) {
$aliasBlock = "\n\nALIAS OVERRIDES — use these exact replacement tags for these specific names instead of inferring a role:\n" . implode("\n", $lines);
}
}
$system = <<<PROMPT $system = <<<PROMPT
You are a privacy redaction assistant for legal documents (ECHR judgements, Norwegian family law cases, EU child welfare documents). The text below has already had mechanical identifiers replaced with placeholder tags in [BRACKETS]. You are a privacy redaction assistant for legal documents (ECHR judgements, Norwegian family law cases, EU child welfare documents). The text below has already had mechanical identifiers replaced with placeholder tags in [BRACKETS].
Your task: find any remaining identifiable information — person names, organisation names, specific places at city level or below, dates of birth, and identifying descriptions. Your task: find any remaining identifiable information — person names, organisation names, specific places at city level or below, dates of birth, and identifying descriptions.
Return ONLY a valid JSON object: STEP 1 — For person names: identify each individual and infer their role or relationship from context.
{"redactions":[{"original":"exact text as it appears","type":"person_name","tag":"[PERSON]"}]} Assign each person a consistent contextual tag used for every occurrence of their name:
• Family roles: FATHER, MOTHER, CHILD, CHILD_1, CHILD_2, GRANDPARENT, SIBLING
• Professional roles: ATTORNEY, JUDGE, CASEWORKER, EXPERT_WITNESS
• Generic fallback: PERSON_1, PERSON_2 (use only when role cannot be determined)
The same individual MUST receive the same tag every time they appear.{$aliasBlock}
Allowed type values and their tags: Return ONLY a valid JSON object:
- person_name → [PERSON] {"redactions":[{"original":"exact text as it appears","type":"person_name","tag":"[FATHER]"}]}
- org → [ORG]
- place → [PLACE] Allowed types and their tag format:
- date_of_birth → [DOB] person_name → contextual role tag e.g. [FATHER], [CHILD_1], [ATTORNEY] (or alias tag if provided above)
- other → [IDENTIFIER] org [ORG]
place → [PLACE]
date_of_birth → [DOB]
other → [IDENTIFIER]
Rules: Rules:
- Include only text that appears verbatim in the input. Do not invent or paraphrase. Include only text that appears verbatim in the input. Do not invent or paraphrase.
- If nothing needs redacting, return {"redactions":[]}. • The same person MUST get the same tag every time they appear.
- Do not redact text already inside [BRACKETS]. • If nothing needs redacting, return {"redactions":[]}.
- Legal citations, statute names, article numbers, and institution names (e.g. "the European Court of Human Rights", "Barnevernloven § 4-12") are NOT PII. • Do not redact text already inside [BRACKETS].
- Short common words, conjunctions, and prepositions are NOT PII.{$languageNote} • Legal citations, statute names, article numbers, and institution names (e.g. "the European Court of Human Rights", "Barnevernloven § 4-12") are NOT PII.
• Short common words, conjunctions, and prepositions are NOT PII.{$languageNote}
PROMPT; PROMPT;
try { try {
@@ -818,9 +841,9 @@ PROMPT;
['role' => 'user', 'content' => $preRedacted], ['role' => 'user', 'content' => $preRedacted],
], [ ], [
'temperature' => 0.1, 'temperature' => 0.1,
'max_tokens' => 2000, 'max_tokens' => 4000,
'json' => true, 'json' => true,
'timeout' => 60, 'timeout' => 90,
]); ]);
$content = (string)($response['choices'][0]['message']['content'] ?? ''); $content = (string)($response['choices'][0]['message']['content'] ?? '');
+14 -5
View File
@@ -219,18 +219,27 @@ $authenticated = dbnToolsIsAuthenticated();
</div> </div>
<div class="upload-zone is-hidden" id="uploadZone" role="region" aria-label="File upload"> <div class="upload-zone is-hidden" id="uploadZone" role="region" aria-label="File upload">
<input type="file" id="uploadInput" accept=".pdf,.docx,.txt" aria-label="Choose a file"> <input type="file" id="uploadInput" multiple accept=".pdf,.docx,.txt" aria-label="Choose files">
<div id="uploadPrompt" class="upload-prompt"> <div id="uploadPrompt" class="upload-prompt">
<span class="upload-icon" aria-hidden="true">&#8679;</span> <span class="upload-icon" aria-hidden="true">&#8679;</span>
<p>Drop a <strong>.pdf</strong>, <strong>.docx</strong>, or <strong>.txt</strong>, or <label for="uploadInput" class="upload-browse">browse</label></p> <p>Drop up to 5 files (<strong>.pdf</strong>, <strong>.docx</strong>, <strong>.txt</strong>), or <label for="uploadInput" class="upload-browse">browse</label></p>
<p class="upload-hint">Text is extracted and never stored.</p> <p class="upload-hint">Text is extracted in memory and never stored.</p>
</div> </div>
<div id="uploadFileInfo" class="upload-file is-hidden"> <div id="uploadFileInfo" class="upload-file is-hidden">
<span id="uploadFileName" class="upload-filename"></span> <ul id="uploadFileList" class="upload-file-list"></ul>
<button type="button" id="uploadClear" class="upload-clear" aria-label="Clear uploaded file">&times;</button> <button type="button" id="uploadClear" class="upload-clear" aria-label="Clear uploaded files">&times;</button>
</div> </div>
</div> </div>
<div class="alias-section is-hidden" id="aliasSection">
<div class="alias-header">
<span class="control-label">Name aliases</span>
<button type="button" id="addAliasRow" class="alias-add-btn">+ Add</button>
</div>
<div id="aliasRows"></div>
<p class="alias-hint">Replace a name with a bracketed alias, e.g. &ldquo;David Jr&rdquo; &rarr; [Junior]</p>
</div>
<label class="input-label" for="toolInput" id="inputLabel">Question</label> <label class="input-label" for="toolInput" id="inputLabel">Question</label>
<textarea id="toolInput" name="toolInput" rows="10" required></textarea> <textarea id="toolInput" name="toolInput" rows="10" required></textarea>