From 95685862ab0744d852456bc5a77aa7ca75844287 Mon Sep 17 00:00:00 2001 From: davegilligan Date: Wed, 13 May 2026 07:17:02 +0200 Subject: [PATCH] Redact: multi-doc upload, contextual person naming, aliases - Extract limit raised from 32K to 128K chars per file (long legal docs now fit) - Redact API body/text limits raised (400KB / 128K chars) to match - Upload zone accepts multiple files (up to 5); extracted text concatenated with doc separator and combined before redaction; shows per-file char counts - LLM redact pass now infers contextual person roles (FATHER, MOTHER, CHILD, ATTORNEY, JUDGE, etc.) instead of generic [PERSON] for all names; same individual gets consistent tag throughout the document - Tag validation widened to allow any [A-Za-z0-9_- ] pattern (not just the five hardcoded tags), supporting contextual and alias tags - Alias UI added to Redact mode: user maps real names to bracketed aliases (e.g. "David Jr" -> [Junior]); aliases injected into LLM system prompt as override instructions; max 20 aliases, 100 chars each - max_tokens raised from 2000 to 4000; timeout from 60s to 90s for larger docs Co-Authored-By: Claude Sonnet 4.6 --- api/extract.php | 2 +- api/redact.php | 22 ++++++-- assets/css/tools.css | 110 ++++++++++++++++++++++++++++++++++++++ assets/js/tools.js | 115 +++++++++++++++++++++++++++++++--------- includes/LegalTools.php | 63 +++++++++++++++------- index.php | 19 +++++-- 6 files changed, 276 insertions(+), 55 deletions(-) diff --git a/api/extract.php b/api/extract.php index c3e8c38..b3cd7e7 100644 --- a/api/extract.php +++ b/api/extract.php @@ -7,7 +7,7 @@ dbnToolsRequireMethod('POST'); dbnToolsRequireAuth(); const EXTRACT_MAX_BYTES = 4 * 1024 * 1024; -const EXTRACT_TEXT_LIMIT = 32000; +const EXTRACT_TEXT_LIMIT = 128000; const EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx']; try { diff --git a/api/redact.php b/api/redact.php index 7467ca1..223dbb8 100644 --- a/api/redact.php +++ b/api/redact.php @@ -5,12 +5,28 @@ require_once __DIR__ . '/../includes/LegalTools.php'; dbnToolsRequireMethod('POST'); dbnToolsRequireAuth(); -$input = dbnToolsJsonInput(70000); +$input = dbnToolsJsonInput(400000); dbnToolsWithTelemetry('redact', '', function () use ($input): array { - $text = dbnToolsString($input, 'text', 32000); + $text = dbnToolsString($input, 'text', 128000); $mode = (string)($input['mode'] ?? 'standard'); $region = dbnToolsNormalizeRegion($input['region'] ?? 'nordic'); $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); - return (new DbnLegalToolsService())->redact($text, $mode, $region, $language); + + $aliases = []; + $rawAliases = $input['aliases'] ?? []; + if (is_array($rawAliases)) { + foreach (array_slice($rawAliases, 0, 20) as $item) { + if (!is_array($item)) { + continue; + } + $original = substr(trim((string)($item['original'] ?? '')), 0, 100); + $alias = substr(trim((string)($item['alias'] ?? '')), 0, 100); + if ($original !== '' && $alias !== '') { + $aliases[] = ['original' => $original, 'alias' => $alias]; + } + } + } + + return (new DbnLegalToolsService())->redact($text, $mode, $region, $language, $aliases); }); diff --git a/assets/css/tools.css b/assets/css/tools.css index 24c5223..fd71490 100644 --- a/assets/css/tools.css +++ b/assets/css/tools.css @@ -963,3 +963,113 @@ p { color: var(--coral); border-color: #f5c6aa; } + +/* ─── Multi-file list ─────────────────────────────────────────────────────── */ + +.upload-file-list { + list-style: none; + padding: 0; + margin: 0; + text-align: left; + width: 100%; +} + +.upload-file-list li { + display: flex; + align-items: baseline; + gap: 0.5rem; + padding: 2px 0; + font-size: 0.85rem; +} + +.upload-chars { + color: var(--muted); + font-size: 0.78rem; + flex-shrink: 0; +} + +/* ─── Name aliases (Redact tool) ──────────────────────────────────────────── */ + +.alias-section { + margin-top: 0.75rem; + padding-top: 0.75rem; + border-top: 1px solid var(--line); +} + +.alias-header { + display: flex; + align-items: center; + gap: 1rem; + margin-bottom: 0.5rem; +} + +.alias-add-btn { + background: var(--soft-teal); + color: var(--teal-dark); + border: 1px solid transparent; + border-radius: 6px; + padding: 3px 10px; + font-size: 0.8rem; + font-weight: 600; + cursor: pointer; + transition: background 0.12s; +} + +.alias-add-btn:hover { + background: #c4ece5; +} + +.alias-row { + display: flex; + align-items: center; + gap: 0.5rem; + margin-bottom: 0.4rem; +} + +.alias-original, +.alias-label { + flex: 1; + padding: 0.3rem 0.55rem; + border: 1px solid var(--line); + border-radius: 6px; + font-size: 0.875rem; + background: var(--panel); + color: var(--ink); + min-width: 0; +} + +.alias-original:focus, +.alias-label:focus { + outline: 3px solid rgba(15, 118, 110, 0.28); + outline-offset: 1px; + border-color: var(--teal); +} + +.alias-arrow { + color: var(--muted); + font-size: 1rem; + flex-shrink: 0; +} + +.alias-remove { + flex-shrink: 0; + background: transparent; + color: var(--muted); + font-size: 1.1rem; + line-height: 1; + padding: 2px 6px; + border-radius: 4px; + border: 1px solid transparent; +} + +.alias-remove:hover { + background: var(--soft-coral); + color: var(--coral); + border-color: #f5c6aa; +} + +.alias-hint { + font-size: 0.76rem; + color: var(--muted); + margin: 0.35rem 0 0; +} diff --git a/assets/js/tools.js b/assets/js/tools.js index 2be7308..6e863dc 100644 --- a/assets/js/tools.js +++ b/assets/js/tools.js @@ -84,8 +84,11 @@ document.addEventListener('DOMContentLoaded', () => { uploadInput: document.querySelector('#uploadInput'), uploadPrompt: document.querySelector('#uploadPrompt'), uploadFileInfo: document.querySelector('#uploadFileInfo'), - uploadFileName: document.querySelector('#uploadFileName'), + uploadFileList: document.querySelector('#uploadFileList'), uploadClear: document.querySelector('#uploadClear'), + aliasSection: document.querySelector('#aliasSection'), + addAliasRow: document.querySelector('#addAliasRow'), + aliasRows: document.querySelector('#aliasRows'), }); els.tabs.forEach((button) => { @@ -95,6 +98,7 @@ document.addEventListener('DOMContentLoaded', () => { els.passcodeForm.addEventListener('submit', submitPasscode); els.healthButton.addEventListener('click', checkHealth); setupUpload(); + setupAliases(); setTool(state.activeTool); if (state.authenticated) { @@ -122,7 +126,9 @@ function setTool(toolName) { els.languageControl.classList.toggle('is-hidden', !tool.usesLanguage); els.redactionControl.classList.toggle('is-hidden', toolName !== 'redact'); els.uploadZone.classList.toggle('is-hidden', toolName !== 'redact'); + els.aliasSection.classList.toggle('is-hidden', toolName !== 'redact'); resetUpload(); + resetAliases(); els.status.textContent = ''; renderTrace([]); } @@ -170,6 +176,7 @@ async function runTool(event) { if (state.activeTool === 'redact') { payload.mode = currentRedactionMode(); payload.region = currentRedactionRegion(); + payload.aliases = getAliases(); } setBusy(true); @@ -200,7 +207,7 @@ function resetUpload() { els.uploadInput.value = ''; els.uploadPrompt.classList.remove('is-hidden'); els.uploadFileInfo.classList.add('is-hidden'); - els.uploadFileName.textContent = ''; + els.uploadFileList.innerHTML = ''; els.uploadZone.classList.remove('is-drag-over'); } @@ -219,8 +226,7 @@ function setupUpload() { els.uploadZone.addEventListener('drop', (e) => { e.preventDefault(); els.uploadZone.classList.remove('is-drag-over'); - const file = e.dataTransfer?.files?.[0]; - if (file) handleFileUpload(file); + if (e.dataTransfer?.files?.length) handleFiles(e.dataTransfer.files); }); els.uploadZone.addEventListener('click', (e) => { @@ -230,49 +236,74 @@ function setupUpload() { }); els.uploadInput.addEventListener('change', () => { - const file = els.uploadInput.files?.[0]; - if (file) handleFileUpload(file); + if (els.uploadInput.files?.length) handleFiles(els.uploadInput.files); }); els.uploadClear.addEventListener('click', () => { resetUpload(); + els.input.value = ''; els.status.textContent = ''; }); } -async function handleFileUpload(file) { +async function handleFiles(fileList) { const allowed = ['pdf', 'docx', 'txt']; - const ext = file.name.split('.').pop().toLowerCase(); - if (!allowed.includes(ext)) { - els.status.textContent = 'Unsupported file type. Use .pdf, .docx, or .txt.'; - return; + const files = Array.from(fileList).slice(0, 5); + + for (const file of files) { + const ext = file.name.split('.').pop().toLowerCase(); + if (!allowed.includes(ext)) { + els.status.textContent = `Skipped ${file.name}: unsupported type. Use .pdf, .docx, or .txt.`; + return; + } } - els.status.textContent = `Extracting ${file.name}…`; + els.status.textContent = files.length === 1 ? `Extracting ${files[0].name}…` : `Extracting ${files.length} files…`; setBusy(true); + const parts = []; + let totalChars = 0; + let anyTruncated = false; + try { - const formData = new FormData(); - formData.append('file', file); + for (const file of files) { + const formData = new FormData(); + formData.append('file', file); - const resp = await fetch('api/extract.php', { - method: 'POST', - credentials: 'same-origin', - body: formData, - }); - const data = await resp.json().catch(() => ({})); + const resp = await fetch('api/extract.php', { + method: 'POST', + credentials: 'same-origin', + body: formData, + }); + const data = await resp.json().catch(() => ({})); - if (!resp.ok || !data.ok) { - throw new Error(data.error?.message || `Extraction failed (HTTP ${resp.status}).`); + if (!resp.ok || !data.ok) { + throw new Error(data.error?.message || `Extraction failed for ${file.name} (HTTP ${resp.status}).`); + } + + parts.push({ filename: file.name, chars: data.chars, truncated: data.truncated, text: data.text }); + totalChars += data.chars; + if (data.truncated) anyTruncated = true; } - els.input.value = data.text; - els.uploadFileName.textContent = file.name; + const combined = parts.length === 1 + ? parts[0].text + : parts.map((p) => `--- Document: ${p.filename} ---\n\n${p.text}`).join('\n\n'); + + const MAX_COMBINED = 128000; + const combinedTruncated = combined.length > MAX_COMBINED; + els.input.value = combinedTruncated ? combined.slice(0, MAX_COMBINED) : combined; + + els.uploadFileList.innerHTML = parts + .map((p) => `
  • ${escapeHtml(p.filename)}${p.chars.toLocaleString()} chars${p.truncated ? ' • per-file limit reached' : ''}
  • `) + .join(''); els.uploadPrompt.classList.add('is-hidden'); els.uploadFileInfo.classList.remove('is-hidden'); - const note = data.truncated ? ' (truncated to 32 000 chars)' : ''; - els.status.textContent = `Extracted ${data.chars.toLocaleString()} chars from ${file.name}${note}.`; + const truncNote = (anyTruncated || combinedTruncated) ? ' — truncated to 128 000 char limit' : ''; + els.status.textContent = parts.length === 1 + ? `Extracted ${totalChars.toLocaleString()} chars from ${parts[0].filename}${truncNote}.` + : `Extracted ${totalChars.toLocaleString()} chars total from ${parts.length} files${truncNote}.`; } catch (err) { els.status.textContent = err.message; resetUpload(); @@ -281,6 +312,38 @@ async function handleFileUpload(file) { } } +function setupAliases() { + els.addAliasRow.addEventListener('click', () => { + const row = document.createElement('div'); + row.className = 'alias-row'; + row.innerHTML = [ + '', + '', + '', + '', + ].join(''); + els.aliasRows.appendChild(row); + row.querySelector('.alias-original').focus(); + }); + + els.aliasRows.addEventListener('click', (e) => { + const btn = e.target.closest('.alias-remove'); + if (btn) btn.closest('.alias-row').remove(); + }); +} + +function getAliases() { + return Array.from(els.aliasRows.querySelectorAll('.alias-row')).flatMap((row) => { + const original = row.querySelector('.alias-original')?.value.trim() ?? ''; + const alias = row.querySelector('.alias-label')?.value.trim() ?? ''; + return original && alias ? [{ original, alias }] : []; + }); +} + +function resetAliases() { + if (els.aliasRows) els.aliasRows.innerHTML = ''; +} + async function checkHealth() { els.healthPill.textContent = 'Checking...'; try { diff --git a/includes/LegalTools.php b/includes/LegalTools.php index bbdc7ee..62b237c 100644 --- a/includes/LegalTools.php +++ b/includes/LegalTools.php @@ -330,7 +330,7 @@ PROMPT; ]; } - public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en'): array + public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en', array $aliases = []): array { $text = $this->requirePasteText($text); $mode = $mode === 'strict' ? 'strict' : 'standard'; @@ -357,7 +357,7 @@ PROMPT; $pass2Counts = []; $llmDeployment = null; - $llmResult = $this->llmRedactionPass($preRedacted, $language); + $llmResult = $this->llmRedactionPass($preRedacted, $language, $aliases); if (!empty($llmResult['skipped'])) { $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'Azure not configured') . '.', 'warning'); @@ -378,7 +378,7 @@ PROMPT; if ($original === '' || str_starts_with($original, '[')) { continue; } - if (!in_array($tag, ['[PERSON]', '[ORG]', '[PLACE]', '[DOB]', '[IDENTIFIER]'], true)) { + if (!preg_match('/^\[[A-Za-z0-9_\- ]+\]$/', $tag)) { $tag = '[IDENTIFIER]'; } if (str_contains($finalRedacted, $original)) { @@ -780,36 +780,59 @@ PROMPT; ]); } - private function llmRedactionPass(string $preRedacted, string $language = 'en'): array + private function llmRedactionPass(string $preRedacted, string $language = 'en', array $aliases = []): array { $missing = $this->azure->missingChatConfig(); if ($missing) { return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')']; } - $languageNote = $language === 'no' ? "\nThe document may contain Norwegian or mixed-language content." : ''; + $languageNote = $language === 'no' ? "\n • The document may contain Norwegian or mixed-language content." : ''; + + $aliasBlock = ''; + if (!empty($aliases)) { + $lines = []; + foreach ($aliases as $a) { + $orig = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['original'] ?? '')), 0, 100)); + $lbl = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['alias'] ?? '')), 0, 100)); + if ($orig !== '' && $lbl !== '') { + $lines[] = " \"{$orig}\" → [{$lbl}]"; + } + } + if ($lines) { + $aliasBlock = "\n\nALIAS OVERRIDES — use these exact replacement tags for these specific names instead of inferring a role:\n" . implode("\n", $lines); + } + } $system = << 'user', 'content' => $preRedacted], ], [ 'temperature' => 0.1, - 'max_tokens' => 2000, + 'max_tokens' => 4000, 'json' => true, - 'timeout' => 60, + 'timeout' => 90, ]); $content = (string)($response['choices'][0]['message']['content'] ?? ''); diff --git a/index.php b/index.php index e28cf25..b0ca1a0 100644 --- a/index.php +++ b/index.php @@ -219,18 +219,27 @@ $authenticated = dbnToolsIsAuthenticated(); + +