From 8c12d5e7787ceaa3dadf1239d7a0fc7e15c9a863 Mon Sep 17 00:00:00 2001 From: davegilligan Date: Fri, 15 May 2026 00:20:16 +0200 Subject: [PATCH] Redact tool: rich UI, multilingual, engine choice, output formats - Custom inline form (EN/NO/UK/PL lang switcher) replacing generic stub - Engine selector: Azure gpt-4o-mini (default), gpt-4o, GPU cuttlefish, regex-only - Entity type toggles: names, organisations, places, dates of birth - Output formats: contextual role tags, generic [PERSON], Norwegian pseudonyms - Keep officials mode: judges/experts kept as [JUDGE: Andersen] format - Exempt names list: specific names excluded from redaction - Hint paragraphs explaining each option in all four languages - Backend: engine routing, callGpuLlm(), applyGenericTags(), applyPseudonymization() - AzureOpenAiGateway: withDeployment() clone pattern for per-call model override Co-Authored-By: Claude Sonnet 4.6 --- api/redact.php | 36 +++- assets/css/tools.css | 55 ++++++ assets/js/tools.js | 312 +++++++++++++++++++++++++++++++- includes/AzureOpenAiGateway.php | 7 + includes/LegalTools.php | 298 +++++++++++++++++++++++++++--- redact.php | 121 ++++++++++++- 6 files changed, 789 insertions(+), 40 deletions(-) diff --git a/api/redact.php b/api/redact.php index 223dbb8..7e995d2 100644 --- a/api/redact.php +++ b/api/redact.php @@ -13,6 +13,18 @@ dbnToolsWithTelemetry('redact', '', function () use ($input): array { $region = dbnToolsNormalizeRegion($input['region'] ?? 'nordic'); $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); + $validEngines = ['azure_mini', 'azure_full', 'gpu', 'regex']; + $engine = in_array((string)($input['engine'] ?? ''), $validEngines, true) + ? (string)$input['engine'] + : 'azure_mini'; + + $validFormats = ['contextual', 'generic', 'pseudonym']; + $outputFormat = in_array((string)($input['output_format'] ?? ''), $validFormats, true) + ? (string)$input['output_format'] + : 'contextual'; + + $keepOfficials = (bool)($input['keep_officials'] ?? false); + $aliases = []; $rawAliases = $input['aliases'] ?? []; if (is_array($rawAliases)) { @@ -28,5 +40,27 @@ dbnToolsWithTelemetry('redact', '', function () use ($input): array { } } - return (new DbnLegalToolsService())->redact($text, $mode, $region, $language, $aliases); + $exemptNames = []; + $rawExempt = $input['exempt_names'] ?? []; + if (is_array($rawExempt)) { + foreach (array_slice($rawExempt, 0, 20) as $name) { + $name = substr(trim((string)$name), 0, 100); + if ($name !== '') { + $exemptNames[] = $name; + } + } + } + + $rawTypes = $input['redact_types'] ?? []; + $redactTypes = [ + 'names' => ($rawTypes['names'] ?? true) !== false, + 'orgs' => ($rawTypes['orgs'] ?? true) !== false, + 'places' => ($rawTypes['places'] ?? true) !== false, + 'dob' => ($rawTypes['dob'] ?? true) !== false, + ]; + + return (new DbnLegalToolsService())->redact( + $text, $mode, $region, $language, $aliases, + $engine, $outputFormat, $keepOfficials, $exemptNames, $redactTypes + ); }); diff --git a/assets/css/tools.css b/assets/css/tools.css index fe125b9..f8d8ba9 100644 --- a/assets/css/tools.css +++ b/assets/css/tools.css @@ -1415,3 +1415,58 @@ p { } .control-hint { font-size: 0.74rem; color: var(--muted); font-weight: 400; } + +/* ─── Exempt names section (Redact tool) ──────────────────────────────────── */ + +.exempt-section { + margin-top: 0.75rem; + padding-top: 0.75rem; + border-top: 1px solid var(--line); +} + +.exempt-row { + display: flex; + align-items: center; + gap: 0.5rem; + margin-bottom: 0.4rem; +} + +.exempt-name-input { + flex: 1; + padding: 0.3rem 0.55rem; + border: 1px solid var(--line); + border-radius: 6px; + font-size: 0.875rem; + background: var(--panel); + color: var(--ink); + min-width: 0; +} + +.exempt-name-input:focus { + outline: 3px solid rgba(15, 118, 110, 0.28); + outline-offset: 1px; + border-color: var(--teal); +} + +/* ─── Entity type toggles (Redact tool) ───────────────────────────────────── */ + +.entity-toggles { + flex-wrap: wrap; + gap: 0.4rem 1.1rem; +} + +.entity-toggles label { + display: flex; + align-items: center; + gap: 0.3rem; + font-size: 0.875rem; + cursor: pointer; + user-select: none; +} + +.entity-toggles input[type="checkbox"] { + width: 15px; + height: 15px; + accent-color: var(--teal); + cursor: pointer; +} diff --git a/assets/js/tools.js b/assets/js/tools.js index 77077de..8494ad4 100644 --- a/assets/js/tools.js +++ b/assets/js/tools.js @@ -3,6 +3,205 @@ const state = { authenticated: Boolean(window.DBN_TOOLS_AUTHENTICATED), }; +const REDACT_I18N = { + en: { + redactEngine: 'Engine', + redactEngineAzureMini: 'Azure gpt-4o-mini', + redactEngineAzureFull: 'Azure gpt-4o', + redactEngineGpu: 'GPU (cuttlefish)', + redactEngineRegex: 'Regex only', + redactEngineHint: 'Azure engines use your BNL Azure credits. GPU runs the local LiteLLM proxy. Regex-only is instant and free but finds no names or organisations.', + redactMode: 'Mode', + redactModeStandard: 'Standard', + redactModeStrict: 'Strict', + redactModeHint: 'Standard: regex patterns + LLM scan for names/orgs/places. Strict: also replaces any capitalised two-word phrase as a potential name — more aggressive, may produce false positives.', + redactRegion: 'Region', + redactRegionNordic: 'Nordic', + redactRegionEuropean: 'European', + redactRegionEchr: 'ECHR', + redactRegionGlobal: 'Global', + redactRegionHint: 'Nordic: Norwegian fødselsnummer, phone, email, addresses. European: adds IBAN, SE personnummer, UK NI. ECHR: adds application numbers, DOB phrases. Global: adds US SSN, document numbers.', + redactEntities: 'Redact', + redactEntityNames: 'Names', + redactEntityOrgs: 'Organisations', + redactEntityPlaces: 'Places', + redactEntityDob: 'Dates of birth', + redactOfficials: 'Officials', + redactKeepOfficials: 'Keep official names (judges, experts)', + redactOfficialsHint: 'When checked, judges, expert witnesses and caseworkers keep their names in a labelled tag: [JUDGE: Andersen]. Uncheck to replace all names with generic role tags.', + redactOutput: 'Output', + redactOutputContextual: 'Contextual tags', + redactOutputGeneric: 'Generic tags', + redactOutputPseudo: 'Pseudonyms', + redactOutputHint: 'Contextual: each person gets a role tag so their identity is traceable within the document. Generic: all names become [PERSON]. Pseudonyms: replaced with plausible fake Norwegian values.', + redactExempt: 'Exempt names', + redactExemptAdd: 'Add', + redactExemptHint: 'Names listed here will never be redacted, even if the AI would otherwise remove them — e.g. a judge or expert who must remain identifiable.', + redactExemptPlaceholder: 'Name to keep (e.g. Judge Andersen)', + redactAliases: 'Name aliases', + redactAliasAdd: 'Add', + redactAliasHint: 'Replace a specific name with a custom bracketed label, e.g. "David Jr" → [Junior].', + redactUploadAria: 'File upload', + redactUploadDrop: 'Drop up to 5 files here, or', + redactUploadBrowse: 'browse', + redactUploadHint: 'text extracted in memory, never stored', + redactUploadClear: '× Clear', + redactInputLabel: 'Pasted text', + redactInputPlaceholder: 'Paste text containing names, phone numbers, emails, addresses, or national ID numbers.', + redactRun: 'Run', + redactRunning: 'Redacting…', + redactReadyTitle: 'Ready', + redactReadyDesc: 'Paste text or upload a file, configure redaction options, then run.', + }, + no: { + redactEngine: 'Motor', + redactEngineAzureMini: 'Azure gpt-4o-mini', + redactEngineAzureFull: 'Azure gpt-4o', + redactEngineGpu: 'GPU (cuttlefish)', + redactEngineRegex: 'Kun regex', + redactEngineHint: 'Azure-motorer bruker BNL Azure-kreditter. GPU kjører lokal LiteLLM-proxy. Kun regex er øyeblikkelig og gratis, men finner ingen navn eller organisasjoner.', + redactMode: 'Modus', + redactModeStandard: 'Standard', + redactModeStrict: 'Strikt', + redactModeHint: 'Standard: regex-mønstre + LLM-skanning for navn/org/steder. Strikt: erstatter også enhver stor-stav-kombinasjon som potensielt navn — mer aggressivt, kan gi falske positiver.', + redactRegion: 'Region', + redactRegionNordic: 'Nordisk', + redactRegionEuropean: 'Europeisk', + redactRegionEchr: 'EMD', + redactRegionGlobal: 'Global', + redactRegionHint: 'Nordisk: norsk fødselsnummer, telefon, e-post, adresser. Europeisk: legger til IBAN, SE personnummer, UK NI. EMD: legger til saksnummer, fødselsdatofraser. Global: legger til US SSN, dokumentnummer.', + redactEntities: 'Rediger', + redactEntityNames: 'Navn', + redactEntityOrgs: 'Organisasjoner', + redactEntityPlaces: 'Steder', + redactEntityDob: 'Fødselsdatoer', + redactOfficials: 'Offisielle', + redactKeepOfficials: 'Behold offisielle navn (dommere, sakkyndige)', + redactOfficialsHint: 'Når avkrysset beholder dommere, sakkyndige og saksbehandlere sine navn i en merket tagg: [DOMMER: Andersen]. Fjern haken for å erstatte alle navn med generiske rolletaggar.', + redactOutput: 'Utdata', + redactOutputContextual: 'Kontekstuelle taggar', + redactOutputGeneric: 'Generiske taggar', + redactOutputPseudo: 'Pseudonymer', + redactOutputHint: 'Kontekstuell: hver person får en rolletagg slik at identiteten kan spores i dokumentet. Generisk: alle navn blir [PERSON]. Pseudonymer: erstattes med troverdige falske norske verdier.', + redactExempt: 'Unntak', + redactExemptAdd: 'Legg til', + redactExemptHint: 'Navn oppført her vil aldri bli redigert, selv om AI ellers ville fjernet dem — f.eks. en dommer eller sakkyndig som må forbli identifiserbar.', + redactExemptPlaceholder: 'Navn som skal beholdes (f.eks. Dommer Andersen)', + redactAliases: 'Navnealiaser', + redactAliasAdd: 'Legg til', + redactAliasHint: 'Erstatt et spesifikt navn med en egendefinert merkelapp, f.eks. «David Jr» → [Junior].', + redactUploadAria: 'Filopplasting', + redactUploadDrop: 'Slipp opptil 5 filer her, eller', + redactUploadBrowse: 'bla', + redactUploadHint: 'tekst hentes i minnet, lagres aldri', + redactUploadClear: '× Tøm', + redactInputLabel: 'Limt inn tekst', + redactInputPlaceholder: 'Lim inn tekst med navn, telefonnummer, e-poster, adresser eller personnummer.', + redactRun: 'Kjør', + redactRunning: 'Redigerer…', + redactReadyTitle: 'Klar', + redactReadyDesc: 'Lim inn tekst eller last opp en fil, konfigurer redigeringsalternativene, og kjør.', + }, + uk: { + redactEngine: 'Рушій', + redactEngineAzureMini: 'Azure gpt-4o-mini', + redactEngineAzureFull: 'Azure gpt-4o', + redactEngineGpu: 'GPU (cuttlefish)', + redactEngineRegex: 'Лише регулярні вирази', + redactEngineHint: 'Рушії Azure використовують кредити BNL Azure. GPU запускає локальний проксі LiteLLM. Лише regex — миттєво і безкоштовно, але не знаходить імен або організацій.', + redactMode: 'Режим', + redactModeStandard: 'Стандартний', + redactModeStrict: 'Суворий', + redactModeHint: 'Стандарт: шаблони regex + LLM-сканування для імен/орг/місць. Суворий: також замінює будь-яку комбінацію слів з великої літери як потенційне ім\'я.', + redactRegion: 'Регіон', + redactRegionNordic: 'Nordisk', + redactRegionEuropean: 'Європейський', + redactRegionEchr: 'ЄСПЛ', + redactRegionGlobal: 'Глобальний', + redactRegionHint: 'Nordisk: норвезький фødselsnummer, телефон, email, адреси. Європейський: додає IBAN, SE personnummer, UK NI. ЄСПЛ: додає номери справ, фрази дати народження. Глобальний: додає US SSN.', + redactEntities: 'Редагувати', + redactEntityNames: 'Імена', + redactEntityOrgs: 'Організації', + redactEntityPlaces: 'Місця', + redactEntityDob: 'Дати народження', + redactOfficials: 'Офіційні особи', + redactKeepOfficials: 'Зберігати офіційні імена (судді, експерти)', + redactOfficialsHint: 'Якщо позначено, судді, експерти та соціальні працівники зберігають свої імена у позначеному тезі: [СУДДЯ: Andersen].', + redactOutput: 'Вивід', + redactOutputContextual: 'Контекстні теги', + redactOutputGeneric: 'Загальні теги', + redactOutputPseudo: 'Псевдоніми', + redactOutputHint: 'Контекстний: кожна особа отримує тег ролі. Загальний: всі імена стають [PERSON]. Псевдоніми: замінюються правдоподібними норвезькими значеннями.', + redactExempt: 'Виключені імена', + redactExemptAdd: 'Додати', + redactExemptHint: 'Імена, перелічені тут, ніколи не будуть відредаговані.', + redactExemptPlaceholder: 'Ім\'я для збереження (напр. суддя Andersen)', + redactAliases: 'Псевдоніми імен', + redactAliasAdd: 'Додати', + redactAliasHint: 'Замініть конкретне ім\'я на власну мітку, напр. «David Jr» → [Junior].', + redactUploadAria: 'Завантаження файлів', + redactUploadDrop: 'Перетягніть до 5 файлів сюди, або', + redactUploadBrowse: 'огляд', + redactUploadHint: 'текст обробляється в пам\'яті, ніколи не зберігається', + redactUploadClear: '× Очистити', + redactInputLabel: 'Вставлений текст', + redactInputPlaceholder: 'Вставте текст з іменами, телефонами, адресами або ідентифікаційними номерами.', + redactRun: 'Запустити', + redactRunning: 'Редагування…', + redactReadyTitle: 'Готово', + redactReadyDesc: 'Вставте текст або завантажте файл, налаштуйте параметри, запустіть.', + }, + pl: { + redactEngine: 'Silnik', + redactEngineAzureMini: 'Azure gpt-4o-mini', + redactEngineAzureFull: 'Azure gpt-4o', + redactEngineGpu: 'GPU (cuttlefish)', + redactEngineRegex: 'Tylko regex', + redactEngineHint: 'Silniki Azure używają kredytów Azure BNL. GPU korzysta z lokalnego proxy LiteLLM. Tylko regex jest natychmiastowy i bezpłatny, ale nie znajdzie imion ani organizacji.', + redactMode: 'Tryb', + redactModeStandard: 'Standardowy', + redactModeStrict: 'Ścisły', + redactModeHint: 'Standardowy: wzorce regex + skanowanie LLM dla imion/org/miejsc. Ścisły: zastępuje też każdą kombinację słów pisanych wielką literą jako potencjalne imię.', + redactRegion: 'Region', + redactRegionNordic: 'Nordycki', + redactRegionEuropean: 'Europejski', + redactRegionEchr: 'ETPC', + redactRegionGlobal: 'Globalny', + redactRegionHint: 'Nordycki: norweski fødselsnummer, telefon, email, adresy. Europejski: dodaje IBAN, SE personnummer, UK NI. ETPC: dodaje numery spraw, frazy daty urodzenia. Globalny: dodaje US SSN.', + redactEntities: 'Redaguj', + redactEntityNames: 'Imiona', + redactEntityOrgs: 'Organizacje', + redactEntityPlaces: 'Miejsca', + redactEntityDob: 'Daty urodzenia', + redactOfficials: 'Urzędnicy', + redactKeepOfficials: 'Zachowaj oficjalne nazwy (sędziowie, eksperci)', + redactOfficialsHint: 'Gdy zaznaczone, sędziowie, biegli i pracownicy socjalni zachowują swoje nazwiska w oznaczonym tagu: [SĘDZIA: Andersen].', + redactOutput: 'Wyjście', + redactOutputContextual: 'Tagi kontekstowe', + redactOutputGeneric: 'Tagi ogólne', + redactOutputPseudo: 'Pseudonimy', + redactOutputHint: 'Kontekstowe: każda osoba otrzymuje tag roli. Ogólne: wszystkie imiona stają się [PERSON]. Pseudonimy: zastąpione wiarygodnymi fałszywymi wartościami norweskimi.', + redactExempt: 'Zwolnione nazwy', + redactExemptAdd: 'Dodaj', + redactExemptHint: 'Nazwy tu wpisane nigdy nie zostaną zredagowane.', + redactExemptPlaceholder: 'Nazwa do zachowania (np. Sędzia Andersen)', + redactAliases: 'Aliasy nazw', + redactAliasAdd: 'Dodaj', + redactAliasHint: 'Zastąp konkretną nazwę własną etykietą, np. «David Jr» → [Junior].', + redactUploadAria: 'Przesyłanie pliku', + redactUploadDrop: 'Upuść do 5 plików tutaj lub', + redactUploadBrowse: 'przeglądaj', + redactUploadHint: 'tekst wyodrębniany w pamięci, nigdy nie przechowywany', + redactUploadClear: '× Wyczyść', + redactInputLabel: 'Wklejony tekst', + redactInputPlaceholder: 'Wklej tekst zawierający imiona, numery telefonów, adresy lub numery identyfikacyjne.', + redactRun: 'Uruchom', + redactRunning: 'Redagowanie…', + redactReadyTitle: 'Gotowe', + redactReadyDesc: 'Wklej tekst lub wgraj plik, skonfiguruj opcje redakcji, uruchom.', + }, +}; + let lastTimelineEvents = []; let audioQueue = []; // [{file, status: 'pending'|'processing'|'done'|'error', result}] let lastTranscriptData = null; @@ -310,6 +509,89 @@ function applyTranscribeI18n(lang) { }); } +function currentRedactT(key) { + const t = REDACT_I18N[uiLang] || REDACT_I18N.en; + return (key in t) ? t[key] : (REDACT_I18N.en[key] ?? key); +} + +function applyRedactI18n(lang) { + uiLang = lang; + localStorage.setItem('dbn-ui-lang', lang); + document.querySelectorAll('[data-i18n]').forEach((el) => { + const text = currentRedactT(el.dataset.i18n); + if (text != null) el.textContent = text; + }); + document.querySelectorAll('[data-i18n-placeholder]').forEach((el) => { + const text = currentRedactT(el.dataset.i18nPlaceholder); + if (text != null) el.placeholder = text; + }); + document.querySelectorAll('[data-i18n-aria]').forEach((el) => { + const text = currentRedactT(el.dataset.i18nAria); + if (text != null) el.setAttribute('aria-label', text); + }); + document.querySelectorAll('#redactLangSwitcher .lang-btn').forEach((btn) => { + btn.classList.toggle('is-active', btn.dataset.lang === lang); + }); +} + +function currentRedactEngine() { + return document.querySelector('input[name="redactEngine"]:checked')?.value || 'azure_mini'; +} + +function currentOutputFormat() { + return document.querySelector('input[name="outputFormat"]:checked')?.value || 'contextual'; +} + +function currentKeepOfficials() { + return document.getElementById('keepOfficialsCheck')?.checked ?? false; +} + +function currentRedactTypes() { + return { + names: document.getElementById('redactNames')?.checked ?? true, + orgs: document.getElementById('redactOrgs')?.checked ?? true, + places: document.getElementById('redactPlaces')?.checked ?? true, + dob: document.getElementById('redactDob')?.checked ?? true, + }; +} + +function setupRedactControls() { + const switcher = document.getElementById('redactLangSwitcher'); + if (!switcher) return; + switcher.querySelectorAll('.lang-btn').forEach((btn) => { + btn.addEventListener('click', () => applyRedactI18n(btn.dataset.lang)); + }); + applyRedactI18n(uiLang); +} + +function setupExemptNames() { + const addBtn = document.getElementById('addExemptRow'); + const rows = document.getElementById('exemptRows'); + if (!addBtn || !rows) return; + + addBtn.addEventListener('click', () => { + const row = document.createElement('div'); + row.className = 'exempt-row'; + row.innerHTML = [ + ``, + '', + ].join(''); + rows.appendChild(row); + row.querySelector('.exempt-name-input').focus(); + }); + + rows.addEventListener('click', (e) => { + const btn = e.target.closest('.alias-remove'); + if (btn) btn.closest('.exempt-row').remove(); + }); +} + +function getExemptNames() { + return Array.from(document.querySelectorAll('#exemptRows .exempt-name-input')) + .map((el) => el.value.trim()) + .filter(Boolean); +} + const tools = { ask: { kind: 'Source-grounded Legal Ask', @@ -433,10 +715,15 @@ document.addEventListener('DOMContentLoaded', () => { setupAudio(); setupTranscribeControls(); setupVocabPresets(); - document.querySelectorAll('.lang-btn').forEach((btn) => { + setupRedactControls(); + setupExemptNames(); + // Wire transcribe lang switcher (only present on transcribe page) + document.querySelectorAll('#uiLangSwitcher .lang-btn').forEach((btn) => { btn.addEventListener('click', () => applyTranscribeI18n(btn.dataset.lang)); }); - applyTranscribeI18n(uiLang); + if (document.getElementById('uiLangSwitcher')) { + applyTranscribeI18n(uiLang); + } els.results.addEventListener('click', (e) => { if (e.target.closest('#exportCsvBtn')) exportTimelineCSV(lastTimelineEvents); if (e.target.closest('#dlTxt')) downloadTranscriptTxt(); @@ -532,9 +819,14 @@ async function runTool(event) { payload.limit = 7; } if (state.activeTool === 'redact') { - payload.mode = currentRedactionMode(); - payload.region = currentRedactionRegion(); - payload.aliases = getAliases(); + payload.mode = currentRedactionMode(); + payload.region = currentRedactionRegion(); + payload.aliases = getAliases(); + payload.engine = currentRedactEngine(); + payload.output_format = currentOutputFormat(); + payload.keep_officials = currentKeepOfficials(); + payload.exempt_names = getExemptNames(); + payload.redact_types = currentRedactTypes(); } setBusy(true); @@ -742,9 +1034,13 @@ async function postJson(url, payload) { function setBusy(isBusy) { const button = document.querySelector('#runButton'); button.disabled = isBusy; - button.textContent = isBusy - ? (state.activeTool === 'transcribe' ? currentUiT('running') : currentUiT('runningOther')) - : currentUiT('run'); + if (state.activeTool === 'transcribe') { + button.textContent = isBusy ? currentUiT('running') : currentUiT('run'); + } else if (state.activeTool === 'redact') { + button.textContent = isBusy ? currentRedactT('redactRunning') : currentRedactT('redactRun'); + } else { + button.textContent = isBusy ? currentUiT('runningOther') : currentUiT('run'); + } } function currentLanguage() { diff --git a/includes/AzureOpenAiGateway.php b/includes/AzureOpenAiGateway.php index f718e96..35c7b3c 100644 --- a/includes/AzureOpenAiGateway.php +++ b/includes/AzureOpenAiGateway.php @@ -40,6 +40,13 @@ final class DbnAzureOpenAiGateway return $missing; } + public function withDeployment(string $deployment): static + { + $clone = clone $this; + $clone->config['chat_deployment'] = $deployment; + return $clone; + } + public function chatDeployment(): string { return (string)$this->config['chat_deployment']; diff --git a/includes/LegalTools.php b/includes/LegalTools.php index fef1075..2c577ae 100644 --- a/includes/LegalTools.php +++ b/includes/LegalTools.php @@ -343,15 +343,33 @@ PROMPT; ]; } - public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en', array $aliases = []): array - { - $text = $this->requirePasteText($text); - $mode = $mode === 'strict' ? 'strict' : 'standard'; - $region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic'; + public function redact( + string $text, + string $mode = 'standard', + string $region = 'nordic', + string $language = 'en', + array $aliases = [], + string $engine = 'azure_mini', + string $outputFormat = 'contextual', + bool $keepOfficials = false, + array $exemptNames = [], + array $redactTypes = [] + ): array { + $text = $this->requirePasteText($text); + $mode = $mode === 'strict' ? 'strict' : 'standard'; + $region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic'; + $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'regex'], true) ? $engine : 'azure_mini'; + $outputFormat = in_array($outputFormat, ['contextual', 'generic', 'pseudonym'], true) ? $outputFormat : 'contextual'; + + // Normalise entity-type flags (all on by default) + $doNames = ($redactTypes['names'] ?? true) !== false; + $doOrgs = ($redactTypes['orgs'] ?? true) !== false; + $doPlaces = ($redactTypes['places'] ?? true) !== false; + $doDob = ($redactTypes['dob'] ?? true) !== false; // Pass 1 — deterministic regex [$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region); - $pass1Total = array_sum($pass1Counts); + $pass1Total = array_sum($pass1Counts); $pass1Detail = $pass1Total ? implode(', ', array_map( fn($k, $v) => "{$k}: {$v}", @@ -360,8 +378,15 @@ PROMPT; )) : 'none detected'; + $engineLabel = match ($engine) { + 'azure_full' => 'Azure gpt-4o', + 'gpu' => 'GPU (cuttlefish)', + 'regex' => 'Regex only', + default => 'Azure gpt-4o-mini', + }; + $trace = [ - $this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}.", 'complete'), + $this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}. Engine: {$engineLabel}.", 'complete'), $this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'), ]; @@ -370,10 +395,14 @@ PROMPT; $pass2Counts = []; $llmDeployment = null; - $llmResult = $this->llmRedactionPass($preRedacted, $language, $aliases); + $llmResult = $this->llmRedactionPass( + $preRedacted, $language, $aliases, $engine, + $keepOfficials, $exemptNames, + $doNames, $doOrgs, $doPlaces, $doDob + ); if (!empty($llmResult['skipped'])) { - $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'Azure not configured') . '.', 'warning'); + $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'not configured') . '.', 'warning'); } elseif (!empty($llmResult['error'])) { $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning'); } else { @@ -391,7 +420,8 @@ PROMPT; if ($original === '' || str_starts_with($original, '[')) { continue; } - if (!preg_match('/^\[[A-Za-z0-9_\- ]+\]$/', $tag)) { + // Allow [ROLE: Name] format when keepOfficials is on, else require plain bracket tag + if (!preg_match('/^\[[A-Za-z0-9_\- ]+(?::\s*[^\]]+)?\]$/', $tag)) { $tag = '[IDENTIFIER]'; } if (str_contains($finalRedacted, $original)) { @@ -405,12 +435,24 @@ PROMPT; ? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts)) : 'no additional entities found'; - $trace[] = $this->trace('Pass 2 — LLM semantic scan', "Azure reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete'); + $trace[] = $this->trace('Pass 2 — LLM semantic scan', "{$engineLabel} reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete'); + } + + // Apply output format post-processing + $allCounts = array_merge($pass1Counts, $pass2Counts); + if ($outputFormat === 'generic') { + $finalRedacted = $this->applyGenericTags($finalRedacted); + } elseif ($outputFormat === 'pseudonym') { + $finalRedacted = $this->applyPseudonymization($finalRedacted, $allCounts); } - $allCounts = array_merge($pass1Counts, $pass2Counts); $categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0)); + $trace[] = $this->trace('Output format', match ($outputFormat) { + 'generic' => 'All identifiers normalised to generic tags ([PERSON], [ORG], etc.).', + 'pseudonym' => 'Identifiers replaced with plausible pseudonymous values.', + default => 'Contextual role tags used (e.g. [FATHER], [JUDGE: Name]).', + }, 'complete'); $trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning'); $trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete'); @@ -418,7 +460,9 @@ PROMPT; 'tool' => 'redact', 'mode' => $mode, 'region' => $region, - 'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment ? ' and LLM semantic scan' : '') . '.', + 'engine_used' => $engineLabel, + 'output_format' => $outputFormat, + 'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment || $engine === 'gpu' ? " and {$engineLabel} semantic scan" : '') . '.', 'redacted_text' => $finalRedacted, 'detected_entity_categories' => $categories, 'entity_counts' => $allCounts, @@ -429,7 +473,7 @@ PROMPT; 'trace_metadata' => [ 'chunk_count' => 1, 'source_count' => 1, - 'deployment' => $llmDeployment, + 'deployment' => $llmDeployment ?? $engineLabel, ], 'disclaimer' => 'Privacy support tool. Review before disclosure.', ]; @@ -793,15 +837,32 @@ PROMPT; ]); } - private function llmRedactionPass(string $preRedacted, string $language = 'en', array $aliases = []): array - { - $missing = $this->azure->missingChatConfig(); - if ($missing) { - return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')']; + private function llmRedactionPass( + string $preRedacted, + string $language = 'en', + array $aliases = [], + string $engine = 'azure_mini', + bool $keepOfficials = false, + array $exemptNames = [], + bool $doNames = true, + bool $doOrgs = true, + bool $doPlaces = true, + bool $doDob = true + ): array { + if ($engine === 'regex') { + return ['skipped' => true, 'reason' => 'Regex-only mode selected']; + } + + if ($engine !== 'gpu') { + $missing = $this->azure->missingChatConfig(); + if ($missing) { + return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')']; + } } $languageNote = $language === 'no' ? "\n • The document may contain Norwegian or mixed-language content." : ''; + // Build alias block $aliasBlock = ''; if (!empty($aliases)) { $lines = []; @@ -817,6 +878,32 @@ PROMPT; } } + // Build exempt names block + $exemptBlock = ''; + if (!empty($exemptNames)) { + $quoted = array_map(fn($n) => '"' . str_replace(['"', "\n"], ['\\"', ' '], $n) . '"', array_slice($exemptNames, 0, 20)); + $exemptBlock = "\n\nEXEMPT NAMES — these names must NOT be redacted under any circumstances:\n " . implode(', ', $quoted); + } + + // Build entity-type restriction note + $skipTypes = []; + if (!$doOrgs) $skipTypes[] = 'organisation names'; + if (!$doPlaces) $skipTypes[] = 'place names'; + if (!$doDob) $skipTypes[] = 'dates of birth'; + if (!$doNames) $skipTypes[] = 'person names'; + $skipNote = $skipTypes ? "\n\nSKIP these entity types — do NOT redact them: " . implode(', ', $skipTypes) . '.' : ''; + + // Build officials note + $officialsNote = ''; + if ($keepOfficials) { + $officialsNote = "\n\nOFFICIALS — for persons identified as JUDGE, EXPERT_WITNESS, or CASEWORKER in an official capacity: do NOT replace their name with a plain bracket tag. Instead use the format [ROLE: Name], e.g. [JUDGE: Andersen] or [EXPERT_WITNESS: Dr. Larsen]. Their name must remain visible inside the tag."; + } + + $allowedTypesNote = ''; + if (!$doNames) { + $allowedTypesNote = "\n\nDo NOT include person_name entries in your output."; + } + $system = << 'system', 'content' => $system], + ['role' => 'user', 'content' => $preRedacted], + ]; + $chatOptions = ['temperature' => 0.1, 'max_tokens' => 8000, 'json' => true, 'timeout' => 90]; + try { - $response = $this->azure->chat([ - ['role' => 'system', 'content' => $system], - ['role' => 'user', 'content' => $preRedacted], - ], [ - 'temperature' => 0.1, - 'max_tokens' => 8000, - 'json' => true, - 'timeout' => 90, - ]); + if ($engine === 'gpu') { + $response = $this->callGpuLlm($messages, $chatOptions); + $deployLabel = 'GPU (cuttlefish)'; + } elseif ($engine === 'azure_full') { + $response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOptions); + $deployLabel = 'gpt-4o'; + } else { + $response = $this->azure->chat($messages, $chatOptions); + $deployLabel = $this->azure->chatDeployment(); + } $content = (string)($response['choices'][0]['message']['content'] ?? ''); $json = $this->azure->decodeJsonObject($content); @@ -869,7 +963,7 @@ PROMPT; return [ 'skipped' => false, 'entities' => is_array($json['redactions']) ? $json['redactions'] : [], - 'deployment' => $this->azure->chatDeployment(), + 'deployment' => $deployLabel, ]; } catch (Throwable $e) { error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage()); @@ -877,6 +971,150 @@ PROMPT; } } + private function callGpuLlm(array $messages, array $options = []): array + { + $url = 'http://10.0.1.10:4000/v1/chat/completions'; + $apiKey = 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d'; + $model = 'qwen2.5:14b'; + $timeout = (int)($options['timeout'] ?? 90); + + $payload = [ + 'model' => $model, + 'messages' => $messages, + 'temperature' => $options['temperature'] ?? 0.1, + 'max_tokens' => $options['max_tokens'] ?? 8000, + ]; + if (!empty($options['json'])) { + $payload['response_format'] = ['type' => 'json_object']; + } + + $body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); + $headers = [ + 'Content-Type: application/json', + 'Authorization: Bearer ' . $apiKey, + ]; + + if (function_exists('curl_init')) { + $ch = curl_init($url); + curl_setopt_array($ch, [ + CURLOPT_RETURNTRANSFER => true, + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => $body, + CURLOPT_HTTPHEADER => $headers, + CURLOPT_TIMEOUT => $timeout, + ]); + $response = curl_exec($ch); + $code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE); + $err = curl_error($ch); + curl_close($ch); + + if ($response === false) { + throw new RuntimeException('GPU LiteLLM request failed: ' . $err); + } + } else { + $ctx = stream_context_create(['http' => [ + 'method' => 'POST', + 'header' => implode("\r\n", $headers), + 'content' => $body, + 'timeout' => $timeout, + 'ignore_errors' => true, + ]]); + $response = @file_get_contents($url, false, $ctx); + $code = 0; + if (isset($http_response_header[0]) && preg_match('/\s(\d{3})\s/', $http_response_header[0], $m)) { + $code = (int)$m[1]; + } + if ($response === false) { + throw new RuntimeException('GPU LiteLLM request failed.'); + } + } + + $decoded = json_decode($response, true); + if (!is_array($decoded)) { + throw new RuntimeException('GPU LiteLLM returned non-JSON response.'); + } + if ($code < 200 || $code >= 300) { + $msg = $decoded['error']['message'] ?? ('HTTP ' . $code); + throw new RuntimeException('GPU LiteLLM error: ' . $msg); + } + return $decoded; + } + + private function applyGenericTags(string $text): string + { + // Collapse contextual role tags (e.g. [FATHER], [JUDGE: Andersen], [CHILD_1]) → [PERSON] + $text = preg_replace('/\[(?:FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u', '[PERSON]', $text) ?? $text; + return $text; + } + + private function applyPseudonymization(string $text, array $allCounts): string + { + $norwegianNames = [ + 'Ola Nordmann', 'Per Hansen', 'Kari Larsen', 'Anne Berg', 'Erik Dahl', + 'Ingrid Holm', 'Lars Moen', 'Silje Bakke', 'Tor Haugen', 'Eva Strand', + ]; + $nameCursor = 0; + $phoneBase = 1; + $emailCursor = 0; + $addrCursor = 1; + $orgCursor = 1; + $personMap = []; + + // Replace named role tags (keeping consistent mapping per unique tag) + $text = preg_replace_callback( + '/\[(FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u', + function (array $m) use (&$nameCursor, &$personMap, $norwegianNames): string { + $key = $m[1]; + if (!isset($personMap[$key])) { + $personMap[$key] = $norwegianNames[$nameCursor % count($norwegianNames)]; + $nameCursor++; + } + return $personMap[$key]; + }, + $text + ) ?? $text; + + $text = preg_replace_callback('/\[PHONE\]/', function () use (&$phoneBase): string { + return sprintf('+47 400 00 %03d', $phoneBase++); + }, $text) ?? $text; + + $text = preg_replace_callback('/\[EMAIL\]/', function () use (&$emailCursor): string { + $letter = chr(ord('a') + ($emailCursor % 26)); + $emailCursor++; + return "person.{$letter}@example.no"; + }, $text) ?? $text; + + $text = preg_replace_callback('/\[ADDRESS\]/', function () use (&$addrCursor): string { + return "Eksempelveien {$addrCursor}, 0001 Oslo"; + }, $text) ?? $text; + + $text = preg_replace_callback('/\[ORG\]/', function () use (&$orgCursor): string { + return "Eksempel AS ({$orgCursor})"; + }, $text) ?? $text; + + $text = preg_replace_callback('/\[FNR\]/', function (): string { + return '010100XXXXX'; + }, $text) ?? $text; + + $text = preg_replace_callback('/\[(?:SE_PERSONNUMMER|FR_INSEE|UK_NI|SSN|NAT_ID|DOC_NO|ECHR_APP_NO)\]/', function (): string { + return '[ID-REDACTED]'; + }, $text) ?? $text; + + $text = preg_replace_callback('/\[PLACE\]/', function (): string { + return 'Eksempelby'; + }, $text) ?? $text; + + $text = preg_replace_callback('/\[DOB\]/', function (): string { + return '01.01.0000'; + }, $text) ?? $text; + + $text = preg_replace_callback('/\[IBAN\]/', function (): string { + return 'NO00 0000 00 00000'; + }, $text) ?? $text; + + return $text; + } + private function uncertaintySummary(mixed $uncertainty): string { if (is_array($uncertainty)) { diff --git a/redact.php b/redact.php index 8429b27..81fdd5c 100644 --- a/redact.php +++ b/redact.php @@ -6,5 +6,124 @@ $toolKind = 'Redaction Assistant'; $toolBadge = 'deterministic first'; require_once __DIR__ . '/includes/layout.php'; ?> - +
+ +
+ + + + +
+ +
+ Engine + + + + +
+

Azure engines use your BNL Azure credits. GPU runs the local LiteLLM proxy. Regex-only is instant and free but finds no names or organisations.

+ +
+ Mode + + +
+

Standard: regex patterns + LLM scan for names/orgs/places. Strict: also replaces any capitalised two-word phrase as a potential name — more aggressive, may produce false positives.

+ +
+ Region + + + + +
+

Nordic: Norwegian fødselsnummer, phone, email, addresses. European: adds IBAN, SE personnummer, UK NI. ECHR: adds application numbers, DOB phrases. Global: adds US SSN, document numbers.

+ +
+ Redact + + + + +
+ +
+ Officials + +
+

When checked, judges, expert witnesses and caseworkers keep their names in a labelled tag: [JUDGE: Andersen]. Uncheck to replace all names with generic role tags.

+ +
+ Output + + + +
+

Contextual: each person gets a role tag so their identity is traceable within the document. Generic: all names become [PERSON]. Pseudonyms: replaced with plausible fake Norwegian values.

+ +
+
+ Exempt names + +
+
+

Names listed here will never be redacted, even if the AI would otherwise remove them — e.g. a judge or expert who must remain identifiable.

+
+ +
+
+ Name aliases + +
+
+

Replace a specific name with a custom bracketed label, e.g. “David Jr” → [Junior].

+
+ +
+ +
+ +

Drop up to 5 files here, or

+

PDF, DOCX, TXTtext extracted in memory, never stored

+
+ +
+ + + + + +
+ +
+
+

Ready

+

Paste text or upload a file, configure redaction options, then run.

+
+
+ + + + + + + +