Files
dobetternorge-tools/includes/LegalTools.php
T
daveadmin c84ed2ed78 fix(tools): parse-harden Do Better Legal ask against leaky fine-tune output
The dbn-legal-agent-v3 fine-tune (Track 1 / family) emits a labelled-prose
template — duplicate `answer:` prefixes, markdown-escaped underscores (`\_`),
and a trailing raw JSON blob — rather than the strict JSON the Azure/gpt-4o
path produces via response_format. decodeJsonObject() returned null on that
invalid JSON, so ask() dumped the entire raw blob into `answer`.

Fix at the parse layer (no upstream response_format change, to avoid fighting
the fine-tune's training):
- dbnToolsRepairJsonText(): strip fences, drop only invalid `\_`/`\*` escapes,
  then balanced-brace scan collecting every top-level {...} (longest first) to
  recover an appended JSON object. Shared by both gateways' decodeJsonObject(),
  so all JSON tools benefit.
- dbnToolsParseLabeledFields(): parse labelled-prose into real fields when no
  JSON decodes, tolerating escaped key names and collapsing duplicate prefixes.
- ask() null-fallback now builds clean structured fields from the parsed prose
  instead of dumping raw; what_remains_uncertain becomes a proper list.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-02 17:36:35 +02:00

2033 lines
100 KiB
PHP

<?php
declare(strict_types=1);
require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';
require_once __DIR__ . '/DbnGatewayFactory.php';
final class DbnLegalToolsService
{
private const MAX_PASTE_CHARS = 128000;
private const MAX_TIMELINE_CHARS = 600000;
private DbnAzureOpenAiGateway|DbnBedrockGateway $azure;
public function __construct(DbnAzureOpenAiGateway|DbnBedrockGateway|null $azure = null)
{
$this->azure = $azure ?: DbnGatewayFactory::make();
}
public function search(
string $query,
string $language = 'en',
int $limit = 6,
string $temporalMode = 'disabled',
?string $asOfDate = null,
string $scope = 'both',
?string $persona = null
): array {
$query = trim($query);
if (mb_strlen($query, 'UTF-8') < 3) {
dbnToolsAbort('Search query must be at least 3 characters.', 422, 'query_too_short');
}
$limit = max(1, min(10, $limit));
$temporalMode = in_array($temporalMode, ['legal_conservative', 'disabled'], true) ? $temporalMode : 'disabled';
$scope = in_array($scope, ['shared', 'private', 'both'], true) ? $scope : 'both';
$scopeLabel = match ($scope) {
'private' => 'personal corpus only',
'shared' => 'Legal Library only',
default => 'Legal Library + personal corpus',
};
$product = dbnToolsProductName();
$trace = [
$this->trace('Query interpretation', "Searching {$product} {$scopeLabel}.", 'complete'),
$this->trace('Search tools used', 'ClientRagPipeline::searchAll with keyword mode.', 'running'),
];
$client = dbnToolsRequireClient();
$personaResolved = dbnToolsResolvePersona((int)$client['id'], $persona);
$package = $personaResolved['package'] ?? $this->requireFamilyPackage((int)$client['id']);
$packageIds = $personaResolved['package_ids'] ?: [(int)$package['id']];
$personaRagOpts = is_array($personaResolved['rag_opts'] ?? null) ? $personaResolved['rag_opts'] : [];
$searchMethod = (string)($personaResolved['search_method'] ?? 'keyword') ?: 'keyword';
// Personal corpus client_id from session (may be 0 if user has no linked workspace)
$personalClientId = (int)($_SESSION['dbn_tools_client_id'] ?? 0);
$chunks = [];
$retrievalNote = 'ClientRagPipeline keyword retrieval';
try {
dbnToolsBootCaveau();
$gatewayUrl = 'http://10.0.1.10:4000';
try {
$config = getConfig();
$configured = trim((string)($config['ai_gateway']['url'] ?? ''));
if ($configured !== '') {
$gatewayUrl = $configured;
}
} catch (Throwable $e) {
// Retrieval still works in keyword mode without gateway config.
}
if ($scope === 'private') {
// Search only the user's personal corpus
if ($personalClientId > 0) {
$rag = new ClientRagPipeline($personalClientId, $gatewayUrl, 30);
$chunks = $rag->searchAll($query, $limit, null, array_merge($personaRagOpts, [
'search_private' => true,
'search_shared' => false,
'chunk_limit' => $limit,
'search_method' => $searchMethod,
'min_private' => 0,
]));
}
} elseif ($scope === 'shared') {
// Search only the shared legal library (persona-scoped packages)
$rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 30);
$chunks = $rag->searchAll($query, $limit, null, array_merge($personaRagOpts, [
'search_private' => true,
'search_shared' => true,
'package_ids' => $packageIds,
'chunk_limit' => $limit,
'search_method' => $searchMethod,
'min_private' => 0,
'include_beta_website' => true,
]));
} else {
// 'both': shared library + personal corpus merged and re-ranked by score
$rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 30);
$sharedChunks = $rag->searchAll($query, $limit, null, array_merge($personaRagOpts, [
'search_private' => true,
'search_shared' => true,
'package_ids' => $packageIds,
'chunk_limit' => $limit,
'search_method' => $searchMethod,
'min_private' => 0,
'include_beta_website' => true,
]));
$privateChunks = [];
if ($personalClientId > 0) {
try {
$ragPrivate = new ClientRagPipeline($personalClientId, $gatewayUrl, 30);
$privateChunks = $ragPrivate->searchAll($query, $limit, null, array_merge($personaRagOpts, [
'search_private' => true,
'search_shared' => false,
'chunk_limit' => $limit,
'search_method' => $searchMethod,
'min_private' => 0,
]));
} catch (Throwable $e) {
error_log('[search] personal corpus query failed for client ' . $personalClientId . ': ' . $e->getMessage());
}
}
// Merge by score descending, cap at $limit
$merged = array_merge($sharedChunks, $privateChunks);
usort($merged, fn($a, $b) => ($b['score'] ?? 0) <=> ($a['score'] ?? 0));
$chunks = array_slice($merged, 0, $limit);
}
// Apply temporal reranking after retrieval (optional)
if ($temporalMode === 'legal_conservative' && !empty($chunks)) {
$temporalLayerPath = __DIR__ . '/../../ai-portal/platform/includes/LegalTemporalLayer.php';
if (file_exists($temporalLayerPath)) {
require_once $temporalLayerPath;
$layer = new LegalTemporalLayer(['temporal_mode' => $temporalMode]);
$chunks = $layer->rerank($chunks, $query, $asOfDate);
}
}
} catch (Throwable $e) {
$retrievalNote = 'SQL keyword fallback after ClientRagPipeline error';
$trace[] = $this->trace('Search fallback', 'Pipeline retrieval failed; using direct SQL keyword fallback without storing the query.', 'warning');
$chunks = $this->fallbackKeywordSearch((int)$client['id'], $package, $query, $limit);
}
if (!$chunks) {
$fallback = $this->fallbackKeywordSearch((int)$client['id'], $package, $query, $limit);
if ($fallback) {
$chunks = $fallback;
$retrievalNote = 'SQL keyword fallback';
}
}
$sharedDocIds = [];
foreach (array_slice($chunks, 0, $limit) as $chunk) {
if (($chunk['source_type'] ?? '') !== 'private' && isset($chunk['document_id'])) {
$sharedDocIds[(int)$chunk['document_id']] = true;
}
}
$docSummaries = $sharedDocIds ? $this->fetchDocSummaries(array_keys($sharedDocIds)) : [];
$hits = array_map(
fn(array $chunk): array => $this->sourceFromChunk(
$chunk,
($chunk['source_type'] ?? '') !== 'private'
? ($docSummaries[(int)($chunk['document_id'] ?? 0)] ?? null)
: null
),
array_slice($chunks, 0, $limit)
);
$confidence = $this->citationConfidence($hits);
$trace[1] = $this->trace('Search tools used', $retrievalNote . '; returned ' . count($hits) . ' source hit(s).', 'complete');
$trace[] = $this->trace('Evidence found', count($hits) ? 'Retrieved source excerpts for review.' : 'No matching source excerpts were found.', count($hits) ? 'complete' : 'warning');
$trace[] = $this->trace('Citation confidence', ucfirst($confidence) . ' confidence based on source count and retrieval scores.', $confidence === 'low' ? 'warning' : 'complete');
return [
'tool' => 'search',
'language' => $language,
'what_we_found' => count($hits) ? 'Found source excerpts from the legal corpus.' : 'No matching source excerpts were found.',
'hits' => $hits,
'evidence_trail' => $hits,
'what_remains_uncertain' => count($hits) ? 'Search results still need human review for legal relevance and currentness.' : 'The corpus may not contain enough evidence for this query.',
'next_practical_step' => count($hits) ? 'Open the strongest sources and confirm the cited sections before relying on them.' : 'Try a narrower query with statutory terms, party names, or dates.',
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($chunks),
'source_count' => count($hits),
'deployment' => null,
'citation_confidence' => $confidence,
'persona' => $personaResolved['slug'] ?? null,
'persona_source' => $personaResolved['source'] ?? null,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
public function ask(string $question, string $language = 'en', string $engine = 'azure_mini', ?string $persona = null): array
{
$engine = in_array($engine, ['azure_mini', 'azure_full'], true) ? $engine : 'azure_mini';
$client = dbnToolsRequireClient();
$personaResolved = dbnToolsResolvePersona((int)$client['id'], $persona);
$search = $this->search($question, $language, 7, 'disabled', null, 'both', $personaResolved['slug']);
$hits = $search['hits'];
$trace = $search['trace'];
if (!$hits) {
$trace[] = $this->trace('Synthesis', 'Skipped answer synthesis because no evidence was found.', 'warning');
return [
'tool' => 'ask',
'language' => $language,
'answer' => match (dbnToolsNormalizeUiLanguage($language)) {
'no' => 'Jeg fant ikke nok kildestøtte i familierettskorpuset til å svare sikkert.',
'uk' => 'Я не знайшов достатньої підтримки в корпусі сімейного права, щоб відповісти безпечно.',
'pl' => 'Nie znalazłem wystarczającego wsparcia źródłowego w korpusie prawa rodzinnego, aby odpowiedzieć bezpiecznie.',
default => 'I did not find enough source support in the family-law corpus to answer safely.',
},
'what_we_found' => $search['what_we_found'],
'evidence_trail' => [],
'what_remains_uncertain' => $search['what_remains_uncertain'],
'next_practical_step' => $search['next_practical_step'],
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => 0,
'source_count' => 0,
'deployment' => null,
'citation_confidence' => 'low',
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
[$gateway, $personaModel] = $this->personaGateway($personaResolved, $engine);
$gateway->requireChat();
$context = $this->buildEvidenceContext($hits);
$locale = dbnToolsLanguageName($language);
$prompt = <<<PROMPT
Question:
{$question}
Evidence excerpts:
{$context}
Return JSON only with these keys:
{
"answer": "short direct answer in {$locale}",
"what_we_found": "plain-language summary of the supported finding",
"evidence_trail": [{"title":"source title","why_it_matters":"one sentence","citation":"visible source title or section"}],
"what_remains_uncertain": ["specific gaps or caveats"],
"next_practical_step": "one concrete next action"
}
PROMPT;
// Persona voice/domain folded into the JSON-enforcing scaffold (keeps the
// structured-output contract while applying the persona's legal framing).
$system = $this->legalJsonSystemPrompt($language, $personaResolved['system_prompt'] ?? null);
$askDeployment = $personaModel;
$raw = $gateway->withDeployment($askDeployment)->chatText([
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $prompt],
], [
'json' => true,
'temperature' => 0.15,
'max_tokens' => 1300,
]);
$json = $gateway->decodeJsonObject($raw);
if (!$json) {
// Some fine-tuned models emit a labelled-prose template instead of JSON.
// Parse those labels into the real fields rather than dumping the raw blob.
$fields = dbnToolsParseLabeledFields($raw, [
'answer', 'what_we_found', 'evidence_trail', 'what_remains_uncertain', 'next_practical_step',
]);
$uncertain = trim((string)($fields['what_remains_uncertain'] ?? ''));
$uncertainList = $uncertain !== ''
? array_values(array_filter(array_map(
static fn(string $l): string => trim(ltrim($l, "-*• \t")),
preg_split('/\r?\n/', $uncertain) ?: []
), static fn(string $l): bool => $l !== ''))
: ['The response format could not be validated as structured JSON.'];
$cleanAnswer = trim((string)($fields['answer'] ?? ''));
if ($cleanAnswer === '') {
// No usable label — strip the trailing appended JSON blob from raw.
$cleanAnswer = trim((string)preg_replace('/\s*\{[\s\S]*$/', '', (string)preg_replace('/\\\\([_*])/', '$1', $raw)));
}
$json = [
'answer' => $cleanAnswer !== '' ? $cleanAnswer : $raw,
'what_we_found' => trim((string)($fields['what_we_found'] ?? ''))
?: 'The model returned a plain-text answer based on the retrieved excerpts.',
'evidence_trail' => [],
'what_remains_uncertain' => $uncertainList,
'next_practical_step' => trim((string)($fields['next_practical_step'] ?? ''))
?: 'Review the source excerpts manually before relying on the answer.',
];
}
$trace[] = $this->trace('Synthesis', 'Azure OpenAI generated an answer using only the retrieved source excerpts.', 'complete');
$trace[] = $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete');
$trace[] = $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the evidence trail.'), 'complete');
return [
'tool' => 'ask',
'language' => $language,
'answer' => (string)($json['answer'] ?? ''),
'what_we_found' => (string)($json['what_we_found'] ?? ''),
'evidence_trail' => $hits,
'citation_notes' => $this->normalizeEvidenceTrail($json['evidence_trail'] ?? [], $hits),
'sources' => $hits,
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($hits),
'source_count' => count($hits),
'deployment' => $askDeployment,
'citation_confidence' => $search['trace_metadata']['citation_confidence'] ?? 'medium',
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
public function summarize(string $text, string $language = 'en'): array
{
$text = $this->requirePasteText($text);
$this->azure->requireChat();
$locale = dbnToolsLanguageName($language);
$prompt = <<<PROMPT
Summarize this pasted case-preparation text in {$locale}. Do not invent missing facts.
Pasted text:
{$text}
Return JSON only:
{
"what_we_found": "plain-language summary",
"key_facts": ["fact"],
"dates": ["date or unknown"],
"parties": ["party or role"],
"legal_references_detected": ["reference"],
"what_remains_uncertain": ["uncertainty"],
"next_practical_step": "one concrete next action"
}
PROMPT;
$json = $this->runJsonTool($prompt, $language, 1300);
$trace = [
$this->trace('Query interpretation', 'Summarize pasted text without saving the text or output.', 'complete'),
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text.', 'complete'),
$this->trace('Evidence found', 'Evidence trail is limited to the pasted text supplied in this request.', 'complete'),
$this->trace('Citation confidence', 'Medium confidence for factual extraction; no external legal source verification was performed.', 'warning'),
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'),
$this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the summary against the original text.'), 'complete'),
];
return [
'tool' => 'summarize',
'language' => $language,
'what_we_found' => (string)($json['what_we_found'] ?? ''),
'key_facts' => $json['key_facts'] ?? [],
'dates' => $json['dates'] ?? [],
'parties' => $json['parties'] ?? [],
'legal_references_detected' => $json['legal_references_detected'] ?? [],
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => 1,
'source_count' => 1,
'deployment' => $this->azure->chatDeployment(),
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
public function timeline(
string $text,
string $language = 'en',
string $engine = 'azure_mini',
string $focus = 'all',
string $confidenceFilter = 'all',
bool $includeRelative = true,
bool $includeBackground = true,
string $userNotes = '',
?callable $onProgress = null
): array {
$text = $this->requirePasteText($text, self::MAX_TIMELINE_CHARS);
$engine = in_array($engine, ['nova_lite', 'azure_mini', 'azure_full'], true) ? $engine : 'azure_mini';
$focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all';
$this->azure->requireChat();
$onProgress && $onProgress("Preparing document\u{2026}");
$locale = dbnToolsLanguageName($language);
$inputDateHintCount = $this->timelineDateHintCount($text);
$focusInstruction = match ($focus) {
'deadlines' => "\nFocus specifically on: legal deadlines, filing dates, response windows, appeal periods, and statutory time limits. Deprioritise narrative events with no legal deadline significance.",
'hearings' => "\nFocus specifically on: court hearings, tribunal sessions, mediation sessions, formal meetings, and hearing-related procedural dates.",
'cps' => "\nFocus specifically on: CPS (Barnevernet) interventions, home visits, case reviews, acute measures (akuttvedtak), and Fylkesnemnda proceedings.",
default => '',
};
$backgroundInstruction = $includeBackground
? "\nAlso extract BACKGROUND and NARRATIVE events: dates embedded in contextual paragraphs, historical facts, year-only references, and approximate years (e.g. \"rundt 2011/2012\", \"David ble født den 30.07.2015\", \"familien i 2015\"). These are valid timeline events even when they appear in introductory or background text — do NOT skip them."
: "\nDo NOT include purely historical background or narrative context dates. Focus only on operational events, deadlines, and milestones that are directly actionable in the case.";
$relativeInstruction = $includeRelative
? ''
: "\nDo NOT extract relative, recurring, or conditional date references — extract only events with determinable absolute dates (date_type=absolute).";
$userNotesBlock = $userNotes !== ''
? "\n\nUser-provided context notes (use these to resolve ambiguities, not as source events):\n---\n" . $userNotes . "\n---"
: '';
$charCount = mb_strlen($text, 'UTF-8');
$singlePassLimit = $this->timelineSinglePassLimit($engine);
if ($charCount > $singlePassLimit) {
return $this->timelineChunked(
$text,
$language,
$engine,
$focus,
$confidenceFilter,
$includeRelative,
$includeBackground,
$userNotes,
$onProgress,
$inputDateHintCount
);
}
$prompt = <<<PROMPT
Build a chronological timeline from the pasted text in {$locale}.{$userNotesBlock}
Extract ALL dates, deadlines, milestones, and temporal references.{$focusInstruction}{$backgroundInstruction}{$relativeInstruction}
IMPORTANT — Norwegian date and time formats to recognise:
- DD.MM.YYYY (e.g. 18.09.2025 → 2025-09-18)
- DD.MM.YY (e.g. 18.09.25 = 2025-09-18, 09.04.25 = 2025-04-09)
- D.M.YY (e.g. 6.1.25 = 2025-01-06)
- Two-digit years: always interpret as 20YY (25 → 2025, 24 → 2024).
- "den DD. Month YYYY" (e.g. "den 18. september 2025" → 2025-09-18)
- "DD. Month YYYY" (e.g. "18. september 2025" → 2025-09-18)
- "DD. Month" (e.g. "18. september" → infer year per the rule below)
- Norwegian month names: januar=01 februar=02 mars=03 april=04 mai=05 juni=06
juli=07 august=08 september=09 oktober=10 november=11 desember=12
- DD.MM. (e.g. 18.09.) and D.M. (e.g. 6.1.) — day and month WITHOUT year:
Step 1: scan BACKWARD in the same document section for the nearest absolute year.
Step 2: if none found before, scan FORWARD for the nearest absolute year.
Step 3: use that year and set confidence=medium.
Step 4: if the resulting date would be in the future relative to the document's apparent writing date, subtract one year.
Only use "year unknown" when no year anchor exists within 300 words.
- Times: "kl. 14:30", "kl 09.00", "14:30", "14.30" → extract as "14:30" (HH:MM 24-hour).
- Diary / log format: lines that begin with a date followed by a colon or space are ALWAYS events.
Example: "18.09.25: Samtale med Davids lærer" → date 2025-09-18, event "Samtale med Davids lærer".
Example: "6.1. Samtaler med David" → infer year from context, event "Samtaler med David".
Example: "18.09.25 kl. 09.00: Møte på skolen" → date 2025-09-18, time "09:00", event "Møte på skolen".
- Do NOT skip a line just because the year is ambiguous — infer from context, record it, and set confidence accordingly.
For each temporal reference provide:
- "date": ISO 8601 date (YYYY-MM-DD) if determinable, otherwise a human-readable description such as "06 Jan (year unknown)"
- "end_date": end date (YYYY-MM-DD) for date_type=period; null for all other types
- "time": time of day in HH:MM (24-hour) if present in the source text, otherwise null
- "date_type": one of absolute | relative | recurring | conditional | period
- "actor": person, institution, or party involved — or "unknown".
Normalize Norwegian institutional actors: Barnevernstjenesten/BV → "Barnevernstjenesten",
Fylkesnemnda → "Fylkesnemnda", Statsforvalteren/Statsforvalter → "Statsforvalteren",
Tingrett → "Tingrett", Lagmannsrett → "Lagmannsrett", Høyesterett → "Høyesterett",
NAV → "NAV", BUP → "BUP", PPT → "PPT".
- "event": concise description of what happened or is due
- "source_excerpt": the most diagnostic verbatim phrase (≤ 30 words) that directly establishes both
the date and the event — prefer the phrase that would be least ambiguous out of context
- "confidence": high | medium | low
high = explicit date + event, no ambiguity, verbatim in the text
medium = year derived from context, date approximate, or event description is paraphrased
low = no explicit date, year is unknown, or event is implied rather than stated
Sort events chronologically (absolute dates first, then relative, then recurring).
Keep uncertain dates explicit — do not invent dates not in the text.
If multiple documents are separated by "--- Document: … ---" markers, note the source document in the event description where helpful.
If the same event appears in multiple documents, create ONE entry — use the most specific date and note both sources in the event description.
Pasted text:
{$text}
Return JSON only:
{
"what_we_found": "total events found; earliest and latest dates; main actors; any notable gaps",
"events": [{"date":"...","end_date":"YYYY-MM-DD or null","time":"HH:MM or null","date_type":"absolute","actor":"...","event":"...","source_excerpt":"...","confidence":"high|medium|low"}],
"evidence_trail": [{"title":"...","excerpt":"..."}],
"what_remains_uncertain": ["..."],
"next_practical_step": "..."
}
PROMPT;
if ($engine === 'nova_lite') {
$prompt = <<<PROMPT
Build a chronological timeline from the pasted text in {$locale}.{$userNotesBlock}
Extract dated lines and temporal references from uploaded or pasted case text. Focus on recall: if a line starts with or contains a date, include it as an event.{$focusInstruction}{$backgroundInstruction}{$relativeInstruction}
Recognise Norwegian formats:
- DD.MM.YYYY, DD.MM.YY, D.M.YY
- DD.MM. or D.M. without a year; infer the nearest year from nearby text when possible
- "den DD. month YYYY" and Norwegian month names
- optional times such as "kl. 09.00" or "14:30"
For every event return:
- date as YYYY-MM-DD when determinable, otherwise a short human-readable date
- end_date as null unless the source states a period
- time as HH:MM or null
- date_type: absolute, relative, recurring, conditional, or period
- actor: the named person/institution or "unknown"
- event: concise description
- source_excerpt: the exact source words that show the date and event
- confidence: high, medium, or low
Pasted text:
{$text}
Return JSON only:
{
"what_we_found": "event count, date range, main actors, notable gaps",
"events": [{"date":"YYYY-MM-DD","end_date":null,"time":null,"date_type":"absolute","actor":"unknown","event":"...","source_excerpt":"...","confidence":"high"}],
"evidence_trail": [{"title":"Pasted text","excerpt":"Processed in-memory only; not stored."}],
"what_remains_uncertain": [],
"next_practical_step": "..."
}
PROMPT;
}
$system = $this->legalJsonSystemPrompt($language);
$messages = [
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $prompt],
];
$isBedrock = $this->azure instanceof DbnBedrockGateway;
$maxTokens = match ($engine) { 'azure_full', 'claude_sonnet' => 8000, 'nova_lite' => 2000, default => 4000 };
$chatOptions = ['json' => true, 'temperature' => 0.1, 'max_tokens' => $maxTokens, 'timeout' => 120];
$deployLabel = match (true) {
$engine === 'nova_lite' => 'nova-lite',
$engine === 'azure_full' || $engine === 'claude_sonnet' => $isBedrock ? 'claude-sonnet-bedrock' : 'gpt-4o',
default => $isBedrock ? 'claude-haiku-bedrock' : 'gpt-4o-mini',
};
$onProgress && $onProgress("Calling {$deployLabel}\u{2026}");
try {
if ($engine === 'nova_lite') {
$response = dbnToolsCallGpuLlm($messages, ['model' => 'nova-lite', 'max_tokens' => $maxTokens, 'temperature' => 0.1, 'timeout' => 120]);
} elseif ($engine === 'azure_full' || $engine === 'claude_sonnet') {
$deploy = $isBedrock ? DbnBedrockModelRouter::LITELLM_SONNET : 'gpt-4o';
$response = $this->azure->withDeployment($deploy)->chat($messages, $chatOptions);
} else {
$deploy = $isBedrock ? DbnBedrockModelRouter::LITELLM_HAIKU : 'gpt-4o-mini';
$response = $this->azure->withDeployment($deploy)->chat($messages, $chatOptions);
}
} catch (Throwable $e) {
$msg = $e->getMessage();
if (preg_match('/timed?\s*out|timeout|operation timed out/i', $msg)) {
dbnToolsAbort('The model timed out. Try Quick mode, a smaller file, or fewer selected documents.', 504, 'llm_timeout');
}
dbnToolsAbort('LLM request failed: ' . $msg, 502, 'llm_error');
}
$onProgress && $onProgress("Parsing events\u{2026}");
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
$json = $this->azure->decodeJsonObject($raw);
if (!$json) {
dbnToolsAbort('The selected engine did not return valid structured JSON.', 502, 'llm_invalid_json');
}
$events = is_array($json['events'] ?? null) ? $json['events'] : [];
$usedFallbackExtractor = false;
if (!$events && $inputDateHintCount > 0) {
$fallbackEvents = $this->fallbackTimelineEvents($text);
if ($fallbackEvents) {
$events = $fallbackEvents;
$usedFallbackExtractor = true;
$uncertain = is_array($json['what_remains_uncertain'] ?? null) ? $json['what_remains_uncertain'] : [];
array_unshift($uncertain, 'The selected engine returned no events, so a deterministic date-line fallback extracted visible dated lines. Review these medium-confidence entries against the original file.');
$json['what_remains_uncertain'] = $uncertain;
$json['what_we_found'] = count($events) . ' date-like event(s) extracted by fallback after the selected engine returned no events.';
$json['next_practical_step'] = 'Review each fallback event against the original uploaded document and rerun with Standard or Deep if you need fuller actor/event interpretation.';
}
}
if (!$events && $inputDateHintCount === 0) {
$json['what_we_found'] = (string)($json['what_we_found'] ?? 'No recognizable dates were found in the extracted text from this upload.');
if (trim((string)$json['what_we_found']) === '') {
$json['what_we_found'] = 'No recognizable dates were found in the extracted text from this upload.';
}
$json['next_practical_step'] = (string)($json['next_practical_step'] ?? 'Check that the file text was extracted correctly, or upload a text-searchable PDF/DOCX.');
if (trim((string)$json['next_practical_step']) === '') {
$json['next_practical_step'] = 'Check that the file text was extracted correctly, or upload a text-searchable PDF/DOCX.';
}
}
// Post-filter: confidence
if ($confidenceFilter === 'high_medium') {
$events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low'));
}
// Post-filter: relative/recurring date types
if (!$includeRelative) {
$events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute'));
}
$engineLabel = $deployLabel;
$focusLabel = match ($focus) {
'deadlines' => 'legal deadlines',
'hearings' => 'court hearings',
'cps' => 'CPS milestones',
default => 'all events',
};
$trace = [
$this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Without saving the text or output.", 'complete'),
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text.', 'complete'),
$this->trace('Evidence found', count($events) . ' event(s) identified' . ($confidenceFilter === 'high_medium' ? ' (low-confidence filtered out)' : '') . '.', count($events) ? 'complete' : 'warning'),
$this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text.', 'complete'),
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'),
$this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Verify dates against original documents.'), 'complete'),
];
return [
'tool' => 'timeline',
'language' => $language,
'what_we_found' => (string)($json['what_we_found'] ?? ''),
'events' => $events,
'evidence_trail' => $json['evidence_trail'] ?? [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($events),
'source_count' => 1,
'deployment' => $engineLabel,
'input_date_hint_count' => $inputDateHintCount,
'used_fallback_extractor' => $usedFallbackExtractor,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function timelineChunked(
string $text,
string $language,
string $engine,
string $focus,
string $confidenceFilter,
bool $includeRelative,
bool $includeBackground,
string $userNotes,
?callable $onProgress,
int $inputDateHintCount
): array {
$isBedrock = $this->azure instanceof DbnBedrockGateway;
$engineLabel = match (true) {
$engine === 'nova_lite' => 'nova-lite',
$engine === 'azure_full' || $engine === 'claude_sonnet' => $isBedrock ? 'claude-sonnet-bedrock' : 'gpt-4o',
default => $isBedrock ? 'claude-haiku-bedrock' : 'gpt-4o-mini',
};
$chunkSize = $this->timelineChunkSize($engine);
$chunks = $this->timelineTextChunks($text, $chunkSize, 900);
$chunkCount = count($chunks);
$events = [];
$chunkFailures = 0;
$usedFallbackExtractor = false;
$onProgress && $onProgress('Splitting timeline into ' . $chunkCount . " chunk(s)\u{2026}");
foreach ($chunks as $idx => $chunk) {
$chunkNo = $idx + 1;
$chunkText = trim((string)$chunk['text']);
if (mb_strlen($chunkText, 'UTF-8') < 20) {
continue;
}
$onProgress && $onProgress("Extracting timeline chunk {$chunkNo}/{$chunkCount}\u{2026}");
try {
$result = $this->timeline(
$chunkText,
$language,
$engine,
$focus,
$confidenceFilter,
$includeRelative,
$includeBackground,
$userNotes,
null
);
$chunkEvents = is_array($result['events'] ?? null) ? $result['events'] : [];
if (!empty($result['trace_metadata']['used_fallback_extractor'])) {
$usedFallbackExtractor = true;
}
} catch (DbnToolsHttpException $e) {
$chunkFailures++;
$chunkEvents = [];
if ($this->timelineDateHintCount($chunkText) > 0) {
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
if ($chunkEvents) {
$usedFallbackExtractor = true;
}
}
if (!$chunkEvents && $e->status >= 500) {
error_log('timeline chunk failed: ' . $e->errorCode . ' ' . $e->getMessage());
}
} catch (Throwable $e) {
$chunkFailures++;
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
if ($chunkEvents) {
$usedFallbackExtractor = true;
}
error_log('timeline chunk throwable: ' . $e->getMessage());
}
foreach ($chunkEvents as $event) {
if (!is_array($event)) {
continue;
}
$event['chunk_index'] = $chunkNo;
$event['source_position'] = (int)$chunk['start'];
$events[] = $event;
}
}
$events = $this->mergeTimelineEvents($events);
if ($confidenceFilter === 'high_medium') {
$events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low'));
}
if (!$includeRelative) {
$events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute'));
}
$focusLabel = match ($focus) {
'deadlines' => 'legal deadlines',
'hearings' => 'court hearings',
'cps' => 'CPS milestones',
default => 'all events',
};
$isoDates = array_values(array_filter(array_map(fn($ev) => (string)($ev['date'] ?? ''), $events), fn($d) => preg_match('/^\d{4}-\d{2}-\d{2}$/', $d)));
sort($isoDates);
$range = $isoDates ? (' from ' . $isoDates[0] . ' to ' . $isoDates[count($isoDates) - 1]) : '';
$actors = array_values(array_unique(array_filter(array_map(fn($ev) => (string)($ev['actor'] ?? ''), $events), fn($a) => $a !== '' && $a !== 'unknown')));
$summary = count($events) . " event(s) extracted from {$chunkCount} chunk(s){$range}.";
if ($actors) {
$summary .= ' Main actors: ' . implode(', ', array_slice($actors, 0, 8)) . '.';
}
$uncertain = [];
if ($chunkFailures > 0) {
$uncertain[] = "{$chunkFailures} chunk(s) needed fallback extraction or could not be fully parsed; review medium-confidence entries against the source.";
}
if ($usedFallbackExtractor) {
$uncertain[] = 'Deterministic fallback extraction was used for at least one chunk.';
}
$trace = [
$this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Chunked timeline mode; without saving the text or output.", 'complete'),
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text split into local chunks.', 'complete'),
$this->trace('Evidence found', count($events) . " event(s) identified across {$chunkCount} chunk(s).", count($events) ? 'complete' : 'warning'),
$this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text. Overlapping chunks were de-duplicated.', 'complete'),
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($uncertain), $uncertain ? 'warning' : 'complete'),
$this->trace('Next practical step', 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.', 'complete'),
];
return [
'tool' => 'timeline',
'language' => $language,
'what_we_found' => $summary,
'events' => $events,
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; split into local timeline chunks; not stored.']],
'what_remains_uncertain' => $uncertain,
'next_practical_step' => 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.',
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($events),
'source_count' => $chunkCount,
'deployment' => $engineLabel,
'input_date_hint_count' => $inputDateHintCount,
'used_fallback_extractor' => $usedFallbackExtractor,
'chunked_timeline' => true,
'timeline_chunk_count' => $chunkCount,
'chunk_failures' => $chunkFailures,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function timelineSinglePassLimit(string $engine): int
{
return match ($engine) {
'nova_lite' => 25000,
'azure_mini' => 55000,
default => 128000,
};
}
private function timelineChunkSize(string $engine): int
{
return match ($engine) {
'nova_lite' => 10000,
'azure_mini' => 16000,
default => 30000,
};
}
private function timelineTextChunks(string $text, int $chunkSize, int $overlap): array
{
$len = mb_strlen($text, 'UTF-8');
$chunks = [];
$start = 0;
while ($start < $len) {
$targetEnd = min($len, $start + $chunkSize);
$window = mb_substr($text, $start, $targetEnd - $start, 'UTF-8');
$end = $targetEnd;
if ($targetEnd < $len) {
$breakAt = mb_strrpos($window, "\n\n", 0, 'UTF-8');
if ($breakAt === false || $breakAt < (int)($chunkSize * 0.55)) {
$breakAt = mb_strrpos($window, "\n", 0, 'UTF-8');
}
if ($breakAt !== false && $breakAt > (int)($chunkSize * 0.45)) {
$end = $start + $breakAt;
}
}
$chunkText = trim(mb_substr($text, $start, max(1, $end - $start), 'UTF-8'));
if ($chunkText !== '') {
$chunks[] = ['start' => $start, 'text' => $chunkText];
}
if ($end >= $len) {
break;
}
$nextStart = max(0, $end - $overlap);
if ($nextStart <= $start) {
$nextStart = $end;
}
$start = $nextStart;
}
return $chunks;
}
private function mergeTimelineEvents(array $events): array
{
$merged = [];
foreach ($events as $event) {
if (!is_array($event)) {
continue;
}
$key = $this->timelineEventSignature($event);
if (!isset($merged[$key])) {
$merged[$key] = $event;
continue;
}
$existing = $merged[$key];
$candidateExcerpt = (string)($event['source_excerpt'] ?? '');
$existingExcerpt = (string)($existing['source_excerpt'] ?? '');
$additionalExcerpt = $candidateExcerpt;
if ($this->timelineConfidenceRank((string)($event['confidence'] ?? 'medium')) > $this->timelineConfidenceRank((string)($existing['confidence'] ?? 'medium'))) {
$merged[$key] = $event;
$additionalExcerpt = $existingExcerpt;
}
$oldExcerpt = (string)($merged[$key]['source_excerpt'] ?? '');
$newExcerpt = $additionalExcerpt;
if ($newExcerpt !== '' && $oldExcerpt !== '' && $newExcerpt !== $oldExcerpt && mb_strlen($oldExcerpt, 'UTF-8') < 260) {
$merged[$key]['source_excerpt'] = $oldExcerpt . ' / ' . $newExcerpt;
}
}
$events = array_values($merged);
usort($events, static function (array $a, array $b): int {
$ad = (string)($a['date'] ?? '');
$bd = (string)($b['date'] ?? '');
$ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99';
$bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99';
$cmp = strcmp($ai, $bi);
if ($cmp !== 0) {
return $cmp;
}
return strcmp((string)($a['time'] ?? ''), (string)($b['time'] ?? ''));
});
return $events;
}
private function timelineEventSignature(array $event): string
{
$date = mb_strtolower(trim((string)($event['date'] ?? '')), 'UTF-8');
$time = mb_strtolower(trim((string)($event['time'] ?? '')), 'UTF-8');
$actor = mb_strtolower(trim((string)($event['actor'] ?? 'unknown')), 'UTF-8');
$body = mb_strtolower(trim((string)($event['event'] ?? '')), 'UTF-8');
$body = (string)preg_replace('/[^\p{L}\p{N}]+/u', ' ', $body);
$body = trim((string)preg_replace('/\s+/u', ' ', $body));
return $date . '|' . $time . '|' . $actor . '|' . mb_substr($body, 0, 96, 'UTF-8');
}
private function timelineConfidenceRank(string $confidence): int
{
return match ($confidence) {
'high' => 3,
'medium' => 2,
default => 1,
};
}
private function timelineDateHintCount(string $text): int
{
preg_match_all('/(?<!\d)\d{1,2}\.\d{1,2}\.(?:\d{2,4})?(?!\d)/u', $text, $numeric);
preg_match_all('/\b\d{1,2}\.\s*(?:januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)\b/iu', $text, $months);
return count($numeric[0] ?? []) + count($months[0] ?? []);
}
private function fallbackTimelineEvents(string $text): array
{
$lines = preg_split('/\R/u', $text) ?: [];
$events = [];
$lastYear = null;
foreach ($lines as $line) {
if (count($events) >= 80) {
break;
}
$line = trim((string)preg_replace('/\s+/u', ' ', $line));
if ($line === '') {
continue;
}
if (preg_match('/\b(20\d{2}|19\d{2})\b/u', $line, $ym)) {
$lastYear = (int)$ym[1];
}
if (!preg_match_all('/(?<!\d)(\d{1,2})\.(\d{1,2})\.(?:(\d{2,4}))?(?!\d)/u', $line, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)) {
continue;
}
foreach ($matches as $m) {
if (count($events) >= 80) {
break 2;
}
$day = (int)$m[1][0];
$month = (int)$m[2][0];
if ($day < 1 || $day > 31 || $month < 1 || $month > 12) {
continue;
}
$yearRaw = $m[3][0] ?? '';
$year = null;
if ($yearRaw !== '') {
$year = strlen($yearRaw) === 2 ? 2000 + (int)$yearRaw : (int)$yearRaw;
$lastYear = $year;
} elseif ($lastYear !== null) {
$year = $lastYear;
}
$date = $year !== null
? sprintf('%04d-%02d-%02d', $year, $month, $day)
: sprintf('%02d.%02d. (year unknown)', $day, $month);
$time = null;
if (preg_match('/\bkl\.?\s*(\d{1,2})[:.](\d{2})\b|\b(\d{1,2}):(\d{2})\b/u', $line, $tm)) {
$hour = (int)($tm[1] !== '' ? $tm[1] : $tm[3]);
$min = (int)($tm[2] !== '' ? $tm[2] : $tm[4]);
if ($hour >= 0 && $hour <= 23 && $min >= 0 && $min <= 59) {
$time = sprintf('%02d:%02d', $hour, $min);
}
}
$eventText = trim(preg_replace('/^\s*[-*#\s]*/u', '', $line));
$eventText = trim(preg_replace('/^' . preg_quote($m[0][0], '/') . '\s*(?:kl\.?\s*\d{1,2}[:.]\d{2})?\s*[:\-–—]?\s*/u', '', $eventText));
if ($eventText === '') {
$eventText = 'Dated event found in uploaded text.';
}
$events[] = [
'date' => $date,
'end_date' => null,
'time' => $time,
'date_type' => $year !== null ? 'absolute' : 'relative',
'actor' => $this->fallbackTimelineActor($line),
'event' => mb_substr($eventText, 0, 240, 'UTF-8'),
'source_excerpt' => mb_substr($line, 0, 300, 'UTF-8'),
'confidence' => 'medium',
];
}
}
usort($events, static function (array $a, array $b): int {
$ad = (string)($a['date'] ?? '');
$bd = (string)($b['date'] ?? '');
$ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99';
$bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99';
return strcmp($ai, $bi);
});
return $events;
}
private function fallbackTimelineActor(string $line): string
{
$actors = [
'/barnevern(?:s?tjenesten)?|bv\b/iu' => 'Barnevernstjenesten',
'/fylkesnemnda/iu' => 'Fylkesnemnda',
'/statsforvalter(?:en)?/iu' => 'Statsforvalteren',
'/tingrett/iu' => 'Tingrett',
'/lagmannsrett/iu' => 'Lagmannsrett',
'/høyesterett|høyesterett/iu' => 'Høyesterett',
'/\bnav\b/iu' => 'NAV',
'/\bbup\b/iu' => 'BUP',
'/\bppt\b/iu' => 'PPT',
];
foreach ($actors as $pattern => $actor) {
if (preg_match($pattern, $line)) {
return $actor;
}
}
return 'unknown';
}
public function redact(
string $text,
string $mode = 'standard',
string $region = 'nordic',
string $language = 'en',
array $aliases = [],
string $engine = 'azure_mini',
string $outputFormat = 'contextual',
bool $keepOfficials = false,
array $exemptNames = [],
array $redactTypes = []
): array {
$text = $this->requirePasteText($text);
$mode = $mode === 'strict' ? 'strict' : 'standard';
$region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic';
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'regex'], true) ? $engine : 'azure_mini';
$outputFormat = in_array($outputFormat, ['contextual', 'generic', 'pseudonym'], true) ? $outputFormat : 'contextual';
// Normalise entity-type flags (all on by default)
$doNames = ($redactTypes['names'] ?? true) !== false;
$doOrgs = ($redactTypes['orgs'] ?? true) !== false;
$doPlaces = ($redactTypes['places'] ?? true) !== false;
$doDob = ($redactTypes['dob'] ?? true) !== false;
// Pass 1 — deterministic regex
[$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region);
$pass1Total = array_sum($pass1Counts);
$pass1Detail = $pass1Total
? implode(', ', array_map(
fn($k, $v) => "{$k}: {$v}",
array_keys(array_filter($pass1Counts, fn($v): bool => $v > 0)),
array_filter($pass1Counts, fn($v): bool => $v > 0)
))
: 'none detected';
$engineLabel = match ($engine) {
'azure_full' => 'Azure gpt-4o',
'gpu' => 'GPU (cuttlefish)',
'regex' => 'Regex only',
default => 'Azure gpt-4o-mini',
};
$trace = [
$this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}. Engine: {$engineLabel}.", 'complete'),
$this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'),
];
// Pass 2 — LLM semantic scan
$finalRedacted = $preRedacted;
$pass2Counts = [];
$llmDeployment = null;
$redactionMap = [];
$llmResult = $this->llmRedactionPass(
$preRedacted, $language, $aliases, $engine,
$keepOfficials, $exemptNames,
$doNames, $doOrgs, $doPlaces, $doDob
);
if (!empty($llmResult['skipped'])) {
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'not configured') . '.', 'warning');
} elseif (!empty($llmResult['error'])) {
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning');
} else {
$entities = $llmResult['entities'] ?? [];
$llmDeployment = $llmResult['deployment'] ?? null;
$applied = 0;
$redactionMap = [];
foreach ($entities as $entity) {
if (!is_array($entity)) {
continue;
}
$original = (string)($entity['original'] ?? '');
$type = (string)($entity['type'] ?? 'other');
$tag = (string)($entity['tag'] ?? '[IDENTIFIER]');
if ($original === '' || str_starts_with($original, '[')) {
continue;
}
// Allow [ROLE: Name] format when keepOfficials is on, else require plain bracket tag
if (!preg_match('/^\[[A-Za-z0-9_\- ]+(?::\s*[^\]]+)?\]$/', $tag)) {
$tag = '[IDENTIFIER]';
}
// Try word-boundary match first to avoid partial-word substitutions (e.g. "Per" inside "Persson")
$escaped = preg_quote($original, '/');
$replaced = preg_replace('/\b' . $escaped . '\b/u', $tag, $finalRedacted);
if ($replaced !== null && $replaced !== $finalRedacted) {
$finalRedacted = $replaced;
$pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1;
$applied++;
if (!isset($redactionMap[$tag])) {
$redactionMap[$tag] = ['originals' => [], 'type' => $type];
}
if (!in_array($original, $redactionMap[$tag]['originals'], true)) {
$redactionMap[$tag]['originals'][] = $original;
}
} elseif (str_contains($finalRedacted, $original)) {
// Fallback for names adjacent to punctuation or non-word characters
$finalRedacted = str_replace($original, $tag, $finalRedacted);
$pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1;
$applied++;
if (!isset($redactionMap[$tag])) {
$redactionMap[$tag] = ['originals' => [], 'type' => $type];
}
if (!in_array($original, $redactionMap[$tag]['originals'], true)) {
$redactionMap[$tag]['originals'][] = $original;
}
}
}
// Add occurrence counts by scanning the final text
foreach ($redactionMap as $tag => &$entry) {
$entry['occurrences'] = substr_count($finalRedacted, $tag);
}
unset($entry);
$pass2Detail = $applied > 0
? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts))
: 'no additional entities found';
$trace[] = $this->trace('Pass 2 — LLM semantic scan', "{$engineLabel} reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete');
}
// Apply output format post-processing
$allCounts = array_merge($pass1Counts, $pass2Counts);
if ($outputFormat === 'generic') {
$finalRedacted = $this->applyGenericTags($finalRedacted);
} elseif ($outputFormat === 'pseudonym') {
$finalRedacted = $this->applyPseudonymization($finalRedacted, $allCounts);
}
$categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0));
$trace[] = $this->trace('Output format', match ($outputFormat) {
'generic' => 'All identifiers normalised to generic tags ([PERSON], [ORG], etc.).',
'pseudonym' => 'Identifiers replaced with plausible pseudonymous values.',
default => 'Contextual role tags used (e.g. [FATHER], [JUDGE: Name]).',
}, 'complete');
$trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning');
$trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete');
return [
'tool' => 'redact',
'mode' => $mode,
'region' => $region,
'engine_used' => $engineLabel,
'output_format' => $outputFormat,
'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment || $engine === 'gpu' ? " and {$engineLabel} semantic scan" : '') . '.',
'redacted_text' => $finalRedacted,
'detected_entity_categories' => $categories,
'entity_counts' => $allCounts,
'redaction_map' => $redactionMap,
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
'what_remains_uncertain' => ['Human review is still recommended for contextual identification.'],
'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.',
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => 1,
'source_count' => 1,
'deployment' => $llmDeployment ?? $engineLabel,
],
'disclaimer' => 'Privacy support tool. Review before disclosure.',
];
}
private function requireFamilyPackage(int $clientId): array
{
$package = dbnToolsFetchPackage('family-legal');
if (!$package || empty($package['is_active'])) {
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
}
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
dbnToolsAbort(dbnToolsProductName() . ' does not have an active family-legal subscription.', 503, 'subscription_missing');
}
return $package;
}
/**
* Pick the synthesis gateway + model for a persona.
* - Persona pins a model (e.g. dbn-legal-agent-v3, gpt-4o) → route via LiteLLM
* so any model registered on the gateway is reachable.
* - No pinned model → existing Azure routing (gpt-4o / gpt-4o-mini by engine).
* @return array{0: DbnAzureOpenAiGateway|DbnBedrockGateway, 1: string}
*/
private function personaGateway(array $persona, string $engine): array
{
$model = trim((string)($persona['model'] ?? ''));
if ($model !== '') {
try {
return [new DbnBedrockGateway(['chat_model_name' => $model]), $model];
} catch (Throwable $e) {
error_log('[dbn-persona] gateway init failed for model ' . $model . ': ' . $e->getMessage());
}
}
return [$this->azure, ($engine === 'azure_full') ? 'gpt-4o' : 'gpt-4o-mini'];
}
private function runJsonTool(string $prompt, string $language, int $maxTokens, ?array $persona = null): array
{
// With a persona, route to its pinned engine (Track-1 → tuned Qwen, Track-2 → gpt-4o)
// and fold its domain framing into the system prompt. Without one (e.g. pasted-text
// tools), keep the default Azure routing with the neutral base prompt.
$personaPrompt = $persona['system_prompt'] ?? null;
if ($persona !== null) {
[$gateway, $model] = $this->personaGateway($persona, 'azure_mini');
$gateway = $gateway->withDeployment($model);
} else {
$gateway = $this->azure;
}
$raw = $gateway->chatText([
['role' => 'system', 'content' => $this->legalJsonSystemPrompt($language, $personaPrompt)],
['role' => 'user', 'content' => $prompt],
], [
'json' => true,
'temperature' => 0.1,
'max_tokens' => $maxTokens,
]);
$json = $gateway->decodeJsonObject($raw);
if (!$json) {
dbnToolsAbort('The model did not return valid structured JSON.', 502, 'invalid_json');
}
return $json;
}
private function legalJsonSystemPrompt(string $language, ?string $personaPrompt = null): string
{
$locale = dbnToolsLanguageName($language);
$product = dbnToolsProductName();
$personaPrompt = is_string($personaPrompt) ? trim($personaPrompt) : '';
// The persona (family, immigration, labour, …) supplies the domain framing; the
// base prompt stays domain-neutral so non-family tracks are not cast as child-welfare.
$personaBlock = $personaPrompt !== '' ? ($personaPrompt . "\n") : '';
return <<<PROMPT
You are {$product} Tools — a source-grounded Norwegian legal preparation assistant covering all areas of Norwegian law.
{$personaBlock}Legal guardrails:
- Answer only from provided source excerpts or pasted text.
- Treat your role as legal information and issue-spotting, not final legal advice.
- Never invent statutes, paragraph numbers, case names, citations, parties, dates, or sources.
- If evidence is insufficient, say so plainly.
- Respond in {$locale}.
- Return valid JSON only. No markdown fences.
PROMPT;
}
private function buildEvidenceContext(array $hits): string
{
$lines = [];
foreach ($hits as $idx => $hit) {
$n = $idx + 1;
$lines[] = "[{$n}] Title: " . ($hit['title'] ?? 'Untitled');
if (!empty($hit['section'])) {
$lines[] = "Section: " . $hit['section'];
}
$lines[] = "Corpus/package: " . ($hit['package_or_corpus'] ?? 'unknown');
$lines[] = "Excerpt: " . ($hit['excerpt'] ?? '');
}
return implode("\n", $lines);
}
private function normalizeEvidenceTrail(mixed $trail, array $hits): array
{
if (!is_array($trail) || !$trail) {
return array_map(fn(array $hit): array => [
'title' => $hit['title'],
'citation' => $hit['title'],
'why_it_matters' => dbnToolsExcerpt($hit['excerpt'], 180),
], array_slice($hits, 0, 4));
}
return array_values(array_filter($trail, 'is_array'));
}
private function sourceFromChunk(array $chunk, ?string $docSummary = null): array
{
$title = (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source');
$score = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
$rawExcerpt = dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620);
return [
'title' => $title,
'excerpt' => $docSummary ?? $rawExcerpt,
'chunk_text' => $rawExcerpt,
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? dbnToolsProductName()),
'score' => $score,
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
'section' => $chunk['section_title'] ?? null,
'authority_type' => $chunk['authority_type'] ?? null,
'jurisdiction' => $chunk['jurisdiction'] ?? null,
// Temporal annotations (present when temporal_mode = 'legal_conservative')
'temporal_state' => $chunk['temporal_state'] ?? null,
'temporal_kind' => $chunk['temporal_kind'] ?? null,
'temporal_reason' => $chunk['temporal_reason'] ?? null,
'currentness_warning' => $chunk['currentness_warning'] ?? null,
'valid_from' => $chunk['valid_from'] ?? null,
'valid_until' => $chunk['valid_until'] ?? null,
'is_current_version' => $chunk['is_current_version'] ?? null,
];
}
private function fetchDocSummaries(array $docIds): array
{
if (!$docIds) {
return [];
}
try {
$db = dbnToolsRagDb();
$placeholders = implode(',', array_fill(0, count($docIds), '?'));
$stmt = $db->prepare(
"SELECT document_id, summary FROM doc_summaries
WHERE document_id IN ({$placeholders}) AND summary != ''"
);
$stmt->execute(array_values($docIds));
return array_column($stmt->fetchAll(PDO::FETCH_ASSOC), 'summary', 'document_id');
} catch (Throwable) {
return [];
}
}
private function citationConfidence(array $hits): string
{
if (!$hits) {
return 'low';
}
$scores = array_values(array_filter(array_map(fn(array $h) => $h['score'] ?? null, $hits), 'is_numeric'));
$best = $scores ? max($scores) : 0;
if (count($hits) >= 3 && $best >= 0.35) {
return 'high';
}
if (count($hits) >= 1) {
return 'medium';
}
return 'low';
}
private function fallbackKeywordSearch(int $clientId, array $package, string $query, int $limit): array
{
$results = [];
try {
$results = array_merge($results, $this->fallbackPrivateSearch($clientId, $query, $limit));
} catch (Throwable $e) {
error_log('DBN tools private fallback failed: ' . $e->getMessage());
}
try {
$remaining = max(1, $limit - count($results));
$results = array_merge($results, $this->fallbackSharedSearch($package, $query, $remaining));
} catch (Throwable $e) {
error_log('DBN tools shared fallback failed: ' . $e->getMessage());
}
return array_slice($results, 0, $limit);
}
private function fallbackPrivateSearch(int $clientId, string $query, int $limit): array
{
$db = dbnToolsDb();
$terms = $this->searchTerms($query);
if (!$terms) {
return [];
}
$clauses = [];
$params = [':client_id' => $clientId];
foreach ($terms as $i => $term) {
$key = ':term' . $i;
$clauses[] = "(cc.content LIKE {$key} OR cd.title LIKE {$key})";
$params[$key] = '%' . $term . '%';
}
$sql = 'SELECT cc.id, cc.document_id, cc.content, cd.title AS document_title, cd.category
FROM client_chunks cc
JOIN client_documents cd ON cc.document_id = cd.id
WHERE cc.client_id = :client_id AND cd.status = "ready" AND (' . implode(' OR ', $clauses) . ')
LIMIT ' . (int)$limit;
$stmt = $db->prepare($sql);
$stmt->execute($params);
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
foreach ($rows as &$row) {
$row['similarity'] = 0.25;
$row['source_name'] = dbnToolsProductName() . ' private corpus';
$row['source_type'] = 'private';
}
return $rows;
}
private function fallbackSharedSearch(array $package, string $query, int $limit): array
{
$ragDb = dbnToolsRagDb();
$terms = $this->searchTerms($query);
if (!$terms) {
return [];
}
$where = ['d.status = "ready"'];
$params = [];
if (!empty($package['corpus_id'])) {
$where[] = 'd.corpus_id = ?';
$params[] = (int)$package['corpus_id'];
}
$cats = json_decode((string)($package['category_filter'] ?? '[]'), true) ?: [];
if ($cats) {
$where[] = 'd.category IN (' . implode(',', array_fill(0, count($cats), '?')) . ')';
$params = array_merge($params, $cats);
}
$langs = json_decode((string)($package['language_filter'] ?? '[]'), true) ?: [];
if ($langs) {
$where[] = 'd.language IN (' . implode(',', array_fill(0, count($langs), '?')) . ')';
$params = array_merge($params, $langs);
}
$termClauses = [];
foreach ($terms as $term) {
$termClauses[] = '(c.content LIKE ? OR d.title LIKE ?)';
$params[] = '%' . $term . '%';
$params[] = '%' . $term . '%';
}
$where[] = '(' . implode(' OR ', $termClauses) . ')';
$sql = 'SELECT c.id, c.document_id, c.content, c.section_title, d.title AS document_title,
d.category, d.language
FROM chunks c
JOIN documents d ON c.document_id = d.id
WHERE ' . implode(' AND ', $where) . '
LIMIT ' . (int)$limit;
$stmt = $ragDb->prepare($sql);
$stmt->execute($params);
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
foreach ($rows as &$row) {
$row['similarity'] = 0.2;
$row['source_name'] = (string)($package['name'] ?? 'family-legal');
$row['source_type'] = 'package';
}
return $rows;
}
private function searchTerms(string $query): array
{
// Citation atoms first: "§ 4-12", "Art. 8(2)", "Rt. 2020 s. 1234" tokenize
// to fragments shorter than the 3-char floor and get dropped, so a citation
// query loses its only meaningful term (EDI Vol.1 #2, §2.1). Extract them
// verbatim and route them ahead of the word tokens.
$citations = $this->extractCitationAtoms($query);
$parts = preg_split('/[^\p{L}\p{N}]+/u', mb_strtolower($query, 'UTF-8')) ?: [];
$stop = ['the', 'and', 'for', 'with', 'that', 'this', 'hva', 'har', 'kan', 'jeg', 'som', 'det', 'med', 'til', 'og'];
$terms = [];
foreach ($parts as $part) {
if (mb_strlen($part, 'UTF-8') < 3 || in_array($part, $stop, true)) {
continue;
}
$terms[] = $part;
}
// Citation atoms are authoritative — prepend, keep verbatim, dedupe.
$terms = array_merge($citations, $terms);
return array_slice(array_values(array_unique($terms)), 0, 8);
}
/**
* Extract exact legal-identifier substrings that must survive tokenization.
* Each is kept as a whole LIKE term. For § sections we also emit spaced /
* unspaced variants so "§4-12" matches stored "§ 4-12" and vice versa.
*
* @return string[]
*/
private function extractCitationAtoms(string $query): array
{
return self::citationAtoms($query);
}
/**
* Static, reusable citation extractor (also used by api/corpus-search.php to
* route identifier queries around the FULLTEXT tokenizer).
*
* @return string[]
*/
public static function citationAtoms(string $query): array
{
$patterns = [
'/§\s*\d+(?:-\d+)?[a-z]?/u', // § 4-12, § 1a
'/\bArt(?:ikkel|icle|\.)?\s*\d+(?:\(\d+\))?/iu', // Art. 8, Article 3, Art. 8(2)
'/\b3\d{4}[A-Z]\d{4}\b/', // EU CELEX: 32016R0679
'/\bRt[\.\s]*\d{4}[\.\s]*s[\.\s]*\d+/u', // Rt. 2020 s. 1234
'/\bHR-\d{4}-\d+(?:-[A-Z])?/u', // HR-2020-1789-A
];
$out = [];
foreach ($patterns as $rx) {
if (!preg_match_all($rx, $query, $m)) continue;
foreach ($m[0] as $hit) {
$hit = trim((string)$hit);
if ($hit === '') continue;
$out[$hit] = true;
if (mb_strpos($hit, '§') !== false) {
$out[preg_replace('/§\s*/u', '§ ', $hit)] = true; // force single space
$out[preg_replace('/§\s*/u', '§', $hit)] = true; // no space
}
}
}
return array_keys($out);
}
private function requirePasteText(string $text, ?int $maxChars = null): string
{
$text = trim($text);
if (mb_strlen($text, 'UTF-8') < 20) {
dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short');
}
$maxChars ??= self::MAX_PASTE_CHARS;
if (mb_strlen($text, 'UTF-8') > $maxChars) {
dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long');
}
return $text;
}
private function deterministicRedaction(string $text, string $mode, string $region = 'nordic'): array
{
$counts = [];
$replace = function (string $pattern, string $type, string $token) use (&$text, &$counts): void {
$text = preg_replace_callback($pattern, function () use (&$counts, $type, $token): string {
$counts[$type] = ($counts[$type] ?? 0) + 1;
return $token;
}, $text) ?? $text;
};
foreach ($this->getPatternPack($region) as $entry) {
$replace($entry['pattern'], $entry['type'], $entry['replacement']);
}
// Structured role-label names (Barn: X, Mother: X, etc.) — universal
$text = preg_replace_callback(
'/\b(Barn|Child|Navn|Name|Mor|Far|Mother|Father|Sønn|Datter)\s*:\s*([^\r\n,.;]+)/iu',
function (array $m) use (&$counts): string {
$counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
return $m[1] . ': [PERSON]';
},
$text
) ?? $text;
// Child-identifier phrases ("barnet heter X", "child named X") — universal
$text = preg_replace_callback(
'/\b(?:barnet|child|sønn|son|datter|daughter)\s+(?:heter|named|called)?\s*([A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,})\b/iu',
function () use (&$counts): string {
$counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
return '[CHILD_IDENTIFIER]';
},
$text
) ?? $text;
if ($mode === 'strict') {
$replace('/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,}\s+[A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,}\b/u', 'person_or_child_name', '[PERSON]');
}
return [$text, $counts];
}
private function getPatternPack(string $region): array
{
$nordic = [
['pattern' => '/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'replacement' => '[EMAIL]', 'type' => 'email'],
['pattern' => '/(?<!\d)(?:\d{6}[\s\-]?\d{5}|\d{11})(?!\d)/u', 'replacement' => '[FNR]', 'type' => 'fødselsnummer'],
['pattern' => '/(?<!\d)(?:\+47[\s.\-]?)?(?:\d[\s.\-]?){8}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
// Dates — must precede generic numeric patterns
// Year range (e.g. 2011/2012, 2018-2019)
['pattern' => '/(?<!\d)(?:19|20)\d{2}\s*[\/\-–—]\s*(?:19|20)?\d{2}(?!\d)/u', 'replacement' => '[DATE]', 'type' => 'date'],
// Norwegian DD.MM.YYYY and DD/MM/YYYY
['pattern' => '/(?<!\d)(?:0?[1-9]|[12]\d|3[01])[.\/](?:0?[1-9]|1[0-2])[.\/](?:19|20)\d{2}(?!\d)/u', 'replacement' => '[DATE]', 'type' => 'date'],
// ISO YYYY-MM-DD
['pattern' => '/(?<!\d)(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])(?!\d)/u', 'replacement' => '[DATE]', 'type' => 'date'],
// DD. Month YYYY (e.g. "30. juli 2015") and Month YYYY (Norwegian + English)
['pattern' => '/\b(?:\d{1,2}\.?\s+)?(?:januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember|January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\s+(?:19|20)\d{2}\b/iu', 'replacement' => '[DATE]', 'type' => 'date'],
// Year after Norwegian/English temporal preposition (lookbehind keeps preposition)
['pattern' => '/(?<=\b(?:i|fra|siden|innen|før|etter|rundt|omkring|cirka|in|from|since|until|before|after|around|circa)\s)(?:19|20)\d{2}(?![\d\/\-])/iu', 'replacement' => '[DATE]', 'type' => 'date'],
];
if ($region === 'nordic') {
return $nordic;
}
$european = array_merge($nordic, [
// Swedish personnummer short (YYMMDD-XXXX / YYMMDD+XXXX)
['pattern' => '/(?<!\d)\d{6}[-+]\d{4}(?!\d)/u', 'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
// Swedish personnummer full (YYYYMMDD-XXXX)
['pattern' => '/(?<!\d)\d{8}[-+]\d{4}(?!\d)/u', 'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
// Danish/Finnish CPR / henkilötunnus — same format as short SE personnummer but included for clarity
['pattern' => '/\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b/i', 'replacement' => '[UK_NI]', 'type' => 'uk_ni'],
// French INSEE (15 digits, not overlapping with 11-digit FNR due to lookarounds)
['pattern' => '/(?<!\d)\d{15}(?!\d)/u', 'replacement' => '[FR_INSEE]', 'type' => 'fr_insee'],
// IBAN (2-letter country code + 2 check digits + up to 30 alphanumeric)
['pattern' => '/\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}[A-Z0-9]{0,16}\b/i', 'replacement' => '[IBAN]', 'type' => 'iban'],
// European phone (international prefix for major EU/EEA country codes)
['pattern' => '/(?<!\d)\+(?:44|46|45|358|33|49|34|39|31|32|41|43|30|351|353|48|36|420|421|372|371|370|386|385|356|357|40|359|352)[\s.\-]?(?:\d[\s.\-]?){7,12}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
// Street address expanded to European street-type keywords
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave|rue|straße|strasse|straat|gade|calle|via|gatan|vägen)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
]);
if ($region === 'european') {
return $european;
}
$echr = array_merge($european, [
// ECHR application number (requires "Application no." or "App. No." prefix to avoid matching dates/pages)
['pattern' => '/\b(?:Application|App\.?)\s+(?:no\.?|nr\.?|#)\s*\d{3,6}\s*\/\s*\d{2,4}\b/i', 'replacement' => '[ECHR_APP_NO]', 'type' => 'echr_app_no'],
// Date of birth stated in judgment context
['pattern' => '/\bborn\s+(?:on\s+)?\d{1,2}[.\s]+(?:January|February|March|April|May|June|July|August|September|October|November|December|januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)[,\s]+\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
['pattern' => '/\bf\.\s*\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
// National ID label patterns in multiple languages
['pattern' => '/\b(?:personal\s+number|numéro\s+national|Personalausweis|personnummer|fødselsnummer|henkilötunnus|CPR-nummer)\s*:\s*[\w\s\-]+/iu', 'replacement' => '[NAT_ID]', 'type' => 'nat_id'],
]);
if ($region === 'echr') {
return $echr;
}
// global
return array_merge($echr, [
// US Social Security Number
['pattern' => '/(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/u', 'replacement' => '[SSN]', 'type' => 'ssn'],
// Document number in context (passport no., ID No., document no.)
['pattern' => '/\b(?:passport\s+(?:no\.?|number)|ID\s+(?:no\.?|number)|document\s+(?:no\.?|number))\s*[:\#]?\s*([A-Z0-9]{6,12})\b/iu', 'replacement' => '[DOC_NO]', 'type' => 'doc_no'],
]);
}
private function llmRedactionPass(
string $preRedacted,
string $language = 'en',
array $aliases = [],
string $engine = 'azure_mini',
bool $keepOfficials = false,
array $exemptNames = [],
bool $doNames = true,
bool $doOrgs = true,
bool $doPlaces = true,
bool $doDob = true
): array {
if ($engine === 'regex') {
return ['skipped' => true, 'reason' => 'Regex-only mode selected'];
}
if ($engine !== 'gpu') {
$missing = $this->azure->missingChatConfig();
if ($missing) {
return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')'];
}
}
$languageNote = $language === 'no' ? "\n • The document may contain Norwegian or mixed-language content." : '';
// Build alias block
$aliasBlock = '';
if (!empty($aliases)) {
$lines = [];
foreach ($aliases as $a) {
$orig = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['original'] ?? '')), 0, 100));
$lbl = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['alias'] ?? '')), 0, 100));
if ($orig !== '' && $lbl !== '') {
$lines[] = " \"{$orig}\" → [{$lbl}]";
}
}
if ($lines) {
$aliasBlock = "\n\nALIAS OVERRIDES — use these exact replacement tags for these specific names instead of inferring a role:\n" . implode("\n", $lines);
}
}
// Build exempt names block
$exemptBlock = '';
if (!empty($exemptNames)) {
$quoted = array_map(fn($n) => '"' . str_replace(['"', "\n"], ['\\"', ' '], $n) . '"', array_slice($exemptNames, 0, 20));
$exemptBlock = "\n\nEXEMPT NAMES — these names must NOT be redacted under any circumstances:\n " . implode(', ', $quoted);
}
// Build entity-type restriction note
$skipTypes = [];
if (!$doOrgs) $skipTypes[] = 'organisation names';
if (!$doPlaces) $skipTypes[] = 'place names';
if (!$doDob) $skipTypes[] = 'dates of birth';
if (!$doNames) $skipTypes[] = 'person names';
$skipNote = $skipTypes ? "\n\nSKIP these entity types — do NOT redact them: " . implode(', ', $skipTypes) . '.' : '';
// Build officials note
$officialsNote = '';
if ($keepOfficials) {
$officialsNote = "\n\nOFFICIALS — for persons identified as JUDGE, ATTORNEY, EXPERT_WITNESS, or CASEWORKER in an official capacity: do NOT replace their name with a plain bracket tag. Instead use the format [ROLE: Name], e.g. [JUDGE: Andersen], [ATTORNEY: Skretting] or [EXPERT_WITNESS: Dr. Larsen]. Their name must remain visible inside the tag.";
}
$allowedTypesNote = '';
if (!$doNames) {
$allowedTypesNote = "\n\nDo NOT include person_name entries in your output.";
}
$system = <<<PROMPT
You are a privacy redaction assistant for legal documents (ECHR judgements, Norwegian family law cases, EU child welfare documents). The text has already had mechanical identifiers (phone numbers, emails, national ID numbers, addresses) replaced with placeholder tags in [BRACKETS].
Your task: find ALL remaining identifiable information — person names, organisation names, specific places at city level or below, and dates/years that could identify when events occurred.
STEP 1 — Identify persons and assign consistent role tags.
Infer each person's role from context and assign a tag used for EVERY occurrence of their name:
• Family roles: FATHER, MOTHER, CHILD, CHILD_1, CHILD_2, GRANDPARENT, SIBLING
• Professional roles: ATTORNEY, JUDGE, CASEWORKER, EXPERT_WITNESS
• Generic fallback: PERSON_1, PERSON_2 (only when role is unclear from context)
The same individual MUST receive the same tag every time they appear.{$aliasBlock}{$exemptBlock}{$officialsNote}
STEP 2 — Name variants: for each person, add a SEPARATE entry for every distinct textual form their name takes in the document. All variants of the same person receive the SAME tag.
Example: if "Per Hansen" also appears as "Per" alone and "Hansen" alone, return three entries: "Per Hansen", "Per", "Hansen" — all tagged [FATHER] (or whichever role applies).
Skip a short form only if it is also a common Norwegian or English word used in a clearly different sense elsewhere in the text.{$skipNote}{$allowedTypesNote}
Return ONLY a valid JSON object:
{"redactions":[{"original":"exact text as it appears in input","type":"person_name","tag":"[FATHER]"}]}
Allowed types and their tag format:
person_name → contextual role tag e.g. [FATHER], [CHILD_1], [ATTORNEY] (or alias tag if overridden above)
org → [ORG]
place → [PLACE] (city, town, neighbourhood, named location — NOT country names)
date_of_birth → [DOB]
date → [DATE] (standalone years, year ranges, month+year references that could identify events)
CRITICAL: "original" must be the date token ONLY — never include surrounding prepositions.
✓ text "i 2015" → original:"2015", tag:"[DATE]"
✓ text "rundt 2011/2012" → original:"2011/2012", tag:"[DATE]"
✓ text "august 2018" → original:"august 2018", tag:"[DATE]"
✓ text "spring of 2019" → original:"spring of 2019", tag:"[DATE]"
✗ WRONG: original:"i 2015" — preposition included, do NOT do this
other → [IDENTIFIER]
Rules:
• "original" must be verbatim text from the input — exact case, no paraphrasing or alterations.
• Do not return entries for text already inside [BRACKETS].
• The same person MUST get the same tag in every entry.
• If nothing remains to redact, return {"redactions":[]}.
• NOT PII: legal citations, statute names, article numbers (e.g. "Barnevernloven § 4-12", "Article 8 ECHR").
• NOT PII: national institution names ("Barnevernet", "Fylkesnemnda", "Oslo tingrett", "the Court").
• NOT PII: country names. City districts and named locations ARE PII.
• NOT PII: short common words, conjunctions, prepositions.{$languageNote}
PROMPT;
$messages = [
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $preRedacted],
];
$chatOptions = ['temperature' => 0.1, 'max_tokens' => 8000, 'json' => true, 'timeout' => 90];
try {
if ($engine === 'gpu') {
$response = $this->callGpuLlm($messages, $chatOptions);
$deployLabel = 'GPU (cuttlefish)';
} elseif ($engine === 'azure_full') {
$response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOptions);
$deployLabel = 'gpt-4o';
} else {
$response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOptions);
$deployLabel = 'gpt-4o-mini';
}
$content = (string)($response['choices'][0]['message']['content'] ?? '');
$json = $this->azure->decodeJsonObject($content);
if (!is_array($json) || !array_key_exists('redactions', $json)) {
return ['skipped' => false, 'entities' => [], 'error' => 'LLM returned unexpected JSON structure'];
}
return [
'skipped' => false,
'entities' => is_array($json['redactions']) ? $json['redactions'] : [],
'deployment' => $deployLabel,
];
} catch (Throwable $e) {
error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage());
return ['skipped' => false, 'entities' => [], 'error' => $e->getMessage()];
}
}
private function callGpuLlm(array $messages, array $options = []): array
{
return dbnToolsCallGpuLlm($messages, $options);
}
// ── Summarize: corpus context + engine-aware summary ─────────────────────
/**
* Search the shared legal corpus and return top-N passages as a formatted
* context string. Returns '' on failure so the caller can degrade gracefully.
*/
public function corpusContextForSummarize(string $query, int $limit = 8, ?string $persona = null): string
{
try {
$client = dbnToolsRequireClient();
$personaResolved = dbnToolsResolvePersona((int)$client['id'], $persona);
$package = $personaResolved['package'] ?? $this->requireFamilyPackage((int)$client['id']);
$packageIds = $personaResolved['package_ids'] ?: [(int)$package['id']];
$searchMethod = (string)($personaResolved['search_method'] ?? 'keyword') ?: 'keyword';
$personaRagOpts = is_array($personaResolved['rag_opts'] ?? null) ? $personaResolved['rag_opts'] : [];
dbnToolsBootCaveau();
$gatewayUrl = 'http://10.0.1.10:4000';
try {
$config = getConfig();
$u = trim((string)($config['ai_gateway']['url'] ?? ''));
if ($u !== '') $gatewayUrl = $u;
} catch (Throwable) {}
$rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 20);
$chunks = $rag->searchAll($query, $limit, null, array_merge($personaRagOpts, [
'search_private' => true,
'search_shared' => true,
'package_ids' => $packageIds,
'chunk_limit' => $limit,
'search_method' => $searchMethod,
'min_private' => 0,
'include_beta_website' => true,
]));
$parts = [];
foreach ($chunks as $c) {
$title = (string)($c['title'] ?? ($c['source'] ?? 'Legal source'));
$content = (string)($c['content'] ?? ($c['text'] ?? ''));
if ($content !== '') {
$parts[] = "=== {$title} ===\n{$content}";
}
}
return implode("\n\n", $parts);
} catch (Throwable $e) {
error_log('summarize corpus search failed: ' . $e->getMessage());
return '';
}
}
/**
* Engine-aware structured summarization, optionally enriched with corpus context.
*/
public function summarizeWithContext(
string $text,
string $language = 'en',
string $engine = 'azure_mini',
string $corpusContext = ''
): array {
$text = $this->requirePasteText($text);
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
$locale = dbnToolsLanguageName($language);
$enriched = $text;
$corpusUsed = $corpusContext !== '';
if ($corpusUsed) {
$enriched = '[Relevant legal context from ' . dbnToolsProductName() . " corpus]\n"
. $corpusContext
. "\n\n---\n\nDocument to summarise:\n"
. $text;
}
$prompt = <<<PROMPT
Summarise the following document in {$locale}. Do not invent facts not present in the text.
Return JSON only — no extra text before or after the JSON object.
{$enriched}
Return this JSON structure:
{
"what_we_found": "plain-language summary (2-4 sentences)",
"key_facts": ["fact 1", "fact 2"],
"dates": ["date or event phrase"],
"parties": ["party or role"],
"legal_references_detected": ["statute, article, or case name"],
"what_remains_uncertain": ["uncertainty or gap"],
"next_practical_step": "one concrete next action"
}
PROMPT;
$system = $this->legalJsonSystemPrompt($language);
$messages = [
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $prompt],
];
$maxTok = ($engine === 'azure_full') ? 8000 : 4000;
$chatOpts = ['json' => true, 'temperature' => 0.1, 'max_tokens' => $maxTok, 'timeout' => 120];
$deployLabel = $this->azure->chatDeployment();
try {
if ($engine === 'gpu') {
$response = $this->callGpuLlm($messages, $chatOpts);
$deployLabel = 'GPU (local)';
} elseif ($engine === 'azure_full') {
$response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOpts);
$deployLabel = 'gpt-4o';
} else {
$response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOpts);
$deployLabel = 'gpt-4o-mini';
}
} catch (Throwable $e) {
dbnToolsAbort('LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
}
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
$json = $this->azure->decodeJsonObject($raw);
if (!$json) {
dbnToolsAbort('LLM returned unparseable JSON.', 502, 'llm_parse_error');
}
$corpusNote = $corpusUsed
? 'Summary enriched with ' . count(array_filter(explode('=== ', $corpusContext))) . ' passage(s) from the ' . dbnToolsProductName() . ' legal corpus.'
: 'No corpus search performed; summarised from document text only.';
$trace = [
$this->trace('Document preparation', 'Text validated and prepared for summarisation.', 'complete'),
$this->trace('Corpus enrichment', $corpusNote, $corpusUsed ? 'complete' : 'complete'),
$this->trace('Summary generation', 'Structured summary generated via ' . $deployLabel . '.', 'complete'),
$this->trace('Uncertainty', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'),
$this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the summary against the original document.'), 'complete'),
];
return [
'tool' => 'summarize',
'language' => $language,
'what_we_found' => (string)($json['what_we_found'] ?? ''),
'key_facts' => $json['key_facts'] ?? [],
'dates' => $json['dates'] ?? [],
'parties' => $json['parties'] ?? [],
'legal_references_detected' => $json['legal_references_detected'] ?? [],
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
'corpus_used' => $corpusUsed,
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => 1,
'source_count' => 1,
'deployment' => $deployLabel,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function applyGenericTags(string $text): string
{
// Collapse contextual role tags (e.g. [FATHER], [JUDGE: Andersen], [CHILD_1]) → [PERSON]
$text = preg_replace('/\[(?:FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY(?::\s*[^\]]+)?|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u', '[PERSON]', $text) ?? $text;
return $text;
}
private function applyPseudonymization(string $text, array $allCounts): string
{
$norwegianNames = [
'Ola Nordmann', 'Per Hansen', 'Kari Larsen', 'Anne Berg', 'Erik Dahl',
'Ingrid Holm', 'Lars Moen', 'Silje Bakke', 'Tor Haugen', 'Eva Strand',
];
$nameCursor = 0;
$phoneBase = 1;
$emailCursor = 0;
$addrCursor = 1;
$orgCursor = 1;
$personMap = [];
// Replace named role tags (keeping consistent mapping per unique tag)
$text = preg_replace_callback(
'/\[(FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY(?::\s*[^\]]+)?|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u',
function (array $m) use (&$nameCursor, &$personMap, $norwegianNames): string {
$key = $m[1];
if (!isset($personMap[$key])) {
$personMap[$key] = $norwegianNames[$nameCursor % count($norwegianNames)];
$nameCursor++;
}
return $personMap[$key];
},
$text
) ?? $text;
$text = preg_replace_callback('/\[PHONE\]/', function () use (&$phoneBase): string {
return sprintf('+47 400 00 %03d', $phoneBase++);
}, $text) ?? $text;
$text = preg_replace_callback('/\[EMAIL\]/', function () use (&$emailCursor): string {
$letter = chr(ord('a') + ($emailCursor % 26));
$emailCursor++;
return "person.{$letter}@example.no";
}, $text) ?? $text;
$text = preg_replace_callback('/\[ADDRESS\]/', function () use (&$addrCursor): string {
return "Eksempelveien {$addrCursor}, 0001 Oslo";
}, $text) ?? $text;
$text = preg_replace_callback('/\[ORG\]/', function () use (&$orgCursor): string {
return "Eksempel AS ({$orgCursor})";
}, $text) ?? $text;
$text = preg_replace_callback('/\[FNR\]/', function (): string {
return '010100XXXXX';
}, $text) ?? $text;
$text = preg_replace_callback('/\[(?:SE_PERSONNUMMER|FR_INSEE|UK_NI|SSN|NAT_ID|DOC_NO|ECHR_APP_NO)\]/', function (): string {
return '[ID-REDACTED]';
}, $text) ?? $text;
$text = preg_replace_callback('/\[PLACE\]/', function (): string {
return 'Eksempelby';
}, $text) ?? $text;
$text = preg_replace_callback('/\[DOB\]/', function (): string {
return '01.01.0000';
}, $text) ?? $text;
$text = preg_replace_callback('/\[IBAN\]/', function (): string {
return 'NO00 0000 00 00000';
}, $text) ?? $text;
return $text;
}
private function uncertaintySummary(mixed $uncertainty): string
{
if (is_array($uncertainty)) {
$uncertainty = implode(' ', array_map('strval', $uncertainty));
}
$uncertainty = trim((string)$uncertainty);
return $uncertainty !== '' ? dbnToolsExcerpt($uncertainty, 220) : 'No additional uncertainty was supplied by the tool.';
}
private function trace(string $label, string $detail, string $status = 'complete'): array
{
return [
'label' => $label,
'detail' => $detail,
'status' => $status,
];
}
}