Files
dobetternorge-tools/includes/LegalTools.php
T
daveadmin f183678f35 Redact: catch soft dates (years, month+year, ranges, prepositions)
Adds Nordic-pack regex patterns for:
- DD.MM.YYYY / DD/MM/YYYY / YYYY-MM-DD
- Year ranges (2011/2012, 2018-2019)
- Month + year (Norwegian + English, with optional day)
- Year preceded by temporal preposition (i 2015, fra 2019, rundt 2018)

Also renames the entity toggle from "Dates of birth" to "Dates" (broader
scope) in all four languages, and expands the LLM prompt so soft date
references in free text are caught even when regex misses them.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 01:58:35 +02:00

1237 lines
59 KiB
PHP

<?php
declare(strict_types=1);
require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';
final class DbnLegalToolsService
{
private const MAX_PASTE_CHARS = 128000;
private DbnAzureOpenAiGateway $azure;
public function __construct(?DbnAzureOpenAiGateway $azure = null)
{
$this->azure = $azure ?: new DbnAzureOpenAiGateway();
}
public function search(
string $query,
string $language = 'en',
int $limit = 6,
string $temporalMode = 'disabled',
?string $asOfDate = null
): array {
$query = trim($query);
if (mb_strlen($query, 'UTF-8') < 3) {
dbnToolsAbort('Search query must be at least 3 characters.', 422, 'query_too_short');
}
$limit = max(1, min(10, $limit));
$temporalMode = in_array($temporalMode, ['legal_conservative', 'disabled'], true) ? $temporalMode : 'disabled';
$trace = [
$this->trace('Query interpretation', 'Searching Do Better Norge private corpus plus the subscribed family-legal package.', 'complete'),
$this->trace('Search tools used', 'ClientRagPipeline::searchAll with keyword mode, private corpus enabled, shared package filter set to family-legal.', 'running'),
];
$client = dbnToolsRequireClient();
$package = $this->requireFamilyPackage((int)$client['id']);
$chunks = [];
$retrievalNote = 'ClientRagPipeline keyword retrieval';
try {
dbnToolsBootCaveau();
$gatewayUrl = 'http://10.0.1.10:4000';
try {
$config = getConfig();
$configured = trim((string)($config['ai_gateway']['url'] ?? ''));
if ($configured !== '') {
$gatewayUrl = $configured;
}
} catch (Throwable $e) {
// Retrieval still works in keyword mode without gateway config.
}
$rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 30);
$chunks = $rag->searchAll($query, $limit, null, [
'search_private' => true,
'search_shared' => true,
'package_ids' => [(int)$package['id']],
'chunk_limit' => $limit,
'search_method' => 'keyword',
'min_private' => 0,
'include_beta_website' => true,
]);
// Apply temporal reranking after retrieval (optional)
if ($temporalMode === 'legal_conservative' && !empty($chunks)) {
$temporalLayerPath = __DIR__ . '/../../ai-portal/platform/includes/LegalTemporalLayer.php';
if (file_exists($temporalLayerPath)) {
require_once $temporalLayerPath;
$layer = new LegalTemporalLayer(['temporal_mode' => $temporalMode]);
$chunks = $layer->rerank($chunks, $query, $asOfDate);
}
}
} catch (Throwable $e) {
$retrievalNote = 'SQL keyword fallback after ClientRagPipeline error';
$trace[] = $this->trace('Search fallback', 'Pipeline retrieval failed; using direct SQL keyword fallback without storing the query.', 'warning');
$chunks = $this->fallbackKeywordSearch((int)$client['id'], $package, $query, $limit);
}
if (!$chunks) {
$fallback = $this->fallbackKeywordSearch((int)$client['id'], $package, $query, $limit);
if ($fallback) {
$chunks = $fallback;
$retrievalNote = 'SQL keyword fallback';
}
}
$sharedDocIds = [];
foreach (array_slice($chunks, 0, $limit) as $chunk) {
if (($chunk['source_type'] ?? '') !== 'private' && isset($chunk['document_id'])) {
$sharedDocIds[(int)$chunk['document_id']] = true;
}
}
$docSummaries = $sharedDocIds ? $this->fetchDocSummaries(array_keys($sharedDocIds)) : [];
$hits = array_map(
fn(array $chunk): array => $this->sourceFromChunk(
$chunk,
($chunk['source_type'] ?? '') !== 'private'
? ($docSummaries[(int)($chunk['document_id'] ?? 0)] ?? null)
: null
),
array_slice($chunks, 0, $limit)
);
$confidence = $this->citationConfidence($hits);
$trace[1] = $this->trace('Search tools used', $retrievalNote . '; returned ' . count($hits) . ' source hit(s).', 'complete');
$trace[] = $this->trace('Evidence found', count($hits) ? 'Retrieved source excerpts for review.' : 'No matching source excerpts were found.', count($hits) ? 'complete' : 'warning');
$trace[] = $this->trace('Citation confidence', ucfirst($confidence) . ' confidence based on source count and retrieval scores.', $confidence === 'low' ? 'warning' : 'complete');
return [
'tool' => 'search',
'language' => $language,
'what_we_found' => count($hits) ? 'Found source excerpts from the legal corpus.' : 'No matching source excerpts were found.',
'hits' => $hits,
'evidence_trail' => $hits,
'what_remains_uncertain' => count($hits) ? 'Search results still need human review for legal relevance and currentness.' : 'The corpus may not contain enough evidence for this query.',
'next_practical_step' => count($hits) ? 'Open the strongest sources and confirm the cited sections before relying on them.' : 'Try a narrower query with statutory terms, party names, or dates.',
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($chunks),
'source_count' => count($hits),
'deployment' => null,
'citation_confidence' => $confidence,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
public function ask(string $question, string $language = 'en'): array
{
$search = $this->search($question, $language, 7);
$hits = $search['hits'];
$trace = $search['trace'];
if (!$hits) {
$trace[] = $this->trace('Synthesis', 'Skipped answer synthesis because no evidence was found.', 'warning');
return [
'tool' => 'ask',
'language' => $language,
'answer' => $language === 'no'
? 'Jeg fant ikke nok kildestøtte i familie-rettskorpuset til å svare sikkert.'
: 'I did not find enough source support in the family-law corpus to answer safely.',
'what_we_found' => $search['what_we_found'],
'evidence_trail' => [],
'what_remains_uncertain' => $search['what_remains_uncertain'],
'next_practical_step' => $search['next_practical_step'],
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => 0,
'source_count' => 0,
'deployment' => null,
'citation_confidence' => 'low',
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
$this->azure->requireChat();
$context = $this->buildEvidenceContext($hits);
$locale = $language === 'no' ? 'Norwegian' : 'English';
$prompt = <<<PROMPT
Question:
{$question}
Evidence excerpts:
{$context}
Return JSON only with these keys:
{
"answer": "short direct answer in {$locale}",
"what_we_found": "plain-language summary of the supported finding",
"evidence_trail": [{"title":"source title","why_it_matters":"one sentence","citation":"visible source title or section"}],
"what_remains_uncertain": ["specific gaps or caveats"],
"next_practical_step": "one concrete next action"
}
PROMPT;
$system = $this->legalJsonSystemPrompt($language);
$raw = $this->azure->chatText([
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $prompt],
], [
'json' => true,
'temperature' => 0.15,
'max_tokens' => 1300,
]);
$json = $this->azure->decodeJsonObject($raw);
if (!$json) {
$json = [
'answer' => $raw,
'what_we_found' => 'Azure returned a plain-text answer based on the retrieved excerpts.',
'evidence_trail' => [],
'what_remains_uncertain' => ['The response format could not be validated as structured JSON.'],
'next_practical_step' => 'Review the source excerpts manually before relying on the answer.',
];
}
$trace[] = $this->trace('Synthesis', 'Azure OpenAI generated an answer using only the retrieved source excerpts.', 'complete');
$trace[] = $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete');
$trace[] = $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the evidence trail.'), 'complete');
return [
'tool' => 'ask',
'language' => $language,
'answer' => (string)($json['answer'] ?? ''),
'what_we_found' => (string)($json['what_we_found'] ?? ''),
'evidence_trail' => $hits,
'citation_notes' => $this->normalizeEvidenceTrail($json['evidence_trail'] ?? [], $hits),
'sources' => $hits,
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($hits),
'source_count' => count($hits),
'deployment' => $this->azure->chatDeployment(),
'citation_confidence' => $search['trace_metadata']['citation_confidence'] ?? 'medium',
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
public function summarize(string $text, string $language = 'en'): array
{
$text = $this->requirePasteText($text);
$this->azure->requireChat();
$locale = $language === 'no' ? 'Norwegian' : 'English';
$prompt = <<<PROMPT
Summarize this pasted case-preparation text in {$locale}. Do not invent missing facts.
Pasted text:
{$text}
Return JSON only:
{
"what_we_found": "plain-language summary",
"key_facts": ["fact"],
"dates": ["date or unknown"],
"parties": ["party or role"],
"legal_references_detected": ["reference"],
"what_remains_uncertain": ["uncertainty"],
"next_practical_step": "one concrete next action"
}
PROMPT;
$json = $this->runJsonTool($prompt, $language, 1300);
$trace = [
$this->trace('Query interpretation', 'Summarize pasted text without saving the text or output.', 'complete'),
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text.', 'complete'),
$this->trace('Evidence found', 'Evidence trail is limited to the pasted text supplied in this request.', 'complete'),
$this->trace('Citation confidence', 'Medium confidence for factual extraction; no external legal source verification was performed.', 'warning'),
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'),
$this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the summary against the original text.'), 'complete'),
];
return [
'tool' => 'summarize',
'language' => $language,
'what_we_found' => (string)($json['what_we_found'] ?? ''),
'key_facts' => $json['key_facts'] ?? [],
'dates' => $json['dates'] ?? [],
'parties' => $json['parties'] ?? [],
'legal_references_detected' => $json['legal_references_detected'] ?? [],
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => 1,
'source_count' => 1,
'deployment' => $this->azure->chatDeployment(),
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
public function timeline(
string $text,
string $language = 'en',
string $engine = 'azure_mini',
string $focus = 'all',
string $confidenceFilter = 'all',
bool $includeRelative = true,
bool $includeBackground = true
): array {
$text = $this->requirePasteText($text);
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
$focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all';
if ($engine !== 'gpu') {
$this->azure->requireChat();
}
$locale = $language === 'no' ? 'Norwegian' : 'English';
$focusInstruction = match ($focus) {
'deadlines' => "\nFocus specifically on: legal deadlines, filing dates, response windows, appeal periods, and statutory time limits. Deprioritise narrative events with no legal deadline significance.",
'hearings' => "\nFocus specifically on: court hearings, tribunal sessions, mediation sessions, formal meetings, and hearing-related procedural dates.",
'cps' => "\nFocus specifically on: CPS (Barnevernet) interventions, home visits, case reviews, acute measures (akuttvedtak), and Fylkesnemnda proceedings.",
default => '',
};
$backgroundInstruction = $includeBackground
? "\nAlso extract BACKGROUND and NARRATIVE events: dates embedded in contextual paragraphs, historical facts, year-only references, and approximate years (e.g. \"rundt 2011/2012\", \"David ble født den 30.07.2015\", \"familien i 2015\"). These are valid timeline events even when they appear in introductory or background text — do NOT skip them."
: "\nDo NOT include purely historical background or narrative context dates. Focus only on operational events, deadlines, and milestones that are directly actionable in the case.";
$prompt = <<<PROMPT
Build a chronological timeline from the pasted text in {$locale}.
Extract ALL dates, deadlines, milestones, and temporal references.{$focusInstruction}{$backgroundInstruction}
IMPORTANT — Norwegian date formats to recognise:
- DD.MM.YY (e.g. 18.09.25 = 2025-09-18, 09.04.25 = 2025-04-09)
- D.M.YY (e.g. 6.1.25 = 2025-01-06)
- DD.MM. (e.g. 18.09. — day and month without year; infer year from surrounding context)
- D.M. (e.g. 6.1. — day and month only)
- DD.MM.YYYY (e.g. 18.09.2025)
- Two-digit years: always interpret as 20YY (25 → 2025, 24 → 2024).
- Diary / log format: lines that begin with a date followed by a colon or space are ALWAYS events.
Example: "18.09.25: Samtale med Davids lærer" → date 2025-09-18, event "Samtale med Davids lærer".
Example: "6.1. Samtaler med David" → date unknown-year-01-06, event "Samtaler med David".
- Do NOT skip a line just because the year is ambiguous — record what you can and set confidence accordingly.
For each temporal reference provide:
- "date": ISO 8601 date (YYYY-MM-DD) if determinable, otherwise a human-readable description such as "06 Jan (year unknown)"
- "date_type": one of absolute | relative | recurring | conditional | period
- "actor": person, institution, or party involved — or "unknown"
- "event": concise description of what happened or is due
- "source_excerpt": the verbatim phrase from the text that grounds this event (≤ 30 words)
- "confidence": high | medium | low
Sort events chronologically (absolute dates first, then relative, then recurring).
Keep uncertain dates explicit — do not invent dates not in the text.
If multiple documents are separated by "--- Document: … ---" markers, note the source document in the event description where helpful.
Pasted text:
{$text}
Return JSON only:
{
"what_we_found": "short overview",
"events": [{"date":"...","date_type":"absolute","actor":"...","event":"...","source_excerpt":"...","confidence":"high|medium|low"}],
"evidence_trail": [{"title":"...","excerpt":"..."}],
"what_remains_uncertain": ["..."],
"next_practical_step": "..."
}
PROMPT;
$system = $this->legalJsonSystemPrompt($language);
$messages = [
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $prompt],
];
$chatOptions = ['json' => true, 'temperature' => 0.1, 'max_tokens' => 4000];
$deployLabel = $this->azure->chatDeployment();
try {
if ($engine === 'gpu') {
$response = $this->callGpuLlm($messages, $chatOptions);
$deployLabel = 'GPU (cuttlefish)';
} elseif ($engine === 'azure_full') {
$response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOptions);
$deployLabel = 'gpt-4o';
} else {
$response = $this->azure->chat($messages, $chatOptions);
$deployLabel = $this->azure->chatDeployment();
}
} catch (Throwable $e) {
dbnToolsAbort('LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
}
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
$json = $this->azure->decodeJsonObject($raw);
if (!$json) {
dbnToolsAbort('The selected engine did not return valid structured JSON.', 502, 'llm_invalid_json');
}
$events = is_array($json['events'] ?? null) ? $json['events'] : [];
// Post-filter: confidence
if ($confidenceFilter === 'high_medium') {
$events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low'));
}
// Post-filter: relative/recurring date types
if (!$includeRelative) {
$events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute'));
}
$engineLabel = match ($engine) {
'gpu' => 'GPU (cuttlefish)',
'azure_full' => 'gpt-4o',
default => $deployLabel ?? $this->azure->chatDeployment(),
};
$focusLabel = match ($focus) {
'deadlines' => 'legal deadlines',
'hearings' => 'court hearings',
'cps' => 'CPS milestones',
default => 'all events',
};
$trace = [
$this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Without saving the text or output.", 'complete'),
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text.', 'complete'),
$this->trace('Evidence found', count($events) . ' event(s) identified' . ($confidenceFilter === 'high_medium' ? ' (low-confidence filtered out)' : '') . '.', count($events) ? 'complete' : 'warning'),
$this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text.', 'complete'),
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'),
$this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Verify dates against original documents.'), 'complete'),
];
return [
'tool' => 'timeline',
'language' => $language,
'what_we_found' => (string)($json['what_we_found'] ?? ''),
'events' => $events,
'evidence_trail' => $json['evidence_trail'] ?? [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($events),
'source_count' => 1,
'deployment' => $engineLabel,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
public function redact(
string $text,
string $mode = 'standard',
string $region = 'nordic',
string $language = 'en',
array $aliases = [],
string $engine = 'azure_mini',
string $outputFormat = 'contextual',
bool $keepOfficials = false,
array $exemptNames = [],
array $redactTypes = []
): array {
$text = $this->requirePasteText($text);
$mode = $mode === 'strict' ? 'strict' : 'standard';
$region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic';
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'regex'], true) ? $engine : 'azure_mini';
$outputFormat = in_array($outputFormat, ['contextual', 'generic', 'pseudonym'], true) ? $outputFormat : 'contextual';
// Normalise entity-type flags (all on by default)
$doNames = ($redactTypes['names'] ?? true) !== false;
$doOrgs = ($redactTypes['orgs'] ?? true) !== false;
$doPlaces = ($redactTypes['places'] ?? true) !== false;
$doDob = ($redactTypes['dob'] ?? true) !== false;
// Pass 1 — deterministic regex
[$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region);
$pass1Total = array_sum($pass1Counts);
$pass1Detail = $pass1Total
? implode(', ', array_map(
fn($k, $v) => "{$k}: {$v}",
array_keys(array_filter($pass1Counts, fn($v): bool => $v > 0)),
array_filter($pass1Counts, fn($v): bool => $v > 0)
))
: 'none detected';
$engineLabel = match ($engine) {
'azure_full' => 'Azure gpt-4o',
'gpu' => 'GPU (cuttlefish)',
'regex' => 'Regex only',
default => 'Azure gpt-4o-mini',
};
$trace = [
$this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}. Engine: {$engineLabel}.", 'complete'),
$this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'),
];
// Pass 2 — LLM semantic scan
$finalRedacted = $preRedacted;
$pass2Counts = [];
$llmDeployment = null;
$llmResult = $this->llmRedactionPass(
$preRedacted, $language, $aliases, $engine,
$keepOfficials, $exemptNames,
$doNames, $doOrgs, $doPlaces, $doDob
);
if (!empty($llmResult['skipped'])) {
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'not configured') . '.', 'warning');
} elseif (!empty($llmResult['error'])) {
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning');
} else {
$entities = $llmResult['entities'] ?? [];
$llmDeployment = $llmResult['deployment'] ?? null;
$applied = 0;
foreach ($entities as $entity) {
if (!is_array($entity)) {
continue;
}
$original = (string)($entity['original'] ?? '');
$type = (string)($entity['type'] ?? 'other');
$tag = (string)($entity['tag'] ?? '[IDENTIFIER]');
if ($original === '' || str_starts_with($original, '[')) {
continue;
}
// Allow [ROLE: Name] format when keepOfficials is on, else require plain bracket tag
if (!preg_match('/^\[[A-Za-z0-9_\- ]+(?::\s*[^\]]+)?\]$/', $tag)) {
$tag = '[IDENTIFIER]';
}
if (str_contains($finalRedacted, $original)) {
$finalRedacted = str_replace($original, $tag, $finalRedacted);
$pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1;
$applied++;
}
}
$pass2Detail = $applied > 0
? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts))
: 'no additional entities found';
$trace[] = $this->trace('Pass 2 — LLM semantic scan', "{$engineLabel} reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete');
}
// Apply output format post-processing
$allCounts = array_merge($pass1Counts, $pass2Counts);
if ($outputFormat === 'generic') {
$finalRedacted = $this->applyGenericTags($finalRedacted);
} elseif ($outputFormat === 'pseudonym') {
$finalRedacted = $this->applyPseudonymization($finalRedacted, $allCounts);
}
$categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0));
$trace[] = $this->trace('Output format', match ($outputFormat) {
'generic' => 'All identifiers normalised to generic tags ([PERSON], [ORG], etc.).',
'pseudonym' => 'Identifiers replaced with plausible pseudonymous values.',
default => 'Contextual role tags used (e.g. [FATHER], [JUDGE: Name]).',
}, 'complete');
$trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning');
$trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete');
return [
'tool' => 'redact',
'mode' => $mode,
'region' => $region,
'engine_used' => $engineLabel,
'output_format' => $outputFormat,
'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment || $engine === 'gpu' ? " and {$engineLabel} semantic scan" : '') . '.',
'redacted_text' => $finalRedacted,
'detected_entity_categories' => $categories,
'entity_counts' => $allCounts,
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
'what_remains_uncertain' => ['Human review is still recommended for contextual identification.'],
'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.',
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => 1,
'source_count' => 1,
'deployment' => $llmDeployment ?? $engineLabel,
],
'disclaimer' => 'Privacy support tool. Review before disclosure.',
];
}
private function requireFamilyPackage(int $clientId): array
{
$package = dbnToolsFetchPackage('family-legal');
if (!$package || empty($package['is_active'])) {
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
}
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
}
return $package;
}
private function runJsonTool(string $prompt, string $language, int $maxTokens): array
{
$raw = $this->azure->chatText([
['role' => 'system', 'content' => $this->legalJsonSystemPrompt($language)],
['role' => 'user', 'content' => $prompt],
], [
'json' => true,
'temperature' => 0.1,
'max_tokens' => $maxTokens,
]);
$json = $this->azure->decodeJsonObject($raw);
if (!$json) {
dbnToolsAbort('Azure OpenAI did not return valid structured JSON.', 502, 'azure_invalid_json');
}
return $json;
}
private function legalJsonSystemPrompt(string $language): string
{
$locale = $language === 'no' ? 'Norwegian' : 'English';
return <<<PROMPT
You are Do Better Norge Legal Tools in a source-grounded legal preparation workflow.
Use the DBN legal guardrails:
- Answer only from provided source excerpts or pasted text.
- Treat your role as legal information and issue-spotting, not final legal advice.
- Never invent statutes, paragraph numbers, case names, citations, parties, dates, or sources.
- If evidence is insufficient, say so plainly.
- Respond in {$locale}.
- Return valid JSON only. No markdown fences.
PROMPT;
}
private function buildEvidenceContext(array $hits): string
{
$lines = [];
foreach ($hits as $idx => $hit) {
$n = $idx + 1;
$lines[] = "[{$n}] Title: " . ($hit['title'] ?? 'Untitled');
if (!empty($hit['section'])) {
$lines[] = "Section: " . $hit['section'];
}
$lines[] = "Corpus/package: " . ($hit['package_or_corpus'] ?? 'unknown');
$lines[] = "Excerpt: " . ($hit['excerpt'] ?? '');
}
return implode("\n", $lines);
}
private function normalizeEvidenceTrail(mixed $trail, array $hits): array
{
if (!is_array($trail) || !$trail) {
return array_map(fn(array $hit): array => [
'title' => $hit['title'],
'citation' => $hit['title'],
'why_it_matters' => dbnToolsExcerpt($hit['excerpt'], 180),
], array_slice($hits, 0, 4));
}
return array_values(array_filter($trail, 'is_array'));
}
private function sourceFromChunk(array $chunk, ?string $docSummary = null): array
{
$title = (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source');
$score = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
$rawExcerpt = dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620);
return [
'title' => $title,
'excerpt' => $docSummary ?? $rawExcerpt,
'chunk_text' => $rawExcerpt,
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Norge'),
'score' => $score,
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
'section' => $chunk['section_title'] ?? null,
'authority_type' => $chunk['authority_type'] ?? null,
'jurisdiction' => $chunk['jurisdiction'] ?? null,
// Temporal annotations (present when temporal_mode = 'legal_conservative')
'temporal_state' => $chunk['temporal_state'] ?? null,
'temporal_kind' => $chunk['temporal_kind'] ?? null,
'temporal_reason' => $chunk['temporal_reason'] ?? null,
'currentness_warning' => $chunk['currentness_warning'] ?? null,
'valid_from' => $chunk['valid_from'] ?? null,
'valid_until' => $chunk['valid_until'] ?? null,
'is_current_version' => $chunk['is_current_version'] ?? null,
];
}
private function fetchDocSummaries(array $docIds): array
{
if (!$docIds) {
return [];
}
try {
$db = dbnToolsRagDb();
$placeholders = implode(',', array_fill(0, count($docIds), '?'));
$stmt = $db->prepare(
"SELECT document_id, summary FROM doc_summaries
WHERE document_id IN ({$placeholders}) AND summary != ''"
);
$stmt->execute(array_values($docIds));
return array_column($stmt->fetchAll(PDO::FETCH_ASSOC), 'summary', 'document_id');
} catch (Throwable) {
return [];
}
}
private function citationConfidence(array $hits): string
{
if (!$hits) {
return 'low';
}
$scores = array_values(array_filter(array_map(fn(array $h) => $h['score'] ?? null, $hits), 'is_numeric'));
$best = $scores ? max($scores) : 0;
if (count($hits) >= 3 && $best >= 0.35) {
return 'high';
}
if (count($hits) >= 1) {
return 'medium';
}
return 'low';
}
private function fallbackKeywordSearch(int $clientId, array $package, string $query, int $limit): array
{
$results = [];
try {
$results = array_merge($results, $this->fallbackPrivateSearch($clientId, $query, $limit));
} catch (Throwable $e) {
error_log('DBN tools private fallback failed: ' . $e->getMessage());
}
try {
$remaining = max(1, $limit - count($results));
$results = array_merge($results, $this->fallbackSharedSearch($package, $query, $remaining));
} catch (Throwable $e) {
error_log('DBN tools shared fallback failed: ' . $e->getMessage());
}
return array_slice($results, 0, $limit);
}
private function fallbackPrivateSearch(int $clientId, string $query, int $limit): array
{
$db = dbnToolsDb();
$terms = $this->searchTerms($query);
if (!$terms) {
return [];
}
$clauses = [];
$params = [':client_id' => $clientId];
foreach ($terms as $i => $term) {
$key = ':term' . $i;
$clauses[] = "(cc.content LIKE {$key} OR cd.title LIKE {$key})";
$params[$key] = '%' . $term . '%';
}
$sql = 'SELECT cc.id, cc.document_id, cc.content, cd.title AS document_title, cd.category
FROM client_chunks cc
JOIN client_documents cd ON cc.document_id = cd.id
WHERE cc.client_id = :client_id AND cd.status = "ready" AND (' . implode(' OR ', $clauses) . ')
LIMIT ' . (int)$limit;
$stmt = $db->prepare($sql);
$stmt->execute($params);
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
foreach ($rows as &$row) {
$row['similarity'] = 0.25;
$row['source_name'] = 'Do Better Norge private corpus';
$row['source_type'] = 'private';
}
return $rows;
}
private function fallbackSharedSearch(array $package, string $query, int $limit): array
{
$ragDb = dbnToolsRagDb();
$terms = $this->searchTerms($query);
if (!$terms) {
return [];
}
$where = ['d.status = "ready"'];
$params = [];
if (!empty($package['corpus_id'])) {
$where[] = 'd.corpus_id = ?';
$params[] = (int)$package['corpus_id'];
}
$cats = json_decode((string)($package['category_filter'] ?? '[]'), true) ?: [];
if ($cats) {
$where[] = 'd.category IN (' . implode(',', array_fill(0, count($cats), '?')) . ')';
$params = array_merge($params, $cats);
}
$langs = json_decode((string)($package['language_filter'] ?? '[]'), true) ?: [];
if ($langs) {
$where[] = 'd.language IN (' . implode(',', array_fill(0, count($langs), '?')) . ')';
$params = array_merge($params, $langs);
}
$termClauses = [];
foreach ($terms as $term) {
$termClauses[] = '(c.content LIKE ? OR d.title LIKE ?)';
$params[] = '%' . $term . '%';
$params[] = '%' . $term . '%';
}
$where[] = '(' . implode(' OR ', $termClauses) . ')';
$sql = 'SELECT c.id, c.document_id, c.content, c.section_title, d.title AS document_title,
d.category, d.language
FROM chunks c
JOIN documents d ON c.document_id = d.id
WHERE ' . implode(' AND ', $where) . '
LIMIT ' . (int)$limit;
$stmt = $ragDb->prepare($sql);
$stmt->execute($params);
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
foreach ($rows as &$row) {
$row['similarity'] = 0.2;
$row['source_name'] = (string)($package['name'] ?? 'family-legal');
$row['source_type'] = 'package';
}
return $rows;
}
private function searchTerms(string $query): array
{
$parts = preg_split('/[^\p{L}\p{N}]+/u', mb_strtolower($query, 'UTF-8')) ?: [];
$stop = ['the', 'and', 'for', 'with', 'that', 'this', 'hva', 'har', 'kan', 'jeg', 'som', 'det', 'med', 'til', 'og'];
$terms = [];
foreach ($parts as $part) {
if (mb_strlen($part, 'UTF-8') < 3 || in_array($part, $stop, true)) {
continue;
}
$terms[] = $part;
}
return array_slice(array_values(array_unique($terms)), 0, 6);
}
private function requirePasteText(string $text): string
{
$text = trim($text);
if (mb_strlen($text, 'UTF-8') < 20) {
dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short');
}
if (mb_strlen($text, 'UTF-8') > self::MAX_PASTE_CHARS) {
dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long');
}
return $text;
}
private function deterministicRedaction(string $text, string $mode, string $region = 'nordic'): array
{
$counts = [];
$replace = function (string $pattern, string $type, string $token) use (&$text, &$counts): void {
$text = preg_replace_callback($pattern, function () use (&$counts, $type, $token): string {
$counts[$type] = ($counts[$type] ?? 0) + 1;
return $token;
}, $text) ?? $text;
};
foreach ($this->getPatternPack($region) as $entry) {
$replace($entry['pattern'], $entry['type'], $entry['replacement']);
}
// Structured role-label names (Barn: X, Mother: X, etc.) — universal
$text = preg_replace_callback(
'/\b(Barn|Child|Navn|Name|Mor|Far|Mother|Father|Sønn|Datter)\s*:\s*([^\r\n,.;]+)/iu',
function (array $m) use (&$counts): string {
$counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
return $m[1] . ': [PERSON]';
},
$text
) ?? $text;
// Child-identifier phrases ("barnet heter X", "child named X") — universal
$text = preg_replace_callback(
'/\b(?:barnet|child|sønn|son|datter|daughter)\s+(?:heter|named|called)?\s*([A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,})\b/iu',
function () use (&$counts): string {
$counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
return '[CHILD_IDENTIFIER]';
},
$text
) ?? $text;
if ($mode === 'strict') {
$replace('/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,}\s+[A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,}\b/u', 'person_or_child_name', '[PERSON]');
}
return [$text, $counts];
}
private function getPatternPack(string $region): array
{
$nordic = [
['pattern' => '/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'replacement' => '[EMAIL]', 'type' => 'email'],
['pattern' => '/(?<!\d)(?:\d{6}[\s\-]?\d{5}|\d{11})(?!\d)/u', 'replacement' => '[FNR]', 'type' => 'fødselsnummer'],
['pattern' => '/(?<!\d)(?:\+47[\s.\-]?)?(?:\d[\s.\-]?){8}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
// Dates — must precede generic numeric patterns
// Year range (e.g. 2011/2012, 2018-2019)
['pattern' => '/(?<!\d)(?:19|20)\d{2}\s*[\/\-–—]\s*(?:19|20)?\d{2}(?!\d)/u', 'replacement' => '[DATE]', 'type' => 'date'],
// Norwegian DD.MM.YYYY and DD/MM/YYYY
['pattern' => '/(?<!\d)(?:0?[1-9]|[12]\d|3[01])[.\/](?:0?[1-9]|1[0-2])[.\/](?:19|20)\d{2}(?!\d)/u', 'replacement' => '[DATE]', 'type' => 'date'],
// ISO YYYY-MM-DD
['pattern' => '/(?<!\d)(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])(?!\d)/u', 'replacement' => '[DATE]', 'type' => 'date'],
// DD. Month YYYY (e.g. "30. juli 2015") and Month YYYY (Norwegian + English)
['pattern' => '/\b(?:\d{1,2}\.?\s+)?(?:januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember|January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\s+(?:19|20)\d{2}\b/iu', 'replacement' => '[DATE]', 'type' => 'date'],
// Year after Norwegian/English temporal preposition (lookbehind keeps preposition)
['pattern' => '/(?<=\b(?:i|fra|siden|innen|før|etter|rundt|omkring|cirka|in|from|since|until|before|after|around|circa)\s)(?:19|20)\d{2}(?![\d\/\-])/iu', 'replacement' => '[DATE]', 'type' => 'date'],
];
if ($region === 'nordic') {
return $nordic;
}
$european = array_merge($nordic, [
// Swedish personnummer short (YYMMDD-XXXX / YYMMDD+XXXX)
['pattern' => '/(?<!\d)\d{6}[-+]\d{4}(?!\d)/u', 'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
// Swedish personnummer full (YYYYMMDD-XXXX)
['pattern' => '/(?<!\d)\d{8}[-+]\d{4}(?!\d)/u', 'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
// Danish/Finnish CPR / henkilötunnus — same format as short SE personnummer but included for clarity
['pattern' => '/\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b/i', 'replacement' => '[UK_NI]', 'type' => 'uk_ni'],
// French INSEE (15 digits, not overlapping with 11-digit FNR due to lookarounds)
['pattern' => '/(?<!\d)\d{15}(?!\d)/u', 'replacement' => '[FR_INSEE]', 'type' => 'fr_insee'],
// IBAN (2-letter country code + 2 check digits + up to 30 alphanumeric)
['pattern' => '/\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}[A-Z0-9]{0,16}\b/i', 'replacement' => '[IBAN]', 'type' => 'iban'],
// European phone (international prefix for major EU/EEA country codes)
['pattern' => '/(?<!\d)\+(?:44|46|45|358|33|49|34|39|31|32|41|43|30|351|353|48|36|420|421|372|371|370|386|385|356|357|40|359|352)[\s.\-]?(?:\d[\s.\-]?){7,12}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
// Street address expanded to European street-type keywords
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave|rue|straße|strasse|straat|gade|calle|via|gatan|vägen)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
]);
if ($region === 'european') {
return $european;
}
$echr = array_merge($european, [
// ECHR application number (requires "Application no." or "App. No." prefix to avoid matching dates/pages)
['pattern' => '/\b(?:Application|App\.?)\s+(?:no\.?|nr\.?|#)\s*\d{3,6}\s*\/\s*\d{2,4}\b/i', 'replacement' => '[ECHR_APP_NO]', 'type' => 'echr_app_no'],
// Date of birth stated in judgment context
['pattern' => '/\bborn\s+(?:on\s+)?\d{1,2}[.\s]+(?:January|February|March|April|May|June|July|August|September|October|November|December|januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)[,\s]+\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
['pattern' => '/\bf\.\s*\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
// National ID label patterns in multiple languages
['pattern' => '/\b(?:personal\s+number|numéro\s+national|Personalausweis|personnummer|fødselsnummer|henkilötunnus|CPR-nummer)\s*:\s*[\w\s\-]+/iu', 'replacement' => '[NAT_ID]', 'type' => 'nat_id'],
]);
if ($region === 'echr') {
return $echr;
}
// global
return array_merge($echr, [
// US Social Security Number
['pattern' => '/(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/u', 'replacement' => '[SSN]', 'type' => 'ssn'],
// Document number in context (passport no., ID No., document no.)
['pattern' => '/\b(?:passport\s+(?:no\.?|number)|ID\s+(?:no\.?|number)|document\s+(?:no\.?|number))\s*[:\#]?\s*([A-Z0-9]{6,12})\b/iu', 'replacement' => '[DOC_NO]', 'type' => 'doc_no'],
]);
}
private function llmRedactionPass(
string $preRedacted,
string $language = 'en',
array $aliases = [],
string $engine = 'azure_mini',
bool $keepOfficials = false,
array $exemptNames = [],
bool $doNames = true,
bool $doOrgs = true,
bool $doPlaces = true,
bool $doDob = true
): array {
if ($engine === 'regex') {
return ['skipped' => true, 'reason' => 'Regex-only mode selected'];
}
if ($engine !== 'gpu') {
$missing = $this->azure->missingChatConfig();
if ($missing) {
return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')'];
}
}
$languageNote = $language === 'no' ? "\n • The document may contain Norwegian or mixed-language content." : '';
// Build alias block
$aliasBlock = '';
if (!empty($aliases)) {
$lines = [];
foreach ($aliases as $a) {
$orig = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['original'] ?? '')), 0, 100));
$lbl = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['alias'] ?? '')), 0, 100));
if ($orig !== '' && $lbl !== '') {
$lines[] = " \"{$orig}\" → [{$lbl}]";
}
}
if ($lines) {
$aliasBlock = "\n\nALIAS OVERRIDES — use these exact replacement tags for these specific names instead of inferring a role:\n" . implode("\n", $lines);
}
}
// Build exempt names block
$exemptBlock = '';
if (!empty($exemptNames)) {
$quoted = array_map(fn($n) => '"' . str_replace(['"', "\n"], ['\\"', ' '], $n) . '"', array_slice($exemptNames, 0, 20));
$exemptBlock = "\n\nEXEMPT NAMES — these names must NOT be redacted under any circumstances:\n " . implode(', ', $quoted);
}
// Build entity-type restriction note
$skipTypes = [];
if (!$doOrgs) $skipTypes[] = 'organisation names';
if (!$doPlaces) $skipTypes[] = 'place names';
if (!$doDob) $skipTypes[] = 'dates of birth';
if (!$doNames) $skipTypes[] = 'person names';
$skipNote = $skipTypes ? "\n\nSKIP these entity types — do NOT redact them: " . implode(', ', $skipTypes) . '.' : '';
// Build officials note
$officialsNote = '';
if ($keepOfficials) {
$officialsNote = "\n\nOFFICIALS — for persons identified as JUDGE, EXPERT_WITNESS, or CASEWORKER in an official capacity: do NOT replace their name with a plain bracket tag. Instead use the format [ROLE: Name], e.g. [JUDGE: Andersen] or [EXPERT_WITNESS: Dr. Larsen]. Their name must remain visible inside the tag.";
}
$allowedTypesNote = '';
if (!$doNames) {
$allowedTypesNote = "\n\nDo NOT include person_name entries in your output.";
}
$system = <<<PROMPT
You are a privacy redaction assistant for legal documents (ECHR judgements, Norwegian family law cases, EU child welfare documents). The text below has already had mechanical identifiers replaced with placeholder tags in [BRACKETS].
Your task: find any remaining identifiable information — person names, organisation names, specific places at city level or below, dates and dates of birth (including soft references like "i 2015", "august 2018", "rundt 2011/2012", "spring of 2019"), and identifying descriptions.
STEP 1 — For person names: identify each individual and infer their role or relationship from context.
Assign each person a consistent contextual tag used for every occurrence of their name:
• Family roles: FATHER, MOTHER, CHILD, CHILD_1, CHILD_2, GRANDPARENT, SIBLING
• Professional roles: ATTORNEY, JUDGE, CASEWORKER, EXPERT_WITNESS
• Generic fallback: PERSON_1, PERSON_2 (use only when role cannot be determined)
The same individual MUST receive the same tag every time they appear.{$aliasBlock}{$exemptBlock}{$officialsNote}{$skipNote}{$allowedTypesNote}
Return ONLY a valid JSON object:
{"redactions":[{"original":"exact text as it appears","type":"person_name","tag":"[FATHER]"}]}
Allowed types and their tag format:
person_name → contextual role tag e.g. [FATHER], [CHILD_1], [ATTORNEY] (or alias tag if provided above)
org → [ORG]
place → [PLACE]
date_of_birth → [DOB]
date → [DATE] (years, year ranges, month+year, soft temporal references — e.g. "i 2015" → "i [DATE]", "rundt 2011/2012" → "rundt [DATE]")
other → [IDENTIFIER]
Rules:
• Include only text that appears verbatim in the input. Do not invent or paraphrase.
• The same person MUST get the same tag every time they appear.
• If nothing needs redacting, return {"redactions":[]}.
• Do not redact text already inside [BRACKETS].
• Legal citations, statute names, article numbers, and institution names (e.g. "the European Court of Human Rights", "Barnevernloven § 4-12") are NOT PII.
• Short common words, conjunctions, and prepositions are NOT PII.{$languageNote}
PROMPT;
$messages = [
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $preRedacted],
];
$chatOptions = ['temperature' => 0.1, 'max_tokens' => 8000, 'json' => true, 'timeout' => 90];
try {
if ($engine === 'gpu') {
$response = $this->callGpuLlm($messages, $chatOptions);
$deployLabel = 'GPU (cuttlefish)';
} elseif ($engine === 'azure_full') {
$response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOptions);
$deployLabel = 'gpt-4o';
} else {
$response = $this->azure->chat($messages, $chatOptions);
$deployLabel = $this->azure->chatDeployment();
}
$content = (string)($response['choices'][0]['message']['content'] ?? '');
$json = $this->azure->decodeJsonObject($content);
if (!is_array($json) || !array_key_exists('redactions', $json)) {
return ['skipped' => false, 'entities' => [], 'error' => 'LLM returned unexpected JSON structure'];
}
return [
'skipped' => false,
'entities' => is_array($json['redactions']) ? $json['redactions'] : [],
'deployment' => $deployLabel,
];
} catch (Throwable $e) {
error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage());
return ['skipped' => false, 'entities' => [], 'error' => $e->getMessage()];
}
}
private function callGpuLlm(array $messages, array $options = []): array
{
$url = 'http://10.0.1.10:4000/v1/chat/completions';
$apiKey = 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d';
$model = 'qwen2.5:14b';
$timeout = (int)($options['timeout'] ?? 90);
$payload = [
'model' => $model,
'messages' => $messages,
'temperature' => $options['temperature'] ?? 0.1,
'max_tokens' => $options['max_tokens'] ?? 8000,
];
if (!empty($options['json'])) {
$payload['response_format'] = ['type' => 'json_object'];
}
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
$headers = [
'Content-Type: application/json',
'Authorization: Bearer ' . $apiKey,
];
if (function_exists('curl_init')) {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $body,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_TIMEOUT => $timeout,
]);
$response = curl_exec($ch);
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$err = curl_error($ch);
curl_close($ch);
if ($response === false) {
throw new RuntimeException('GPU LiteLLM request failed: ' . $err);
}
} else {
$ctx = stream_context_create(['http' => [
'method' => 'POST',
'header' => implode("\r\n", $headers),
'content' => $body,
'timeout' => $timeout,
'ignore_errors' => true,
]]);
$response = @file_get_contents($url, false, $ctx);
$code = 0;
if (isset($http_response_header[0]) && preg_match('/\s(\d{3})\s/', $http_response_header[0], $m)) {
$code = (int)$m[1];
}
if ($response === false) {
throw new RuntimeException('GPU LiteLLM request failed.');
}
}
$decoded = json_decode($response, true);
if (!is_array($decoded)) {
throw new RuntimeException('GPU LiteLLM returned non-JSON response.');
}
if ($code < 200 || $code >= 300) {
$msg = $decoded['error']['message'] ?? ('HTTP ' . $code);
throw new RuntimeException('GPU LiteLLM error: ' . $msg);
}
return $decoded;
}
private function applyGenericTags(string $text): string
{
// Collapse contextual role tags (e.g. [FATHER], [JUDGE: Andersen], [CHILD_1]) → [PERSON]
$text = preg_replace('/\[(?:FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u', '[PERSON]', $text) ?? $text;
return $text;
}
private function applyPseudonymization(string $text, array $allCounts): string
{
$norwegianNames = [
'Ola Nordmann', 'Per Hansen', 'Kari Larsen', 'Anne Berg', 'Erik Dahl',
'Ingrid Holm', 'Lars Moen', 'Silje Bakke', 'Tor Haugen', 'Eva Strand',
];
$nameCursor = 0;
$phoneBase = 1;
$emailCursor = 0;
$addrCursor = 1;
$orgCursor = 1;
$personMap = [];
// Replace named role tags (keeping consistent mapping per unique tag)
$text = preg_replace_callback(
'/\[(FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u',
function (array $m) use (&$nameCursor, &$personMap, $norwegianNames): string {
$key = $m[1];
if (!isset($personMap[$key])) {
$personMap[$key] = $norwegianNames[$nameCursor % count($norwegianNames)];
$nameCursor++;
}
return $personMap[$key];
},
$text
) ?? $text;
$text = preg_replace_callback('/\[PHONE\]/', function () use (&$phoneBase): string {
return sprintf('+47 400 00 %03d', $phoneBase++);
}, $text) ?? $text;
$text = preg_replace_callback('/\[EMAIL\]/', function () use (&$emailCursor): string {
$letter = chr(ord('a') + ($emailCursor % 26));
$emailCursor++;
return "person.{$letter}@example.no";
}, $text) ?? $text;
$text = preg_replace_callback('/\[ADDRESS\]/', function () use (&$addrCursor): string {
return "Eksempelveien {$addrCursor}, 0001 Oslo";
}, $text) ?? $text;
$text = preg_replace_callback('/\[ORG\]/', function () use (&$orgCursor): string {
return "Eksempel AS ({$orgCursor})";
}, $text) ?? $text;
$text = preg_replace_callback('/\[FNR\]/', function (): string {
return '010100XXXXX';
}, $text) ?? $text;
$text = preg_replace_callback('/\[(?:SE_PERSONNUMMER|FR_INSEE|UK_NI|SSN|NAT_ID|DOC_NO|ECHR_APP_NO)\]/', function (): string {
return '[ID-REDACTED]';
}, $text) ?? $text;
$text = preg_replace_callback('/\[PLACE\]/', function (): string {
return 'Eksempelby';
}, $text) ?? $text;
$text = preg_replace_callback('/\[DOB\]/', function (): string {
return '01.01.0000';
}, $text) ?? $text;
$text = preg_replace_callback('/\[IBAN\]/', function (): string {
return 'NO00 0000 00 00000';
}, $text) ?? $text;
return $text;
}
private function uncertaintySummary(mixed $uncertainty): string
{
if (is_array($uncertainty)) {
$uncertainty = implode(' ', array_map('strval', $uncertainty));
}
$uncertainty = trim((string)$uncertainty);
return $uncertainty !== '' ? dbnToolsExcerpt($uncertainty, 220) : 'No additional uncertainty was supplied by the tool.';
}
private function trace(string $label, string $detail, string $status = 'complete'): array
{
return [
'label' => $label,
'detail' => $detail,
'status' => $status,
];
}
}