azure = $azure ?: new DbnAzureOpenAiGateway(); } public function search( string $query, string $language = 'en', int $limit = 6, string $temporalMode = 'disabled', ?string $asOfDate = null, string $scope = 'both' ): array { $query = trim($query); if (mb_strlen($query, 'UTF-8') < 3) { dbnToolsAbort('Search query must be at least 3 characters.', 422, 'query_too_short'); } $limit = max(1, min(10, $limit)); $temporalMode = in_array($temporalMode, ['legal_conservative', 'disabled'], true) ? $temporalMode : 'disabled'; $scope = in_array($scope, ['shared', 'private', 'both'], true) ? $scope : 'both'; $scopeLabel = match ($scope) { 'private' => 'personal corpus only', 'shared' => 'Legal Library only', default => 'Legal Library + personal corpus', }; $trace = [ $this->trace('Query interpretation', "Searching Do Better Norge {$scopeLabel}.", 'complete'), $this->trace('Search tools used', 'ClientRagPipeline::searchAll with keyword mode.', 'running'), ]; $client = dbnToolsRequireClient(); $package = $this->requireFamilyPackage((int)$client['id']); // Personal corpus client_id from session (may be 0 if user has no linked workspace) $personalClientId = (int)($_SESSION['dbn_tools_client_id'] ?? 0); $chunks = []; $retrievalNote = 'ClientRagPipeline keyword retrieval'; try { dbnToolsBootCaveau(); $gatewayUrl = 'http://10.0.1.10:4000'; try { $config = getConfig(); $configured = trim((string)($config['ai_gateway']['url'] ?? '')); if ($configured !== '') { $gatewayUrl = $configured; } } catch (Throwable $e) { // Retrieval still works in keyword mode without gateway config. } if ($scope === 'private') { // Search only the user's personal corpus if ($personalClientId > 0) { $rag = new ClientRagPipeline($personalClientId, $gatewayUrl, 30); $chunks = $rag->searchAll($query, $limit, null, [ 'search_private' => true, 'search_shared' => false, 'chunk_limit' => $limit, 'search_method' => 'keyword', 'min_private' => 0, ]); } } elseif ($scope === 'shared') { // Search only the shared legal library $rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 30); $chunks = $rag->searchAll($query, $limit, null, [ 'search_private' => true, 'search_shared' => true, 'package_ids' => [(int)$package['id']], 'chunk_limit' => $limit, 'search_method' => 'keyword', 'min_private' => 0, 'include_beta_website' => true, ]); } else { // 'both': shared library + personal corpus merged and re-ranked by score $rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 30); $sharedChunks = $rag->searchAll($query, $limit, null, [ 'search_private' => true, 'search_shared' => true, 'package_ids' => [(int)$package['id']], 'chunk_limit' => $limit, 'search_method' => 'keyword', 'min_private' => 0, 'include_beta_website' => true, ]); $privateChunks = []; if ($personalClientId > 0) { try { $ragPrivate = new ClientRagPipeline($personalClientId, $gatewayUrl, 30); $privateChunks = $ragPrivate->searchAll($query, $limit, null, [ 'search_private' => true, 'search_shared' => false, 'chunk_limit' => $limit, 'search_method' => 'keyword', 'min_private' => 0, ]); } catch (Throwable $e) { error_log('[search] personal corpus query failed for client ' . $personalClientId . ': ' . $e->getMessage()); } } // Merge by score descending, cap at $limit $merged = array_merge($sharedChunks, $privateChunks); usort($merged, fn($a, $b) => ($b['score'] ?? 0) <=> ($a['score'] ?? 0)); $chunks = array_slice($merged, 0, $limit); } // Apply temporal reranking after retrieval (optional) if ($temporalMode === 'legal_conservative' && !empty($chunks)) { $temporalLayerPath = __DIR__ . '/../../ai-portal/platform/includes/LegalTemporalLayer.php'; if (file_exists($temporalLayerPath)) { require_once $temporalLayerPath; $layer = new LegalTemporalLayer(['temporal_mode' => $temporalMode]); $chunks = $layer->rerank($chunks, $query, $asOfDate); } } } catch (Throwable $e) { $retrievalNote = 'SQL keyword fallback after ClientRagPipeline error'; $trace[] = $this->trace('Search fallback', 'Pipeline retrieval failed; using direct SQL keyword fallback without storing the query.', 'warning'); $chunks = $this->fallbackKeywordSearch((int)$client['id'], $package, $query, $limit); } if (!$chunks) { $fallback = $this->fallbackKeywordSearch((int)$client['id'], $package, $query, $limit); if ($fallback) { $chunks = $fallback; $retrievalNote = 'SQL keyword fallback'; } } $sharedDocIds = []; foreach (array_slice($chunks, 0, $limit) as $chunk) { if (($chunk['source_type'] ?? '') !== 'private' && isset($chunk['document_id'])) { $sharedDocIds[(int)$chunk['document_id']] = true; } } $docSummaries = $sharedDocIds ? $this->fetchDocSummaries(array_keys($sharedDocIds)) : []; $hits = array_map( fn(array $chunk): array => $this->sourceFromChunk( $chunk, ($chunk['source_type'] ?? '') !== 'private' ? ($docSummaries[(int)($chunk['document_id'] ?? 0)] ?? null) : null ), array_slice($chunks, 0, $limit) ); $confidence = $this->citationConfidence($hits); $trace[1] = $this->trace('Search tools used', $retrievalNote . '; returned ' . count($hits) . ' source hit(s).', 'complete'); $trace[] = $this->trace('Evidence found', count($hits) ? 'Retrieved source excerpts for review.' : 'No matching source excerpts were found.', count($hits) ? 'complete' : 'warning'); $trace[] = $this->trace('Citation confidence', ucfirst($confidence) . ' confidence based on source count and retrieval scores.', $confidence === 'low' ? 'warning' : 'complete'); return [ 'tool' => 'search', 'language' => $language, 'what_we_found' => count($hits) ? 'Found source excerpts from the legal corpus.' : 'No matching source excerpts were found.', 'hits' => $hits, 'evidence_trail' => $hits, 'what_remains_uncertain' => count($hits) ? 'Search results still need human review for legal relevance and currentness.' : 'The corpus may not contain enough evidence for this query.', 'next_practical_step' => count($hits) ? 'Open the strongest sources and confirm the cited sections before relying on them.' : 'Try a narrower query with statutory terms, party names, or dates.', 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => count($chunks), 'source_count' => count($hits), 'deployment' => null, 'citation_confidence' => $confidence, ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } public function ask(string $question, string $language = 'en'): array { $search = $this->search($question, $language, 7); $hits = $search['hits']; $trace = $search['trace']; if (!$hits) { $trace[] = $this->trace('Synthesis', 'Skipped answer synthesis because no evidence was found.', 'warning'); return [ 'tool' => 'ask', 'language' => $language, 'answer' => match (dbnToolsNormalizeUiLanguage($language)) { 'no' => 'Jeg fant ikke nok kildestøtte i familierettskorpuset til å svare sikkert.', 'uk' => 'Я не знайшов достатньої підтримки в корпусі сімейного права, щоб відповісти безпечно.', 'pl' => 'Nie znalazłem wystarczającego wsparcia źródłowego w korpusie prawa rodzinnego, aby odpowiedzieć bezpiecznie.', default => 'I did not find enough source support in the family-law corpus to answer safely.', }, 'what_we_found' => $search['what_we_found'], 'evidence_trail' => [], 'what_remains_uncertain' => $search['what_remains_uncertain'], 'next_practical_step' => $search['next_practical_step'], 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => 0, 'source_count' => 0, 'deployment' => null, 'citation_confidence' => 'low', ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } $this->azure->requireChat(); $context = $this->buildEvidenceContext($hits); $locale = dbnToolsLanguageName($language); $prompt = <<legalJsonSystemPrompt($language); $raw = $this->azure->chatText([ ['role' => 'system', 'content' => $system], ['role' => 'user', 'content' => $prompt], ], [ 'json' => true, 'temperature' => 0.15, 'max_tokens' => 1300, ]); $json = $this->azure->decodeJsonObject($raw); if (!$json) { $json = [ 'answer' => $raw, 'what_we_found' => 'Azure returned a plain-text answer based on the retrieved excerpts.', 'evidence_trail' => [], 'what_remains_uncertain' => ['The response format could not be validated as structured JSON.'], 'next_practical_step' => 'Review the source excerpts manually before relying on the answer.', ]; } $trace[] = $this->trace('Synthesis', 'Azure OpenAI generated an answer using only the retrieved source excerpts.', 'complete'); $trace[] = $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'); $trace[] = $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the evidence trail.'), 'complete'); return [ 'tool' => 'ask', 'language' => $language, 'answer' => (string)($json['answer'] ?? ''), 'what_we_found' => (string)($json['what_we_found'] ?? ''), 'evidence_trail' => $hits, 'citation_notes' => $this->normalizeEvidenceTrail($json['evidence_trail'] ?? [], $hits), 'sources' => $hits, 'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [], 'next_practical_step' => (string)($json['next_practical_step'] ?? ''), 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => count($hits), 'source_count' => count($hits), 'deployment' => $this->azure->chatDeployment(), 'citation_confidence' => $search['trace_metadata']['citation_confidence'] ?? 'medium', ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } public function summarize(string $text, string $language = 'en'): array { $text = $this->requirePasteText($text); $this->azure->requireChat(); $locale = dbnToolsLanguageName($language); $prompt = <<runJsonTool($prompt, $language, 1300); $trace = [ $this->trace('Query interpretation', 'Summarize pasted text without saving the text or output.', 'complete'), $this->trace('Search tools used', 'No external corpus search; source is the user-pasted text.', 'complete'), $this->trace('Evidence found', 'Evidence trail is limited to the pasted text supplied in this request.', 'complete'), $this->trace('Citation confidence', 'Medium confidence for factual extraction; no external legal source verification was performed.', 'warning'), $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'), $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the summary against the original text.'), 'complete'), ]; return [ 'tool' => 'summarize', 'language' => $language, 'what_we_found' => (string)($json['what_we_found'] ?? ''), 'key_facts' => $json['key_facts'] ?? [], 'dates' => $json['dates'] ?? [], 'parties' => $json['parties'] ?? [], 'legal_references_detected' => $json['legal_references_detected'] ?? [], 'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']], 'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [], 'next_practical_step' => (string)($json['next_practical_step'] ?? ''), 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => 1, 'source_count' => 1, 'deployment' => $this->azure->chatDeployment(), ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } public function timeline( string $text, string $language = 'en', string $engine = 'azure_mini', string $focus = 'all', string $confidenceFilter = 'all', bool $includeRelative = true, bool $includeBackground = true, string $userNotes = '' ): array { $text = $this->requirePasteText($text); $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini'; $focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all'; if ($engine !== 'gpu') { $this->azure->requireChat(); } $locale = dbnToolsLanguageName($language); $focusInstruction = match ($focus) { 'deadlines' => "\nFocus specifically on: legal deadlines, filing dates, response windows, appeal periods, and statutory time limits. Deprioritise narrative events with no legal deadline significance.", 'hearings' => "\nFocus specifically on: court hearings, tribunal sessions, mediation sessions, formal meetings, and hearing-related procedural dates.", 'cps' => "\nFocus specifically on: CPS (Barnevernet) interventions, home visits, case reviews, acute measures (akuttvedtak), and Fylkesnemnda proceedings.", default => '', }; $backgroundInstruction = $includeBackground ? "\nAlso extract BACKGROUND and NARRATIVE events: dates embedded in contextual paragraphs, historical facts, year-only references, and approximate years (e.g. \"rundt 2011/2012\", \"David ble født den 30.07.2015\", \"familien i 2015\"). These are valid timeline events even when they appear in introductory or background text — do NOT skip them." : "\nDo NOT include purely historical background or narrative context dates. Focus only on operational events, deadlines, and milestones that are directly actionable in the case."; $relativeInstruction = $includeRelative ? '' : "\nDo NOT extract relative, recurring, or conditional date references — extract only events with determinable absolute dates (date_type=absolute)."; $userNotesBlock = $userNotes !== '' ? "\n\nUser-provided context notes (use these to resolve ambiguities, not as source events):\n---\n" . $userNotes . "\n---" : ''; $prompt = <<legalJsonSystemPrompt($language); $messages = [ ['role' => 'system', 'content' => $system], ['role' => 'user', 'content' => $prompt], ]; $chatOptions = ['json' => true, 'temperature' => 0.1, 'max_tokens' => ($engine === 'azure_full' ? 8000 : 4000), 'timeout' => 120]; $deployLabel = $this->azure->chatDeployment(); try { if ($engine === 'gpu') { $response = $this->callGpuLlm($messages, $chatOptions); $deployLabel = 'GPU (cuttlefish)'; } elseif ($engine === 'azure_full') { $response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOptions); $deployLabel = 'gpt-4o'; } else { $response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOptions); $deployLabel = 'gpt-4o-mini'; } } catch (Throwable $e) { dbnToolsAbort('LLM request failed: ' . $e->getMessage(), 502, 'llm_error'); } $raw = (string)($response['choices'][0]['message']['content'] ?? ''); $json = $this->azure->decodeJsonObject($raw); if (!$json) { dbnToolsAbort('The selected engine did not return valid structured JSON.', 502, 'llm_invalid_json'); } $events = is_array($json['events'] ?? null) ? $json['events'] : []; // Post-filter: confidence if ($confidenceFilter === 'high_medium') { $events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low')); } // Post-filter: relative/recurring date types if (!$includeRelative) { $events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute')); } $engineLabel = match ($engine) { 'gpu' => 'GPU (cuttlefish)', 'azure_full' => 'gpt-4o', default => $deployLabel ?? $this->azure->chatDeployment(), }; $focusLabel = match ($focus) { 'deadlines' => 'legal deadlines', 'hearings' => 'court hearings', 'cps' => 'CPS milestones', default => 'all events', }; $trace = [ $this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Without saving the text or output.", 'complete'), $this->trace('Search tools used', 'No external corpus search; source is the user-pasted text.', 'complete'), $this->trace('Evidence found', count($events) . ' event(s) identified' . ($confidenceFilter === 'high_medium' ? ' (low-confidence filtered out)' : '') . '.', count($events) ? 'complete' : 'warning'), $this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text.', 'complete'), $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'), $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Verify dates against original documents.'), 'complete'), ]; return [ 'tool' => 'timeline', 'language' => $language, 'what_we_found' => (string)($json['what_we_found'] ?? ''), 'events' => $events, 'evidence_trail' => $json['evidence_trail'] ?? [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']], 'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [], 'next_practical_step' => (string)($json['next_practical_step'] ?? ''), 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => count($events), 'source_count' => 1, 'deployment' => $engineLabel, ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } public function redact( string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en', array $aliases = [], string $engine = 'azure_mini', string $outputFormat = 'contextual', bool $keepOfficials = false, array $exemptNames = [], array $redactTypes = [] ): array { $text = $this->requirePasteText($text); $mode = $mode === 'strict' ? 'strict' : 'standard'; $region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic'; $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'regex'], true) ? $engine : 'azure_mini'; $outputFormat = in_array($outputFormat, ['contextual', 'generic', 'pseudonym'], true) ? $outputFormat : 'contextual'; // Normalise entity-type flags (all on by default) $doNames = ($redactTypes['names'] ?? true) !== false; $doOrgs = ($redactTypes['orgs'] ?? true) !== false; $doPlaces = ($redactTypes['places'] ?? true) !== false; $doDob = ($redactTypes['dob'] ?? true) !== false; // Pass 1 — deterministic regex [$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region); $pass1Total = array_sum($pass1Counts); $pass1Detail = $pass1Total ? implode(', ', array_map( fn($k, $v) => "{$k}: {$v}", array_keys(array_filter($pass1Counts, fn($v): bool => $v > 0)), array_filter($pass1Counts, fn($v): bool => $v > 0) )) : 'none detected'; $engineLabel = match ($engine) { 'azure_full' => 'Azure gpt-4o', 'gpu' => 'GPU (cuttlefish)', 'regex' => 'Regex only', default => 'Azure gpt-4o-mini', }; $trace = [ $this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}. Engine: {$engineLabel}.", 'complete'), $this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'), ]; // Pass 2 — LLM semantic scan $finalRedacted = $preRedacted; $pass2Counts = []; $llmDeployment = null; $redactionMap = []; $llmResult = $this->llmRedactionPass( $preRedacted, $language, $aliases, $engine, $keepOfficials, $exemptNames, $doNames, $doOrgs, $doPlaces, $doDob ); if (!empty($llmResult['skipped'])) { $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'not configured') . '.', 'warning'); } elseif (!empty($llmResult['error'])) { $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning'); } else { $entities = $llmResult['entities'] ?? []; $llmDeployment = $llmResult['deployment'] ?? null; $applied = 0; $redactionMap = []; foreach ($entities as $entity) { if (!is_array($entity)) { continue; } $original = (string)($entity['original'] ?? ''); $type = (string)($entity['type'] ?? 'other'); $tag = (string)($entity['tag'] ?? '[IDENTIFIER]'); if ($original === '' || str_starts_with($original, '[')) { continue; } // Allow [ROLE: Name] format when keepOfficials is on, else require plain bracket tag if (!preg_match('/^\[[A-Za-z0-9_\- ]+(?::\s*[^\]]+)?\]$/', $tag)) { $tag = '[IDENTIFIER]'; } // Try word-boundary match first to avoid partial-word substitutions (e.g. "Per" inside "Persson") $escaped = preg_quote($original, '/'); $replaced = preg_replace('/\b' . $escaped . '\b/u', $tag, $finalRedacted); if ($replaced !== null && $replaced !== $finalRedacted) { $finalRedacted = $replaced; $pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1; $applied++; if (!isset($redactionMap[$tag])) { $redactionMap[$tag] = ['originals' => [], 'type' => $type]; } if (!in_array($original, $redactionMap[$tag]['originals'], true)) { $redactionMap[$tag]['originals'][] = $original; } } elseif (str_contains($finalRedacted, $original)) { // Fallback for names adjacent to punctuation or non-word characters $finalRedacted = str_replace($original, $tag, $finalRedacted); $pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1; $applied++; if (!isset($redactionMap[$tag])) { $redactionMap[$tag] = ['originals' => [], 'type' => $type]; } if (!in_array($original, $redactionMap[$tag]['originals'], true)) { $redactionMap[$tag]['originals'][] = $original; } } } // Add occurrence counts by scanning the final text foreach ($redactionMap as $tag => &$entry) { $entry['occurrences'] = substr_count($finalRedacted, $tag); } unset($entry); $pass2Detail = $applied > 0 ? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts)) : 'no additional entities found'; $trace[] = $this->trace('Pass 2 — LLM semantic scan', "{$engineLabel} reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete'); } // Apply output format post-processing $allCounts = array_merge($pass1Counts, $pass2Counts); if ($outputFormat === 'generic') { $finalRedacted = $this->applyGenericTags($finalRedacted); } elseif ($outputFormat === 'pseudonym') { $finalRedacted = $this->applyPseudonymization($finalRedacted, $allCounts); } $categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0)); $trace[] = $this->trace('Output format', match ($outputFormat) { 'generic' => 'All identifiers normalised to generic tags ([PERSON], [ORG], etc.).', 'pseudonym' => 'Identifiers replaced with plausible pseudonymous values.', default => 'Contextual role tags used (e.g. [FATHER], [JUDGE: Name]).', }, 'complete'); $trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning'); $trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete'); return [ 'tool' => 'redact', 'mode' => $mode, 'region' => $region, 'engine_used' => $engineLabel, 'output_format' => $outputFormat, 'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment || $engine === 'gpu' ? " and {$engineLabel} semantic scan" : '') . '.', 'redacted_text' => $finalRedacted, 'detected_entity_categories' => $categories, 'entity_counts' => $allCounts, 'redaction_map' => $redactionMap, 'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']], 'what_remains_uncertain' => ['Human review is still recommended for contextual identification.'], 'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.', 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => 1, 'source_count' => 1, 'deployment' => $llmDeployment ?? $engineLabel, ], 'disclaimer' => 'Privacy support tool. Review before disclosure.', ]; } private function requireFamilyPackage(int $clientId): array { $package = dbnToolsFetchPackage('family-legal'); if (!$package || empty($package['is_active'])) { dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable'); } if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) { dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing'); } return $package; } private function runJsonTool(string $prompt, string $language, int $maxTokens): array { $raw = $this->azure->chatText([ ['role' => 'system', 'content' => $this->legalJsonSystemPrompt($language)], ['role' => 'user', 'content' => $prompt], ], [ 'json' => true, 'temperature' => 0.1, 'max_tokens' => $maxTokens, ]); $json = $this->azure->decodeJsonObject($raw); if (!$json) { dbnToolsAbort('Azure OpenAI did not return valid structured JSON.', 502, 'azure_invalid_json'); } return $json; } private function legalJsonSystemPrompt(string $language): string { $locale = dbnToolsLanguageName($language); return << $hit) { $n = $idx + 1; $lines[] = "[{$n}] Title: " . ($hit['title'] ?? 'Untitled'); if (!empty($hit['section'])) { $lines[] = "Section: " . $hit['section']; } $lines[] = "Corpus/package: " . ($hit['package_or_corpus'] ?? 'unknown'); $lines[] = "Excerpt: " . ($hit['excerpt'] ?? ''); } return implode("\n", $lines); } private function normalizeEvidenceTrail(mixed $trail, array $hits): array { if (!is_array($trail) || !$trail) { return array_map(fn(array $hit): array => [ 'title' => $hit['title'], 'citation' => $hit['title'], 'why_it_matters' => dbnToolsExcerpt($hit['excerpt'], 180), ], array_slice($hits, 0, 4)); } return array_values(array_filter($trail, 'is_array')); } private function sourceFromChunk(array $chunk, ?string $docSummary = null): array { $title = (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'); $score = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null; $rawExcerpt = dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620); return [ 'title' => $title, 'excerpt' => $docSummary ?? $rawExcerpt, 'chunk_text' => $rawExcerpt, 'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Norge'), 'score' => $score, 'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null, 'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null, 'section' => $chunk['section_title'] ?? null, 'authority_type' => $chunk['authority_type'] ?? null, 'jurisdiction' => $chunk['jurisdiction'] ?? null, // Temporal annotations (present when temporal_mode = 'legal_conservative') 'temporal_state' => $chunk['temporal_state'] ?? null, 'temporal_kind' => $chunk['temporal_kind'] ?? null, 'temporal_reason' => $chunk['temporal_reason'] ?? null, 'currentness_warning' => $chunk['currentness_warning'] ?? null, 'valid_from' => $chunk['valid_from'] ?? null, 'valid_until' => $chunk['valid_until'] ?? null, 'is_current_version' => $chunk['is_current_version'] ?? null, ]; } private function fetchDocSummaries(array $docIds): array { if (!$docIds) { return []; } try { $db = dbnToolsRagDb(); $placeholders = implode(',', array_fill(0, count($docIds), '?')); $stmt = $db->prepare( "SELECT document_id, summary FROM doc_summaries WHERE document_id IN ({$placeholders}) AND summary != ''" ); $stmt->execute(array_values($docIds)); return array_column($stmt->fetchAll(PDO::FETCH_ASSOC), 'summary', 'document_id'); } catch (Throwable) { return []; } } private function citationConfidence(array $hits): string { if (!$hits) { return 'low'; } $scores = array_values(array_filter(array_map(fn(array $h) => $h['score'] ?? null, $hits), 'is_numeric')); $best = $scores ? max($scores) : 0; if (count($hits) >= 3 && $best >= 0.35) { return 'high'; } if (count($hits) >= 1) { return 'medium'; } return 'low'; } private function fallbackKeywordSearch(int $clientId, array $package, string $query, int $limit): array { $results = []; try { $results = array_merge($results, $this->fallbackPrivateSearch($clientId, $query, $limit)); } catch (Throwable $e) { error_log('DBN tools private fallback failed: ' . $e->getMessage()); } try { $remaining = max(1, $limit - count($results)); $results = array_merge($results, $this->fallbackSharedSearch($package, $query, $remaining)); } catch (Throwable $e) { error_log('DBN tools shared fallback failed: ' . $e->getMessage()); } return array_slice($results, 0, $limit); } private function fallbackPrivateSearch(int $clientId, string $query, int $limit): array { $db = dbnToolsDb(); $terms = $this->searchTerms($query); if (!$terms) { return []; } $clauses = []; $params = [':client_id' => $clientId]; foreach ($terms as $i => $term) { $key = ':term' . $i; $clauses[] = "(cc.content LIKE {$key} OR cd.title LIKE {$key})"; $params[$key] = '%' . $term . '%'; } $sql = 'SELECT cc.id, cc.document_id, cc.content, cd.title AS document_title, cd.category FROM client_chunks cc JOIN client_documents cd ON cc.document_id = cd.id WHERE cc.client_id = :client_id AND cd.status = "ready" AND (' . implode(' OR ', $clauses) . ') LIMIT ' . (int)$limit; $stmt = $db->prepare($sql); $stmt->execute($params); $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); foreach ($rows as &$row) { $row['similarity'] = 0.25; $row['source_name'] = 'Do Better Norge private corpus'; $row['source_type'] = 'private'; } return $rows; } private function fallbackSharedSearch(array $package, string $query, int $limit): array { $ragDb = dbnToolsRagDb(); $terms = $this->searchTerms($query); if (!$terms) { return []; } $where = ['d.status = "ready"']; $params = []; if (!empty($package['corpus_id'])) { $where[] = 'd.corpus_id = ?'; $params[] = (int)$package['corpus_id']; } $cats = json_decode((string)($package['category_filter'] ?? '[]'), true) ?: []; if ($cats) { $where[] = 'd.category IN (' . implode(',', array_fill(0, count($cats), '?')) . ')'; $params = array_merge($params, $cats); } $langs = json_decode((string)($package['language_filter'] ?? '[]'), true) ?: []; if ($langs) { $where[] = 'd.language IN (' . implode(',', array_fill(0, count($langs), '?')) . ')'; $params = array_merge($params, $langs); } $termClauses = []; foreach ($terms as $term) { $termClauses[] = '(c.content LIKE ? OR d.title LIKE ?)'; $params[] = '%' . $term . '%'; $params[] = '%' . $term . '%'; } $where[] = '(' . implode(' OR ', $termClauses) . ')'; $sql = 'SELECT c.id, c.document_id, c.content, c.section_title, d.title AS document_title, d.category, d.language FROM chunks c JOIN documents d ON c.document_id = d.id WHERE ' . implode(' AND ', $where) . ' LIMIT ' . (int)$limit; $stmt = $ragDb->prepare($sql); $stmt->execute($params); $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); foreach ($rows as &$row) { $row['similarity'] = 0.2; $row['source_name'] = (string)($package['name'] ?? 'family-legal'); $row['source_type'] = 'package'; } return $rows; } private function searchTerms(string $query): array { $parts = preg_split('/[^\p{L}\p{N}]+/u', mb_strtolower($query, 'UTF-8')) ?: []; $stop = ['the', 'and', 'for', 'with', 'that', 'this', 'hva', 'har', 'kan', 'jeg', 'som', 'det', 'med', 'til', 'og']; $terms = []; foreach ($parts as $part) { if (mb_strlen($part, 'UTF-8') < 3 || in_array($part, $stop, true)) { continue; } $terms[] = $part; } return array_slice(array_values(array_unique($terms)), 0, 6); } private function requirePasteText(string $text): string { $text = trim($text); if (mb_strlen($text, 'UTF-8') < 20) { dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short'); } if (mb_strlen($text, 'UTF-8') > self::MAX_PASTE_CHARS) { dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long'); } return $text; } private function deterministicRedaction(string $text, string $mode, string $region = 'nordic'): array { $counts = []; $replace = function (string $pattern, string $type, string $token) use (&$text, &$counts): void { $text = preg_replace_callback($pattern, function () use (&$counts, $type, $token): string { $counts[$type] = ($counts[$type] ?? 0) + 1; return $token; }, $text) ?? $text; }; foreach ($this->getPatternPack($region) as $entry) { $replace($entry['pattern'], $entry['type'], $entry['replacement']); } // Structured role-label names (Barn: X, Mother: X, etc.) — universal $text = preg_replace_callback( '/\b(Barn|Child|Navn|Name|Mor|Far|Mother|Father|Sønn|Datter)\s*:\s*([^\r\n,.;]+)/iu', function (array $m) use (&$counts): string { $counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1; return $m[1] . ': [PERSON]'; }, $text ) ?? $text; // Child-identifier phrases ("barnet heter X", "child named X") — universal $text = preg_replace_callback( '/\b(?:barnet|child|sønn|son|datter|daughter)\s+(?:heter|named|called)?\s*([A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,})\b/iu', function () use (&$counts): string { $counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1; return '[CHILD_IDENTIFIER]'; }, $text ) ?? $text; if ($mode === 'strict') { $replace('/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,}\s+[A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,}\b/u', 'person_or_child_name', '[PERSON]'); } return [$text, $counts]; } private function getPatternPack(string $region): array { $nordic = [ ['pattern' => '/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'replacement' => '[EMAIL]', 'type' => 'email'], ['pattern' => '/(? '[FNR]', 'type' => 'fødselsnummer'], ['pattern' => '/(? '[PHONE]', 'type' => 'phone'], ['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'], // Dates — must precede generic numeric patterns // Year range (e.g. 2011/2012, 2018-2019) ['pattern' => '/(? '[DATE]', 'type' => 'date'], // Norwegian DD.MM.YYYY and DD/MM/YYYY ['pattern' => '/(? '[DATE]', 'type' => 'date'], // ISO YYYY-MM-DD ['pattern' => '/(? '[DATE]', 'type' => 'date'], // DD. Month YYYY (e.g. "30. juli 2015") and Month YYYY (Norwegian + English) ['pattern' => '/\b(?:\d{1,2}\.?\s+)?(?:januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember|January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\s+(?:19|20)\d{2}\b/iu', 'replacement' => '[DATE]', 'type' => 'date'], // Year after Norwegian/English temporal preposition (lookbehind keeps preposition) ['pattern' => '/(?<=\b(?:i|fra|siden|innen|før|etter|rundt|omkring|cirka|in|from|since|until|before|after|around|circa)\s)(?:19|20)\d{2}(?![\d\/\-])/iu', 'replacement' => '[DATE]', 'type' => 'date'], ]; if ($region === 'nordic') { return $nordic; } $european = array_merge($nordic, [ // Swedish personnummer short (YYMMDD-XXXX / YYMMDD+XXXX) ['pattern' => '/(? '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'], // Swedish personnummer full (YYYYMMDD-XXXX) ['pattern' => '/(? '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'], // Danish/Finnish CPR / henkilötunnus — same format as short SE personnummer but included for clarity ['pattern' => '/\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b/i', 'replacement' => '[UK_NI]', 'type' => 'uk_ni'], // French INSEE (15 digits, not overlapping with 11-digit FNR due to lookarounds) ['pattern' => '/(? '[FR_INSEE]', 'type' => 'fr_insee'], // IBAN (2-letter country code + 2 check digits + up to 30 alphanumeric) ['pattern' => '/\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}[A-Z0-9]{0,16}\b/i', 'replacement' => '[IBAN]', 'type' => 'iban'], // European phone (international prefix for major EU/EEA country codes) ['pattern' => '/(? '[PHONE]', 'type' => 'phone'], // Street address expanded to European street-type keywords ['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave|rue|straße|strasse|straat|gade|calle|via|gatan|vägen)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'], ]); if ($region === 'european') { return $european; } $echr = array_merge($european, [ // ECHR application number (requires "Application no." or "App. No." prefix to avoid matching dates/pages) ['pattern' => '/\b(?:Application|App\.?)\s+(?:no\.?|nr\.?|#)\s*\d{3,6}\s*\/\s*\d{2,4}\b/i', 'replacement' => '[ECHR_APP_NO]', 'type' => 'echr_app_no'], // Date of birth stated in judgment context ['pattern' => '/\bborn\s+(?:on\s+)?\d{1,2}[.\s]+(?:January|February|March|April|May|June|July|August|September|October|November|December|januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)[,\s]+\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'], ['pattern' => '/\bf\.\s*\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'], // National ID label patterns in multiple languages ['pattern' => '/\b(?:personal\s+number|numéro\s+national|Personalausweis|personnummer|fødselsnummer|henkilötunnus|CPR-nummer)\s*:\s*[\w\s\-]+/iu', 'replacement' => '[NAT_ID]', 'type' => 'nat_id'], ]); if ($region === 'echr') { return $echr; } // global return array_merge($echr, [ // US Social Security Number ['pattern' => '/(? '[SSN]', 'type' => 'ssn'], // Document number in context (passport no., ID No., document no.) ['pattern' => '/\b(?:passport\s+(?:no\.?|number)|ID\s+(?:no\.?|number)|document\s+(?:no\.?|number))\s*[:\#]?\s*([A-Z0-9]{6,12})\b/iu', 'replacement' => '[DOC_NO]', 'type' => 'doc_no'], ]); } private function llmRedactionPass( string $preRedacted, string $language = 'en', array $aliases = [], string $engine = 'azure_mini', bool $keepOfficials = false, array $exemptNames = [], bool $doNames = true, bool $doOrgs = true, bool $doPlaces = true, bool $doDob = true ): array { if ($engine === 'regex') { return ['skipped' => true, 'reason' => 'Regex-only mode selected']; } if ($engine !== 'gpu') { $missing = $this->azure->missingChatConfig(); if ($missing) { return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')']; } } $languageNote = $language === 'no' ? "\n • The document may contain Norwegian or mixed-language content." : ''; // Build alias block $aliasBlock = ''; if (!empty($aliases)) { $lines = []; foreach ($aliases as $a) { $orig = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['original'] ?? '')), 0, 100)); $lbl = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['alias'] ?? '')), 0, 100)); if ($orig !== '' && $lbl !== '') { $lines[] = " \"{$orig}\" → [{$lbl}]"; } } if ($lines) { $aliasBlock = "\n\nALIAS OVERRIDES — use these exact replacement tags for these specific names instead of inferring a role:\n" . implode("\n", $lines); } } // Build exempt names block $exemptBlock = ''; if (!empty($exemptNames)) { $quoted = array_map(fn($n) => '"' . str_replace(['"', "\n"], ['\\"', ' '], $n) . '"', array_slice($exemptNames, 0, 20)); $exemptBlock = "\n\nEXEMPT NAMES — these names must NOT be redacted under any circumstances:\n " . implode(', ', $quoted); } // Build entity-type restriction note $skipTypes = []; if (!$doOrgs) $skipTypes[] = 'organisation names'; if (!$doPlaces) $skipTypes[] = 'place names'; if (!$doDob) $skipTypes[] = 'dates of birth'; if (!$doNames) $skipTypes[] = 'person names'; $skipNote = $skipTypes ? "\n\nSKIP these entity types — do NOT redact them: " . implode(', ', $skipTypes) . '.' : ''; // Build officials note $officialsNote = ''; if ($keepOfficials) { $officialsNote = "\n\nOFFICIALS — for persons identified as JUDGE, EXPERT_WITNESS, or CASEWORKER in an official capacity: do NOT replace their name with a plain bracket tag. Instead use the format [ROLE: Name], e.g. [JUDGE: Andersen] or [EXPERT_WITNESS: Dr. Larsen]. Their name must remain visible inside the tag."; } $allowedTypesNote = ''; if (!$doNames) { $allowedTypesNote = "\n\nDo NOT include person_name entries in your output."; } $system = << 'system', 'content' => $system], ['role' => 'user', 'content' => $preRedacted], ]; $chatOptions = ['temperature' => 0.1, 'max_tokens' => 8000, 'json' => true, 'timeout' => 90]; try { if ($engine === 'gpu') { $response = $this->callGpuLlm($messages, $chatOptions); $deployLabel = 'GPU (cuttlefish)'; } elseif ($engine === 'azure_full') { $response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOptions); $deployLabel = 'gpt-4o'; } else { $response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOptions); $deployLabel = 'gpt-4o-mini'; } $content = (string)($response['choices'][0]['message']['content'] ?? ''); $json = $this->azure->decodeJsonObject($content); if (!is_array($json) || !array_key_exists('redactions', $json)) { return ['skipped' => false, 'entities' => [], 'error' => 'LLM returned unexpected JSON structure']; } return [ 'skipped' => false, 'entities' => is_array($json['redactions']) ? $json['redactions'] : [], 'deployment' => $deployLabel, ]; } catch (Throwable $e) { error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage()); return ['skipped' => false, 'entities' => [], 'error' => $e->getMessage()]; } } private function callGpuLlm(array $messages, array $options = []): array { return dbnToolsCallGpuLlm($messages, $options); } // ── Summarize: corpus context + engine-aware summary ───────────────────── /** * Search the shared legal corpus and return top-N passages as a formatted * context string. Returns '' on failure so the caller can degrade gracefully. */ public function corpusContextForSummarize(string $query, int $limit = 8): string { try { $client = dbnToolsRequireClient(); $package = $this->requireFamilyPackage((int)$client['id']); dbnToolsBootCaveau(); $gatewayUrl = 'http://10.0.1.10:4000'; try { $config = getConfig(); $u = trim((string)($config['ai_gateway']['url'] ?? '')); if ($u !== '') $gatewayUrl = $u; } catch (Throwable) {} $rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 20); $chunks = $rag->searchAll($query, $limit, null, [ 'search_private' => true, 'search_shared' => true, 'package_ids' => [(int)$package['id']], 'chunk_limit' => $limit, 'search_method' => 'keyword', 'min_private' => 0, 'include_beta_website' => true, ]); $parts = []; foreach ($chunks as $c) { $title = (string)($c['title'] ?? ($c['source'] ?? 'Legal source')); $content = (string)($c['content'] ?? ($c['text'] ?? '')); if ($content !== '') { $parts[] = "=== {$title} ===\n{$content}"; } } return implode("\n\n", $parts); } catch (Throwable $e) { error_log('summarize corpus search failed: ' . $e->getMessage()); return ''; } } /** * Engine-aware structured summarization, optionally enriched with corpus context. */ public function summarizeWithContext( string $text, string $language = 'en', string $engine = 'azure_mini', string $corpusContext = '' ): array { $text = $this->requirePasteText($text); $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini'; $locale = dbnToolsLanguageName($language); $enriched = $text; $corpusUsed = $corpusContext !== ''; if ($corpusUsed) { $enriched = "[Relevant legal context from Do Better Norge corpus]\n" . $corpusContext . "\n\n---\n\nDocument to summarise:\n" . $text; } $prompt = <<legalJsonSystemPrompt($language); $messages = [ ['role' => 'system', 'content' => $system], ['role' => 'user', 'content' => $prompt], ]; $maxTok = ($engine === 'azure_full') ? 8000 : 4000; $chatOpts = ['json' => true, 'temperature' => 0.1, 'max_tokens' => $maxTok, 'timeout' => 120]; $deployLabel = $this->azure->chatDeployment(); try { if ($engine === 'gpu') { $response = $this->callGpuLlm($messages, $chatOpts); $deployLabel = 'GPU (local)'; } elseif ($engine === 'azure_full') { $response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOpts); $deployLabel = 'gpt-4o'; } else { $response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOpts); $deployLabel = 'gpt-4o-mini'; } } catch (Throwable $e) { dbnToolsAbort('LLM request failed: ' . $e->getMessage(), 502, 'llm_error'); } $raw = (string)($response['choices'][0]['message']['content'] ?? ''); $json = $this->azure->decodeJsonObject($raw); if (!$json) { dbnToolsAbort('LLM returned unparseable JSON.', 502, 'llm_parse_error'); } $corpusNote = $corpusUsed ? 'Summary enriched with ' . count(array_filter(explode('=== ', $corpusContext))) . ' passage(s) from the Do Better Norge legal corpus.' : 'No corpus search performed; summarised from document text only.'; $trace = [ $this->trace('Document preparation', 'Text validated and prepared for summarisation.', 'complete'), $this->trace('Corpus enrichment', $corpusNote, $corpusUsed ? 'complete' : 'complete'), $this->trace('Summary generation', 'Structured summary generated via ' . $deployLabel . '.', 'complete'), $this->trace('Uncertainty', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'), $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the summary against the original document.'), 'complete'), ]; return [ 'tool' => 'summarize', 'language' => $language, 'what_we_found' => (string)($json['what_we_found'] ?? ''), 'key_facts' => $json['key_facts'] ?? [], 'dates' => $json['dates'] ?? [], 'parties' => $json['parties'] ?? [], 'legal_references_detected' => $json['legal_references_detected'] ?? [], 'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [], 'next_practical_step' => (string)($json['next_practical_step'] ?? ''), 'corpus_used' => $corpusUsed, 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => 1, 'source_count' => 1, 'deployment' => $deployLabel, ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } private function applyGenericTags(string $text): string { // Collapse contextual role tags (e.g. [FATHER], [JUDGE: Andersen], [CHILD_1]) → [PERSON] $text = preg_replace('/\[(?:FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u', '[PERSON]', $text) ?? $text; return $text; } private function applyPseudonymization(string $text, array $allCounts): string { $norwegianNames = [ 'Ola Nordmann', 'Per Hansen', 'Kari Larsen', 'Anne Berg', 'Erik Dahl', 'Ingrid Holm', 'Lars Moen', 'Silje Bakke', 'Tor Haugen', 'Eva Strand', ]; $nameCursor = 0; $phoneBase = 1; $emailCursor = 0; $addrCursor = 1; $orgCursor = 1; $personMap = []; // Replace named role tags (keeping consistent mapping per unique tag) $text = preg_replace_callback( '/\[(FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u', function (array $m) use (&$nameCursor, &$personMap, $norwegianNames): string { $key = $m[1]; if (!isset($personMap[$key])) { $personMap[$key] = $norwegianNames[$nameCursor % count($norwegianNames)]; $nameCursor++; } return $personMap[$key]; }, $text ) ?? $text; $text = preg_replace_callback('/\[PHONE\]/', function () use (&$phoneBase): string { return sprintf('+47 400 00 %03d', $phoneBase++); }, $text) ?? $text; $text = preg_replace_callback('/\[EMAIL\]/', function () use (&$emailCursor): string { $letter = chr(ord('a') + ($emailCursor % 26)); $emailCursor++; return "person.{$letter}@example.no"; }, $text) ?? $text; $text = preg_replace_callback('/\[ADDRESS\]/', function () use (&$addrCursor): string { return "Eksempelveien {$addrCursor}, 0001 Oslo"; }, $text) ?? $text; $text = preg_replace_callback('/\[ORG\]/', function () use (&$orgCursor): string { return "Eksempel AS ({$orgCursor})"; }, $text) ?? $text; $text = preg_replace_callback('/\[FNR\]/', function (): string { return '010100XXXXX'; }, $text) ?? $text; $text = preg_replace_callback('/\[(?:SE_PERSONNUMMER|FR_INSEE|UK_NI|SSN|NAT_ID|DOC_NO|ECHR_APP_NO)\]/', function (): string { return '[ID-REDACTED]'; }, $text) ?? $text; $text = preg_replace_callback('/\[PLACE\]/', function (): string { return 'Eksempelby'; }, $text) ?? $text; $text = preg_replace_callback('/\[DOB\]/', function (): string { return '01.01.0000'; }, $text) ?? $text; $text = preg_replace_callback('/\[IBAN\]/', function (): string { return 'NO00 0000 00 00000'; }, $text) ?? $text; return $text; } private function uncertaintySummary(mixed $uncertainty): string { if (is_array($uncertainty)) { $uncertainty = implode(' ', array_map('strval', $uncertainty)); } $uncertainty = trim((string)$uncertainty); return $uncertainty !== '' ? dbnToolsExcerpt($uncertainty, 220) : 'No additional uncertainty was supplied by the tool.'; } private function trace(string $label, string $detail, string $status = 'complete'): array { return [ 'label' => $label, 'detail' => $detail, 'status' => $status, ]; } }