azure = $azure ?: DbnGatewayFactory::make(); } public function search( string $query, string $language = 'en', int $limit = 6, string $temporalMode = 'disabled', ?string $asOfDate = null, string $scope = 'both', ?string $persona = null ): array { $query = trim($query); if (mb_strlen($query, 'UTF-8') < 3) { dbnToolsAbort('Search query must be at least 3 characters.', 422, 'query_too_short'); } $limit = max(1, min(10, $limit)); $temporalMode = in_array($temporalMode, ['legal_conservative', 'disabled'], true) ? $temporalMode : 'disabled'; $scope = in_array($scope, ['shared', 'private', 'both'], true) ? $scope : 'both'; $scopeLabel = match ($scope) { 'private' => 'personal corpus only', 'shared' => 'Legal Library only', default => 'Legal Library + personal corpus', }; $product = dbnToolsProductName(); $trace = [ $this->trace('Query interpretation', "Searching {$product} {$scopeLabel}.", 'complete'), $this->trace('Search tools used', 'ClientRagPipeline::searchAll with keyword mode.', 'running'), ]; $client = dbnToolsRequireClient(); $personaResolved = dbnToolsResolvePersona((int)$client['id'], $persona); $package = $personaResolved['package'] ?? $this->requireFamilyPackage((int)$client['id']); $packageIds = $personaResolved['package_ids'] ?: [(int)$package['id']]; $personaRagOpts = is_array($personaResolved['rag_opts'] ?? null) ? $personaResolved['rag_opts'] : []; $searchMethod = (string)($personaResolved['search_method'] ?? 'keyword') ?: 'keyword'; // Personal corpus client_id from session (may be 0 if user has no linked workspace) $personalClientId = (int)($_SESSION['dbn_tools_client_id'] ?? 0); $chunks = []; $retrievalNote = 'ClientRagPipeline keyword retrieval'; try { dbnToolsBootCaveau(); $gatewayUrl = 'http://10.0.1.10:4000'; try { $config = getConfig(); $configured = trim((string)($config['ai_gateway']['url'] ?? '')); if ($configured !== '') { $gatewayUrl = $configured; } } catch (Throwable $e) { // Retrieval still works in keyword mode without gateway config. } if ($scope === 'private') { // Search only the user's personal corpus if ($personalClientId > 0) { $rag = new ClientRagPipeline($personalClientId, $gatewayUrl, 30); $chunks = $rag->searchAll($query, $limit, null, array_merge($personaRagOpts, [ 'search_private' => true, 'search_shared' => false, 'chunk_limit' => $limit, 'search_method' => $searchMethod, 'min_private' => 0, ])); } } elseif ($scope === 'shared') { // Search only the shared legal library (persona-scoped packages) $rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 30); $chunks = $rag->searchAll($query, $limit, null, array_merge($personaRagOpts, [ 'search_private' => true, 'search_shared' => true, 'package_ids' => $packageIds, 'chunk_limit' => $limit, 'search_method' => $searchMethod, 'min_private' => 0, 'include_beta_website' => true, ])); } else { // 'both': shared library + personal corpus merged and re-ranked by score $rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 30); $sharedChunks = $rag->searchAll($query, $limit, null, array_merge($personaRagOpts, [ 'search_private' => true, 'search_shared' => true, 'package_ids' => $packageIds, 'chunk_limit' => $limit, 'search_method' => $searchMethod, 'min_private' => 0, 'include_beta_website' => true, ])); $privateChunks = []; if ($personalClientId > 0) { try { $ragPrivate = new ClientRagPipeline($personalClientId, $gatewayUrl, 30); $privateChunks = $ragPrivate->searchAll($query, $limit, null, array_merge($personaRagOpts, [ 'search_private' => true, 'search_shared' => false, 'chunk_limit' => $limit, 'search_method' => $searchMethod, 'min_private' => 0, ])); } catch (Throwable $e) { error_log('[search] personal corpus query failed for client ' . $personalClientId . ': ' . $e->getMessage()); } } // Merge by score descending, cap at $limit $merged = array_merge($sharedChunks, $privateChunks); usort($merged, fn($a, $b) => ($b['score'] ?? 0) <=> ($a['score'] ?? 0)); $chunks = array_slice($merged, 0, $limit); } // Apply temporal reranking after retrieval (optional) if ($temporalMode === 'legal_conservative' && !empty($chunks)) { $temporalLayerPath = __DIR__ . '/../../ai-portal/platform/includes/LegalTemporalLayer.php'; if (file_exists($temporalLayerPath)) { require_once $temporalLayerPath; $layer = new LegalTemporalLayer(['temporal_mode' => $temporalMode]); $chunks = $layer->rerank($chunks, $query, $asOfDate); } } } catch (Throwable $e) { $retrievalNote = 'SQL keyword fallback after ClientRagPipeline error'; $trace[] = $this->trace('Search fallback', 'Pipeline retrieval failed; using direct SQL keyword fallback without storing the query.', 'warning'); $chunks = $this->fallbackKeywordSearch((int)$client['id'], $package, $query, $limit); } if (!$chunks) { $fallback = $this->fallbackKeywordSearch((int)$client['id'], $package, $query, $limit); if ($fallback) { $chunks = $fallback; $retrievalNote = 'SQL keyword fallback'; } } $sharedDocIds = []; foreach (array_slice($chunks, 0, $limit) as $chunk) { if (($chunk['source_type'] ?? '') !== 'private' && isset($chunk['document_id'])) { $sharedDocIds[(int)$chunk['document_id']] = true; } } $docSummaries = $sharedDocIds ? $this->fetchDocSummaries(array_keys($sharedDocIds)) : []; $hits = array_map( fn(array $chunk): array => $this->sourceFromChunk( $chunk, ($chunk['source_type'] ?? '') !== 'private' ? ($docSummaries[(int)($chunk['document_id'] ?? 0)] ?? null) : null ), array_slice($chunks, 0, $limit) ); $confidence = $this->citationConfidence($hits); $trace[1] = $this->trace('Search tools used', $retrievalNote . '; returned ' . count($hits) . ' source hit(s).', 'complete'); $trace[] = $this->trace('Evidence found', count($hits) ? 'Retrieved source excerpts for review.' : 'No matching source excerpts were found.', count($hits) ? 'complete' : 'warning'); $trace[] = $this->trace('Citation confidence', ucfirst($confidence) . ' confidence based on source count and retrieval scores.', $confidence === 'low' ? 'warning' : 'complete'); return [ 'tool' => 'search', 'language' => $language, 'what_we_found' => count($hits) ? 'Found source excerpts from the legal corpus.' : 'No matching source excerpts were found.', 'hits' => $hits, 'evidence_trail' => $hits, 'what_remains_uncertain' => count($hits) ? 'Search results still need human review for legal relevance and currentness.' : 'The corpus may not contain enough evidence for this query.', 'next_practical_step' => count($hits) ? 'Open the strongest sources and confirm the cited sections before relying on them.' : 'Try a narrower query with statutory terms, party names, or dates.', 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => count($chunks), 'source_count' => count($hits), 'deployment' => null, 'citation_confidence' => $confidence, 'persona' => $personaResolved['slug'] ?? null, 'persona_source' => $personaResolved['source'] ?? null, ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } public function ask(string $question, string $language = 'en', string $engine = 'azure_mini', ?string $persona = null): array { $engine = in_array($engine, ['azure_mini', 'azure_full'], true) ? $engine : 'azure_mini'; $client = dbnToolsRequireClient(); $personaResolved = dbnToolsResolvePersona((int)$client['id'], $persona); $search = $this->search($question, $language, 7, 'disabled', null, 'both', $personaResolved['slug']); $hits = $search['hits']; $trace = $search['trace']; if (!$hits) { $trace[] = $this->trace('Synthesis', 'Skipped answer synthesis because no evidence was found.', 'warning'); return [ 'tool' => 'ask', 'language' => $language, 'answer' => match (dbnToolsNormalizeUiLanguage($language)) { 'no' => 'Jeg fant ikke nok kildestøtte i familierettskorpuset til å svare sikkert.', 'uk' => 'Я не знайшов достатньої підтримки в корпусі сімейного права, щоб відповісти безпечно.', 'pl' => 'Nie znalazłem wystarczającego wsparcia źródłowego w korpusie prawa rodzinnego, aby odpowiedzieć bezpiecznie.', default => 'I did not find enough source support in the family-law corpus to answer safely.', }, 'what_we_found' => $search['what_we_found'], 'evidence_trail' => [], 'what_remains_uncertain' => $search['what_remains_uncertain'], 'next_practical_step' => $search['next_practical_step'], 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => 0, 'source_count' => 0, 'deployment' => null, 'citation_confidence' => 'low', ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } [$gateway, $personaModel] = $this->personaGateway($personaResolved, $engine); $gateway->requireChat(); $context = $this->buildEvidenceContext($hits); $locale = dbnToolsLanguageName($language); $prompt = <<legalJsonSystemPrompt($language, $personaResolved['system_prompt'] ?? null); $askDeployment = $personaModel; $raw = $gateway->withDeployment($askDeployment)->chatText([ ['role' => 'system', 'content' => $system], ['role' => 'user', 'content' => $prompt], ], [ 'json' => true, 'temperature' => 0.15, 'max_tokens' => 1300, ]); $json = $gateway->decodeJsonObject($raw); if (!$json) { $json = [ 'answer' => $raw, 'what_we_found' => 'Azure returned a plain-text answer based on the retrieved excerpts.', 'evidence_trail' => [], 'what_remains_uncertain' => ['The response format could not be validated as structured JSON.'], 'next_practical_step' => 'Review the source excerpts manually before relying on the answer.', ]; } $trace[] = $this->trace('Synthesis', 'Azure OpenAI generated an answer using only the retrieved source excerpts.', 'complete'); $trace[] = $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'); $trace[] = $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the evidence trail.'), 'complete'); return [ 'tool' => 'ask', 'language' => $language, 'answer' => (string)($json['answer'] ?? ''), 'what_we_found' => (string)($json['what_we_found'] ?? ''), 'evidence_trail' => $hits, 'citation_notes' => $this->normalizeEvidenceTrail($json['evidence_trail'] ?? [], $hits), 'sources' => $hits, 'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [], 'next_practical_step' => (string)($json['next_practical_step'] ?? ''), 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => count($hits), 'source_count' => count($hits), 'deployment' => $askDeployment, 'citation_confidence' => $search['trace_metadata']['citation_confidence'] ?? 'medium', ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } public function summarize(string $text, string $language = 'en'): array { $text = $this->requirePasteText($text); $this->azure->requireChat(); $locale = dbnToolsLanguageName($language); $prompt = <<runJsonTool($prompt, $language, 1300); $trace = [ $this->trace('Query interpretation', 'Summarize pasted text without saving the text or output.', 'complete'), $this->trace('Search tools used', 'No external corpus search; source is the user-pasted text.', 'complete'), $this->trace('Evidence found', 'Evidence trail is limited to the pasted text supplied in this request.', 'complete'), $this->trace('Citation confidence', 'Medium confidence for factual extraction; no external legal source verification was performed.', 'warning'), $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'), $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the summary against the original text.'), 'complete'), ]; return [ 'tool' => 'summarize', 'language' => $language, 'what_we_found' => (string)($json['what_we_found'] ?? ''), 'key_facts' => $json['key_facts'] ?? [], 'dates' => $json['dates'] ?? [], 'parties' => $json['parties'] ?? [], 'legal_references_detected' => $json['legal_references_detected'] ?? [], 'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']], 'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [], 'next_practical_step' => (string)($json['next_practical_step'] ?? ''), 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => 1, 'source_count' => 1, 'deployment' => $this->azure->chatDeployment(), ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } public function timeline( string $text, string $language = 'en', string $engine = 'azure_mini', string $focus = 'all', string $confidenceFilter = 'all', bool $includeRelative = true, bool $includeBackground = true, string $userNotes = '', ?callable $onProgress = null ): array { $text = $this->requirePasteText($text, self::MAX_TIMELINE_CHARS); $engine = in_array($engine, ['nova_lite', 'azure_mini', 'azure_full'], true) ? $engine : 'azure_mini'; $focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all'; $this->azure->requireChat(); $onProgress && $onProgress("Preparing document\u{2026}"); $locale = dbnToolsLanguageName($language); $inputDateHintCount = $this->timelineDateHintCount($text); $focusInstruction = match ($focus) { 'deadlines' => "\nFocus specifically on: legal deadlines, filing dates, response windows, appeal periods, and statutory time limits. Deprioritise narrative events with no legal deadline significance.", 'hearings' => "\nFocus specifically on: court hearings, tribunal sessions, mediation sessions, formal meetings, and hearing-related procedural dates.", 'cps' => "\nFocus specifically on: CPS (Barnevernet) interventions, home visits, case reviews, acute measures (akuttvedtak), and Fylkesnemnda proceedings.", default => '', }; $backgroundInstruction = $includeBackground ? "\nAlso extract BACKGROUND and NARRATIVE events: dates embedded in contextual paragraphs, historical facts, year-only references, and approximate years (e.g. \"rundt 2011/2012\", \"David ble født den 30.07.2015\", \"familien i 2015\"). These are valid timeline events even when they appear in introductory or background text — do NOT skip them." : "\nDo NOT include purely historical background or narrative context dates. Focus only on operational events, deadlines, and milestones that are directly actionable in the case."; $relativeInstruction = $includeRelative ? '' : "\nDo NOT extract relative, recurring, or conditional date references — extract only events with determinable absolute dates (date_type=absolute)."; $userNotesBlock = $userNotes !== '' ? "\n\nUser-provided context notes (use these to resolve ambiguities, not as source events):\n---\n" . $userNotes . "\n---" : ''; $charCount = mb_strlen($text, 'UTF-8'); $singlePassLimit = $this->timelineSinglePassLimit($engine); if ($charCount > $singlePassLimit) { return $this->timelineChunked( $text, $language, $engine, $focus, $confidenceFilter, $includeRelative, $includeBackground, $userNotes, $onProgress, $inputDateHintCount ); } $prompt = <<legalJsonSystemPrompt($language); $messages = [ ['role' => 'system', 'content' => $system], ['role' => 'user', 'content' => $prompt], ]; $isBedrock = $this->azure instanceof DbnBedrockGateway; $maxTokens = match ($engine) { 'azure_full', 'claude_sonnet' => 8000, 'nova_lite' => 2000, default => 4000 }; $chatOptions = ['json' => true, 'temperature' => 0.1, 'max_tokens' => $maxTokens, 'timeout' => 120]; $deployLabel = match (true) { $engine === 'nova_lite' => 'nova-lite', $engine === 'azure_full' || $engine === 'claude_sonnet' => $isBedrock ? 'claude-sonnet-bedrock' : 'gpt-4o', default => $isBedrock ? 'claude-haiku-bedrock' : 'gpt-4o-mini', }; $onProgress && $onProgress("Calling {$deployLabel}\u{2026}"); try { if ($engine === 'nova_lite') { $response = dbnToolsCallGpuLlm($messages, ['model' => 'nova-lite', 'max_tokens' => $maxTokens, 'temperature' => 0.1, 'timeout' => 120]); } elseif ($engine === 'azure_full' || $engine === 'claude_sonnet') { $deploy = $isBedrock ? DbnBedrockModelRouter::LITELLM_SONNET : 'gpt-4o'; $response = $this->azure->withDeployment($deploy)->chat($messages, $chatOptions); } else { $deploy = $isBedrock ? DbnBedrockModelRouter::LITELLM_HAIKU : 'gpt-4o-mini'; $response = $this->azure->withDeployment($deploy)->chat($messages, $chatOptions); } } catch (Throwable $e) { $msg = $e->getMessage(); if (preg_match('/timed?\s*out|timeout|operation timed out/i', $msg)) { dbnToolsAbort('The model timed out. Try Quick mode, a smaller file, or fewer selected documents.', 504, 'llm_timeout'); } dbnToolsAbort('LLM request failed: ' . $msg, 502, 'llm_error'); } $onProgress && $onProgress("Parsing events\u{2026}"); $raw = (string)($response['choices'][0]['message']['content'] ?? ''); $json = $this->azure->decodeJsonObject($raw); if (!$json) { dbnToolsAbort('The selected engine did not return valid structured JSON.', 502, 'llm_invalid_json'); } $events = is_array($json['events'] ?? null) ? $json['events'] : []; $usedFallbackExtractor = false; if (!$events && $inputDateHintCount > 0) { $fallbackEvents = $this->fallbackTimelineEvents($text); if ($fallbackEvents) { $events = $fallbackEvents; $usedFallbackExtractor = true; $uncertain = is_array($json['what_remains_uncertain'] ?? null) ? $json['what_remains_uncertain'] : []; array_unshift($uncertain, 'The selected engine returned no events, so a deterministic date-line fallback extracted visible dated lines. Review these medium-confidence entries against the original file.'); $json['what_remains_uncertain'] = $uncertain; $json['what_we_found'] = count($events) . ' date-like event(s) extracted by fallback after the selected engine returned no events.'; $json['next_practical_step'] = 'Review each fallback event against the original uploaded document and rerun with Standard or Deep if you need fuller actor/event interpretation.'; } } if (!$events && $inputDateHintCount === 0) { $json['what_we_found'] = (string)($json['what_we_found'] ?? 'No recognizable dates were found in the extracted text from this upload.'); if (trim((string)$json['what_we_found']) === '') { $json['what_we_found'] = 'No recognizable dates were found in the extracted text from this upload.'; } $json['next_practical_step'] = (string)($json['next_practical_step'] ?? 'Check that the file text was extracted correctly, or upload a text-searchable PDF/DOCX.'); if (trim((string)$json['next_practical_step']) === '') { $json['next_practical_step'] = 'Check that the file text was extracted correctly, or upload a text-searchable PDF/DOCX.'; } } // Post-filter: confidence if ($confidenceFilter === 'high_medium') { $events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low')); } // Post-filter: relative/recurring date types if (!$includeRelative) { $events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute')); } $engineLabel = $deployLabel; $focusLabel = match ($focus) { 'deadlines' => 'legal deadlines', 'hearings' => 'court hearings', 'cps' => 'CPS milestones', default => 'all events', }; $trace = [ $this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Without saving the text or output.", 'complete'), $this->trace('Search tools used', 'No external corpus search; source is the user-pasted text.', 'complete'), $this->trace('Evidence found', count($events) . ' event(s) identified' . ($confidenceFilter === 'high_medium' ? ' (low-confidence filtered out)' : '') . '.', count($events) ? 'complete' : 'warning'), $this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text.', 'complete'), $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'), $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Verify dates against original documents.'), 'complete'), ]; return [ 'tool' => 'timeline', 'language' => $language, 'what_we_found' => (string)($json['what_we_found'] ?? ''), 'events' => $events, 'evidence_trail' => $json['evidence_trail'] ?? [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']], 'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [], 'next_practical_step' => (string)($json['next_practical_step'] ?? ''), 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => count($events), 'source_count' => 1, 'deployment' => $engineLabel, 'input_date_hint_count' => $inputDateHintCount, 'used_fallback_extractor' => $usedFallbackExtractor, ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } private function timelineChunked( string $text, string $language, string $engine, string $focus, string $confidenceFilter, bool $includeRelative, bool $includeBackground, string $userNotes, ?callable $onProgress, int $inputDateHintCount ): array { $isBedrock = $this->azure instanceof DbnBedrockGateway; $engineLabel = match (true) { $engine === 'nova_lite' => 'nova-lite', $engine === 'azure_full' || $engine === 'claude_sonnet' => $isBedrock ? 'claude-sonnet-bedrock' : 'gpt-4o', default => $isBedrock ? 'claude-haiku-bedrock' : 'gpt-4o-mini', }; $chunkSize = $this->timelineChunkSize($engine); $chunks = $this->timelineTextChunks($text, $chunkSize, 900); $chunkCount = count($chunks); $events = []; $chunkFailures = 0; $usedFallbackExtractor = false; $onProgress && $onProgress('Splitting timeline into ' . $chunkCount . " chunk(s)\u{2026}"); foreach ($chunks as $idx => $chunk) { $chunkNo = $idx + 1; $chunkText = trim((string)$chunk['text']); if (mb_strlen($chunkText, 'UTF-8') < 20) { continue; } $onProgress && $onProgress("Extracting timeline chunk {$chunkNo}/{$chunkCount}\u{2026}"); try { $result = $this->timeline( $chunkText, $language, $engine, $focus, $confidenceFilter, $includeRelative, $includeBackground, $userNotes, null ); $chunkEvents = is_array($result['events'] ?? null) ? $result['events'] : []; if (!empty($result['trace_metadata']['used_fallback_extractor'])) { $usedFallbackExtractor = true; } } catch (DbnToolsHttpException $e) { $chunkFailures++; $chunkEvents = []; if ($this->timelineDateHintCount($chunkText) > 0) { $chunkEvents = $this->fallbackTimelineEvents($chunkText); if ($chunkEvents) { $usedFallbackExtractor = true; } } if (!$chunkEvents && $e->status >= 500) { error_log('timeline chunk failed: ' . $e->errorCode . ' ' . $e->getMessage()); } } catch (Throwable $e) { $chunkFailures++; $chunkEvents = $this->fallbackTimelineEvents($chunkText); if ($chunkEvents) { $usedFallbackExtractor = true; } error_log('timeline chunk throwable: ' . $e->getMessage()); } foreach ($chunkEvents as $event) { if (!is_array($event)) { continue; } $event['chunk_index'] = $chunkNo; $event['source_position'] = (int)$chunk['start']; $events[] = $event; } } $events = $this->mergeTimelineEvents($events); if ($confidenceFilter === 'high_medium') { $events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low')); } if (!$includeRelative) { $events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute')); } $focusLabel = match ($focus) { 'deadlines' => 'legal deadlines', 'hearings' => 'court hearings', 'cps' => 'CPS milestones', default => 'all events', }; $isoDates = array_values(array_filter(array_map(fn($ev) => (string)($ev['date'] ?? ''), $events), fn($d) => preg_match('/^\d{4}-\d{2}-\d{2}$/', $d))); sort($isoDates); $range = $isoDates ? (' from ' . $isoDates[0] . ' to ' . $isoDates[count($isoDates) - 1]) : ''; $actors = array_values(array_unique(array_filter(array_map(fn($ev) => (string)($ev['actor'] ?? ''), $events), fn($a) => $a !== '' && $a !== 'unknown'))); $summary = count($events) . " event(s) extracted from {$chunkCount} chunk(s){$range}."; if ($actors) { $summary .= ' Main actors: ' . implode(', ', array_slice($actors, 0, 8)) . '.'; } $uncertain = []; if ($chunkFailures > 0) { $uncertain[] = "{$chunkFailures} chunk(s) needed fallback extraction or could not be fully parsed; review medium-confidence entries against the source."; } if ($usedFallbackExtractor) { $uncertain[] = 'Deterministic fallback extraction was used for at least one chunk.'; } $trace = [ $this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Chunked timeline mode; without saving the text or output.", 'complete'), $this->trace('Search tools used', 'No external corpus search; source is the user-pasted text split into local chunks.', 'complete'), $this->trace('Evidence found', count($events) . " event(s) identified across {$chunkCount} chunk(s).", count($events) ? 'complete' : 'warning'), $this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text. Overlapping chunks were de-duplicated.', 'complete'), $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($uncertain), $uncertain ? 'warning' : 'complete'), $this->trace('Next practical step', 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.', 'complete'), ]; return [ 'tool' => 'timeline', 'language' => $language, 'what_we_found' => $summary, 'events' => $events, 'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; split into local timeline chunks; not stored.']], 'what_remains_uncertain' => $uncertain, 'next_practical_step' => 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.', 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => count($events), 'source_count' => $chunkCount, 'deployment' => $engineLabel, 'input_date_hint_count' => $inputDateHintCount, 'used_fallback_extractor' => $usedFallbackExtractor, 'chunked_timeline' => true, 'timeline_chunk_count' => $chunkCount, 'chunk_failures' => $chunkFailures, ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } private function timelineSinglePassLimit(string $engine): int { return match ($engine) { 'nova_lite' => 25000, 'azure_mini' => 55000, default => 128000, }; } private function timelineChunkSize(string $engine): int { return match ($engine) { 'nova_lite' => 10000, 'azure_mini' => 16000, default => 30000, }; } private function timelineTextChunks(string $text, int $chunkSize, int $overlap): array { $len = mb_strlen($text, 'UTF-8'); $chunks = []; $start = 0; while ($start < $len) { $targetEnd = min($len, $start + $chunkSize); $window = mb_substr($text, $start, $targetEnd - $start, 'UTF-8'); $end = $targetEnd; if ($targetEnd < $len) { $breakAt = mb_strrpos($window, "\n\n", 0, 'UTF-8'); if ($breakAt === false || $breakAt < (int)($chunkSize * 0.55)) { $breakAt = mb_strrpos($window, "\n", 0, 'UTF-8'); } if ($breakAt !== false && $breakAt > (int)($chunkSize * 0.45)) { $end = $start + $breakAt; } } $chunkText = trim(mb_substr($text, $start, max(1, $end - $start), 'UTF-8')); if ($chunkText !== '') { $chunks[] = ['start' => $start, 'text' => $chunkText]; } if ($end >= $len) { break; } $nextStart = max(0, $end - $overlap); if ($nextStart <= $start) { $nextStart = $end; } $start = $nextStart; } return $chunks; } private function mergeTimelineEvents(array $events): array { $merged = []; foreach ($events as $event) { if (!is_array($event)) { continue; } $key = $this->timelineEventSignature($event); if (!isset($merged[$key])) { $merged[$key] = $event; continue; } $existing = $merged[$key]; $candidateExcerpt = (string)($event['source_excerpt'] ?? ''); $existingExcerpt = (string)($existing['source_excerpt'] ?? ''); $additionalExcerpt = $candidateExcerpt; if ($this->timelineConfidenceRank((string)($event['confidence'] ?? 'medium')) > $this->timelineConfidenceRank((string)($existing['confidence'] ?? 'medium'))) { $merged[$key] = $event; $additionalExcerpt = $existingExcerpt; } $oldExcerpt = (string)($merged[$key]['source_excerpt'] ?? ''); $newExcerpt = $additionalExcerpt; if ($newExcerpt !== '' && $oldExcerpt !== '' && $newExcerpt !== $oldExcerpt && mb_strlen($oldExcerpt, 'UTF-8') < 260) { $merged[$key]['source_excerpt'] = $oldExcerpt . ' / ' . $newExcerpt; } } $events = array_values($merged); usort($events, static function (array $a, array $b): int { $ad = (string)($a['date'] ?? ''); $bd = (string)($b['date'] ?? ''); $ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99'; $bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99'; $cmp = strcmp($ai, $bi); if ($cmp !== 0) { return $cmp; } return strcmp((string)($a['time'] ?? ''), (string)($b['time'] ?? '')); }); return $events; } private function timelineEventSignature(array $event): string { $date = mb_strtolower(trim((string)($event['date'] ?? '')), 'UTF-8'); $time = mb_strtolower(trim((string)($event['time'] ?? '')), 'UTF-8'); $actor = mb_strtolower(trim((string)($event['actor'] ?? 'unknown')), 'UTF-8'); $body = mb_strtolower(trim((string)($event['event'] ?? '')), 'UTF-8'); $body = (string)preg_replace('/[^\p{L}\p{N}]+/u', ' ', $body); $body = trim((string)preg_replace('/\s+/u', ' ', $body)); return $date . '|' . $time . '|' . $actor . '|' . mb_substr($body, 0, 96, 'UTF-8'); } private function timelineConfidenceRank(string $confidence): int { return match ($confidence) { 'high' => 3, 'medium' => 2, default => 1, }; } private function timelineDateHintCount(string $text): int { preg_match_all('/(?= 80) { break; } $line = trim((string)preg_replace('/\s+/u', ' ', $line)); if ($line === '') { continue; } if (preg_match('/\b(20\d{2}|19\d{2})\b/u', $line, $ym)) { $lastYear = (int)$ym[1]; } if (!preg_match_all('/(?= 80) { break 2; } $day = (int)$m[1][0]; $month = (int)$m[2][0]; if ($day < 1 || $day > 31 || $month < 1 || $month > 12) { continue; } $yearRaw = $m[3][0] ?? ''; $year = null; if ($yearRaw !== '') { $year = strlen($yearRaw) === 2 ? 2000 + (int)$yearRaw : (int)$yearRaw; $lastYear = $year; } elseif ($lastYear !== null) { $year = $lastYear; } $date = $year !== null ? sprintf('%04d-%02d-%02d', $year, $month, $day) : sprintf('%02d.%02d. (year unknown)', $day, $month); $time = null; if (preg_match('/\bkl\.?\s*(\d{1,2})[:.](\d{2})\b|\b(\d{1,2}):(\d{2})\b/u', $line, $tm)) { $hour = (int)($tm[1] !== '' ? $tm[1] : $tm[3]); $min = (int)($tm[2] !== '' ? $tm[2] : $tm[4]); if ($hour >= 0 && $hour <= 23 && $min >= 0 && $min <= 59) { $time = sprintf('%02d:%02d', $hour, $min); } } $eventText = trim(preg_replace('/^\s*[-*#\s]*/u', '', $line)); $eventText = trim(preg_replace('/^' . preg_quote($m[0][0], '/') . '\s*(?:kl\.?\s*\d{1,2}[:.]\d{2})?\s*[:\-–—]?\s*/u', '', $eventText)); if ($eventText === '') { $eventText = 'Dated event found in uploaded text.'; } $events[] = [ 'date' => $date, 'end_date' => null, 'time' => $time, 'date_type' => $year !== null ? 'absolute' : 'relative', 'actor' => $this->fallbackTimelineActor($line), 'event' => mb_substr($eventText, 0, 240, 'UTF-8'), 'source_excerpt' => mb_substr($line, 0, 300, 'UTF-8'), 'confidence' => 'medium', ]; } } usort($events, static function (array $a, array $b): int { $ad = (string)($a['date'] ?? ''); $bd = (string)($b['date'] ?? ''); $ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99'; $bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99'; return strcmp($ai, $bi); }); return $events; } private function fallbackTimelineActor(string $line): string { $actors = [ '/barnevern(?:s?tjenesten)?|bv\b/iu' => 'Barnevernstjenesten', '/fylkesnemnda/iu' => 'Fylkesnemnda', '/statsforvalter(?:en)?/iu' => 'Statsforvalteren', '/tingrett/iu' => 'Tingrett', '/lagmannsrett/iu' => 'Lagmannsrett', '/høyesterett|høyesterett/iu' => 'Høyesterett', '/\bnav\b/iu' => 'NAV', '/\bbup\b/iu' => 'BUP', '/\bppt\b/iu' => 'PPT', ]; foreach ($actors as $pattern => $actor) { if (preg_match($pattern, $line)) { return $actor; } } return 'unknown'; } public function redact( string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en', array $aliases = [], string $engine = 'azure_mini', string $outputFormat = 'contextual', bool $keepOfficials = false, array $exemptNames = [], array $redactTypes = [] ): array { $text = $this->requirePasteText($text); $mode = $mode === 'strict' ? 'strict' : 'standard'; $region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic'; $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'regex'], true) ? $engine : 'azure_mini'; $outputFormat = in_array($outputFormat, ['contextual', 'generic', 'pseudonym'], true) ? $outputFormat : 'contextual'; // Normalise entity-type flags (all on by default) $doNames = ($redactTypes['names'] ?? true) !== false; $doOrgs = ($redactTypes['orgs'] ?? true) !== false; $doPlaces = ($redactTypes['places'] ?? true) !== false; $doDob = ($redactTypes['dob'] ?? true) !== false; // Pass 1 — deterministic regex [$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region); $pass1Total = array_sum($pass1Counts); $pass1Detail = $pass1Total ? implode(', ', array_map( fn($k, $v) => "{$k}: {$v}", array_keys(array_filter($pass1Counts, fn($v): bool => $v > 0)), array_filter($pass1Counts, fn($v): bool => $v > 0) )) : 'none detected'; $engineLabel = match ($engine) { 'azure_full' => 'Azure gpt-4o', 'gpu' => 'GPU (cuttlefish)', 'regex' => 'Regex only', default => 'Azure gpt-4o-mini', }; $trace = [ $this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}. Engine: {$engineLabel}.", 'complete'), $this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'), ]; // Pass 2 — LLM semantic scan $finalRedacted = $preRedacted; $pass2Counts = []; $llmDeployment = null; $redactionMap = []; $llmResult = $this->llmRedactionPass( $preRedacted, $language, $aliases, $engine, $keepOfficials, $exemptNames, $doNames, $doOrgs, $doPlaces, $doDob ); if (!empty($llmResult['skipped'])) { $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'not configured') . '.', 'warning'); } elseif (!empty($llmResult['error'])) { $trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning'); } else { $entities = $llmResult['entities'] ?? []; $llmDeployment = $llmResult['deployment'] ?? null; $applied = 0; $redactionMap = []; foreach ($entities as $entity) { if (!is_array($entity)) { continue; } $original = (string)($entity['original'] ?? ''); $type = (string)($entity['type'] ?? 'other'); $tag = (string)($entity['tag'] ?? '[IDENTIFIER]'); if ($original === '' || str_starts_with($original, '[')) { continue; } // Allow [ROLE: Name] format when keepOfficials is on, else require plain bracket tag if (!preg_match('/^\[[A-Za-z0-9_\- ]+(?::\s*[^\]]+)?\]$/', $tag)) { $tag = '[IDENTIFIER]'; } // Try word-boundary match first to avoid partial-word substitutions (e.g. "Per" inside "Persson") $escaped = preg_quote($original, '/'); $replaced = preg_replace('/\b' . $escaped . '\b/u', $tag, $finalRedacted); if ($replaced !== null && $replaced !== $finalRedacted) { $finalRedacted = $replaced; $pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1; $applied++; if (!isset($redactionMap[$tag])) { $redactionMap[$tag] = ['originals' => [], 'type' => $type]; } if (!in_array($original, $redactionMap[$tag]['originals'], true)) { $redactionMap[$tag]['originals'][] = $original; } } elseif (str_contains($finalRedacted, $original)) { // Fallback for names adjacent to punctuation or non-word characters $finalRedacted = str_replace($original, $tag, $finalRedacted); $pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1; $applied++; if (!isset($redactionMap[$tag])) { $redactionMap[$tag] = ['originals' => [], 'type' => $type]; } if (!in_array($original, $redactionMap[$tag]['originals'], true)) { $redactionMap[$tag]['originals'][] = $original; } } } // Add occurrence counts by scanning the final text foreach ($redactionMap as $tag => &$entry) { $entry['occurrences'] = substr_count($finalRedacted, $tag); } unset($entry); $pass2Detail = $applied > 0 ? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts)) : 'no additional entities found'; $trace[] = $this->trace('Pass 2 — LLM semantic scan', "{$engineLabel} reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete'); } // Apply output format post-processing $allCounts = array_merge($pass1Counts, $pass2Counts); if ($outputFormat === 'generic') { $finalRedacted = $this->applyGenericTags($finalRedacted); } elseif ($outputFormat === 'pseudonym') { $finalRedacted = $this->applyPseudonymization($finalRedacted, $allCounts); } $categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0)); $trace[] = $this->trace('Output format', match ($outputFormat) { 'generic' => 'All identifiers normalised to generic tags ([PERSON], [ORG], etc.).', 'pseudonym' => 'Identifiers replaced with plausible pseudonymous values.', default => 'Contextual role tags used (e.g. [FATHER], [JUDGE: Name]).', }, 'complete'); $trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning'); $trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete'); return [ 'tool' => 'redact', 'mode' => $mode, 'region' => $region, 'engine_used' => $engineLabel, 'output_format' => $outputFormat, 'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment || $engine === 'gpu' ? " and {$engineLabel} semantic scan" : '') . '.', 'redacted_text' => $finalRedacted, 'detected_entity_categories' => $categories, 'entity_counts' => $allCounts, 'redaction_map' => $redactionMap, 'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']], 'what_remains_uncertain' => ['Human review is still recommended for contextual identification.'], 'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.', 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => 1, 'source_count' => 1, 'deployment' => $llmDeployment ?? $engineLabel, ], 'disclaimer' => 'Privacy support tool. Review before disclosure.', ]; } private function requireFamilyPackage(int $clientId): array { $package = dbnToolsFetchPackage('family-legal'); if (!$package || empty($package['is_active'])) { dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable'); } if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) { dbnToolsAbort(dbnToolsProductName() . ' does not have an active family-legal subscription.', 503, 'subscription_missing'); } return $package; } /** * Pick the synthesis gateway + model for a persona. * - Persona pins a model (e.g. dbn-legal-agent-v3, gpt-4o) → route via LiteLLM * so any model registered on the gateway is reachable. * - No pinned model → existing Azure routing (gpt-4o / gpt-4o-mini by engine). * @return array{0: DbnAzureOpenAiGateway|DbnBedrockGateway, 1: string} */ private function personaGateway(array $persona, string $engine): array { $model = trim((string)($persona['model'] ?? '')); if ($model !== '') { try { return [new DbnBedrockGateway(['chat_model_name' => $model]), $model]; } catch (Throwable $e) { error_log('[dbn-persona] gateway init failed for model ' . $model . ': ' . $e->getMessage()); } } return [$this->azure, ($engine === 'azure_full') ? 'gpt-4o' : 'gpt-4o-mini']; } private function runJsonTool(string $prompt, string $language, int $maxTokens, ?array $persona = null): array { // With a persona, route to its pinned engine (Track-1 → tuned Qwen, Track-2 → gpt-4o) // and fold its domain framing into the system prompt. Without one (e.g. pasted-text // tools), keep the default Azure routing with the neutral base prompt. $personaPrompt = $persona['system_prompt'] ?? null; if ($persona !== null) { [$gateway, $model] = $this->personaGateway($persona, 'azure_mini'); $gateway = $gateway->withDeployment($model); } else { $gateway = $this->azure; } $raw = $gateway->chatText([ ['role' => 'system', 'content' => $this->legalJsonSystemPrompt($language, $personaPrompt)], ['role' => 'user', 'content' => $prompt], ], [ 'json' => true, 'temperature' => 0.1, 'max_tokens' => $maxTokens, ]); $json = $gateway->decodeJsonObject($raw); if (!$json) { dbnToolsAbort('The model did not return valid structured JSON.', 502, 'invalid_json'); } return $json; } private function legalJsonSystemPrompt(string $language, ?string $personaPrompt = null): string { $locale = dbnToolsLanguageName($language); $product = dbnToolsProductName(); $personaPrompt = is_string($personaPrompt) ? trim($personaPrompt) : ''; // The persona (family, immigration, labour, …) supplies the domain framing; the // base prompt stays domain-neutral so non-family tracks are not cast as child-welfare. $personaBlock = $personaPrompt !== '' ? ($personaPrompt . "\n") : ''; return << $hit) { $n = $idx + 1; $lines[] = "[{$n}] Title: " . ($hit['title'] ?? 'Untitled'); if (!empty($hit['section'])) { $lines[] = "Section: " . $hit['section']; } $lines[] = "Corpus/package: " . ($hit['package_or_corpus'] ?? 'unknown'); $lines[] = "Excerpt: " . ($hit['excerpt'] ?? ''); } return implode("\n", $lines); } private function normalizeEvidenceTrail(mixed $trail, array $hits): array { if (!is_array($trail) || !$trail) { return array_map(fn(array $hit): array => [ 'title' => $hit['title'], 'citation' => $hit['title'], 'why_it_matters' => dbnToolsExcerpt($hit['excerpt'], 180), ], array_slice($hits, 0, 4)); } return array_values(array_filter($trail, 'is_array')); } private function sourceFromChunk(array $chunk, ?string $docSummary = null): array { $title = (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'); $score = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null; $rawExcerpt = dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620); return [ 'title' => $title, 'excerpt' => $docSummary ?? $rawExcerpt, 'chunk_text' => $rawExcerpt, 'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? dbnToolsProductName()), 'score' => $score, 'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null, 'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null, 'section' => $chunk['section_title'] ?? null, 'authority_type' => $chunk['authority_type'] ?? null, 'jurisdiction' => $chunk['jurisdiction'] ?? null, // Temporal annotations (present when temporal_mode = 'legal_conservative') 'temporal_state' => $chunk['temporal_state'] ?? null, 'temporal_kind' => $chunk['temporal_kind'] ?? null, 'temporal_reason' => $chunk['temporal_reason'] ?? null, 'currentness_warning' => $chunk['currentness_warning'] ?? null, 'valid_from' => $chunk['valid_from'] ?? null, 'valid_until' => $chunk['valid_until'] ?? null, 'is_current_version' => $chunk['is_current_version'] ?? null, ]; } private function fetchDocSummaries(array $docIds): array { if (!$docIds) { return []; } try { $db = dbnToolsRagDb(); $placeholders = implode(',', array_fill(0, count($docIds), '?')); $stmt = $db->prepare( "SELECT document_id, summary FROM doc_summaries WHERE document_id IN ({$placeholders}) AND summary != ''" ); $stmt->execute(array_values($docIds)); return array_column($stmt->fetchAll(PDO::FETCH_ASSOC), 'summary', 'document_id'); } catch (Throwable) { return []; } } private function citationConfidence(array $hits): string { if (!$hits) { return 'low'; } $scores = array_values(array_filter(array_map(fn(array $h) => $h['score'] ?? null, $hits), 'is_numeric')); $best = $scores ? max($scores) : 0; if (count($hits) >= 3 && $best >= 0.35) { return 'high'; } if (count($hits) >= 1) { return 'medium'; } return 'low'; } private function fallbackKeywordSearch(int $clientId, array $package, string $query, int $limit): array { $results = []; try { $results = array_merge($results, $this->fallbackPrivateSearch($clientId, $query, $limit)); } catch (Throwable $e) { error_log('DBN tools private fallback failed: ' . $e->getMessage()); } try { $remaining = max(1, $limit - count($results)); $results = array_merge($results, $this->fallbackSharedSearch($package, $query, $remaining)); } catch (Throwable $e) { error_log('DBN tools shared fallback failed: ' . $e->getMessage()); } return array_slice($results, 0, $limit); } private function fallbackPrivateSearch(int $clientId, string $query, int $limit): array { $db = dbnToolsDb(); $terms = $this->searchTerms($query); if (!$terms) { return []; } $clauses = []; $params = [':client_id' => $clientId]; foreach ($terms as $i => $term) { $key = ':term' . $i; $clauses[] = "(cc.content LIKE {$key} OR cd.title LIKE {$key})"; $params[$key] = '%' . $term . '%'; } $sql = 'SELECT cc.id, cc.document_id, cc.content, cd.title AS document_title, cd.category FROM client_chunks cc JOIN client_documents cd ON cc.document_id = cd.id WHERE cc.client_id = :client_id AND cd.status = "ready" AND (' . implode(' OR ', $clauses) . ') LIMIT ' . (int)$limit; $stmt = $db->prepare($sql); $stmt->execute($params); $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); foreach ($rows as &$row) { $row['similarity'] = 0.25; $row['source_name'] = dbnToolsProductName() . ' private corpus'; $row['source_type'] = 'private'; } return $rows; } private function fallbackSharedSearch(array $package, string $query, int $limit): array { $ragDb = dbnToolsRagDb(); $terms = $this->searchTerms($query); if (!$terms) { return []; } $where = ['d.status = "ready"']; $params = []; if (!empty($package['corpus_id'])) { $where[] = 'd.corpus_id = ?'; $params[] = (int)$package['corpus_id']; } $cats = json_decode((string)($package['category_filter'] ?? '[]'), true) ?: []; if ($cats) { $where[] = 'd.category IN (' . implode(',', array_fill(0, count($cats), '?')) . ')'; $params = array_merge($params, $cats); } $langs = json_decode((string)($package['language_filter'] ?? '[]'), true) ?: []; if ($langs) { $where[] = 'd.language IN (' . implode(',', array_fill(0, count($langs), '?')) . ')'; $params = array_merge($params, $langs); } $termClauses = []; foreach ($terms as $term) { $termClauses[] = '(c.content LIKE ? OR d.title LIKE ?)'; $params[] = '%' . $term . '%'; $params[] = '%' . $term . '%'; } $where[] = '(' . implode(' OR ', $termClauses) . ')'; $sql = 'SELECT c.id, c.document_id, c.content, c.section_title, d.title AS document_title, d.category, d.language FROM chunks c JOIN documents d ON c.document_id = d.id WHERE ' . implode(' AND ', $where) . ' LIMIT ' . (int)$limit; $stmt = $ragDb->prepare($sql); $stmt->execute($params); $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); foreach ($rows as &$row) { $row['similarity'] = 0.2; $row['source_name'] = (string)($package['name'] ?? 'family-legal'); $row['source_type'] = 'package'; } return $rows; } private function searchTerms(string $query): array { // Citation atoms first: "§ 4-12", "Art. 8(2)", "Rt. 2020 s. 1234" tokenize // to fragments shorter than the 3-char floor and get dropped, so a citation // query loses its only meaningful term (EDI Vol.1 #2, §2.1). Extract them // verbatim and route them ahead of the word tokens. $citations = $this->extractCitationAtoms($query); $parts = preg_split('/[^\p{L}\p{N}]+/u', mb_strtolower($query, 'UTF-8')) ?: []; $stop = ['the', 'and', 'for', 'with', 'that', 'this', 'hva', 'har', 'kan', 'jeg', 'som', 'det', 'med', 'til', 'og']; $terms = []; foreach ($parts as $part) { if (mb_strlen($part, 'UTF-8') < 3 || in_array($part, $stop, true)) { continue; } $terms[] = $part; } // Citation atoms are authoritative — prepend, keep verbatim, dedupe. $terms = array_merge($citations, $terms); return array_slice(array_values(array_unique($terms)), 0, 8); } /** * Extract exact legal-identifier substrings that must survive tokenization. * Each is kept as a whole LIKE term. For § sections we also emit spaced / * unspaced variants so "§4-12" matches stored "§ 4-12" and vice versa. * * @return string[] */ private function extractCitationAtoms(string $query): array { return self::citationAtoms($query); } /** * Static, reusable citation extractor (also used by api/corpus-search.php to * route identifier queries around the FULLTEXT tokenizer). * * @return string[] */ public static function citationAtoms(string $query): array { $patterns = [ '/§\s*\d+(?:-\d+)?[a-z]?/u', // § 4-12, § 1a '/\bArt(?:ikkel|icle|\.)?\s*\d+(?:\(\d+\))?/iu', // Art. 8, Article 3, Art. 8(2) '/\b3\d{4}[A-Z]\d{4}\b/', // EU CELEX: 32016R0679 '/\bRt[\.\s]*\d{4}[\.\s]*s[\.\s]*\d+/u', // Rt. 2020 s. 1234 '/\bHR-\d{4}-\d+(?:-[A-Z])?/u', // HR-2020-1789-A ]; $out = []; foreach ($patterns as $rx) { if (!preg_match_all($rx, $query, $m)) continue; foreach ($m[0] as $hit) { $hit = trim((string)$hit); if ($hit === '') continue; $out[$hit] = true; if (mb_strpos($hit, '§') !== false) { $out[preg_replace('/§\s*/u', '§ ', $hit)] = true; // force single space $out[preg_replace('/§\s*/u', '§', $hit)] = true; // no space } } } return array_keys($out); } private function requirePasteText(string $text, ?int $maxChars = null): string { $text = trim($text); if (mb_strlen($text, 'UTF-8') < 20) { dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short'); } $maxChars ??= self::MAX_PASTE_CHARS; if (mb_strlen($text, 'UTF-8') > $maxChars) { dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long'); } return $text; } private function deterministicRedaction(string $text, string $mode, string $region = 'nordic'): array { $counts = []; $replace = function (string $pattern, string $type, string $token) use (&$text, &$counts): void { $text = preg_replace_callback($pattern, function () use (&$counts, $type, $token): string { $counts[$type] = ($counts[$type] ?? 0) + 1; return $token; }, $text) ?? $text; }; foreach ($this->getPatternPack($region) as $entry) { $replace($entry['pattern'], $entry['type'], $entry['replacement']); } // Structured role-label names (Barn: X, Mother: X, etc.) — universal $text = preg_replace_callback( '/\b(Barn|Child|Navn|Name|Mor|Far|Mother|Father|Sønn|Datter)\s*:\s*([^\r\n,.;]+)/iu', function (array $m) use (&$counts): string { $counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1; return $m[1] . ': [PERSON]'; }, $text ) ?? $text; // Child-identifier phrases ("barnet heter X", "child named X") — universal $text = preg_replace_callback( '/\b(?:barnet|child|sønn|son|datter|daughter)\s+(?:heter|named|called)?\s*([A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,})\b/iu', function () use (&$counts): string { $counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1; return '[CHILD_IDENTIFIER]'; }, $text ) ?? $text; if ($mode === 'strict') { $replace('/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,}\s+[A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,}\b/u', 'person_or_child_name', '[PERSON]'); } return [$text, $counts]; } private function getPatternPack(string $region): array { $nordic = [ ['pattern' => '/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'replacement' => '[EMAIL]', 'type' => 'email'], ['pattern' => '/(? '[FNR]', 'type' => 'fødselsnummer'], ['pattern' => '/(? '[PHONE]', 'type' => 'phone'], ['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'], // Dates — must precede generic numeric patterns // Year range (e.g. 2011/2012, 2018-2019) ['pattern' => '/(? '[DATE]', 'type' => 'date'], // Norwegian DD.MM.YYYY and DD/MM/YYYY ['pattern' => '/(? '[DATE]', 'type' => 'date'], // ISO YYYY-MM-DD ['pattern' => '/(? '[DATE]', 'type' => 'date'], // DD. Month YYYY (e.g. "30. juli 2015") and Month YYYY (Norwegian + English) ['pattern' => '/\b(?:\d{1,2}\.?\s+)?(?:januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember|January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\s+(?:19|20)\d{2}\b/iu', 'replacement' => '[DATE]', 'type' => 'date'], // Year after Norwegian/English temporal preposition (lookbehind keeps preposition) ['pattern' => '/(?<=\b(?:i|fra|siden|innen|før|etter|rundt|omkring|cirka|in|from|since|until|before|after|around|circa)\s)(?:19|20)\d{2}(?![\d\/\-])/iu', 'replacement' => '[DATE]', 'type' => 'date'], ]; if ($region === 'nordic') { return $nordic; } $european = array_merge($nordic, [ // Swedish personnummer short (YYMMDD-XXXX / YYMMDD+XXXX) ['pattern' => '/(? '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'], // Swedish personnummer full (YYYYMMDD-XXXX) ['pattern' => '/(? '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'], // Danish/Finnish CPR / henkilötunnus — same format as short SE personnummer but included for clarity ['pattern' => '/\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b/i', 'replacement' => '[UK_NI]', 'type' => 'uk_ni'], // French INSEE (15 digits, not overlapping with 11-digit FNR due to lookarounds) ['pattern' => '/(? '[FR_INSEE]', 'type' => 'fr_insee'], // IBAN (2-letter country code + 2 check digits + up to 30 alphanumeric) ['pattern' => '/\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}[A-Z0-9]{0,16}\b/i', 'replacement' => '[IBAN]', 'type' => 'iban'], // European phone (international prefix for major EU/EEA country codes) ['pattern' => '/(? '[PHONE]', 'type' => 'phone'], // Street address expanded to European street-type keywords ['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave|rue|straße|strasse|straat|gade|calle|via|gatan|vägen)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'], ]); if ($region === 'european') { return $european; } $echr = array_merge($european, [ // ECHR application number (requires "Application no." or "App. No." prefix to avoid matching dates/pages) ['pattern' => '/\b(?:Application|App\.?)\s+(?:no\.?|nr\.?|#)\s*\d{3,6}\s*\/\s*\d{2,4}\b/i', 'replacement' => '[ECHR_APP_NO]', 'type' => 'echr_app_no'], // Date of birth stated in judgment context ['pattern' => '/\bborn\s+(?:on\s+)?\d{1,2}[.\s]+(?:January|February|March|April|May|June|July|August|September|October|November|December|januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)[,\s]+\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'], ['pattern' => '/\bf\.\s*\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'], // National ID label patterns in multiple languages ['pattern' => '/\b(?:personal\s+number|numéro\s+national|Personalausweis|personnummer|fødselsnummer|henkilötunnus|CPR-nummer)\s*:\s*[\w\s\-]+/iu', 'replacement' => '[NAT_ID]', 'type' => 'nat_id'], ]); if ($region === 'echr') { return $echr; } // global return array_merge($echr, [ // US Social Security Number ['pattern' => '/(? '[SSN]', 'type' => 'ssn'], // Document number in context (passport no., ID No., document no.) ['pattern' => '/\b(?:passport\s+(?:no\.?|number)|ID\s+(?:no\.?|number)|document\s+(?:no\.?|number))\s*[:\#]?\s*([A-Z0-9]{6,12})\b/iu', 'replacement' => '[DOC_NO]', 'type' => 'doc_no'], ]); } private function llmRedactionPass( string $preRedacted, string $language = 'en', array $aliases = [], string $engine = 'azure_mini', bool $keepOfficials = false, array $exemptNames = [], bool $doNames = true, bool $doOrgs = true, bool $doPlaces = true, bool $doDob = true ): array { if ($engine === 'regex') { return ['skipped' => true, 'reason' => 'Regex-only mode selected']; } if ($engine !== 'gpu') { $missing = $this->azure->missingChatConfig(); if ($missing) { return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')']; } } $languageNote = $language === 'no' ? "\n • The document may contain Norwegian or mixed-language content." : ''; // Build alias block $aliasBlock = ''; if (!empty($aliases)) { $lines = []; foreach ($aliases as $a) { $orig = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['original'] ?? '')), 0, 100)); $lbl = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['alias'] ?? '')), 0, 100)); if ($orig !== '' && $lbl !== '') { $lines[] = " \"{$orig}\" → [{$lbl}]"; } } if ($lines) { $aliasBlock = "\n\nALIAS OVERRIDES — use these exact replacement tags for these specific names instead of inferring a role:\n" . implode("\n", $lines); } } // Build exempt names block $exemptBlock = ''; if (!empty($exemptNames)) { $quoted = array_map(fn($n) => '"' . str_replace(['"', "\n"], ['\\"', ' '], $n) . '"', array_slice($exemptNames, 0, 20)); $exemptBlock = "\n\nEXEMPT NAMES — these names must NOT be redacted under any circumstances:\n " . implode(', ', $quoted); } // Build entity-type restriction note $skipTypes = []; if (!$doOrgs) $skipTypes[] = 'organisation names'; if (!$doPlaces) $skipTypes[] = 'place names'; if (!$doDob) $skipTypes[] = 'dates of birth'; if (!$doNames) $skipTypes[] = 'person names'; $skipNote = $skipTypes ? "\n\nSKIP these entity types — do NOT redact them: " . implode(', ', $skipTypes) . '.' : ''; // Build officials note $officialsNote = ''; if ($keepOfficials) { $officialsNote = "\n\nOFFICIALS — for persons identified as JUDGE, ATTORNEY, EXPERT_WITNESS, or CASEWORKER in an official capacity: do NOT replace their name with a plain bracket tag. Instead use the format [ROLE: Name], e.g. [JUDGE: Andersen], [ATTORNEY: Skretting] or [EXPERT_WITNESS: Dr. Larsen]. Their name must remain visible inside the tag."; } $allowedTypesNote = ''; if (!$doNames) { $allowedTypesNote = "\n\nDo NOT include person_name entries in your output."; } $system = << 'system', 'content' => $system], ['role' => 'user', 'content' => $preRedacted], ]; $chatOptions = ['temperature' => 0.1, 'max_tokens' => 8000, 'json' => true, 'timeout' => 90]; try { if ($engine === 'gpu') { $response = $this->callGpuLlm($messages, $chatOptions); $deployLabel = 'GPU (cuttlefish)'; } elseif ($engine === 'azure_full') { $response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOptions); $deployLabel = 'gpt-4o'; } else { $response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOptions); $deployLabel = 'gpt-4o-mini'; } $content = (string)($response['choices'][0]['message']['content'] ?? ''); $json = $this->azure->decodeJsonObject($content); if (!is_array($json) || !array_key_exists('redactions', $json)) { return ['skipped' => false, 'entities' => [], 'error' => 'LLM returned unexpected JSON structure']; } return [ 'skipped' => false, 'entities' => is_array($json['redactions']) ? $json['redactions'] : [], 'deployment' => $deployLabel, ]; } catch (Throwable $e) { error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage()); return ['skipped' => false, 'entities' => [], 'error' => $e->getMessage()]; } } private function callGpuLlm(array $messages, array $options = []): array { return dbnToolsCallGpuLlm($messages, $options); } // ── Summarize: corpus context + engine-aware summary ───────────────────── /** * Search the shared legal corpus and return top-N passages as a formatted * context string. Returns '' on failure so the caller can degrade gracefully. */ public function corpusContextForSummarize(string $query, int $limit = 8, ?string $persona = null): string { try { $client = dbnToolsRequireClient(); $personaResolved = dbnToolsResolvePersona((int)$client['id'], $persona); $package = $personaResolved['package'] ?? $this->requireFamilyPackage((int)$client['id']); $packageIds = $personaResolved['package_ids'] ?: [(int)$package['id']]; $searchMethod = (string)($personaResolved['search_method'] ?? 'keyword') ?: 'keyword'; $personaRagOpts = is_array($personaResolved['rag_opts'] ?? null) ? $personaResolved['rag_opts'] : []; dbnToolsBootCaveau(); $gatewayUrl = 'http://10.0.1.10:4000'; try { $config = getConfig(); $u = trim((string)($config['ai_gateway']['url'] ?? '')); if ($u !== '') $gatewayUrl = $u; } catch (Throwable) {} $rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 20); $chunks = $rag->searchAll($query, $limit, null, array_merge($personaRagOpts, [ 'search_private' => true, 'search_shared' => true, 'package_ids' => $packageIds, 'chunk_limit' => $limit, 'search_method' => $searchMethod, 'min_private' => 0, 'include_beta_website' => true, ])); $parts = []; foreach ($chunks as $c) { $title = (string)($c['title'] ?? ($c['source'] ?? 'Legal source')); $content = (string)($c['content'] ?? ($c['text'] ?? '')); if ($content !== '') { $parts[] = "=== {$title} ===\n{$content}"; } } return implode("\n\n", $parts); } catch (Throwable $e) { error_log('summarize corpus search failed: ' . $e->getMessage()); return ''; } } /** * Engine-aware structured summarization, optionally enriched with corpus context. */ public function summarizeWithContext( string $text, string $language = 'en', string $engine = 'azure_mini', string $corpusContext = '' ): array { $text = $this->requirePasteText($text); $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini'; $locale = dbnToolsLanguageName($language); $enriched = $text; $corpusUsed = $corpusContext !== ''; if ($corpusUsed) { $enriched = '[Relevant legal context from ' . dbnToolsProductName() . " corpus]\n" . $corpusContext . "\n\n---\n\nDocument to summarise:\n" . $text; } $prompt = <<legalJsonSystemPrompt($language); $messages = [ ['role' => 'system', 'content' => $system], ['role' => 'user', 'content' => $prompt], ]; $maxTok = ($engine === 'azure_full') ? 8000 : 4000; $chatOpts = ['json' => true, 'temperature' => 0.1, 'max_tokens' => $maxTok, 'timeout' => 120]; $deployLabel = $this->azure->chatDeployment(); try { if ($engine === 'gpu') { $response = $this->callGpuLlm($messages, $chatOpts); $deployLabel = 'GPU (local)'; } elseif ($engine === 'azure_full') { $response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOpts); $deployLabel = 'gpt-4o'; } else { $response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOpts); $deployLabel = 'gpt-4o-mini'; } } catch (Throwable $e) { dbnToolsAbort('LLM request failed: ' . $e->getMessage(), 502, 'llm_error'); } $raw = (string)($response['choices'][0]['message']['content'] ?? ''); $json = $this->azure->decodeJsonObject($raw); if (!$json) { dbnToolsAbort('LLM returned unparseable JSON.', 502, 'llm_parse_error'); } $corpusNote = $corpusUsed ? 'Summary enriched with ' . count(array_filter(explode('=== ', $corpusContext))) . ' passage(s) from the ' . dbnToolsProductName() . ' legal corpus.' : 'No corpus search performed; summarised from document text only.'; $trace = [ $this->trace('Document preparation', 'Text validated and prepared for summarisation.', 'complete'), $this->trace('Corpus enrichment', $corpusNote, $corpusUsed ? 'complete' : 'complete'), $this->trace('Summary generation', 'Structured summary generated via ' . $deployLabel . '.', 'complete'), $this->trace('Uncertainty', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'), $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the summary against the original document.'), 'complete'), ]; return [ 'tool' => 'summarize', 'language' => $language, 'what_we_found' => (string)($json['what_we_found'] ?? ''), 'key_facts' => $json['key_facts'] ?? [], 'dates' => $json['dates'] ?? [], 'parties' => $json['parties'] ?? [], 'legal_references_detected' => $json['legal_references_detected'] ?? [], 'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [], 'next_practical_step' => (string)($json['next_practical_step'] ?? ''), 'corpus_used' => $corpusUsed, 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => 1, 'source_count' => 1, 'deployment' => $deployLabel, ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } private function applyGenericTags(string $text): string { // Collapse contextual role tags (e.g. [FATHER], [JUDGE: Andersen], [CHILD_1]) → [PERSON] $text = preg_replace('/\[(?:FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY(?::\s*[^\]]+)?|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u', '[PERSON]', $text) ?? $text; return $text; } private function applyPseudonymization(string $text, array $allCounts): string { $norwegianNames = [ 'Ola Nordmann', 'Per Hansen', 'Kari Larsen', 'Anne Berg', 'Erik Dahl', 'Ingrid Holm', 'Lars Moen', 'Silje Bakke', 'Tor Haugen', 'Eva Strand', ]; $nameCursor = 0; $phoneBase = 1; $emailCursor = 0; $addrCursor = 1; $orgCursor = 1; $personMap = []; // Replace named role tags (keeping consistent mapping per unique tag) $text = preg_replace_callback( '/\[(FATHER|MOTHER|CHILD(?:_\d+)?|GRANDPARENT|SIBLING|ATTORNEY(?::\s*[^\]]+)?|JUDGE(?::\s*[^\]]+)?|CASEWORKER(?::\s*[^\]]+)?|EXPERT_WITNESS(?::\s*[^\]]+)?|PERSON(?:_\d+)?)\]/u', function (array $m) use (&$nameCursor, &$personMap, $norwegianNames): string { $key = $m[1]; if (!isset($personMap[$key])) { $personMap[$key] = $norwegianNames[$nameCursor % count($norwegianNames)]; $nameCursor++; } return $personMap[$key]; }, $text ) ?? $text; $text = preg_replace_callback('/\[PHONE\]/', function () use (&$phoneBase): string { return sprintf('+47 400 00 %03d', $phoneBase++); }, $text) ?? $text; $text = preg_replace_callback('/\[EMAIL\]/', function () use (&$emailCursor): string { $letter = chr(ord('a') + ($emailCursor % 26)); $emailCursor++; return "person.{$letter}@example.no"; }, $text) ?? $text; $text = preg_replace_callback('/\[ADDRESS\]/', function () use (&$addrCursor): string { return "Eksempelveien {$addrCursor}, 0001 Oslo"; }, $text) ?? $text; $text = preg_replace_callback('/\[ORG\]/', function () use (&$orgCursor): string { return "Eksempel AS ({$orgCursor})"; }, $text) ?? $text; $text = preg_replace_callback('/\[FNR\]/', function (): string { return '010100XXXXX'; }, $text) ?? $text; $text = preg_replace_callback('/\[(?:SE_PERSONNUMMER|FR_INSEE|UK_NI|SSN|NAT_ID|DOC_NO|ECHR_APP_NO)\]/', function (): string { return '[ID-REDACTED]'; }, $text) ?? $text; $text = preg_replace_callback('/\[PLACE\]/', function (): string { return 'Eksempelby'; }, $text) ?? $text; $text = preg_replace_callback('/\[DOB\]/', function (): string { return '01.01.0000'; }, $text) ?? $text; $text = preg_replace_callback('/\[IBAN\]/', function (): string { return 'NO00 0000 00 00000'; }, $text) ?? $text; return $text; } private function uncertaintySummary(mixed $uncertainty): string { if (is_array($uncertainty)) { $uncertainty = implode(' ', array_map('strval', $uncertainty)); } $uncertainty = trim((string)$uncertainty); return $uncertainty !== '' ? dbnToolsExcerpt($uncertainty, 220) : 'No additional uncertainty was supplied by the tool.'; } private function trace(string $label, string $detail, string $status = 'complete'): array { return [ 'label' => $label, 'detail' => $detail, 'status' => $status, ]; } }