azure = $azure ?: DbnGatewayFactory::makeForTool('deep-research'); } public function run( string $seedQuery, string $pastedText, array $uploadedFiles, array $sliceSelection, string $engine, string $language, array $controls, ?callable $emit = null, string $advocateRole = '', ?array $priorContext = null, string $branchNotes = '', array $subQuestionsOverride = [], ?string $persona = null ): array { $seedQuery = trim($seedQuery); $pastedText = trim($pastedText); $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'dbn_legal', 'dbn_legal_v3', 'claude_sonnet', 'claude_haiku'], true) ? $engine : 'azure_mini'; $language = dbnToolsNormalizeUiLanguage($language); $controls = $this->normalizeControls($controls); if ($seedQuery === '' && $pastedText === '' && empty($uploadedFiles)) { dbnToolsAbort('Provide a question, paste text, or upload at least one file.', 422, 'missing_seed'); } $client = dbnToolsRequireClient(); $personaResolved = dbnToolsResolvePersona((int)$client['id'], $persona); $packageIds = array_values(array_filter( array_map('intval', $personaResolved['package_ids'] ?? []), static fn(int $id): bool => $id > 0 )); if (!$packageIds) { // Persona resolved without a package → fall back to the legacy family package. $packageIds = [(int)$this->requireFamilyPackage((int)$client['id'])['id']]; } dbnToolsBootCaveau(); $aiPortalRoot = dbnToolsAiPortalRoot(); require_once $aiPortalRoot . '/platform/includes/dbn_v6.php'; $this->uploadVecs = []; $this->stepTimings = []; $trace = []; $seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, $uploadedFiles); $emitStep = function (string $stepId, string $label, string $detail, string $status) use (&$trace, $emit): void { $trace[] = $this->trace($label, $detail, $status); if ($emit) { $emit('step', [ 'step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => $status, ]); } }; $emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void { if ($emit) { $emit('step', [ 'step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => 'running', ]); } }; // STEP 1: Query interpretation $emitRunning('interpretation', 'Query interpretation', 'Summarising the seed input…'); $stepStart = microtime(true); $interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole, $priorContext, $branchNotes); $this->stepTimings['interpretation'] = $this->elapsedMs($stepStart); $emitStep('interpretation', 'Query interpretation', $interpretation['detail'], 'complete'); // STEP 2: Query expansion (or use caller-supplied override) $stepStart = microtime(true); if (!empty($subQuestionsOverride)) { $subQuestions = array_values(array_filter($subQuestionsOverride, fn($sq) => is_array($sq) && !empty(trim((string)($sq['question'] ?? ''))) )); $this->stepTimings['expansion'] = $this->elapsedMs($stepStart); $emitStep('expansion', 'Query expansion', sprintf('Using %d custom sub-question(s) supplied by the user.', count($subQuestions)), 'complete'); } else { $emitRunning('expansion', 'Query expansion', 'Generating sub-questions…'); $expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $interpretation['key_signals'], $controls['sub_q_count'], $language, $advocateRole); $this->stepTimings['expansion'] = $this->elapsedMs($stepStart); $subQuestions = $expansion['questions']; $expansionStatus = $expansion['fallback'] ? 'warning' : 'complete'; $expansionDetail = $expansion['fallback'] ? 'Could not parse sub-questions; falling back to retrieving on the seed query alone.' : sprintf('Generated %d sub-questions to research the corpus from multiple angles.', count($subQuestions)); $emitStep('expansion', 'Query expansion', $expansionDetail, $expansionStatus); } // STEP 3: Slice resolution $emitRunning('slice_resolution', 'Slice resolution', 'Resolving slice toggles to document IDs…'); $stepStart = microtime(true); $sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection); if (!array_filter($sliceSelectionNormalized)) { dbnToolsAbort('Enable at least one corpus slice before running deep research.', 422, 'no_slices'); } $ragDb = dbnToolsRagDb(); try { $sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized); $sliceStatus = 'complete'; $sliceDetail = sprintf( '%d slice(s) active → %d candidate documents constrain the corpus search.', count(array_filter($sliceSelectionNormalized)), count($sharedDocIds) ); } catch (Throwable $e) { error_log('DBN deep research slice resolve failed: ' . $e->getMessage()); $sharedDocIds = []; $sliceStatus = 'warning'; $sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.'; } $this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart); $emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus); // STEP 4: Upload indexing (in-memory, ephemeral) $emitRunning('upload_indexing', 'Upload indexing', empty($uploadedFiles) ? 'No uploads; skipping…' : sprintf('Chunking + embedding %d file(s) in memory…', count($uploadedFiles))); $stepStart = microtime(true); $uploadChunks = []; foreach ($uploadedFiles as $idx => $file) { $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1)); // Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size $text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8'); $uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx)); } $uploadStatus = 'complete'; $uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks)); if ($uploadChunks) { try { // Embed in small batches of 5, emitting progress between each so the stream // stays alive during slow CPU Ollama inference (nomic-embed-text on chloe). $texts = array_map(fn(array $c) => $c['text'], $uploadChunks); $allVecs = []; $batchSize = 5; for ($b = 0; $b < count($texts); $b += $batchSize) { $batch = array_slice($texts, $b, $batchSize); if ($emit) { $emit('progress', ['detail' => sprintf( 'Embedding chunks %d–%d of %d…', $b + 1, $b + count($batch), count($texts) )]); } $batchVecs = dbnToolsLiteLLMEmbedBatch($batch); $allVecs = array_merge($allVecs, $batchVecs); } $vecs = $allVecs; if (count($vecs) === count($uploadChunks)) { foreach ($uploadChunks as $i => $chunk) { $this->uploadVecs[] = [ 'meta' => $chunk, 'vec' => $vecs[$i], ]; } } else { $uploadStatus = 'warning'; $uploadDetail = 'Upload embedding returned an unexpected count; uploaded chunks will not participate in retrieval.'; } } catch (Throwable $e) { error_log('DBN deep research upload embed failed: ' . $e->getMessage()); $uploadStatus = 'warning'; $uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.'; $this->uploadVecs = []; } } elseif (empty($uploadedFiles)) { $uploadDetail = 'No files uploaded; agent will research the corpus only.'; } $this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart); $emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus); // STEP 5: Retrieval (per sub-question) $retrievalQueries = $subQuestions ?: [[ 'id' => 'q1', 'question' => $seedQuery !== '' ? $seedQuery : ($interpretation['brief'] ?: 'legal research'), 'rationale' => 'Seed query (no sub-question expansion).', ]]; $emitRunning('retrieval', 'Retrieval', sprintf('Hybrid vector + keyword + rerank across %d sub-question(s)…', count($retrievalQueries))); $stepStart = microtime(true); try { $rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60); } catch (Throwable $e) { dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed'); } $rawPool = []; $retrievalWarnings = 0; $rawCorpusCount = 0; $rawUploadCount = 0; $filteredOutCount = 0; foreach ($retrievalQueries as $idx => $sq) { if ($emit) { $emit('subq', [ 'index' => $idx + 1, 'total' => count($retrievalQueries), 'id' => $sq['id'], 'question' => $sq['question'], ]); } try { $corpusChunks = $rag->searchAll( $sq['question'], $controls['chunk_limit'], null, [ 'search_private' => false, 'search_shared' => true, 'package_ids' => $packageIds, 'shared_doc_ids' => $sharedDocIds, 'chunk_limit' => $controls['chunk_limit'], 'search_method' => 'hybrid', 'reranker_enabled' => true, 'include_beta_website' => false, 'include_primary_website'=> false, ] ); } catch (Throwable $e) { error_log('DBN deep research sub-Q retrieval failed: ' . $e->getMessage()); $corpusChunks = []; $retrievalWarnings++; } $rawCorpusCount += count($corpusChunks); foreach ($corpusChunks as $chunk) { if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) { $filteredOutCount++; continue; } $rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']); } // Upload chunk retrieval via cosine sim if (!empty($this->uploadVecs)) { $uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']); $rawUploadCount += count($uploadHits); foreach ($uploadHits as $hit) { $hit['matched_sub_questions'] = [$sq['id']]; $rawPool[] = $hit; } } } $merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP); $this->stepTimings['retrieval'] = $this->elapsedMs($stepStart); $retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete'; $retrievalDetail = sprintf( '%d sub-question(s) × hybrid + RRF + rerank → %d corpus chunks (%d filtered) + %d upload hits → %d unique after dedupe.', count($retrievalQueries), $rawCorpusCount, $filteredOutCount, $rawUploadCount, count($merged) ); $emitStep('retrieval', 'Retrieval', $retrievalDetail, $retrievalStatus); // Cap pool to reranker top-K for synthesis $synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']); // Hydrate corpus sources with source_url + authority_label via batched dbn_v6 query $this->hydrateSourceUrls($synthesisPool); $numberedSources = $this->numberSources($synthesisPool); $retrievalCounts = [ 'raw_corpus' => $rawCorpusCount, 'filtered_website' => $filteredOutCount, 'post_filter_corpus' => $rawCorpusCount - $filteredOutCount, 'raw_upload' => $rawUploadCount, 'after_dedupe' => count($merged), 'after_topk' => count($numberedSources), ]; // STEP 6: Synthesis $synthesisEngineLabel = $engine === 'azure_full' ? 'Azure gpt-4o' : ($engine === 'gpu' ? 'GPU qwen2.5:14b' : 'Azure gpt-4o-mini'); $emitRunning('synthesis', 'Synthesis', sprintf('Synthesising cited brief with %s — this is the slowest step…', $synthesisEngineLabel)); $stepStart = microtime(true); // Attach upload summaries (generated lazily) to numbered sources if (!empty($uploadedFiles) && !empty($numberedSources)) { $uploadSummaries = []; foreach ($uploadedFiles as $idx => $file) { $text = mb_substr((string)($file['text'] ?? ''), 0, 4000, 'UTF-8'); $filename = (string)($file['filename'] ?? "file-{$idx}"); if ($text === '') continue; try { $raw = $this->azure->chatText([ ['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'], ['role' => 'user', 'content' => "Summarise this document for a legal researcher.\n\nFilename: {$filename}\n\nContent:\n{$text}"], ], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 20]); $uploadSummaries[$idx] = trim($raw); } catch (Throwable $e) { error_log('DBN upload summary gen failed for file ' . $idx . ': ' . $e->getMessage()); $uploadSummaries[$idx] = null; } } foreach ($numberedSources as &$src) { if (($src['source_origin'] ?? '') !== 'upload') continue; if (preg_match('/^upload:(\d+):/', (string)($src['chunk_id'] ?? ''), $m)) { $src['summary'] = $uploadSummaries[(int)$m[1]] ?? null; } } unset($src); } $synthesis = $this->synthesise( $seedDescription, $interpretation['brief'], $retrievalQueries, $numberedSources, $engine, $language, $controls['temperature'], $advocateRole, $priorContext, $branchNotes, $interpretation['key_signals'] ?? [] ); $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart); $emitStep( 'synthesis', 'Synthesis', sprintf('%s synthesised the brief using %d grounded source(s).', $synthesis['deploy_label'], count($numberedSources)), 'complete' ); // STEP 7: Confidence $confidence = $this->citationConfidence($numberedSources); $emitStep( 'confidence', 'Citation confidence', sprintf('%s confidence based on %d source(s) and reranker score distribution.', ucfirst($confidence), count($numberedSources)), $confidence === 'low' ? 'warning' : 'complete' ); // Stitch sub-question chunk_ids + top_sources (top 3 sources matched by each sub-Q) $subQOut = []; foreach ($retrievalQueries as $sq) { $matchedChunks = array_values(array_filter( $numberedSources, fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true) )); $topSources = array_slice($matchedChunks, 0, 3); $subQOut[] = [ 'id' => $sq['id'], 'question' => $sq['question'], 'rationale' => $sq['rationale'] ?? '', 'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)), 'top_sources' => array_map(fn(array $s) => [ 'n' => $s['n'] ?? null, 'title' => $s['title'] ?? '', 'section' => $s['section'] ?? null, 'deep_link' => $s['deep_link'] ?? $s['source_url'] ?? null, 'source_url' => $s['source_url'] ?? null, 'source_origin' => $s['source_origin'] ?? 'corpus', 'authority_label'=> $s['authority_label'] ?? null, 'graph_expanded' => $s['graph_expanded'] ?? false, 'excerpt' => $s['excerpt'] ?? '', ], $topSources), ]; } $isAdvocate = $advocateRole !== ''; return [ 'tool' => $isAdvocate ? 'advocate' : 'deep_research', 'language' => $language, 'advocate_role' => $isAdvocate ? $advocateRole : null, 'brief_markdown' => (string)($synthesis['json']['brief_markdown'] ?? $synthesis['json']['answer'] ?? ''), 'client_strengths' => $isAdvocate ? ($synthesis['json']['client_strengths'] ?? []) : null, 'opposing_weaknesses' => $isAdvocate ? ($synthesis['json']['opposing_weaknesses'] ?? []) : null, 'sub_questions' => $subQOut, 'sources' => $numberedSources, 'what_we_found' => (string)($synthesis['json']['what_we_found'] ?? ''), 'evidence_trail' => $numberedSources, 'what_remains_uncertain' => $synthesis['json']['what_remains_uncertain'] ?? [], 'next_practical_step' => (string)($synthesis['json']['next_practical_step'] ?? ''), 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => count($merged), 'source_count' => count($numberedSources), 'sub_question_count' => count($retrievalQueries), 'upload_chunk_count' => count($this->uploadVecs), 'deployment' => $synthesis['deploy_label'], 'engine_used' => $engine, 'citation_confidence' => $confidence, 'elapsed_ms_per_step' => $this->stepTimings, 'retrieval_counts' => $retrievalCounts, 'slices_active' => array_keys(array_filter($sliceSelectionNormalized)), ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } private function normalizeControls(array $controls): array { return [ 'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))), 'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))), 'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))), 'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))), 'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.10))), ]; } private function requireFamilyPackage(int $clientId): array { $package = dbnToolsFetchPackage('family-legal'); if (!$package || empty($package['is_active'])) { dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable'); } if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) { dbnToolsAbort(dbnToolsProductName() . ' does not have an active family-legal subscription.', 503, 'subscription_missing'); } return $package; } private function buildSeedDescription(string $seedQuery, string $pastedText, array $uploadedFiles): string { $parts = []; if ($seedQuery !== '') { $parts[] = "Question:\n" . mb_substr($seedQuery, 0, self::MAX_SEED_CHARS, 'UTF-8'); } if ($pastedText !== '') { $parts[] = "Pasted text:\n" . mb_substr($pastedText, 0, self::MAX_SEED_CHARS, 'UTF-8'); } foreach ($uploadedFiles as $idx => $file) { $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1)); $text = (string)($file['text'] ?? ''); if ($text === '') { continue; } $parts[] = sprintf("Uploaded file [%s]:\n%s", $filename, mb_substr($text, 0, self::MAX_UPLOAD_CHARS, 'UTF-8')); } return implode("\n\n", $parts); } private function interpretSeed(string $seedDescription, string $language, string $advocateRole = '', ?array $priorContext = null, string $branchNotes = ''): array { $locale = dbnToolsLanguageName($language); $rolePrefix = $advocateRole !== '' ? "You are preparing a case-research brief for: {$advocateRole}. Frame your interpretation to identify the strongest legal angles for this party.\n\n" : ''; $priorContextBlock = ''; if (!empty($priorContext)) { $parts = ['Prior research context:']; if (!empty($priorContext['original_query'])) { $parts[] = 'Original question: ' . mb_substr((string)$priorContext['original_query'], 0, 300, 'UTF-8'); } if (!empty($priorContext['what_we_found'])) { $parts[] = 'Key findings: ' . mb_substr((string)$priorContext['what_we_found'], 0, 400, 'UTF-8'); } if ($branchNotes !== '') { $parts[] = 'Researcher notes: ' . mb_substr($branchNotes, 0, 300, 'UTF-8'); } $priorContextBlock = implode("\n", $parts) . "\n\nNow investigate this branch:\n"; } $product = dbnToolsProductName(); $prompt = << 'system', 'content' => 'You return valid JSON only. No markdown fences.']; $userMsg = ['role' => 'user', 'content' => $prompt]; if ($language === 'no' || $advocateRole !== '') { $resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [ 'model' => 'dbn-legal-agent-v2', 'json' => true, 'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 40, ]); $raw = (string)($resp['choices'][0]['message']['content'] ?? ''); } else { $interpGateway = ($this->azure instanceof DbnBedrockGateway) ? $this->azure->withDeployment(DbnBedrockModelRouter::LITELLM_HAIKU) : $this->azure; $raw = $interpGateway->chatText([$sysMsg, $userMsg], ['json' => true, 'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 60]); } $json = $this->azure->decodeJsonObject($raw); if (is_array($json) && !empty($json['brief'])) { $signals = is_array($json['key_signals'] ?? null) ? array_slice($json['key_signals'], 0, 8) : []; $signalText = $signals ? implode(', ', $signals) : ''; return [ 'brief' => (string)$json['brief'], 'key_signals' => $signals, 'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''), ]; } } catch (Throwable $e) { error_log('DBN deep research interpretation failed: ' . $e->getMessage()); } return [ 'brief' => '', 'key_signals' => [], 'detail' => 'Interpretation step skipped — proceeding with raw seed input.', ]; } private function expandQueries(string $seedDescription, string $brief, array $keySignals, int $targetCount, string $language, string $advocateRole = ''): array { $locale = dbnToolsLanguageName($language); $anchorsLine = !empty($keySignals) ? "\nKey retrieval anchors (incorporate these terms into your sub-questions where relevant):\n" . implode(', ', $keySignals) . "\n" : ''; // Truncate seed to 2000 chars — $brief already captures the key context; // the full upload text (up to 192K chars) would push past the 60s timeout. $seedExcerpt = mb_strimwidth($seedDescription, 0, 2000, '…', 'UTF-8'); if ($advocateRole !== '') { $prompt = << 'system', 'content' => 'You return valid JSON only. No markdown fences.']; $userMsg = ['role' => 'user', 'content' => $prompt]; if ($language === 'no') { $resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [ 'model' => 'dbn-legal-agent-v2', 'json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 50, ]); $raw = (string)($resp['choices'][0]['message']['content'] ?? ''); } else { $expGateway = ($this->azure instanceof DbnBedrockGateway) ? $this->azure->withDeployment(DbnBedrockModelRouter::LITELLM_HAIKU) : $this->azure; $raw = $expGateway->chatText([$sysMsg, $userMsg], ['json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 60]); } $json = $this->azure->decodeJsonObject($raw); $items = is_array($json['sub_questions'] ?? null) ? $json['sub_questions'] : []; $normalized = []; foreach ($items as $i => $item) { if (!is_array($item) || empty($item['question'])) { continue; } $normalized[] = [ 'id' => 'q' . ($i + 1), 'question' => trim((string)$item['question']), 'rationale' => trim((string)($item['rationale'] ?? '')), ]; if (count($normalized) >= $targetCount) break; } if (count($normalized) >= 2) { return ['questions' => $normalized, 'fallback' => false]; } } catch (Throwable $e) { error_log('DBN deep research expansion failed: ' . $e->getMessage()); } return ['questions' => [], 'fallback' => true]; } private function splitIntoChunks(string $text, string $filename, int $fileIdx): array { $text = preg_replace('/\s+/u', ' ', trim($text)) ?? ''; if ($text === '') { return []; } $words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: []; if (!$words) { return []; } $chunks = []; $i = 0; $chunkIdx = 0; $total = count($words); while ($i < $total) { $slice = array_slice($words, $i, self::CHUNK_WORDS); if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) { $chunks[] = [ 'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx), 'file_index' => $fileIdx, 'chunk_index'=> $chunkIdx, 'filename' => $filename, 'text' => implode(' ', $slice), ]; $chunkIdx++; } $advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS; if ($advance < 1) $advance = 1; $i += $advance; if (count($slice) < self::CHUNK_WORDS) { break; } } return $chunks; } private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array { if (empty($this->uploadVecs)) { return []; } try { $qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? []; } catch (Throwable $e) { error_log('DBN deep research sub-Q embed failed: ' . $e->getMessage()); return []; } if (empty($qVec)) { return []; } $scored = []; foreach ($this->uploadVecs as $entry) { $sim = $this->cosineSim($qVec, $entry['vec']); if ($sim < $threshold) { continue; } $scored[] = [ 'chunk_id' => $entry['meta']['chunk_id'], 'title' => 'uploaded: ' . $entry['meta']['filename'], 'section' => null, 'package_or_corpus' => 'Your upload', 'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 950), 'chunk_text' => $entry['meta']['text'], 'similarity' => round($sim, 4), 'reranker_score' => null, 'document_id' => null, 'source_origin' => 'upload', 'authority_type' => null, 'jurisdiction' => null, ]; } usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity'])); $keep = (int)ceil($limitPerSubQ / 2); return array_slice($scored, 0, max(1, $keep)); } private function cosineSim(array $a, array $b): float { $len = min(count($a), count($b)); if ($len === 0) return 0.0; $dot = 0.0; $na = 0.0; $nb = 0.0; for ($i = 0; $i < $len; $i++) { $x = (float)$a[$i]; $y = (float)$b[$i]; $dot += $x * $y; $na += $x * $x; $nb += $y * $y; } if ($na === 0.0 || $nb === 0.0) return 0.0; return $dot / (sqrt($na) * sqrt($nb)); } private function normalizeCorpusChunk(array $chunk, string $subQId): array { $similarity = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null; $rerankerScore = isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null; return [ 'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null, 'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'), 'section' => $chunk['section_title'] ?? null, 'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'), 'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 950), 'chunk_text' => (string)($chunk['content'] ?? ''), 'similarity' => $similarity, 'reranker_score' => $rerankerScore, 'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null, 'graph_expanded' => !empty($chunk['_graph_expanded']), 'source_origin' => 'corpus', 'authority_type' => $chunk['authority_type'] ?? null, 'jurisdiction' => $chunk['jurisdiction'] ?? null, 'publication_year' => $chunk['publication_year'] ?? null, // Filled in later by hydrateSourceUrls() 'source_url' => null, 'deep_link' => null, 'authority_label' => null, 'corpus_source_name'=> null, 'publication_date' => null, 'matched_sub_questions' => [$subQId], ]; } /** * Post-retrieval filter: drop chunks that don't belong in a family-law research pass. * * EU AI Act (source_id=7, EUR-Lex) leaks through when the Qdrant search runs * unconstrained (e.g. empty shared_doc_ids on slice-resolution failure). It is * never relevant to Norwegian family law and is always excluded. * * DBN website pages (Resource Directory, Flashcards, etc.) are indexed with * NULL source_id and score artificially high on broad queries. They are excluded * unless the dbn_resources slice is explicitly ON. */ private function shouldExcludeChunk(array $chunk, array $activeSlices): bool { $name = strtolower((string)($chunk['source_name'] ?? '')); $title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? '')); $url = strtolower((string)($chunk['source_url'] ?? '')); // EU AI Act — never relevant to family law research if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true; if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true; // DBN website pages — allow through only when dbn_resources slice is ON $isDbnPage = ( str_contains($name, 'website') || str_contains($title, 'dobetternorge.no') || preg_match('/^(homepage|landing|about |contact )/i', $title) || str_contains($title, 'resource directory') || preg_match('/^flashcards?\s*[-–|]/i', $title) || preg_match('/\|\s*do better norge\s*$/i', $title) || preg_match('/[-–]\s*do better norge\s*$/i', $title) ); if ($isDbnPage) { return !($activeSlices['dbn_resources'] ?? false); } return false; } /** * Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc. * Uses a direct query against bnl_corpus.documents (only columns that exist there — * the temporal columns added in migration 136 are absent on this instance). */ private function hydrateSourceUrls(array &$pool): void { $docIds = []; foreach ($pool as $chunk) { if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue; $docId = (int)($chunk['document_id'] ?? 0); if ($docId > 0) $docIds[$docId] = true; } if (empty($docIds)) return; try { $ragDb = dbnToolsRagDb(); $ids = array_keys($docIds); $ph = implode(',', array_fill(0, count($ids), '?')); $stmt = $ragDb->prepare(" SELECT d.id, d.title, d.source_url, d.authority_type, d.publication_date, d.source_id, d.jurisdiction, d.summary, LEFT(d.content, 4000) AS content_excerpt FROM documents d WHERE d.id IN ({$ph}) "); $stmt->execute($ids); $docMeta = []; $sourceIds = []; foreach ($stmt as $row) { $dId = (int)$row['id']; $sid = isset($row['source_id']) ? (int)$row['source_id'] : null; if ($sid) $sourceIds[] = $sid; $docMeta[$dId] = [ 'source_url' => $row['source_url'] ?? null, 'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null), 'publication_date' => $row['publication_date'] ?? null, 'corpus_source_name' => 'Do Better Legal', 'source_id' => $sid, 'summary' => $row['summary'] ?? null, 'content_excerpt' => (string)($row['content_excerpt'] ?? ''), 'title' => (string)($row['title'] ?? ''), ]; } // Lazily generate summaries for documents that don't have one yet $unsummarized = array_filter($docMeta, fn($m) => $m['summary'] === null && $m['content_excerpt'] !== ''); foreach ($unsummarized as $dId => $m) { try { $raw = $this->azure->chatText([ ['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'], ['role' => 'user', 'content' => "Summarise this Norwegian family law document for a legal researcher.\nFocus on: which legal provisions it covers, its authority type, and what questions it helps answer.\n\nTitle: {$m['title']}\n\nContent:\n{$m['content_excerpt']}"], ], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]); $summary = trim($raw); if ($summary !== '') { $ragDb->prepare("UPDATE documents SET summary = ? WHERE id = ?")->execute([$summary, $dId]); $docMeta[$dId]['summary'] = $summary; } } catch (Throwable $e) { error_log('DBN hydrateSourceUrls summary gen failed for doc ' . $dId . ': ' . $e->getMessage()); } } // Enrich with corpus source name from bnl_admin.corpus_sources if (!empty($sourceIds)) { $uSids = array_values(array_unique($sourceIds)); $sPh = implode(',', array_fill(0, count($uSids), '?')); $sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})"); $sStmt->execute($uSids); $srcNames = []; foreach ($sStmt as $row) { $srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal')); } foreach ($docMeta as &$m) { if ($m['source_id'] && isset($srcNames[$m['source_id']])) { $m['corpus_source_name'] = $srcNames[$m['source_id']]; } } unset($m); } } catch (Throwable $e) { error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage()); return; } foreach ($pool as &$chunk) { if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue; $docId = (int)($chunk['document_id'] ?? 0); if (!$docId || !isset($docMeta[$docId])) continue; $m = $docMeta[$docId]; $sourceUrl = $m['source_url'] ?? null; $chunk['source_url'] = $sourceUrl; $chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null); $chunk['authority_label'] = $m['authority_label'] ?? $chunk['authority_label']; $chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null; $chunk['publication_date'] = $m['publication_date'] ?? null; $chunk['summary'] = $m['summary'] ?? null; } unset($chunk); } /** * Construct a clickable URL into the original article. Lovdata supports * path-style section anchors (e.g. /§43). For other hosts we return the * document root URL. */ private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string { if (!$sourceUrl) return null; $sourceUrl = trim($sourceUrl); if ($sourceUrl === '') return null; if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl) && $sectionTitle && preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) { return rtrim($sourceUrl, '/') . '/§' . $m[1]; } return $sourceUrl; } private function mergeAndDedupe(array $rawPool, int $cap): array { $byKey = []; foreach ($rawPool as $chunk) { $key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4))); if (!isset($byKey[$key])) { $byKey[$key] = $chunk; continue; } $existing = $byKey[$key]; $existing['matched_sub_questions'] = array_values(array_unique(array_merge( $existing['matched_sub_questions'] ?? [], $chunk['matched_sub_questions'] ?? [] ))); // Keep the higher similarity score if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) { $existing['similarity'] = $chunk['similarity']; } if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) { $existing['reranker_score'] = $chunk['reranker_score']; } $byKey[$key] = $existing; } $merged = array_values($byKey); usort($merged, function (array $a, array $b): int { $aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0; $bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0; return $bScore <=> $aScore; }); return array_slice($merged, 0, $cap); } private function numberSources(array $chunks): array { $out = []; foreach ($chunks as $i => $c) { $c['n'] = $i + 1; $out[] = $c; } return $out; } private function synthesise( string $seedDescription, string $brief, array $subQuestions, array $numberedSources, string $engine, string $language, float $temperature, string $advocateRole = '', ?array $priorContext = null, string $branchNotes = '', array $keySignals = [] ): array { $locale = dbnToolsLanguageName($language); if (empty($numberedSources)) { return [ 'json' => [ 'brief_markdown' => match (dbnToolsNormalizeUiLanguage($language)) { 'no' => 'Jeg fant ikke tilstrekkelig kildestøtte i korpuset til å gi et grunnlagsbasert svar.', 'uk' => 'Я не знайшов достатньої підтримки джерел у корпусі, щоб дати обґрунтовану відповідь.', 'pl' => 'Nie znalazłem wystarczającego wsparcia źródłowego w korpusie, aby udzielić ugruntowanej odpowiedzi.', default => 'I did not find enough source support in the corpus to give a grounded answer.', }, 'what_we_found' => 'No retrieved sources passed the similarity threshold.', 'what_remains_uncertain' => ['No corpus evidence retrieved for the given query and slice selection.'], 'next_practical_step' => 'Try widening slice selection or rephrasing with more specific statutory or party terms.', ], 'deploy_label' => match($engine) { 'gpu' => 'GPU (cuttlefish)', 'dbn_legal' => 'dbn-legal-agent-v2', 'dbn_legal_v3' => 'dbn-legal-agent-v3', 'azure_full' => 'gpt-4o', 'claude_sonnet'=> 'Claude 3.5 Sonnet', default => $this->azure->chatDeployment(), }, 'thinking_trace'=> null, ]; } $priorContextSection = ''; if (!empty($priorContext)) { $prior = []; if (!empty($priorContext['original_query'])) { $prior[] = 'Original research question: ' . mb_substr((string)$priorContext['original_query'], 0, 300, 'UTF-8'); } if (!empty($priorContext['brief_summary'])) { $prior[] = "Key findings from prior research:\n" . mb_substr((string)$priorContext['brief_summary'], 0, 600, 'UTF-8'); } if ($branchNotes !== '') { $prior[] = 'Researcher notes: ' . mb_substr($branchNotes, 0, 300, 'UTF-8'); } if ($prior) { $priorContextSection = "\nBackground from prior research:\n" . implode("\n", $prior) . "\n"; } } $sourcesContext = []; foreach ($numberedSources as $s) { $sourcesContext[] = sprintf( "[%d] (%s) %s%s\n Corpus: %s\n Authority: %s | Jurisdiction: %s\n Excerpt: %s", $s['n'], $s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus', $s['title'], !empty($s['section']) ? ' — ' . $s['section'] : '', $s['package_or_corpus'], $s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'), $s['jurisdiction'] ?? 'n/a', $s['excerpt'] ); } $sourcesText = implode("\n\n", $sourcesContext); $subQText = ''; if ($subQuestions) { $lines = array_map( fn(array $sq, int $i): string => sprintf('%d. (%s) %s', $i + 1, $sq['id'], $sq['question']), $subQuestions, array_keys($subQuestions) ); $subQText = "\nSub-questions explored:\n" . implode("\n", $lines); } $sourceCount = count($numberedSources); $lengthGuidance = $sourceCount >= 3 ? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.' : '250-450 words, 2-3 short paragraphs. Note when evidence is thin.'; $keySignalsLine = !empty($keySignals) ? "\nKey retrieval signals (statutory/factual terms that drove corpus search — ground your brief in these where sources permit):\n" . implode(', ', $keySignals) . "\n" : ''; $product = dbnToolsProductName(); if ($advocateRole !== '') { $prompt = << Høyesterett decision > ECHR Grand Chamber > ECHR regular chamber > lower courts > Bufdir guidance. - Citation self-check: before writing each [n] marker, confirm that source [n] exists in the list and its excerpt actually supports the specific claim being made. - When multiple sources support the same point, cite all of them (e.g. `[2,4]`). - `opposing_weaknesses`: OMIT this field by default. Populate it only when ≥2 retrieved sources explicitly support the identified weakness. Do not speculate or infer weaknesses from thin evidence. - `brief_markdown` must be {$lengthGuidance} Structure it as: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Procedural rights and obligations {$advocateRole} should assert, (4) Opposing weaknesses — only if `opposing_weaknesses` is non-empty. End with a one-line caveat that this is legal preparation support, not final legal advice. - `client_strengths`: 3-6 items, each must include at least one [n] citation. - `what_remains_uncertain`: 3-5 honest gaps where evidence is insufficient or law is unclear. - Respond in {$locale}. - Output valid JSON only — no markdown fences around the JSON object itself. Return JSON: { "brief_markdown": "", "client_strengths": [""], "opposing_weaknesses": [""], "what_we_found": "<2-sentence summary of the most relevant retrieved authority for {$advocateRole}>", "what_remains_uncertain": [""], "next_practical_step": "" } PROMPT; } else { $prompt = <<= 3)"], "next_practical_step": "one concrete next action the user can take to strengthen the case or close a gap" } Rules: - Every factual claim in `brief_markdown` must end with one or more `[n]` markers. - A `[n]` citation is only valid when the excerpt for source [n] explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic. - If no source supports a point, omit the point — DO NOT speculate. - Copy statute section numbers (e.g. §43, §4-12) and ECHR case citations verbatim from the excerpt text — never rephrase or infer a section number that does not appear in an excerpt. - When multiple sources support the same point, cite all of them (e.g. `[2,4]`). - Respond in {$locale}. - Output valid JSON only — no markdown fences around the JSON object itself. PROMPT; } $messages = [ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences. Every legal claim must be supported by a source from the numbered list. Do not invent statute sections, case names, paragraph numbers, or dates. If no source supports a point, omit it entirely.'], ['role' => 'user', 'content' => $prompt], ]; $synthTemp = ($advocateRole !== '') ? min($temperature, 0.20) : $temperature; // Advocate briefs require ~4-6K tokens (brief + strengths + weaknesses + uncertainty). // Non-advocate deep-research responses are shorter (~2-3K). Use separate limits. $synthMaxTokens = ($advocateRole !== '') ? 6000 : 4000; $opts = ['json' => true, 'temperature' => $synthTemp, 'max_tokens' => $synthMaxTokens, 'timeout' => 180]; $thinkingTrace = null; try { if ($engine === 'dbn_legal_v3') { $response = dbnToolsCallGpuLlm($messages, array_merge($opts, ['model' => 'dbn-legal-agent-v3', 'timeout' => 180])); $deployLabel = 'dbn-legal-agent-v3'; $raw = (string)($response['choices'][0]['message']['content'] ?? ''); } elseif ($engine === 'dbn_legal') { $response = dbnToolsCallGpuLlm($messages, array_merge($opts, ['model' => 'dbn-legal-agent-v2', 'timeout' => 180])); $deployLabel = 'dbn-legal-agent-v2'; $raw = (string)($response['choices'][0]['message']['content'] ?? ''); } elseif ($engine === 'gpu') { $response = dbnToolsCallGpuLlm($messages, $opts); $deployLabel = 'GPU (cuttlefish)'; $raw = (string)($response['choices'][0]['message']['content'] ?? ''); } elseif ($engine === 'azure_full') { $raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts); $deployLabel = 'gpt-4o'; } elseif ($engine === 'azure_mini' && $this->azure instanceof DbnBedrockGateway) { // When Bedrock enabled, azure_mini → Haiku (fast, ~20-50s synthesis) $haiku = $this->azure->withDeployment(DbnBedrockModelRouter::LITELLM_HAIKU); $raw = $haiku->chatText($messages, array_merge($opts, ['timeout' => 90])); $deployLabel = 'Claude Haiku 4.5 (AWS Bedrock)'; $thinkingTrace = null; } elseif ($engine === 'claude_sonnet' || ($this->azure instanceof DbnBedrockGateway)) { if ( $this->azure instanceof DbnBedrockGateway && dbnToolsEnv('DBN_BEDROCK_THINKING_ENABLED', 'false') === 'true' && DbnBedrockModelRouter::supportsThinking($this->azure->chatDeployment()) ) { // Extended thinking — Pro showcase $thinkResult = $this->azure->chatWithThinking($messages, [ 'max_tokens' => 16000, 'thinking_budget'=> (int)dbnToolsEnv('DBN_BEDROCK_THINKING_BUDGET', '8000'), 'timeout' => 300, ]); $raw = $thinkResult['text']; $thinkingTrace = $thinkResult['thinking'] ?? null; $deployLabel = 'Claude 3.5 Sonnet (extended thinking)'; } else { $raw = $this->azure->chatText($messages, $opts); $thinkingTrace = null; $deployLabel = $this->azure instanceof DbnBedrockGateway ? 'Claude 3.5 Sonnet' : $this->azure->chatDeployment(); } } else { $raw = $this->azure->chatText($messages, $opts); $deployLabel = $this->azure->chatDeployment(); } } catch (Throwable $e) { dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error'); } $json = $this->azure->decodeJsonObject($raw); if (!is_array($json) || empty($json['brief_markdown'])) { // Salvage as plain markdown $json = [ 'brief_markdown' => $raw, 'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.', 'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'], 'next_practical_step' => 'Review the brief manually before relying on it.', ]; } return [ 'json' => $json, 'deploy_label' => $deployLabel, 'thinking_trace'=> $thinkingTrace, ]; } private function citationConfidence(array $sources): string { if (!$sources) { return 'low'; } $scores = array_values(array_filter(array_map( fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null, $sources ), 'is_numeric')); $best = $scores ? max($scores) : 0; if (count($sources) >= 6 && $best >= 0.5) { return 'high'; } if (count($sources) >= 3 && $best >= 0.35) { return 'medium'; } return 'low'; } public function generateSubQPreview( string $seedQuery, string $pastedText, string $engine, string $language, array $controls, string $advocateRole = '', ?array $priorContext = null, string $branchNotes = '' ): array { $seedQuery = trim($seedQuery); $pastedText = trim($pastedText); $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'dbn_legal', 'dbn_legal_v3', 'claude_sonnet', 'claude_haiku'], true) ? $engine : 'azure_mini'; $language = dbnToolsNormalizeUiLanguage($language); $controls = $this->normalizeControls($controls); if ($seedQuery === '' && $pastedText === '') { dbnToolsAbort('Provide a question or pasted text.', 422, 'missing_seed'); } dbnToolsRequireClient(); dbnToolsBootCaveau(); $aiPortalRoot = dbnToolsAiPortalRoot(); require_once $aiPortalRoot . '/platform/includes/dbn_v6.php'; $seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, []); $interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole, $priorContext, $branchNotes); $expansion = $this->expandQueries( $seedDescription, $interpretation['brief'], $interpretation['key_signals'], $controls['sub_q_count'], $language, $advocateRole ); return [ 'ok' => true, 'interpretation' => $interpretation, 'sub_questions' => $expansion['questions'], 'fallback' => $expansion['fallback'] ?? false, ]; } private function trace(string $label, string $detail, string $status = 'complete'): array { return [ 'label' => $label, 'detail' => $detail, 'status' => $status, ]; } private function elapsedMs(float $start): int { return (int)round((microtime(true) - $start) * 1000); } }