Files
dobetternorge-tools/includes/DeepResearchAgent.php
T
daveadmin 343b19d0b4 Add sub-question branching + document summary modals
- Source modal now shows LLM-generated document summary (lazy-gen + cached
  in documents.summary) instead of raw chunk text; toggle reveals matched
  chunk; "View all chunks" button fetches every chunk of the document via
  new api/document-chunks.php endpoint
- Each sub-question card gets a "Branch ↓" button that pre-fills the query
  with that sub-question and shows a context panel with the prior brief
  summary; prior_context + branch_notes are injected into interpretSeed()
  and synthesise() so the LLM knows where the research is coming from
- Upload document summaries generated at synthesis time and attached to
  upload sources alongside corpus summaries
- DB: documents.summary TEXT column added to bnl_corpus on chloe

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 19:44:27 +02:00

1158 lines
52 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
declare(strict_types=1);
require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';
final class DbnDeepResearchAgent
{
private const MAX_SEED_CHARS = 16000;
private const MAX_UPLOAD_CHARS = 64000;
private const CHUNK_WORDS = 600;
private const CHUNK_OVERLAP_WORDS = 75;
private const MIN_CHUNK_WORDS = 50;
private const POOL_CAP = 30;
private DbnAzureOpenAiGateway $azure;
private array $uploadVecs = [];
private array $stepTimings = [];
public function __construct(?DbnAzureOpenAiGateway $azure = null)
{
$this->azure = $azure ?: new DbnAzureOpenAiGateway();
}
public function run(
string $seedQuery,
string $pastedText,
array $uploadedFiles,
array $sliceSelection,
string $engine,
string $language,
array $controls,
?callable $emit = null,
string $advocateRole = '',
?array $priorContext = null,
string $branchNotes = ''
): array {
$seedQuery = trim($seedQuery);
$pastedText = trim($pastedText);
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
$language = in_array($language, ['en', 'no'], true) ? $language : 'en';
$controls = $this->normalizeControls($controls);
if ($seedQuery === '' && $pastedText === '' && empty($uploadedFiles)) {
dbnToolsAbort('Provide a question, paste text, or upload at least one file.', 422, 'missing_seed');
}
$client = dbnToolsRequireClient();
$package = $this->requireFamilyPackage((int)$client['id']);
dbnToolsBootCaveau();
$aiPortalRoot = dbnToolsAiPortalRoot();
require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
$this->uploadVecs = [];
$this->stepTimings = [];
$trace = [];
$seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, $uploadedFiles);
$emitStep = function (string $stepId, string $label, string $detail, string $status) use (&$trace, $emit): void {
$trace[] = $this->trace($label, $detail, $status);
if ($emit) {
$emit('step', [
'step' => $stepId,
'label' => $label,
'detail' => $detail,
'status' => $status,
]);
}
};
$emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void {
if ($emit) {
$emit('step', [
'step' => $stepId,
'label' => $label,
'detail' => $detail,
'status' => 'running',
]);
}
};
// STEP 1: Query interpretation
$emitRunning('interpretation', 'Query interpretation', 'Summarising the seed input…');
$stepStart = microtime(true);
$interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole, $priorContext, $branchNotes);
$this->stepTimings['interpretation'] = $this->elapsedMs($stepStart);
$emitStep('interpretation', 'Query interpretation', $interpretation['detail'], 'complete');
// STEP 2: Query expansion
$emitRunning('expansion', 'Query expansion', 'Generating sub-questions…');
$stepStart = microtime(true);
$expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language, $advocateRole);
$this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
$subQuestions = $expansion['questions'];
$expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
$expansionDetail = $expansion['fallback']
? 'Could not parse sub-questions; falling back to retrieving on the seed query alone.'
: sprintf('Generated %d sub-questions to research the corpus from multiple angles.', count($subQuestions));
$emitStep('expansion', 'Query expansion', $expansionDetail, $expansionStatus);
// STEP 3: Slice resolution
$emitRunning('slice_resolution', 'Slice resolution', 'Resolving slice toggles to document IDs…');
$stepStart = microtime(true);
$sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
if (!array_filter($sliceSelectionNormalized)) {
dbnToolsAbort('Enable at least one corpus slice before running deep research.', 422, 'no_slices');
}
$ragDb = dbnToolsRagDb();
try {
$sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
$sliceStatus = 'complete';
$sliceDetail = sprintf(
'%d slice(s) active → %d candidate documents constrain the corpus search.',
count(array_filter($sliceSelectionNormalized)),
count($sharedDocIds)
);
} catch (Throwable $e) {
error_log('DBN deep research slice resolve failed: ' . $e->getMessage());
$sharedDocIds = [];
$sliceStatus = 'warning';
$sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.';
}
$this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
$emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus);
// STEP 4: Upload indexing (in-memory, ephemeral)
$emitRunning('upload_indexing', 'Upload indexing', empty($uploadedFiles)
? 'No uploads; skipping…'
: sprintf('Chunking + embedding %d file(s) in memory…', count($uploadedFiles)));
$stepStart = microtime(true);
$uploadChunks = [];
foreach ($uploadedFiles as $idx => $file) {
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
// Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size
$text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8');
$uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
}
$uploadStatus = 'complete';
$uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
if ($uploadChunks) {
try {
// Embed in small batches of 5, emitting progress between each so the stream
// stays alive during slow CPU Ollama inference (nomic-embed-text on chloe).
$texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
$allVecs = [];
$batchSize = 5;
for ($b = 0; $b < count($texts); $b += $batchSize) {
$batch = array_slice($texts, $b, $batchSize);
if ($emit) {
$emit('progress', ['detail' => sprintf(
'Embedding chunks %d%d of %d…',
$b + 1, $b + count($batch), count($texts)
)]);
}
$batchVecs = dbnToolsLiteLLMEmbedBatch($batch);
$allVecs = array_merge($allVecs, $batchVecs);
}
$vecs = $allVecs;
if (count($vecs) === count($uploadChunks)) {
foreach ($uploadChunks as $i => $chunk) {
$this->uploadVecs[] = [
'meta' => $chunk,
'vec' => $vecs[$i],
];
}
} else {
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding returned an unexpected count; uploaded chunks will not participate in retrieval.';
}
} catch (Throwable $e) {
error_log('DBN deep research upload embed failed: ' . $e->getMessage());
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.';
$this->uploadVecs = [];
}
} elseif (empty($uploadedFiles)) {
$uploadDetail = 'No files uploaded; agent will research the corpus only.';
}
$this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
$emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus);
// STEP 5: Retrieval (per sub-question)
$retrievalQueries = $subQuestions ?: [[
'id' => 'q1',
'question' => $seedQuery !== '' ? $seedQuery : ($interpretation['brief'] ?: 'legal research'),
'rationale' => 'Seed query (no sub-question expansion).',
]];
$emitRunning('retrieval', 'Retrieval', sprintf('Hybrid vector + keyword + rerank across %d sub-question(s)…', count($retrievalQueries)));
$stepStart = microtime(true);
try {
$rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
} catch (Throwable $e) {
dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
}
$rawPool = [];
$retrievalWarnings = 0;
$rawCorpusCount = 0;
$rawUploadCount = 0;
$filteredOutCount = 0;
foreach ($retrievalQueries as $idx => $sq) {
if ($emit) {
$emit('subq', [
'index' => $idx + 1,
'total' => count($retrievalQueries),
'id' => $sq['id'],
'question' => $sq['question'],
]);
}
try {
$corpusChunks = $rag->searchAll(
$sq['question'],
$controls['chunk_limit'],
null,
[
'search_private' => false,
'search_shared' => true,
'package_ids' => [(int)$package['id']],
'shared_doc_ids' => $sharedDocIds,
'chunk_limit' => $controls['chunk_limit'],
'search_method' => 'hybrid',
'reranker_enabled' => true,
'include_beta_website' => false,
'include_primary_website'=> false,
]
);
} catch (Throwable $e) {
error_log('DBN deep research sub-Q retrieval failed: ' . $e->getMessage());
$corpusChunks = [];
$retrievalWarnings++;
}
$rawCorpusCount += count($corpusChunks);
foreach ($corpusChunks as $chunk) {
if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) {
$filteredOutCount++;
continue;
}
$rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
}
// Upload chunk retrieval via cosine sim
if (!empty($this->uploadVecs)) {
$uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']);
$rawUploadCount += count($uploadHits);
foreach ($uploadHits as $hit) {
$hit['matched_sub_questions'] = [$sq['id']];
$rawPool[] = $hit;
}
}
}
$merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
$this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
$retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
$retrievalDetail = sprintf(
'%d sub-question(s) × hybrid + RRF + rerank → %d corpus chunks (%d filtered) + %d upload hits → %d unique after dedupe.',
count($retrievalQueries),
$rawCorpusCount,
$filteredOutCount,
$rawUploadCount,
count($merged)
);
$emitStep('retrieval', 'Retrieval', $retrievalDetail, $retrievalStatus);
// Cap pool to reranker top-K for synthesis
$synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
// Hydrate corpus sources with source_url + authority_label via batched dbn_v6 query
$this->hydrateSourceUrls($synthesisPool);
$numberedSources = $this->numberSources($synthesisPool);
$retrievalCounts = [
'raw_corpus' => $rawCorpusCount,
'filtered_website' => $filteredOutCount,
'post_filter_corpus' => $rawCorpusCount - $filteredOutCount,
'raw_upload' => $rawUploadCount,
'after_dedupe' => count($merged),
'after_topk' => count($numberedSources),
];
// STEP 6: Synthesis
$synthesisEngineLabel = $engine === 'azure_full' ? 'Azure gpt-4o' : ($engine === 'gpu' ? 'GPU qwen2.5:14b' : 'Azure gpt-4o-mini');
$emitRunning('synthesis', 'Synthesis', sprintf('Synthesising cited brief with %s — this is the slowest step…', $synthesisEngineLabel));
$stepStart = microtime(true);
// Attach upload summaries (generated lazily) to numbered sources
if (!empty($uploadedFiles) && !empty($numberedSources)) {
$uploadSummaries = [];
foreach ($uploadedFiles as $idx => $file) {
$text = mb_substr((string)($file['text'] ?? ''), 0, 4000, 'UTF-8');
$filename = (string)($file['filename'] ?? "file-{$idx}");
if ($text === '') continue;
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
['role' => 'user', 'content' => "Summarise this document for a legal researcher.\n\nFilename: {$filename}\n\nContent:\n{$text}"],
], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 20]);
$uploadSummaries[$idx] = trim($raw);
} catch (Throwable $e) {
error_log('DBN upload summary gen failed for file ' . $idx . ': ' . $e->getMessage());
$uploadSummaries[$idx] = null;
}
}
foreach ($numberedSources as &$src) {
if (($src['source_origin'] ?? '') !== 'upload') continue;
if (preg_match('/^upload:(\d+):/', (string)($src['chunk_id'] ?? ''), $m)) {
$src['summary'] = $uploadSummaries[(int)$m[1]] ?? null;
}
}
unset($src);
}
$synthesis = $this->synthesise(
$seedDescription,
$interpretation['brief'],
$retrievalQueries,
$numberedSources,
$engine,
$language,
$controls['temperature'],
$advocateRole,
$priorContext,
$branchNotes
);
$this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
$emitStep(
'synthesis',
'Synthesis',
sprintf('%s synthesised the brief using %d grounded source(s).', $synthesis['deploy_label'], count($numberedSources)),
'complete'
);
// STEP 7: Confidence
$confidence = $this->citationConfidence($numberedSources);
$emitStep(
'confidence',
'Citation confidence',
sprintf('%s confidence based on %d source(s) and reranker score distribution.', ucfirst($confidence), count($numberedSources)),
$confidence === 'low' ? 'warning' : 'complete'
);
// Stitch sub-question chunk_ids + top_sources (top 3 sources matched by each sub-Q)
$subQOut = [];
foreach ($retrievalQueries as $sq) {
$matchedChunks = array_values(array_filter(
$numberedSources,
fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
));
$topSources = array_slice($matchedChunks, 0, 3);
$subQOut[] = [
'id' => $sq['id'],
'question' => $sq['question'],
'rationale' => $sq['rationale'] ?? '',
'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
'top_sources' => array_map(fn(array $s) => [
'n' => $s['n'] ?? null,
'title' => $s['title'] ?? '',
'section' => $s['section'] ?? null,
'deep_link' => $s['deep_link'] ?? $s['source_url'] ?? null,
'source_url' => $s['source_url'] ?? null,
'source_origin' => $s['source_origin'] ?? 'corpus',
'authority_label'=> $s['authority_label'] ?? null,
'excerpt' => $s['excerpt'] ?? '',
], $topSources),
];
}
$isAdvocate = $advocateRole !== '';
return [
'tool' => $isAdvocate ? 'advocate' : 'deep_research',
'language' => $language,
'advocate_role' => $isAdvocate ? $advocateRole : null,
'brief_markdown' => (string)($synthesis['json']['brief_markdown'] ?? $synthesis['json']['answer'] ?? ''),
'client_strengths' => $isAdvocate ? ($synthesis['json']['client_strengths'] ?? []) : null,
'opposing_weaknesses' => $isAdvocate ? ($synthesis['json']['opposing_weaknesses'] ?? []) : null,
'sub_questions' => $subQOut,
'sources' => $numberedSources,
'what_we_found' => (string)($synthesis['json']['what_we_found'] ?? ''),
'evidence_trail' => $numberedSources,
'what_remains_uncertain' => $synthesis['json']['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($synthesis['json']['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($merged),
'source_count' => count($numberedSources),
'sub_question_count' => count($retrievalQueries),
'upload_chunk_count' => count($this->uploadVecs),
'deployment' => $synthesis['deploy_label'],
'engine_used' => $engine,
'citation_confidence' => $confidence,
'elapsed_ms_per_step' => $this->stepTimings,
'retrieval_counts' => $retrievalCounts,
'slices_active' => array_keys(array_filter($sliceSelectionNormalized)),
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function normalizeControls(array $controls): array
{
return [
'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
];
}
private function requireFamilyPackage(int $clientId): array
{
$package = dbnToolsFetchPackage('family-legal');
if (!$package || empty($package['is_active'])) {
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
}
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
}
return $package;
}
private function buildSeedDescription(string $seedQuery, string $pastedText, array $uploadedFiles): string
{
$parts = [];
if ($seedQuery !== '') {
$parts[] = "Question:\n" . mb_substr($seedQuery, 0, self::MAX_SEED_CHARS, 'UTF-8');
}
if ($pastedText !== '') {
$parts[] = "Pasted text:\n" . mb_substr($pastedText, 0, self::MAX_SEED_CHARS, 'UTF-8');
}
foreach ($uploadedFiles as $idx => $file) {
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
$text = (string)($file['text'] ?? '');
if ($text === '') {
continue;
}
$parts[] = sprintf("Uploaded file [%s]:\n%s", $filename, mb_substr($text, 0, self::MAX_UPLOAD_CHARS, 'UTF-8'));
}
return implode("\n\n", $parts);
}
private function interpretSeed(string $seedDescription, string $language, string $advocateRole = '', ?array $priorContext = null, string $branchNotes = ''): array
{
$locale = $language === 'no' ? 'Norwegian' : 'English';
$rolePrefix = $advocateRole !== ''
? "You are preparing a case-research brief for: {$advocateRole}. Frame your interpretation to identify the strongest legal angles for this party.\n\n"
: '';
$priorContextBlock = '';
if (!empty($priorContext)) {
$parts = ['Prior research context:'];
if (!empty($priorContext['original_query'])) {
$parts[] = 'Original question: ' . mb_substr((string)$priorContext['original_query'], 0, 300, 'UTF-8');
}
if (!empty($priorContext['what_we_found'])) {
$parts[] = 'Key findings: ' . mb_substr((string)$priorContext['what_we_found'], 0, 400, 'UTF-8');
}
if ($branchNotes !== '') {
$parts[] = 'Researcher notes: ' . mb_substr($branchNotes, 0, 300, 'UTF-8');
}
$priorContextBlock = implode("\n", $parts) . "\n\nNow investigate this branch:\n";
}
$prompt = <<<PROMPT
{$rolePrefix}{$priorContextBlock}You are reviewing the input below to set up a deep legal research pass against the Do Better Norge family-law corpus.
Input:
{$seedDescription}
In {$locale}, produce JSON with:
{
"brief": "1-3 sentence description of what the user is trying to research (≤ 220 chars)",
"key_signals": ["short keywords or terms that should drive retrieval"]
}
PROMPT;
try {
$sysMsg = ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'];
$userMsg = ['role' => 'user', 'content' => $prompt];
if ($language === 'no' || $advocateRole !== '') {
$resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [
'model' => 'dbn-legal-agent', 'json' => true,
'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 40,
]);
$raw = (string)($resp['choices'][0]['message']['content'] ?? '');
} else {
$raw = $this->azure->chatText([$sysMsg, $userMsg],
['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]);
}
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && !empty($json['brief'])) {
$signals = $json['key_signals'] ?? [];
$signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : '';
return [
'brief' => (string)$json['brief'],
'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
];
}
} catch (Throwable $e) {
error_log('DBN deep research interpretation failed: ' . $e->getMessage());
}
return [
'brief' => '',
'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
];
}
private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language, string $advocateRole = ''): array
{
$locale = $language === 'no' ? 'Norwegian' : 'English';
if ($advocateRole !== '') {
$prompt = <<<PROMPT
You are a Norwegian family-law research assistant building a case for: {$advocateRole}.
Generate exactly {$targetCount} targeted sub-questions designed to find:
1. Lovdata statutes and ECHR/Hague precedents that support {$advocateRole}'s position.
2. Procedural rights and obligations the opposing party must satisfy — failures here help {$advocateRole}.
3. Case law that exposes weaknesses in the opposing party's likely arguments.
4. Specific articles, paragraphs, or judgments {$advocateRole}'s representative should cite.
Research brief:
{$brief}
Raw input:
{$seedDescription}
Return JSON only in {$locale}:
{
"sub_questions": [
{"id":"q1","question":"...","rationale":"how finding this strengthens {$advocateRole}'s case (≤ 140 chars)"}
]
}
Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Every question must be answerable from Norwegian family-law, child-welfare, or ECHR/Hague sources.
- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame).
- Sub-questions must be self-contained — readable without the raw input.
- Write the questions in {$locale}.
PROMPT;
} else {
$prompt = <<<PROMPT
You are decomposing a Do Better Norge legal-research request into {$targetCount} focused sub-questions that should each be answered by the legal corpus (Norwegian family law, child welfare, ECHR/Hague).
Research brief:
{$brief}
Raw input:
{$seedDescription}
Return JSON only:
{
"sub_questions": [
{"id":"q1","question":"... ({$locale})","rationale":"why this angle matters (≤ 140 chars)"}
]
}
Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
- Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
- Sub-questions must be self-contained — readable without seeing the seed text.
- Write the questions in {$locale}.
PROMPT;
}
try {
$sysMsg = ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'];
$userMsg = ['role' => 'user', 'content' => $prompt];
if ($language === 'no') {
$resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [
'model' => 'dbn-legal-agent', 'json' => true,
'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 50,
]);
$raw = (string)($resp['choices'][0]['message']['content'] ?? '');
} else {
$raw = $this->azure->chatText([$sysMsg, $userMsg],
['json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 35]);
}
$json = $this->azure->decodeJsonObject($raw);
$items = is_array($json['sub_questions'] ?? null) ? $json['sub_questions'] : [];
$normalized = [];
foreach ($items as $i => $item) {
if (!is_array($item) || empty($item['question'])) {
continue;
}
$normalized[] = [
'id' => 'q' . ($i + 1),
'question' => trim((string)$item['question']),
'rationale' => trim((string)($item['rationale'] ?? '')),
];
if (count($normalized) >= $targetCount) break;
}
if (count($normalized) >= 2) {
return ['questions' => $normalized, 'fallback' => false];
}
} catch (Throwable $e) {
error_log('DBN deep research expansion failed: ' . $e->getMessage());
}
return ['questions' => [], 'fallback' => true];
}
private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
{
$text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
if ($text === '') {
return [];
}
$words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
if (!$words) {
return [];
}
$chunks = [];
$i = 0;
$chunkIdx = 0;
$total = count($words);
while ($i < $total) {
$slice = array_slice($words, $i, self::CHUNK_WORDS);
if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
$chunks[] = [
'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
'file_index' => $fileIdx,
'chunk_index'=> $chunkIdx,
'filename' => $filename,
'text' => implode(' ', $slice),
];
$chunkIdx++;
}
$advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
if ($advance < 1) $advance = 1;
$i += $advance;
if (count($slice) < self::CHUNK_WORDS) {
break;
}
}
return $chunks;
}
private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
{
if (empty($this->uploadVecs)) {
return [];
}
try {
$qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? [];
} catch (Throwable $e) {
error_log('DBN deep research sub-Q embed failed: ' . $e->getMessage());
return [];
}
if (empty($qVec)) {
return [];
}
$scored = [];
foreach ($this->uploadVecs as $entry) {
$sim = $this->cosineSim($qVec, $entry['vec']);
if ($sim < $threshold) {
continue;
}
$scored[] = [
'chunk_id' => $entry['meta']['chunk_id'],
'title' => 'uploaded: ' . $entry['meta']['filename'],
'section' => null,
'package_or_corpus' => 'Your upload',
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620),
'chunk_text' => $entry['meta']['text'],
'similarity' => round($sim, 4),
'reranker_score' => null,
'document_id' => null,
'source_origin' => 'upload',
'authority_type' => null,
'jurisdiction' => null,
];
}
usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
$keep = (int)ceil($limitPerSubQ / 2);
return array_slice($scored, 0, max(1, $keep));
}
private function cosineSim(array $a, array $b): float
{
$len = min(count($a), count($b));
if ($len === 0) return 0.0;
$dot = 0.0;
$na = 0.0;
$nb = 0.0;
for ($i = 0; $i < $len; $i++) {
$x = (float)$a[$i];
$y = (float)$b[$i];
$dot += $x * $y;
$na += $x * $x;
$nb += $y * $y;
}
if ($na === 0.0 || $nb === 0.0) return 0.0;
return $dot / (sqrt($na) * sqrt($nb));
}
private function normalizeCorpusChunk(array $chunk, string $subQId): array
{
$similarity = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
$rerankerScore = isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null;
return [
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
'section' => $chunk['section_title'] ?? null,
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
'chunk_text' => (string)($chunk['content'] ?? ''),
'similarity' => $similarity,
'reranker_score' => $rerankerScore,
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
'source_origin' => 'corpus',
'authority_type' => $chunk['authority_type'] ?? null,
'jurisdiction' => $chunk['jurisdiction'] ?? null,
'publication_year' => $chunk['publication_year'] ?? null,
// Filled in later by hydrateSourceUrls()
'source_url' => null,
'deep_link' => null,
'authority_label' => null,
'corpus_source_name'=> null,
'publication_date' => null,
'matched_sub_questions' => [$subQId],
];
}
/**
* Post-retrieval filter: drop chunks that don't belong in a family-law research pass.
*
* EU AI Act (source_id=7, EUR-Lex) leaks through when the Qdrant search runs
* unconstrained (e.g. empty shared_doc_ids on slice-resolution failure). It is
* never relevant to Norwegian family law and is always excluded.
*
* DBN website pages (Resource Directory, Flashcards, etc.) are indexed with
* NULL source_id and score artificially high on broad queries. They are excluded
* unless the dbn_resources slice is explicitly ON.
*/
private function shouldExcludeChunk(array $chunk, array $activeSlices): bool
{
$name = strtolower((string)($chunk['source_name'] ?? ''));
$title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
$url = strtolower((string)($chunk['source_url'] ?? ''));
// EU AI Act — never relevant to family law research
if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true;
if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true;
// DBN website pages — allow through only when dbn_resources slice is ON
$isDbnPage = (
str_contains($name, 'website')
|| str_contains($title, 'dobetternorge.no')
|| preg_match('/^(homepage|landing|about |contact )/i', $title)
|| str_contains($title, 'resource directory')
|| preg_match('/^flashcards?\s*[-|]/i', $title)
|| preg_match('/\|\s*do better norge\s*$/i', $title)
|| preg_match('/[-]\s*do better norge\s*$/i', $title)
);
if ($isDbnPage) {
return !($activeSlices['dbn_resources'] ?? false);
}
return false;
}
/**
* Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc.
* Uses a direct query against bnl_corpus.documents (only columns that exist there —
* the temporal columns added in migration 136 are absent on this instance).
*/
private function hydrateSourceUrls(array &$pool): void
{
$docIds = [];
foreach ($pool as $chunk) {
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
$docId = (int)($chunk['document_id'] ?? 0);
if ($docId > 0) $docIds[$docId] = true;
}
if (empty($docIds)) return;
try {
$ragDb = dbnToolsRagDb();
$ids = array_keys($docIds);
$ph = implode(',', array_fill(0, count($ids), '?'));
$stmt = $ragDb->prepare("
SELECT d.id, d.title, d.source_url, d.authority_type,
d.publication_date, d.source_id, d.jurisdiction,
d.summary, LEFT(d.content, 4000) AS content_excerpt
FROM documents d
WHERE d.id IN ({$ph})
");
$stmt->execute($ids);
$docMeta = [];
$sourceIds = [];
foreach ($stmt as $row) {
$dId = (int)$row['id'];
$sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
if ($sid) $sourceIds[] = $sid;
$docMeta[$dId] = [
'source_url' => $row['source_url'] ?? null,
'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null),
'publication_date' => $row['publication_date'] ?? null,
'corpus_source_name' => 'Do Better Legal',
'source_id' => $sid,
'summary' => $row['summary'] ?? null,
'content_excerpt' => (string)($row['content_excerpt'] ?? ''),
'title' => (string)($row['title'] ?? ''),
];
}
// Lazily generate summaries for documents that don't have one yet
$unsummarized = array_filter($docMeta, fn($m) => $m['summary'] === null && $m['content_excerpt'] !== '');
foreach ($unsummarized as $dId => $m) {
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
['role' => 'user', 'content' => "Summarise this Norwegian family law document for a legal researcher.\nFocus on: which legal provisions it covers, its authority type, and what questions it helps answer.\n\nTitle: {$m['title']}\n\nContent:\n{$m['content_excerpt']}"],
], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
$summary = trim($raw);
if ($summary !== '') {
$ragDb->prepare("UPDATE documents SET summary = ? WHERE id = ?")->execute([$summary, $dId]);
$docMeta[$dId]['summary'] = $summary;
}
} catch (Throwable $e) {
error_log('DBN hydrateSourceUrls summary gen failed for doc ' . $dId . ': ' . $e->getMessage());
}
}
// Enrich with corpus source name from bnl_admin.corpus_sources
if (!empty($sourceIds)) {
$uSids = array_values(array_unique($sourceIds));
$sPh = implode(',', array_fill(0, count($uSids), '?'));
$sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
$sStmt->execute($uSids);
$srcNames = [];
foreach ($sStmt as $row) {
$srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
}
foreach ($docMeta as &$m) {
if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
$m['corpus_source_name'] = $srcNames[$m['source_id']];
}
}
unset($m);
}
} catch (Throwable $e) {
error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage());
return;
}
foreach ($pool as &$chunk) {
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
$docId = (int)($chunk['document_id'] ?? 0);
if (!$docId || !isset($docMeta[$docId])) continue;
$m = $docMeta[$docId];
$sourceUrl = $m['source_url'] ?? null;
$chunk['source_url'] = $sourceUrl;
$chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
$chunk['authority_label'] = $m['authority_label'] ?? $chunk['authority_label'];
$chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
$chunk['publication_date'] = $m['publication_date'] ?? null;
$chunk['summary'] = $m['summary'] ?? null;
}
unset($chunk);
}
/**
* Construct a clickable URL into the original article. Lovdata supports
* path-style section anchors (e.g. /§43). For other hosts we return the
* document root URL.
*/
private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string
{
if (!$sourceUrl) return null;
$sourceUrl = trim($sourceUrl);
if ($sourceUrl === '') return null;
if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl)
&& $sectionTitle
&& preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) {
return rtrim($sourceUrl, '/') . '/§' . $m[1];
}
return $sourceUrl;
}
private function mergeAndDedupe(array $rawPool, int $cap): array
{
$byKey = [];
foreach ($rawPool as $chunk) {
$key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
if (!isset($byKey[$key])) {
$byKey[$key] = $chunk;
continue;
}
$existing = $byKey[$key];
$existing['matched_sub_questions'] = array_values(array_unique(array_merge(
$existing['matched_sub_questions'] ?? [],
$chunk['matched_sub_questions'] ?? []
)));
// Keep the higher similarity score
if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
$existing['similarity'] = $chunk['similarity'];
}
if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
$existing['reranker_score'] = $chunk['reranker_score'];
}
$byKey[$key] = $existing;
}
$merged = array_values($byKey);
usort($merged, function (array $a, array $b): int {
$aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
$bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
return $bScore <=> $aScore;
});
return array_slice($merged, 0, $cap);
}
private function numberSources(array $chunks): array
{
$out = [];
foreach ($chunks as $i => $c) {
$c['n'] = $i + 1;
$out[] = $c;
}
return $out;
}
private function synthesise(
string $seedDescription,
string $brief,
array $subQuestions,
array $numberedSources,
string $engine,
string $language,
float $temperature,
string $advocateRole = '',
?array $priorContext = null,
string $branchNotes = ''
): array {
$locale = $language === 'no' ? 'Norwegian' : 'English';
if (empty($numberedSources)) {
return [
'json' => [
'brief_markdown' => $language === 'no'
? 'Jeg fant ikke tilstrekkelig kildestøtte i korpuset til å gi et grunnlagsbasert svar.'
: 'I did not find enough source support in the corpus to give a grounded answer.',
'what_we_found' => 'No retrieved sources passed the similarity threshold.',
'what_remains_uncertain' => ['No corpus evidence retrieved for the given query and slice selection.'],
'next_practical_step' => 'Try widening slice selection or rephrasing with more specific statutory or party terms.',
],
'deploy_label' => match($engine) {
'gpu' => 'GPU (cuttlefish)',
'dbn_legal' => 'dbn-legal-agent',
'azure_full'=> 'gpt-4o',
default => $this->azure->chatDeployment(),
},
];
}
$priorContextSection = '';
if (!empty($priorContext)) {
$prior = [];
if (!empty($priorContext['original_query'])) {
$prior[] = 'Original research question: ' . mb_substr((string)$priorContext['original_query'], 0, 300, 'UTF-8');
}
if (!empty($priorContext['brief_summary'])) {
$prior[] = "Key findings from prior research:\n" . mb_substr((string)$priorContext['brief_summary'], 0, 600, 'UTF-8');
}
if ($branchNotes !== '') {
$prior[] = 'Researcher notes: ' . mb_substr($branchNotes, 0, 300, 'UTF-8');
}
if ($prior) {
$priorContextSection = "\nBackground from prior research:\n" . implode("\n", $prior) . "\n";
}
}
$sourcesContext = [];
foreach ($numberedSources as $s) {
$sourcesContext[] = sprintf(
"[%d] (%s) %s%s\n Corpus: %s\n Authority: %s | Jurisdiction: %s\n Excerpt: %s",
$s['n'],
$s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
$s['title'],
!empty($s['section']) ? ' — ' . $s['section'] : '',
$s['package_or_corpus'],
$s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
$s['jurisdiction'] ?? 'n/a',
$s['excerpt']
);
}
$sourcesText = implode("\n\n", $sourcesContext);
$subQText = '';
if ($subQuestions) {
$lines = array_map(
fn(array $sq, int $i): string => sprintf('%d. (%s) %s', $i + 1, $sq['id'], $sq['question']),
$subQuestions,
array_keys($subQuestions)
);
$subQText = "\nSub-questions explored:\n" . implode("\n", $lines);
}
$sourceCount = count($numberedSources);
$lengthGuidance = $sourceCount >= 3
? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.'
: '250-450 words, 2-3 short paragraphs. Note when evidence is thin.';
if ($advocateRole !== '') {
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools producing a legal preparation brief in {$locale}.
Your client: {$advocateRole}
{$priorContextSection}
You MUST ground every claim in the numbered sources below using inline `[n]` citation markers. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
User input:
{$seedDescription}
Research brief:
{$brief}
{$subQText}
Sources ({$sourceCount} numbered):
{$sourcesText}
Return JSON only in {$locale}:
{
"brief_markdown": "Partisan but factually grounded advocate brief. {$lengthGuidance} Structure: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Identified weaknesses in the opposing party's position with [n] citations, (4) Procedural rights and obligations {$advocateRole} should assert. End with a one-line caveat that this is legal preparation support, not final legal advice.",
"client_strengths": ["3-6 strings — the strongest factual/legal points for {$advocateRole}, each anchored to at least one [n] source"],
"opposing_weaknesses": ["2-5 strings — vulnerabilities in the opposing position supported by retrieved sources. Omit this array entirely if evidence is thin — do NOT invent weaknesses."],
"what_we_found": "2-sentence summary of the most relevant retrieved authority for {$advocateRole}",
"what_remains_uncertain": ["3-5 gaps where evidence is insufficient or law is unclear — be honest"],
"next_practical_step": "one concrete action for {$advocateRole} to take next (legal filing, evidence gathering, consultation type, etc.)"
}
Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer citing statute sections (e.g. "Barneloven §43") and case names verbatim from source excerpts.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- `opposing_weaknesses` must be omitted or empty when no retrieved source actually supports the identified weakness.
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
PROMPT;
} else {
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools running a deep-research synthesis. You MUST ground every claim in the numbered sources below, using inline `[n]` citation markers that map to the source list. Do NOT cite a source you did not use. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
{$priorContextSection}
User input:
{$seedDescription}
Research brief:
{$brief}
{$subQText}
Sources ({$sourceCount} numbered):
{$sourcesText}
Return JSON only in {$locale}:
{
"brief_markdown": "Markdown legal brief. {$lengthGuidance} Every factual claim ends with one or more inline [n] markers keyed to the sources above. Use level-3 headings (###) sparingly to separate paragraphs by theme when helpful. End with a one-line caveat that this is research support, not legal advice.",
"what_we_found": "2-4 sentence plain-language summary of the grounded finding",
"what_remains_uncertain": ["specific gaps — what the corpus did not cover, conflicting authority, or where confidence is limited (3-6 items when sources >= 3)"],
"next_practical_step": "one concrete next action the user can take to strengthen the case or close a gap"
}
Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer pinpointing statute sections (e.g. "Barneloven §43") and case names verbatim from the source excerpts.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
PROMPT;
}
$messages = [
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
];
$opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3200, 'timeout' => 180];
try {
if ($engine === 'dbn_legal') {
$response = dbnToolsCallGpuLlm($messages, array_merge($opts, ['model' => 'dbn-legal-agent', 'timeout' => 180]));
$deployLabel = 'dbn-legal-agent';
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
} elseif ($engine === 'gpu') {
$response = dbnToolsCallGpuLlm($messages, $opts);
$deployLabel = 'GPU (cuttlefish)';
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
} elseif ($engine === 'azure_full') {
$raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
$deployLabel = 'gpt-4o';
} else {
$raw = $this->azure->chatText($messages, $opts);
$deployLabel = $this->azure->chatDeployment();
}
} catch (Throwable $e) {
dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
}
$json = $this->azure->decodeJsonObject($raw);
if (!is_array($json) || empty($json['brief_markdown'])) {
// Salvage as plain markdown
$json = [
'brief_markdown' => $raw,
'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.',
'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
'next_practical_step' => 'Review the brief manually before relying on it.',
];
}
return [
'json' => $json,
'deploy_label' => $deployLabel,
];
}
private function citationConfidence(array $sources): string
{
if (!$sources) {
return 'low';
}
$scores = array_values(array_filter(array_map(
fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
$sources
), 'is_numeric'));
$best = $scores ? max($scores) : 0;
if (count($sources) >= 6 && $best >= 0.5) {
return 'high';
}
if (count($sources) >= 3 && $best >= 0.35) {
return 'medium';
}
return 'low';
}
private function trace(string $label, string $detail, string $status = 'complete'): array
{
return [
'label' => $label,
'detail' => $detail,
'status' => $status,
];
}
private function elapsedMs(float $start): int
{
return (int)round((microtime(true) - $start) * 1000);
}
}