Files
dobetternorge-tools/includes/DeepResearchAgent.php
T
daveadmin d156f8cf6b feat(tools): persona selector across standalone tools + dashboard chat
Wire the legal-domain persona picker into corpus, deep-research, korrespond and
the dashboard chat. Each endpoint reads the chosen profile, resolves its packages
against client 57, and scopes retrieval via package_ids (falling back to family
when omitted). New dashboard tenants now subscribe to all DBN domain packages so
persona switching survives the subscription intersection.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-01 23:03:31 +02:00

1298 lines
61 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
declare(strict_types=1);
require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';
require_once __DIR__ . '/DbnGatewayFactory.php';
require_once __DIR__ . '/DbnBedrockModelRouter.php';
final class DbnDeepResearchAgent
{
private const MAX_SEED_CHARS = 16000;
private const MAX_UPLOAD_CHARS = 64000;
private const CHUNK_WORDS = 600;
private const CHUNK_OVERLAP_WORDS = 75;
private const MIN_CHUNK_WORDS = 50;
private const POOL_CAP = 30;
private DbnAzureOpenAiGateway|DbnBedrockGateway $azure;
private array $uploadVecs = [];
private array $stepTimings = [];
public function __construct(DbnAzureOpenAiGateway|DbnBedrockGateway|null $azure = null)
{
$this->azure = $azure ?: DbnGatewayFactory::makeForTool('deep-research');
}
public function run(
string $seedQuery,
string $pastedText,
array $uploadedFiles,
array $sliceSelection,
string $engine,
string $language,
array $controls,
?callable $emit = null,
string $advocateRole = '',
?array $priorContext = null,
string $branchNotes = '',
array $subQuestionsOverride = [],
?string $persona = null
): array {
$seedQuery = trim($seedQuery);
$pastedText = trim($pastedText);
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'dbn_legal', 'dbn_legal_v3', 'claude_sonnet', 'claude_haiku'], true) ? $engine : 'azure_mini';
$language = dbnToolsNormalizeUiLanguage($language);
$controls = $this->normalizeControls($controls);
if ($seedQuery === '' && $pastedText === '' && empty($uploadedFiles)) {
dbnToolsAbort('Provide a question, paste text, or upload at least one file.', 422, 'missing_seed');
}
$client = dbnToolsRequireClient();
$personaResolved = dbnToolsResolvePersona((int)$client['id'], $persona);
$packageIds = array_values(array_filter(
array_map('intval', $personaResolved['package_ids'] ?? []),
static fn(int $id): bool => $id > 0
));
if (!$packageIds) {
// Persona resolved without a package → fall back to the legacy family package.
$packageIds = [(int)$this->requireFamilyPackage((int)$client['id'])['id']];
}
dbnToolsBootCaveau();
$aiPortalRoot = dbnToolsAiPortalRoot();
require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
$this->uploadVecs = [];
$this->stepTimings = [];
$trace = [];
$seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, $uploadedFiles);
$emitStep = function (string $stepId, string $label, string $detail, string $status) use (&$trace, $emit): void {
$trace[] = $this->trace($label, $detail, $status);
if ($emit) {
$emit('step', [
'step' => $stepId,
'label' => $label,
'detail' => $detail,
'status' => $status,
]);
}
};
$emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void {
if ($emit) {
$emit('step', [
'step' => $stepId,
'label' => $label,
'detail' => $detail,
'status' => 'running',
]);
}
};
// STEP 1: Query interpretation
$emitRunning('interpretation', 'Query interpretation', 'Summarising the seed input…');
$stepStart = microtime(true);
$interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole, $priorContext, $branchNotes);
$this->stepTimings['interpretation'] = $this->elapsedMs($stepStart);
$emitStep('interpretation', 'Query interpretation', $interpretation['detail'], 'complete');
// STEP 2: Query expansion (or use caller-supplied override)
$stepStart = microtime(true);
if (!empty($subQuestionsOverride)) {
$subQuestions = array_values(array_filter($subQuestionsOverride, fn($sq) =>
is_array($sq) && !empty(trim((string)($sq['question'] ?? '')))
));
$this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
$emitStep('expansion', 'Query expansion',
sprintf('Using %d custom sub-question(s) supplied by the user.', count($subQuestions)), 'complete');
} else {
$emitRunning('expansion', 'Query expansion', 'Generating sub-questions…');
$expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $interpretation['key_signals'], $controls['sub_q_count'], $language, $advocateRole);
$this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
$subQuestions = $expansion['questions'];
$expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
$expansionDetail = $expansion['fallback']
? 'Could not parse sub-questions; falling back to retrieving on the seed query alone.'
: sprintf('Generated %d sub-questions to research the corpus from multiple angles.', count($subQuestions));
$emitStep('expansion', 'Query expansion', $expansionDetail, $expansionStatus);
}
// STEP 3: Slice resolution
$emitRunning('slice_resolution', 'Slice resolution', 'Resolving slice toggles to document IDs…');
$stepStart = microtime(true);
$sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
if (!array_filter($sliceSelectionNormalized)) {
dbnToolsAbort('Enable at least one corpus slice before running deep research.', 422, 'no_slices');
}
$ragDb = dbnToolsRagDb();
try {
$sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
$sliceStatus = 'complete';
$sliceDetail = sprintf(
'%d slice(s) active → %d candidate documents constrain the corpus search.',
count(array_filter($sliceSelectionNormalized)),
count($sharedDocIds)
);
} catch (Throwable $e) {
error_log('DBN deep research slice resolve failed: ' . $e->getMessage());
$sharedDocIds = [];
$sliceStatus = 'warning';
$sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.';
}
$this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
$emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus);
// STEP 4: Upload indexing (in-memory, ephemeral)
$emitRunning('upload_indexing', 'Upload indexing', empty($uploadedFiles)
? 'No uploads; skipping…'
: sprintf('Chunking + embedding %d file(s) in memory…', count($uploadedFiles)));
$stepStart = microtime(true);
$uploadChunks = [];
foreach ($uploadedFiles as $idx => $file) {
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
// Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size
$text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8');
$uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
}
$uploadStatus = 'complete';
$uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
if ($uploadChunks) {
try {
// Embed in small batches of 5, emitting progress between each so the stream
// stays alive during slow CPU Ollama inference (nomic-embed-text on chloe).
$texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
$allVecs = [];
$batchSize = 5;
for ($b = 0; $b < count($texts); $b += $batchSize) {
$batch = array_slice($texts, $b, $batchSize);
if ($emit) {
$emit('progress', ['detail' => sprintf(
'Embedding chunks %d%d of %d…',
$b + 1, $b + count($batch), count($texts)
)]);
}
$batchVecs = dbnToolsLiteLLMEmbedBatch($batch);
$allVecs = array_merge($allVecs, $batchVecs);
}
$vecs = $allVecs;
if (count($vecs) === count($uploadChunks)) {
foreach ($uploadChunks as $i => $chunk) {
$this->uploadVecs[] = [
'meta' => $chunk,
'vec' => $vecs[$i],
];
}
} else {
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding returned an unexpected count; uploaded chunks will not participate in retrieval.';
}
} catch (Throwable $e) {
error_log('DBN deep research upload embed failed: ' . $e->getMessage());
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.';
$this->uploadVecs = [];
}
} elseif (empty($uploadedFiles)) {
$uploadDetail = 'No files uploaded; agent will research the corpus only.';
}
$this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
$emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus);
// STEP 5: Retrieval (per sub-question)
$retrievalQueries = $subQuestions ?: [[
'id' => 'q1',
'question' => $seedQuery !== '' ? $seedQuery : ($interpretation['brief'] ?: 'legal research'),
'rationale' => 'Seed query (no sub-question expansion).',
]];
$emitRunning('retrieval', 'Retrieval', sprintf('Hybrid vector + keyword + rerank across %d sub-question(s)…', count($retrievalQueries)));
$stepStart = microtime(true);
try {
$rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
} catch (Throwable $e) {
dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
}
$rawPool = [];
$retrievalWarnings = 0;
$rawCorpusCount = 0;
$rawUploadCount = 0;
$filteredOutCount = 0;
foreach ($retrievalQueries as $idx => $sq) {
if ($emit) {
$emit('subq', [
'index' => $idx + 1,
'total' => count($retrievalQueries),
'id' => $sq['id'],
'question' => $sq['question'],
]);
}
try {
$corpusChunks = $rag->searchAll(
$sq['question'],
$controls['chunk_limit'],
null,
[
'search_private' => false,
'search_shared' => true,
'package_ids' => $packageIds,
'shared_doc_ids' => $sharedDocIds,
'chunk_limit' => $controls['chunk_limit'],
'search_method' => 'hybrid',
'reranker_enabled' => true,
'include_beta_website' => false,
'include_primary_website'=> false,
]
);
} catch (Throwable $e) {
error_log('DBN deep research sub-Q retrieval failed: ' . $e->getMessage());
$corpusChunks = [];
$retrievalWarnings++;
}
$rawCorpusCount += count($corpusChunks);
foreach ($corpusChunks as $chunk) {
if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) {
$filteredOutCount++;
continue;
}
$rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
}
// Upload chunk retrieval via cosine sim
if (!empty($this->uploadVecs)) {
$uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']);
$rawUploadCount += count($uploadHits);
foreach ($uploadHits as $hit) {
$hit['matched_sub_questions'] = [$sq['id']];
$rawPool[] = $hit;
}
}
}
$merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
$this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
$retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
$retrievalDetail = sprintf(
'%d sub-question(s) × hybrid + RRF + rerank → %d corpus chunks (%d filtered) + %d upload hits → %d unique after dedupe.',
count($retrievalQueries),
$rawCorpusCount,
$filteredOutCount,
$rawUploadCount,
count($merged)
);
$emitStep('retrieval', 'Retrieval', $retrievalDetail, $retrievalStatus);
// Cap pool to reranker top-K for synthesis
$synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
// Hydrate corpus sources with source_url + authority_label via batched dbn_v6 query
$this->hydrateSourceUrls($synthesisPool);
$numberedSources = $this->numberSources($synthesisPool);
$retrievalCounts = [
'raw_corpus' => $rawCorpusCount,
'filtered_website' => $filteredOutCount,
'post_filter_corpus' => $rawCorpusCount - $filteredOutCount,
'raw_upload' => $rawUploadCount,
'after_dedupe' => count($merged),
'after_topk' => count($numberedSources),
];
// STEP 6: Synthesis
$synthesisEngineLabel = $engine === 'azure_full' ? 'Azure gpt-4o' : ($engine === 'gpu' ? 'GPU qwen2.5:14b' : 'Azure gpt-4o-mini');
$emitRunning('synthesis', 'Synthesis', sprintf('Synthesising cited brief with %s — this is the slowest step…', $synthesisEngineLabel));
$stepStart = microtime(true);
// Attach upload summaries (generated lazily) to numbered sources
if (!empty($uploadedFiles) && !empty($numberedSources)) {
$uploadSummaries = [];
foreach ($uploadedFiles as $idx => $file) {
$text = mb_substr((string)($file['text'] ?? ''), 0, 4000, 'UTF-8');
$filename = (string)($file['filename'] ?? "file-{$idx}");
if ($text === '') continue;
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
['role' => 'user', 'content' => "Summarise this document for a legal researcher.\n\nFilename: {$filename}\n\nContent:\n{$text}"],
], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 20]);
$uploadSummaries[$idx] = trim($raw);
} catch (Throwable $e) {
error_log('DBN upload summary gen failed for file ' . $idx . ': ' . $e->getMessage());
$uploadSummaries[$idx] = null;
}
}
foreach ($numberedSources as &$src) {
if (($src['source_origin'] ?? '') !== 'upload') continue;
if (preg_match('/^upload:(\d+):/', (string)($src['chunk_id'] ?? ''), $m)) {
$src['summary'] = $uploadSummaries[(int)$m[1]] ?? null;
}
}
unset($src);
}
$synthesis = $this->synthesise(
$seedDescription,
$interpretation['brief'],
$retrievalQueries,
$numberedSources,
$engine,
$language,
$controls['temperature'],
$advocateRole,
$priorContext,
$branchNotes,
$interpretation['key_signals'] ?? []
);
$this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
$emitStep(
'synthesis',
'Synthesis',
sprintf('%s synthesised the brief using %d grounded source(s).', $synthesis['deploy_label'], count($numberedSources)),
'complete'
);
// STEP 7: Confidence
$confidence = $this->citationConfidence($numberedSources);
$emitStep(
'confidence',
'Citation confidence',
sprintf('%s confidence based on %d source(s) and reranker score distribution.', ucfirst($confidence), count($numberedSources)),
$confidence === 'low' ? 'warning' : 'complete'
);
// Stitch sub-question chunk_ids + top_sources (top 3 sources matched by each sub-Q)
$subQOut = [];
foreach ($retrievalQueries as $sq) {
$matchedChunks = array_values(array_filter(
$numberedSources,
fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
));
$topSources = array_slice($matchedChunks, 0, 3);
$subQOut[] = [
'id' => $sq['id'],
'question' => $sq['question'],
'rationale' => $sq['rationale'] ?? '',
'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
'top_sources' => array_map(fn(array $s) => [
'n' => $s['n'] ?? null,
'title' => $s['title'] ?? '',
'section' => $s['section'] ?? null,
'deep_link' => $s['deep_link'] ?? $s['source_url'] ?? null,
'source_url' => $s['source_url'] ?? null,
'source_origin' => $s['source_origin'] ?? 'corpus',
'authority_label'=> $s['authority_label'] ?? null,
'graph_expanded' => $s['graph_expanded'] ?? false,
'excerpt' => $s['excerpt'] ?? '',
], $topSources),
];
}
$isAdvocate = $advocateRole !== '';
return [
'tool' => $isAdvocate ? 'advocate' : 'deep_research',
'language' => $language,
'advocate_role' => $isAdvocate ? $advocateRole : null,
'brief_markdown' => (string)($synthesis['json']['brief_markdown'] ?? $synthesis['json']['answer'] ?? ''),
'client_strengths' => $isAdvocate ? ($synthesis['json']['client_strengths'] ?? []) : null,
'opposing_weaknesses' => $isAdvocate ? ($synthesis['json']['opposing_weaknesses'] ?? []) : null,
'sub_questions' => $subQOut,
'sources' => $numberedSources,
'what_we_found' => (string)($synthesis['json']['what_we_found'] ?? ''),
'evidence_trail' => $numberedSources,
'what_remains_uncertain' => $synthesis['json']['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($synthesis['json']['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($merged),
'source_count' => count($numberedSources),
'sub_question_count' => count($retrievalQueries),
'upload_chunk_count' => count($this->uploadVecs),
'deployment' => $synthesis['deploy_label'],
'engine_used' => $engine,
'citation_confidence' => $confidence,
'elapsed_ms_per_step' => $this->stepTimings,
'retrieval_counts' => $retrievalCounts,
'slices_active' => array_keys(array_filter($sliceSelectionNormalized)),
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function normalizeControls(array $controls): array
{
return [
'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.10))),
];
}
private function requireFamilyPackage(int $clientId): array
{
$package = dbnToolsFetchPackage('family-legal');
if (!$package || empty($package['is_active'])) {
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
}
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
}
return $package;
}
private function buildSeedDescription(string $seedQuery, string $pastedText, array $uploadedFiles): string
{
$parts = [];
if ($seedQuery !== '') {
$parts[] = "Question:\n" . mb_substr($seedQuery, 0, self::MAX_SEED_CHARS, 'UTF-8');
}
if ($pastedText !== '') {
$parts[] = "Pasted text:\n" . mb_substr($pastedText, 0, self::MAX_SEED_CHARS, 'UTF-8');
}
foreach ($uploadedFiles as $idx => $file) {
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
$text = (string)($file['text'] ?? '');
if ($text === '') {
continue;
}
$parts[] = sprintf("Uploaded file [%s]:\n%s", $filename, mb_substr($text, 0, self::MAX_UPLOAD_CHARS, 'UTF-8'));
}
return implode("\n\n", $parts);
}
private function interpretSeed(string $seedDescription, string $language, string $advocateRole = '', ?array $priorContext = null, string $branchNotes = ''): array
{
$locale = dbnToolsLanguageName($language);
$rolePrefix = $advocateRole !== ''
? "You are preparing a case-research brief for: {$advocateRole}. Frame your interpretation to identify the strongest legal angles for this party.\n\n"
: '';
$priorContextBlock = '';
if (!empty($priorContext)) {
$parts = ['Prior research context:'];
if (!empty($priorContext['original_query'])) {
$parts[] = 'Original question: ' . mb_substr((string)$priorContext['original_query'], 0, 300, 'UTF-8');
}
if (!empty($priorContext['what_we_found'])) {
$parts[] = 'Key findings: ' . mb_substr((string)$priorContext['what_we_found'], 0, 400, 'UTF-8');
}
if ($branchNotes !== '') {
$parts[] = 'Researcher notes: ' . mb_substr($branchNotes, 0, 300, 'UTF-8');
}
$priorContextBlock = implode("\n", $parts) . "\n\nNow investigate this branch:\n";
}
$prompt = <<<PROMPT
{$rolePrefix}{$priorContextBlock}You are reviewing the input below to set up a deep legal research pass against the Do Better Norge family-law corpus.
Input:
{$seedDescription}
In {$locale}, produce JSON with:
{
"brief": "1-3 sentence description of what the user is trying to research (≤ 300 chars)",
"key_signals": ["short keywords or terms that should drive retrieval"]
}
PROMPT;
try {
$sysMsg = ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'];
$userMsg = ['role' => 'user', 'content' => $prompt];
if ($language === 'no' || $advocateRole !== '') {
$resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [
'model' => 'dbn-legal-agent-v2', 'json' => true,
'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 40,
]);
$raw = (string)($resp['choices'][0]['message']['content'] ?? '');
} else {
$interpGateway = ($this->azure instanceof DbnBedrockGateway)
? $this->azure->withDeployment(DbnBedrockModelRouter::LITELLM_HAIKU)
: $this->azure;
$raw = $interpGateway->chatText([$sysMsg, $userMsg],
['json' => true, 'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 60]);
}
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && !empty($json['brief'])) {
$signals = is_array($json['key_signals'] ?? null) ? array_slice($json['key_signals'], 0, 8) : [];
$signalText = $signals ? implode(', ', $signals) : '';
return [
'brief' => (string)$json['brief'],
'key_signals' => $signals,
'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
];
}
} catch (Throwable $e) {
error_log('DBN deep research interpretation failed: ' . $e->getMessage());
}
return [
'brief' => '',
'key_signals' => [],
'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
];
}
private function expandQueries(string $seedDescription, string $brief, array $keySignals, int $targetCount, string $language, string $advocateRole = ''): array
{
$locale = dbnToolsLanguageName($language);
$anchorsLine = !empty($keySignals)
? "\nKey retrieval anchors (incorporate these terms into your sub-questions where relevant):\n" . implode(', ', $keySignals) . "\n"
: '';
// Truncate seed to 2000 chars — $brief already captures the key context;
// the full upload text (up to 192K chars) would push past the 60s timeout.
$seedExcerpt = mb_strimwidth($seedDescription, 0, 2000, '…', 'UTF-8');
if ($advocateRole !== '') {
$prompt = <<<PROMPT
You are a Norwegian family-law research assistant building a case for: {$advocateRole}.
Generate exactly {$targetCount} targeted sub-questions designed to find:
1. Lovdata statutes and ECHR/Hague precedents that support {$advocateRole}'s position.
2. Procedural rights and obligations the opposing party must satisfy — failures here help {$advocateRole}.
3. Case law that exposes weaknesses in the opposing party's likely arguments.
4. Specific articles, paragraphs, or judgments {$advocateRole}'s representative should cite.
5. Specific documentation and procedural obligations Barnevernet or the opposing authority must fulfil — procedural or evidentiary failures that Norwegian courts have used to rule in favour of parents or children.
Research brief:
{$brief}
{$anchorsLine}
Raw input:
{$seedExcerpt}
Return JSON only in {$locale}:
{
"sub_questions": [
{"id":"q1","question":"...","rationale":"how finding this strengthens {$advocateRole}'s case (≤ 140 chars)"}
]
}
Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Every question must be answerable from Norwegian family-law, child-welfare, or ECHR/Hague sources.
- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame, Barnevernet procedural obligation).
- Each sub-question must reference a DIFFERENT legal instrument, statute section, or ECHR article — do not repeat the same §-reference or case name across sub-questions.
- Sub-questions must be self-contained — readable without the raw input.
- Write the questions in {$locale}.
PROMPT;
} else {
$prompt = <<<PROMPT
You are decomposing a Do Better Norge legal-research request into {$targetCount} focused sub-questions that should each be answered by the legal corpus (Norwegian family law, child welfare, ECHR/Hague).
Research brief:
{$brief}
{$anchorsLine}
Raw input:
{$seedExcerpt}
Return JSON only:
{
"sub_questions": [
{"id":"q1","question":"... ({$locale})","rationale":"why this angle matters (≤ 140 chars)"}
]
}
Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
- Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
- Each sub-question must reference a DIFFERENT legal instrument, statute section, or ECHR article — do not repeat the same §-reference or case name across sub-questions.
- Sub-questions must be self-contained — readable without seeing the seed text.
- Write the questions in {$locale}.
PROMPT;
}
try {
$sysMsg = ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'];
$userMsg = ['role' => 'user', 'content' => $prompt];
if ($language === 'no') {
$resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [
'model' => 'dbn-legal-agent-v2', 'json' => true,
'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 50,
]);
$raw = (string)($resp['choices'][0]['message']['content'] ?? '');
} else {
$expGateway = ($this->azure instanceof DbnBedrockGateway)
? $this->azure->withDeployment(DbnBedrockModelRouter::LITELLM_HAIKU)
: $this->azure;
$raw = $expGateway->chatText([$sysMsg, $userMsg],
['json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 60]);
}
$json = $this->azure->decodeJsonObject($raw);
$items = is_array($json['sub_questions'] ?? null) ? $json['sub_questions'] : [];
$normalized = [];
foreach ($items as $i => $item) {
if (!is_array($item) || empty($item['question'])) {
continue;
}
$normalized[] = [
'id' => 'q' . ($i + 1),
'question' => trim((string)$item['question']),
'rationale' => trim((string)($item['rationale'] ?? '')),
];
if (count($normalized) >= $targetCount) break;
}
if (count($normalized) >= 2) {
return ['questions' => $normalized, 'fallback' => false];
}
} catch (Throwable $e) {
error_log('DBN deep research expansion failed: ' . $e->getMessage());
}
return ['questions' => [], 'fallback' => true];
}
private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
{
$text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
if ($text === '') {
return [];
}
$words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
if (!$words) {
return [];
}
$chunks = [];
$i = 0;
$chunkIdx = 0;
$total = count($words);
while ($i < $total) {
$slice = array_slice($words, $i, self::CHUNK_WORDS);
if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
$chunks[] = [
'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
'file_index' => $fileIdx,
'chunk_index'=> $chunkIdx,
'filename' => $filename,
'text' => implode(' ', $slice),
];
$chunkIdx++;
}
$advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
if ($advance < 1) $advance = 1;
$i += $advance;
if (count($slice) < self::CHUNK_WORDS) {
break;
}
}
return $chunks;
}
private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
{
if (empty($this->uploadVecs)) {
return [];
}
try {
$qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? [];
} catch (Throwable $e) {
error_log('DBN deep research sub-Q embed failed: ' . $e->getMessage());
return [];
}
if (empty($qVec)) {
return [];
}
$scored = [];
foreach ($this->uploadVecs as $entry) {
$sim = $this->cosineSim($qVec, $entry['vec']);
if ($sim < $threshold) {
continue;
}
$scored[] = [
'chunk_id' => $entry['meta']['chunk_id'],
'title' => 'uploaded: ' . $entry['meta']['filename'],
'section' => null,
'package_or_corpus' => 'Your upload',
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 950),
'chunk_text' => $entry['meta']['text'],
'similarity' => round($sim, 4),
'reranker_score' => null,
'document_id' => null,
'source_origin' => 'upload',
'authority_type' => null,
'jurisdiction' => null,
];
}
usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
$keep = (int)ceil($limitPerSubQ / 2);
return array_slice($scored, 0, max(1, $keep));
}
private function cosineSim(array $a, array $b): float
{
$len = min(count($a), count($b));
if ($len === 0) return 0.0;
$dot = 0.0;
$na = 0.0;
$nb = 0.0;
for ($i = 0; $i < $len; $i++) {
$x = (float)$a[$i];
$y = (float)$b[$i];
$dot += $x * $y;
$na += $x * $x;
$nb += $y * $y;
}
if ($na === 0.0 || $nb === 0.0) return 0.0;
return $dot / (sqrt($na) * sqrt($nb));
}
private function normalizeCorpusChunk(array $chunk, string $subQId): array
{
$similarity = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
$rerankerScore = isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null;
return [
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
'section' => $chunk['section_title'] ?? null,
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 950),
'chunk_text' => (string)($chunk['content'] ?? ''),
'similarity' => $similarity,
'reranker_score' => $rerankerScore,
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
'graph_expanded' => !empty($chunk['_graph_expanded']),
'source_origin' => 'corpus',
'authority_type' => $chunk['authority_type'] ?? null,
'jurisdiction' => $chunk['jurisdiction'] ?? null,
'publication_year' => $chunk['publication_year'] ?? null,
// Filled in later by hydrateSourceUrls()
'source_url' => null,
'deep_link' => null,
'authority_label' => null,
'corpus_source_name'=> null,
'publication_date' => null,
'matched_sub_questions' => [$subQId],
];
}
/**
* Post-retrieval filter: drop chunks that don't belong in a family-law research pass.
*
* EU AI Act (source_id=7, EUR-Lex) leaks through when the Qdrant search runs
* unconstrained (e.g. empty shared_doc_ids on slice-resolution failure). It is
* never relevant to Norwegian family law and is always excluded.
*
* DBN website pages (Resource Directory, Flashcards, etc.) are indexed with
* NULL source_id and score artificially high on broad queries. They are excluded
* unless the dbn_resources slice is explicitly ON.
*/
private function shouldExcludeChunk(array $chunk, array $activeSlices): bool
{
$name = strtolower((string)($chunk['source_name'] ?? ''));
$title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
$url = strtolower((string)($chunk['source_url'] ?? ''));
// EU AI Act — never relevant to family law research
if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true;
if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true;
// DBN website pages — allow through only when dbn_resources slice is ON
$isDbnPage = (
str_contains($name, 'website')
|| str_contains($title, 'dobetternorge.no')
|| preg_match('/^(homepage|landing|about |contact )/i', $title)
|| str_contains($title, 'resource directory')
|| preg_match('/^flashcards?\s*[-|]/i', $title)
|| preg_match('/\|\s*do better norge\s*$/i', $title)
|| preg_match('/[-]\s*do better norge\s*$/i', $title)
);
if ($isDbnPage) {
return !($activeSlices['dbn_resources'] ?? false);
}
return false;
}
/**
* Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc.
* Uses a direct query against bnl_corpus.documents (only columns that exist there —
* the temporal columns added in migration 136 are absent on this instance).
*/
private function hydrateSourceUrls(array &$pool): void
{
$docIds = [];
foreach ($pool as $chunk) {
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
$docId = (int)($chunk['document_id'] ?? 0);
if ($docId > 0) $docIds[$docId] = true;
}
if (empty($docIds)) return;
try {
$ragDb = dbnToolsRagDb();
$ids = array_keys($docIds);
$ph = implode(',', array_fill(0, count($ids), '?'));
$stmt = $ragDb->prepare("
SELECT d.id, d.title, d.source_url, d.authority_type,
d.publication_date, d.source_id, d.jurisdiction,
d.summary, LEFT(d.content, 4000) AS content_excerpt
FROM documents d
WHERE d.id IN ({$ph})
");
$stmt->execute($ids);
$docMeta = [];
$sourceIds = [];
foreach ($stmt as $row) {
$dId = (int)$row['id'];
$sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
if ($sid) $sourceIds[] = $sid;
$docMeta[$dId] = [
'source_url' => $row['source_url'] ?? null,
'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null),
'publication_date' => $row['publication_date'] ?? null,
'corpus_source_name' => 'Do Better Legal',
'source_id' => $sid,
'summary' => $row['summary'] ?? null,
'content_excerpt' => (string)($row['content_excerpt'] ?? ''),
'title' => (string)($row['title'] ?? ''),
];
}
// Lazily generate summaries for documents that don't have one yet
$unsummarized = array_filter($docMeta, fn($m) => $m['summary'] === null && $m['content_excerpt'] !== '');
foreach ($unsummarized as $dId => $m) {
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
['role' => 'user', 'content' => "Summarise this Norwegian family law document for a legal researcher.\nFocus on: which legal provisions it covers, its authority type, and what questions it helps answer.\n\nTitle: {$m['title']}\n\nContent:\n{$m['content_excerpt']}"],
], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
$summary = trim($raw);
if ($summary !== '') {
$ragDb->prepare("UPDATE documents SET summary = ? WHERE id = ?")->execute([$summary, $dId]);
$docMeta[$dId]['summary'] = $summary;
}
} catch (Throwable $e) {
error_log('DBN hydrateSourceUrls summary gen failed for doc ' . $dId . ': ' . $e->getMessage());
}
}
// Enrich with corpus source name from bnl_admin.corpus_sources
if (!empty($sourceIds)) {
$uSids = array_values(array_unique($sourceIds));
$sPh = implode(',', array_fill(0, count($uSids), '?'));
$sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
$sStmt->execute($uSids);
$srcNames = [];
foreach ($sStmt as $row) {
$srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
}
foreach ($docMeta as &$m) {
if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
$m['corpus_source_name'] = $srcNames[$m['source_id']];
}
}
unset($m);
}
} catch (Throwable $e) {
error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage());
return;
}
foreach ($pool as &$chunk) {
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
$docId = (int)($chunk['document_id'] ?? 0);
if (!$docId || !isset($docMeta[$docId])) continue;
$m = $docMeta[$docId];
$sourceUrl = $m['source_url'] ?? null;
$chunk['source_url'] = $sourceUrl;
$chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
$chunk['authority_label'] = $m['authority_label'] ?? $chunk['authority_label'];
$chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
$chunk['publication_date'] = $m['publication_date'] ?? null;
$chunk['summary'] = $m['summary'] ?? null;
}
unset($chunk);
}
/**
* Construct a clickable URL into the original article. Lovdata supports
* path-style section anchors (e.g. /§43). For other hosts we return the
* document root URL.
*/
private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string
{
if (!$sourceUrl) return null;
$sourceUrl = trim($sourceUrl);
if ($sourceUrl === '') return null;
if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl)
&& $sectionTitle
&& preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) {
return rtrim($sourceUrl, '/') . '/§' . $m[1];
}
return $sourceUrl;
}
private function mergeAndDedupe(array $rawPool, int $cap): array
{
$byKey = [];
foreach ($rawPool as $chunk) {
$key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
if (!isset($byKey[$key])) {
$byKey[$key] = $chunk;
continue;
}
$existing = $byKey[$key];
$existing['matched_sub_questions'] = array_values(array_unique(array_merge(
$existing['matched_sub_questions'] ?? [],
$chunk['matched_sub_questions'] ?? []
)));
// Keep the higher similarity score
if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
$existing['similarity'] = $chunk['similarity'];
}
if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
$existing['reranker_score'] = $chunk['reranker_score'];
}
$byKey[$key] = $existing;
}
$merged = array_values($byKey);
usort($merged, function (array $a, array $b): int {
$aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
$bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
return $bScore <=> $aScore;
});
return array_slice($merged, 0, $cap);
}
private function numberSources(array $chunks): array
{
$out = [];
foreach ($chunks as $i => $c) {
$c['n'] = $i + 1;
$out[] = $c;
}
return $out;
}
private function synthesise(
string $seedDescription,
string $brief,
array $subQuestions,
array $numberedSources,
string $engine,
string $language,
float $temperature,
string $advocateRole = '',
?array $priorContext = null,
string $branchNotes = '',
array $keySignals = []
): array {
$locale = dbnToolsLanguageName($language);
if (empty($numberedSources)) {
return [
'json' => [
'brief_markdown' => match (dbnToolsNormalizeUiLanguage($language)) {
'no' => 'Jeg fant ikke tilstrekkelig kildestøtte i korpuset til å gi et grunnlagsbasert svar.',
'uk' => 'Я не знайшов достатньої підтримки джерел у корпусі, щоб дати обґрунтовану відповідь.',
'pl' => 'Nie znalazłem wystarczającego wsparcia źródłowego w korpusie, aby udzielić ugruntowanej odpowiedzi.',
default => 'I did not find enough source support in the corpus to give a grounded answer.',
},
'what_we_found' => 'No retrieved sources passed the similarity threshold.',
'what_remains_uncertain' => ['No corpus evidence retrieved for the given query and slice selection.'],
'next_practical_step' => 'Try widening slice selection or rephrasing with more specific statutory or party terms.',
],
'deploy_label' => match($engine) {
'gpu' => 'GPU (cuttlefish)',
'dbn_legal' => 'dbn-legal-agent-v2',
'dbn_legal_v3' => 'dbn-legal-agent-v3',
'azure_full' => 'gpt-4o',
'claude_sonnet'=> 'Claude 3.5 Sonnet',
default => $this->azure->chatDeployment(),
},
'thinking_trace'=> null,
];
}
$priorContextSection = '';
if (!empty($priorContext)) {
$prior = [];
if (!empty($priorContext['original_query'])) {
$prior[] = 'Original research question: ' . mb_substr((string)$priorContext['original_query'], 0, 300, 'UTF-8');
}
if (!empty($priorContext['brief_summary'])) {
$prior[] = "Key findings from prior research:\n" . mb_substr((string)$priorContext['brief_summary'], 0, 600, 'UTF-8');
}
if ($branchNotes !== '') {
$prior[] = 'Researcher notes: ' . mb_substr($branchNotes, 0, 300, 'UTF-8');
}
if ($prior) {
$priorContextSection = "\nBackground from prior research:\n" . implode("\n", $prior) . "\n";
}
}
$sourcesContext = [];
foreach ($numberedSources as $s) {
$sourcesContext[] = sprintf(
"[%d] (%s) %s%s\n Corpus: %s\n Authority: %s | Jurisdiction: %s\n Excerpt: %s",
$s['n'],
$s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
$s['title'],
!empty($s['section']) ? ' — ' . $s['section'] : '',
$s['package_or_corpus'],
$s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
$s['jurisdiction'] ?? 'n/a',
$s['excerpt']
);
}
$sourcesText = implode("\n\n", $sourcesContext);
$subQText = '';
if ($subQuestions) {
$lines = array_map(
fn(array $sq, int $i): string => sprintf('%d. (%s) %s', $i + 1, $sq['id'], $sq['question']),
$subQuestions,
array_keys($subQuestions)
);
$subQText = "\nSub-questions explored:\n" . implode("\n", $lines);
}
$sourceCount = count($numberedSources);
$lengthGuidance = $sourceCount >= 3
? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.'
: '250-450 words, 2-3 short paragraphs. Note when evidence is thin.';
$keySignalsLine = !empty($keySignals)
? "\nKey retrieval signals (statutory/factual terms that drove corpus search — ground your brief in these where sources permit):\n" . implode(', ', $keySignals) . "\n"
: '';
if ($advocateRole !== '') {
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools producing a legal preparation brief in {$locale}.
Your client: {$advocateRole}
{$priorContextSection}
User input:
{$seedDescription}
Research brief:
{$brief}
{$keySignalsLine}
{$subQText}
Sources ({$sourceCount} numbered):
{$sourcesText}
Rules — read ALL of these before writing a single word of output:
- Every factual claim must end with one or more `[n]` markers. A citation is valid ONLY when that source's excerpt explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic.
- Do NOT invent statute sections, case names, paragraph numbers, dates, or parties. Copy statute references (e.g. §43, §4-12) and ECHR citations verbatim from the excerpt text — never infer a section number that does not appear in an excerpt.
- If no source supports a point, omit the point entirely — do NOT speculate.
- Legal hierarchy: when multiple sources support a claim, prefer the highest-authority source — statute (Barneloven/Barnevernsloven/etc.) > Høyesterett decision > ECHR Grand Chamber > ECHR regular chamber > lower courts > Bufdir guidance.
- Citation self-check: before writing each [n] marker, confirm that source [n] exists in the list and its excerpt actually supports the specific claim being made.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- `opposing_weaknesses`: OMIT this field by default. Populate it only when ≥2 retrieved sources explicitly support the identified weakness. Do not speculate or infer weaknesses from thin evidence.
- `brief_markdown` must be {$lengthGuidance} Structure it as: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Procedural rights and obligations {$advocateRole} should assert, (4) Opposing weaknesses — only if `opposing_weaknesses` is non-empty. End with a one-line caveat that this is legal preparation support, not final legal advice.
- `client_strengths`: 3-6 items, each must include at least one [n] citation.
- `what_remains_uncertain`: 3-5 honest gaps where evidence is insufficient or law is unclear.
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
Return JSON:
{
"brief_markdown": "<advocate brief>",
"client_strengths": ["<strength with [n]>"],
"opposing_weaknesses": ["<weakness with [n]>"],
"what_we_found": "<2-sentence summary of the most relevant retrieved authority for {$advocateRole}>",
"what_remains_uncertain": ["<gap>"],
"next_practical_step": "<one concrete action for {$advocateRole} to take next>"
}
PROMPT;
} else {
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools running a deep-research synthesis. You MUST ground every claim in the numbered sources below, using inline `[n]` citation markers that map to the source list. Do NOT cite a source you did not use. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
{$priorContextSection}
User input:
{$seedDescription}
Research brief:
{$brief}
{$subQText}
Sources ({$sourceCount} numbered):
{$sourcesText}
Return JSON only in {$locale}:
{
"brief_markdown": "Markdown legal brief. {$lengthGuidance} Every factual claim ends with one or more inline [n] markers keyed to the sources above. Use level-3 headings (###) sparingly to separate paragraphs by theme when helpful. End with a one-line caveat that this is research support, not legal advice.",
"what_we_found": "2-4 sentence plain-language summary of the grounded finding",
"what_remains_uncertain": ["specific gaps — what the corpus did not cover, conflicting authority, or where confidence is limited (3-6 items when sources >= 3)"],
"next_practical_step": "one concrete next action the user can take to strengthen the case or close a gap"
}
Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- A `[n]` citation is only valid when the excerpt for source [n] explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic.
- If no source supports a point, omit the point — DO NOT speculate.
- Copy statute section numbers (e.g. §43, §4-12) and ECHR case citations verbatim from the excerpt text — never rephrase or infer a section number that does not appear in an excerpt.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
PROMPT;
}
$messages = [
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences. Every legal claim must be supported by a source from the numbered list. Do not invent statute sections, case names, paragraph numbers, or dates. If no source supports a point, omit it entirely.'],
['role' => 'user', 'content' => $prompt],
];
$synthTemp = ($advocateRole !== '') ? min($temperature, 0.20) : $temperature;
// Advocate briefs require ~4-6K tokens (brief + strengths + weaknesses + uncertainty).
// Non-advocate deep-research responses are shorter (~2-3K). Use separate limits.
$synthMaxTokens = ($advocateRole !== '') ? 6000 : 4000;
$opts = ['json' => true, 'temperature' => $synthTemp, 'max_tokens' => $synthMaxTokens, 'timeout' => 180];
$thinkingTrace = null;
try {
if ($engine === 'dbn_legal_v3') {
$response = dbnToolsCallGpuLlm($messages, array_merge($opts, ['model' => 'dbn-legal-agent-v3', 'timeout' => 180]));
$deployLabel = 'dbn-legal-agent-v3';
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
} elseif ($engine === 'dbn_legal') {
$response = dbnToolsCallGpuLlm($messages, array_merge($opts, ['model' => 'dbn-legal-agent-v2', 'timeout' => 180]));
$deployLabel = 'dbn-legal-agent-v2';
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
} elseif ($engine === 'gpu') {
$response = dbnToolsCallGpuLlm($messages, $opts);
$deployLabel = 'GPU (cuttlefish)';
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
} elseif ($engine === 'azure_full') {
$raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
$deployLabel = 'gpt-4o';
} elseif ($engine === 'azure_mini' && $this->azure instanceof DbnBedrockGateway) {
// When Bedrock enabled, azure_mini → Haiku (fast, ~20-50s synthesis)
$haiku = $this->azure->withDeployment(DbnBedrockModelRouter::LITELLM_HAIKU);
$raw = $haiku->chatText($messages, array_merge($opts, ['timeout' => 90]));
$deployLabel = 'Claude Haiku 4.5 (AWS Bedrock)';
$thinkingTrace = null;
} elseif ($engine === 'claude_sonnet' || ($this->azure instanceof DbnBedrockGateway)) {
if (
$this->azure instanceof DbnBedrockGateway
&& dbnToolsEnv('DBN_BEDROCK_THINKING_ENABLED', 'false') === 'true'
&& DbnBedrockModelRouter::supportsThinking($this->azure->chatDeployment())
) {
// Extended thinking — Pro showcase
$thinkResult = $this->azure->chatWithThinking($messages, [
'max_tokens' => 16000,
'thinking_budget'=> (int)dbnToolsEnv('DBN_BEDROCK_THINKING_BUDGET', '8000'),
'timeout' => 300,
]);
$raw = $thinkResult['text'];
$thinkingTrace = $thinkResult['thinking'] ?? null;
$deployLabel = 'Claude 3.5 Sonnet (extended thinking)';
} else {
$raw = $this->azure->chatText($messages, $opts);
$thinkingTrace = null;
$deployLabel = $this->azure instanceof DbnBedrockGateway
? 'Claude 3.5 Sonnet'
: $this->azure->chatDeployment();
}
} else {
$raw = $this->azure->chatText($messages, $opts);
$deployLabel = $this->azure->chatDeployment();
}
} catch (Throwable $e) {
dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
}
$json = $this->azure->decodeJsonObject($raw);
if (!is_array($json) || empty($json['brief_markdown'])) {
// Salvage as plain markdown
$json = [
'brief_markdown' => $raw,
'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.',
'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
'next_practical_step' => 'Review the brief manually before relying on it.',
];
}
return [
'json' => $json,
'deploy_label' => $deployLabel,
'thinking_trace'=> $thinkingTrace,
];
}
private function citationConfidence(array $sources): string
{
if (!$sources) {
return 'low';
}
$scores = array_values(array_filter(array_map(
fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
$sources
), 'is_numeric'));
$best = $scores ? max($scores) : 0;
if (count($sources) >= 6 && $best >= 0.5) {
return 'high';
}
if (count($sources) >= 3 && $best >= 0.35) {
return 'medium';
}
return 'low';
}
public function generateSubQPreview(
string $seedQuery,
string $pastedText,
string $engine,
string $language,
array $controls,
string $advocateRole = '',
?array $priorContext = null,
string $branchNotes = ''
): array {
$seedQuery = trim($seedQuery);
$pastedText = trim($pastedText);
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'dbn_legal', 'dbn_legal_v3', 'claude_sonnet', 'claude_haiku'], true) ? $engine : 'azure_mini';
$language = dbnToolsNormalizeUiLanguage($language);
$controls = $this->normalizeControls($controls);
if ($seedQuery === '' && $pastedText === '') {
dbnToolsAbort('Provide a question or pasted text.', 422, 'missing_seed');
}
dbnToolsRequireClient();
dbnToolsBootCaveau();
$aiPortalRoot = dbnToolsAiPortalRoot();
require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
$seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, []);
$interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole, $priorContext, $branchNotes);
$expansion = $this->expandQueries(
$seedDescription,
$interpretation['brief'],
$interpretation['key_signals'],
$controls['sub_q_count'],
$language,
$advocateRole
);
return [
'ok' => true,
'interpretation' => $interpretation,
'sub_questions' => $expansion['questions'],
'fallback' => $expansion['fallback'] ?? false,
];
}
private function trace(string $label, string $detail, string $status = 'complete'): array
{
return [
'label' => $label,
'detail' => $detail,
'status' => $status,
];
}
private function elapsedMs(float $start): int
{
return (int)round((microtime(true) - $start) * 1000);
}
}