Files
dobetternorge-tools/includes/DeepResearchAgent.php
T
daveadmin 640778454f Add Case Advocate tab — partisan brief grounded in Norwegian law
New /advocate.php tab: user selects who they represent (biological
father, mother, foster carer, CWS, etc.) and the agent takes their
side entirely. Adversarial sub-questions target supporting Lovdata
statutes + ECHR precedents; synthesis returns client_strengths[] and
opposing_weaknesses[] alongside the advocate brief.

- DeepResearchAgent: add advocateRole param to run(), interpretSeed(),
  expandQueries(), synthesise(). Neutral path unchanged (empty string).
- api/deep-research.php: extract + validate advocate_role from payload;
  telemetry logs tool='advocate' vs 'deep_research'.
- advocate.php: new page with role dropdown (presets + custom), same
  corpus slices/engine/controls/upload zone as deep research.
- assets/js/advocate.js: page-scoped JS; renders advocate banner,
  client strengths card (teal), advocate brief, opposing weaknesses
  card (amber), sub-Q cards, sources, uncertainty, next step.
- assets/css/tools.css: append .adv-* rules (~120 lines).
- includes/layout.php: add Advocate nav tab between Deep research and
  Summarize.
- index.php: add Advocate cap-card tile.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 12:26:05 +02:00

1024 lines
45 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
declare(strict_types=1);
require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';
final class DbnDeepResearchAgent
{
private const MAX_SEED_CHARS = 16000;
private const MAX_UPLOAD_CHARS = 64000;
private const CHUNK_WORDS = 600;
private const CHUNK_OVERLAP_WORDS = 75;
private const MIN_CHUNK_WORDS = 50;
private const POOL_CAP = 30;
private DbnAzureOpenAiGateway $azure;
private array $uploadVecs = [];
private array $stepTimings = [];
public function __construct(?DbnAzureOpenAiGateway $azure = null)
{
$this->azure = $azure ?: new DbnAzureOpenAiGateway();
}
public function run(
string $seedQuery,
string $pastedText,
array $uploadedFiles,
array $sliceSelection,
string $engine,
string $language,
array $controls,
?callable $emit = null,
string $advocateRole = ''
): array {
$seedQuery = trim($seedQuery);
$pastedText = trim($pastedText);
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
$language = in_array($language, ['en', 'no'], true) ? $language : 'en';
$controls = $this->normalizeControls($controls);
if ($seedQuery === '' && $pastedText === '' && empty($uploadedFiles)) {
dbnToolsAbort('Provide a question, paste text, or upload at least one file.', 422, 'missing_seed');
}
$client = dbnToolsRequireClient();
$package = $this->requireFamilyPackage((int)$client['id']);
dbnToolsBootCaveau();
$aiPortalRoot = dbnToolsAiPortalRoot();
require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
$this->uploadVecs = [];
$this->stepTimings = [];
$trace = [];
$seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, $uploadedFiles);
$emitStep = function (string $stepId, string $label, string $detail, string $status) use (&$trace, $emit): void {
$trace[] = $this->trace($label, $detail, $status);
if ($emit) {
$emit('step', [
'step' => $stepId,
'label' => $label,
'detail' => $detail,
'status' => $status,
]);
}
};
$emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void {
if ($emit) {
$emit('step', [
'step' => $stepId,
'label' => $label,
'detail' => $detail,
'status' => 'running',
]);
}
};
// STEP 1: Query interpretation
$emitRunning('interpretation', 'Query interpretation', 'Summarising the seed input…');
$stepStart = microtime(true);
$interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole);
$this->stepTimings['interpretation'] = $this->elapsedMs($stepStart);
$emitStep('interpretation', 'Query interpretation', $interpretation['detail'], 'complete');
// STEP 2: Query expansion
$emitRunning('expansion', 'Query expansion', 'Generating sub-questions…');
$stepStart = microtime(true);
$expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language, $advocateRole);
$this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
$subQuestions = $expansion['questions'];
$expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
$expansionDetail = $expansion['fallback']
? 'Could not parse sub-questions; falling back to retrieving on the seed query alone.'
: sprintf('Generated %d sub-questions to research the corpus from multiple angles.', count($subQuestions));
$emitStep('expansion', 'Query expansion', $expansionDetail, $expansionStatus);
// STEP 3: Slice resolution
$emitRunning('slice_resolution', 'Slice resolution', 'Resolving slice toggles to document IDs…');
$stepStart = microtime(true);
$sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
if (!array_filter($sliceSelectionNormalized)) {
dbnToolsAbort('Enable at least one corpus slice before running deep research.', 422, 'no_slices');
}
$ragDb = dbnToolsRagDb();
try {
$sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
$sliceStatus = 'complete';
$sliceDetail = sprintf(
'%d slice(s) active → %d candidate documents constrain the corpus search.',
count(array_filter($sliceSelectionNormalized)),
count($sharedDocIds)
);
} catch (Throwable $e) {
error_log('DBN deep research slice resolve failed: ' . $e->getMessage());
$sharedDocIds = [];
$sliceStatus = 'warning';
$sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.';
}
$this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
$emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus);
// STEP 4: Upload indexing (in-memory, ephemeral)
$emitRunning('upload_indexing', 'Upload indexing', empty($uploadedFiles)
? 'No uploads; skipping…'
: sprintf('Chunking + embedding %d file(s) in memory…', count($uploadedFiles)));
$stepStart = microtime(true);
$uploadChunks = [];
foreach ($uploadedFiles as $idx => $file) {
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
// Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size
$text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8');
$uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
}
$uploadStatus = 'complete';
$uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
if ($uploadChunks) {
try {
// Embed in small batches of 5, emitting progress between each so the stream
// stays alive during slow CPU Ollama inference (nomic-embed-text on chloe).
$texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
$allVecs = [];
$batchSize = 5;
for ($b = 0; $b < count($texts); $b += $batchSize) {
$batch = array_slice($texts, $b, $batchSize);
if ($emit) {
$emit('progress', ['detail' => sprintf(
'Embedding chunks %d%d of %d…',
$b + 1, $b + count($batch), count($texts)
)]);
}
$batchVecs = dbnToolsLiteLLMEmbedBatch($batch);
$allVecs = array_merge($allVecs, $batchVecs);
}
$vecs = $allVecs;
if (count($vecs) === count($uploadChunks)) {
foreach ($uploadChunks as $i => $chunk) {
$this->uploadVecs[] = [
'meta' => $chunk,
'vec' => $vecs[$i],
];
}
} else {
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding returned an unexpected count; uploaded chunks will not participate in retrieval.';
}
} catch (Throwable $e) {
error_log('DBN deep research upload embed failed: ' . $e->getMessage());
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.';
$this->uploadVecs = [];
}
} elseif (empty($uploadedFiles)) {
$uploadDetail = 'No files uploaded; agent will research the corpus only.';
}
$this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
$emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus);
// STEP 5: Retrieval (per sub-question)
$retrievalQueries = $subQuestions ?: [[
'id' => 'q1',
'question' => $seedQuery !== '' ? $seedQuery : ($interpretation['brief'] ?: 'legal research'),
'rationale' => 'Seed query (no sub-question expansion).',
]];
$emitRunning('retrieval', 'Retrieval', sprintf('Hybrid vector + keyword + rerank across %d sub-question(s)…', count($retrievalQueries)));
$stepStart = microtime(true);
try {
$rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
} catch (Throwable $e) {
dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
}
$rawPool = [];
$retrievalWarnings = 0;
$rawCorpusCount = 0;
$rawUploadCount = 0;
$filteredOutCount = 0;
foreach ($retrievalQueries as $idx => $sq) {
if ($emit) {
$emit('subq', [
'index' => $idx + 1,
'total' => count($retrievalQueries),
'id' => $sq['id'],
'question' => $sq['question'],
]);
}
try {
$corpusChunks = $rag->searchAll(
$sq['question'],
$controls['chunk_limit'],
null,
[
'search_private' => false,
'search_shared' => true,
'package_ids' => [(int)$package['id']],
'shared_doc_ids' => $sharedDocIds,
'chunk_limit' => $controls['chunk_limit'],
'search_method' => 'hybrid',
'reranker_enabled' => true,
'include_beta_website' => false,
'include_primary_website'=> false,
]
);
} catch (Throwable $e) {
error_log('DBN deep research sub-Q retrieval failed: ' . $e->getMessage());
$corpusChunks = [];
$retrievalWarnings++;
}
$rawCorpusCount += count($corpusChunks);
foreach ($corpusChunks as $chunk) {
if ($this->isWebsiteChunk($chunk)) {
$filteredOutCount++;
continue;
}
$rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
}
// Upload chunk retrieval via cosine sim
if (!empty($this->uploadVecs)) {
$uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']);
$rawUploadCount += count($uploadHits);
foreach ($uploadHits as $hit) {
$hit['matched_sub_questions'] = [$sq['id']];
$rawPool[] = $hit;
}
}
}
$merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
$this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
$retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
$retrievalDetail = sprintf(
'%d sub-question(s) × hybrid + RRF + rerank → %d corpus chunks (%d filtered) + %d upload hits → %d unique after dedupe.',
count($retrievalQueries),
$rawCorpusCount,
$filteredOutCount,
$rawUploadCount,
count($merged)
);
$emitStep('retrieval', 'Retrieval', $retrievalDetail, $retrievalStatus);
// Cap pool to reranker top-K for synthesis
$synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
// Hydrate corpus sources with source_url + authority_label via batched dbn_v6 query
$this->hydrateSourceUrls($synthesisPool);
$numberedSources = $this->numberSources($synthesisPool);
$retrievalCounts = [
'raw_corpus' => $rawCorpusCount,
'filtered_website' => $filteredOutCount,
'post_filter_corpus' => $rawCorpusCount - $filteredOutCount,
'raw_upload' => $rawUploadCount,
'after_dedupe' => count($merged),
'after_topk' => count($numberedSources),
];
// STEP 6: Synthesis
$synthesisEngineLabel = $engine === 'azure_full' ? 'Azure gpt-4o' : ($engine === 'gpu' ? 'GPU qwen2.5:14b' : 'Azure gpt-4o-mini');
$emitRunning('synthesis', 'Synthesis', sprintf('Synthesising cited brief with %s — this is the slowest step…', $synthesisEngineLabel));
$stepStart = microtime(true);
$synthesis = $this->synthesise(
$seedDescription,
$interpretation['brief'],
$retrievalQueries,
$numberedSources,
$engine,
$language,
$controls['temperature'],
$advocateRole
);
$this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
$emitStep(
'synthesis',
'Synthesis',
sprintf('%s synthesised the brief using %d grounded source(s).', $synthesis['deploy_label'], count($numberedSources)),
'complete'
);
// STEP 7: Confidence
$confidence = $this->citationConfidence($numberedSources);
$emitStep(
'confidence',
'Citation confidence',
sprintf('%s confidence based on %d source(s) and reranker score distribution.', ucfirst($confidence), count($numberedSources)),
$confidence === 'low' ? 'warning' : 'complete'
);
// Stitch sub-question chunk_ids + top_sources (top 3 sources matched by each sub-Q)
$subQOut = [];
foreach ($retrievalQueries as $sq) {
$matchedChunks = array_values(array_filter(
$numberedSources,
fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
));
$topSources = array_slice($matchedChunks, 0, 3);
$subQOut[] = [
'id' => $sq['id'],
'question' => $sq['question'],
'rationale' => $sq['rationale'] ?? '',
'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
'top_sources' => array_map(fn(array $s) => [
'n' => $s['n'] ?? null,
'title' => $s['title'] ?? '',
'section' => $s['section'] ?? null,
'deep_link' => $s['deep_link'] ?? $s['source_url'] ?? null,
'source_url' => $s['source_url'] ?? null,
'source_origin' => $s['source_origin'] ?? 'corpus',
'authority_label'=> $s['authority_label'] ?? null,
'excerpt' => $s['excerpt'] ?? '',
], $topSources),
];
}
$isAdvocate = $advocateRole !== '';
return [
'tool' => $isAdvocate ? 'advocate' : 'deep_research',
'language' => $language,
'advocate_role' => $isAdvocate ? $advocateRole : null,
'brief_markdown' => (string)($synthesis['json']['brief_markdown'] ?? $synthesis['json']['answer'] ?? ''),
'client_strengths' => $isAdvocate ? ($synthesis['json']['client_strengths'] ?? []) : null,
'opposing_weaknesses' => $isAdvocate ? ($synthesis['json']['opposing_weaknesses'] ?? []) : null,
'sub_questions' => $subQOut,
'sources' => $numberedSources,
'what_we_found' => (string)($synthesis['json']['what_we_found'] ?? ''),
'evidence_trail' => $numberedSources,
'what_remains_uncertain' => $synthesis['json']['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($synthesis['json']['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($merged),
'source_count' => count($numberedSources),
'sub_question_count' => count($retrievalQueries),
'upload_chunk_count' => count($this->uploadVecs),
'deployment' => $synthesis['deploy_label'],
'engine_used' => $engine,
'citation_confidence' => $confidence,
'elapsed_ms_per_step' => $this->stepTimings,
'retrieval_counts' => $retrievalCounts,
'slices_active' => array_keys(array_filter($sliceSelectionNormalized)),
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function normalizeControls(array $controls): array
{
return [
'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
];
}
private function requireFamilyPackage(int $clientId): array
{
$package = dbnToolsFetchPackage('family-legal');
if (!$package || empty($package['is_active'])) {
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
}
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
}
return $package;
}
private function buildSeedDescription(string $seedQuery, string $pastedText, array $uploadedFiles): string
{
$parts = [];
if ($seedQuery !== '') {
$parts[] = "Question:\n" . mb_substr($seedQuery, 0, self::MAX_SEED_CHARS, 'UTF-8');
}
if ($pastedText !== '') {
$parts[] = "Pasted text:\n" . mb_substr($pastedText, 0, self::MAX_SEED_CHARS, 'UTF-8');
}
foreach ($uploadedFiles as $idx => $file) {
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
$text = (string)($file['text'] ?? '');
if ($text === '') {
continue;
}
$parts[] = sprintf("Uploaded file [%s]:\n%s", $filename, mb_substr($text, 0, self::MAX_UPLOAD_CHARS, 'UTF-8'));
}
return implode("\n\n", $parts);
}
private function interpretSeed(string $seedDescription, string $language, string $advocateRole = ''): array
{
$locale = $language === 'no' ? 'Norwegian' : 'English';
$rolePrefix = $advocateRole !== ''
? "You are preparing a case-research brief for: {$advocateRole}. Frame your interpretation to identify the strongest legal angles for this party.\n\n"
: '';
$prompt = <<<PROMPT
{$rolePrefix}You are reviewing the input below to set up a deep legal research pass against the Do Better Norge family-law corpus.
Input:
{$seedDescription}
In {$locale}, produce JSON with:
{
"brief": "1-3 sentence description of what the user is trying to research (≤ 220 chars)",
"key_signals": ["short keywords or terms that should drive retrieval"]
}
PROMPT;
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
], ['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]);
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && !empty($json['brief'])) {
$signals = $json['key_signals'] ?? [];
$signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : '';
return [
'brief' => (string)$json['brief'],
'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
];
}
} catch (Throwable $e) {
error_log('DBN deep research interpretation failed: ' . $e->getMessage());
}
return [
'brief' => '',
'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
];
}
private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language, string $advocateRole = ''): array
{
$locale = $language === 'no' ? 'Norwegian' : 'English';
if ($advocateRole !== '') {
$prompt = <<<PROMPT
You are a Norwegian family-law research assistant building a case for: {$advocateRole}.
Generate exactly {$targetCount} targeted sub-questions designed to find:
1. Lovdata statutes and ECHR/Hague precedents that support {$advocateRole}'s position.
2. Procedural rights and obligations the opposing party must satisfy — failures here help {$advocateRole}.
3. Case law that exposes weaknesses in the opposing party's likely arguments.
4. Specific articles, paragraphs, or judgments {$advocateRole}'s representative should cite.
Research brief:
{$brief}
Raw input:
{$seedDescription}
Return JSON only in {$locale}:
{
"sub_questions": [
{"id":"q1","question":"...","rationale":"how finding this strengthens {$advocateRole}'s case (≤ 140 chars)"}
]
}
Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Every question must be answerable from Norwegian family-law, child-welfare, or ECHR/Hague sources.
- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame).
- Sub-questions must be self-contained — readable without the raw input.
- Write the questions in {$locale}.
PROMPT;
} else {
$prompt = <<<PROMPT
You are decomposing a Do Better Norge legal-research request into {$targetCount} focused sub-questions that should each be answered by the legal corpus (Norwegian family law, child welfare, ECHR/Hague).
Research brief:
{$brief}
Raw input:
{$seedDescription}
Return JSON only:
{
"sub_questions": [
{"id":"q1","question":"... ({$locale})","rationale":"why this angle matters (≤ 140 chars)"}
]
}
Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
- Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
- Sub-questions must be self-contained — readable without seeing the seed text.
- Write the questions in {$locale}.
PROMPT;
}
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
], ['json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 35]);
$json = $this->azure->decodeJsonObject($raw);
$items = is_array($json['sub_questions'] ?? null) ? $json['sub_questions'] : [];
$normalized = [];
foreach ($items as $i => $item) {
if (!is_array($item) || empty($item['question'])) {
continue;
}
$normalized[] = [
'id' => 'q' . ($i + 1),
'question' => trim((string)$item['question']),
'rationale' => trim((string)($item['rationale'] ?? '')),
];
if (count($normalized) >= $targetCount) break;
}
if (count($normalized) >= 2) {
return ['questions' => $normalized, 'fallback' => false];
}
} catch (Throwable $e) {
error_log('DBN deep research expansion failed: ' . $e->getMessage());
}
return ['questions' => [], 'fallback' => true];
}
private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
{
$text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
if ($text === '') {
return [];
}
$words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
if (!$words) {
return [];
}
$chunks = [];
$i = 0;
$chunkIdx = 0;
$total = count($words);
while ($i < $total) {
$slice = array_slice($words, $i, self::CHUNK_WORDS);
if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
$chunks[] = [
'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
'file_index' => $fileIdx,
'chunk_index'=> $chunkIdx,
'filename' => $filename,
'text' => implode(' ', $slice),
];
$chunkIdx++;
}
$advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
if ($advance < 1) $advance = 1;
$i += $advance;
if (count($slice) < self::CHUNK_WORDS) {
break;
}
}
return $chunks;
}
private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
{
if (empty($this->uploadVecs)) {
return [];
}
try {
$qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? [];
} catch (Throwable $e) {
error_log('DBN deep research sub-Q embed failed: ' . $e->getMessage());
return [];
}
if (empty($qVec)) {
return [];
}
$scored = [];
foreach ($this->uploadVecs as $entry) {
$sim = $this->cosineSim($qVec, $entry['vec']);
if ($sim < $threshold) {
continue;
}
$scored[] = [
'chunk_id' => $entry['meta']['chunk_id'],
'title' => 'uploaded: ' . $entry['meta']['filename'],
'section' => null,
'package_or_corpus' => 'Your upload',
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620),
'chunk_text' => $entry['meta']['text'],
'similarity' => round($sim, 4),
'reranker_score' => null,
'document_id' => null,
'source_origin' => 'upload',
'authority_type' => null,
'jurisdiction' => null,
];
}
usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
$keep = (int)ceil($limitPerSubQ / 2);
return array_slice($scored, 0, max(1, $keep));
}
private function cosineSim(array $a, array $b): float
{
$len = min(count($a), count($b));
if ($len === 0) return 0.0;
$dot = 0.0;
$na = 0.0;
$nb = 0.0;
for ($i = 0; $i < $len; $i++) {
$x = (float)$a[$i];
$y = (float)$b[$i];
$dot += $x * $y;
$na += $x * $x;
$nb += $y * $y;
}
if ($na === 0.0 || $nb === 0.0) return 0.0;
return $dot / (sqrt($na) * sqrt($nb));
}
private function normalizeCorpusChunk(array $chunk, string $subQId): array
{
$similarity = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
$rerankerScore = isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null;
return [
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
'section' => $chunk['section_title'] ?? null,
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
'chunk_text' => (string)($chunk['content'] ?? ''),
'similarity' => $similarity,
'reranker_score' => $rerankerScore,
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
'source_origin' => 'corpus',
'authority_type' => $chunk['authority_type'] ?? null,
'jurisdiction' => $chunk['jurisdiction'] ?? null,
'publication_year' => $chunk['publication_year'] ?? null,
// Filled in later by hydrateSourceUrls()
'source_url' => null,
'deep_link' => null,
'authority_label' => null,
'corpus_source_name'=> null,
'publication_date' => null,
'matched_sub_questions' => [$subQId],
];
}
/**
* Defensive post-filter: drop any chunk that smells like a marketing-website hit
* (dobetternorge.no marketing pages have source_group 'website-primary'/'website-beta'
* but the chunk payload only carries `source_name` — use a name+title regex check).
*/
private function isWebsiteChunk(array $chunk): bool
{
$name = strtolower((string)($chunk['source_name'] ?? ''));
$title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
if ($name === '') return false;
// Trusted shared-corpus packages do not contain the word 'website'. Marketing
// sources are explicitly labelled with source_group=website-primary/beta upstream.
if (str_contains($name, 'website')) return true;
if (str_contains($title, 'dobetternorge.no')) return true;
if (preg_match('/^(homepage|landing|about |contact )/i', $title)) return true;
return false;
}
/**
* Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc.
* Uses a direct query against bnl_corpus.documents (only columns that exist there —
* the temporal columns added in migration 136 are absent on this instance).
*/
private function hydrateSourceUrls(array &$pool): void
{
$docIds = [];
foreach ($pool as $chunk) {
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
$docId = (int)($chunk['document_id'] ?? 0);
if ($docId > 0) $docIds[$docId] = true;
}
if (empty($docIds)) return;
try {
$ragDb = dbnToolsRagDb();
$ids = array_keys($docIds);
$ph = implode(',', array_fill(0, count($ids), '?'));
$stmt = $ragDb->prepare("
SELECT d.id, d.title, d.source_url, d.authority_type,
d.publication_date, d.source_id, d.jurisdiction
FROM documents d
WHERE d.id IN ({$ph})
");
$stmt->execute($ids);
$docMeta = [];
$sourceIds = [];
foreach ($stmt as $row) {
$dId = (int)$row['id'];
$sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
if ($sid) $sourceIds[] = $sid;
$docMeta[$dId] = [
'source_url' => $row['source_url'] ?? null,
'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null),
'publication_date' => $row['publication_date'] ?? null,
'corpus_source_name' => 'Do Better Legal',
'source_id' => $sid,
];
}
// Enrich with corpus source name from bnl_admin.corpus_sources
if (!empty($sourceIds)) {
$uSids = array_values(array_unique($sourceIds));
$sPh = implode(',', array_fill(0, count($uSids), '?'));
$sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
$sStmt->execute($uSids);
$srcNames = [];
foreach ($sStmt as $row) {
$srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
}
foreach ($docMeta as &$m) {
if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
$m['corpus_source_name'] = $srcNames[$m['source_id']];
}
}
unset($m);
}
} catch (Throwable $e) {
error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage());
return;
}
foreach ($pool as &$chunk) {
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
$docId = (int)($chunk['document_id'] ?? 0);
if (!$docId || !isset($docMeta[$docId])) continue;
$m = $docMeta[$docId];
$sourceUrl = $m['source_url'] ?? null;
$chunk['source_url'] = $sourceUrl;
$chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
$chunk['authority_label'] = $m['authority_label'] ?? $chunk['authority_label'];
$chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
$chunk['publication_date'] = $m['publication_date'] ?? null;
}
unset($chunk);
}
/**
* Construct a clickable URL into the original article. Lovdata supports
* path-style section anchors (e.g. /§43). For other hosts we return the
* document root URL.
*/
private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string
{
if (!$sourceUrl) return null;
$sourceUrl = trim($sourceUrl);
if ($sourceUrl === '') return null;
if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl)
&& $sectionTitle
&& preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) {
return rtrim($sourceUrl, '/') . '/§' . $m[1];
}
return $sourceUrl;
}
private function mergeAndDedupe(array $rawPool, int $cap): array
{
$byKey = [];
foreach ($rawPool as $chunk) {
$key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
if (!isset($byKey[$key])) {
$byKey[$key] = $chunk;
continue;
}
$existing = $byKey[$key];
$existing['matched_sub_questions'] = array_values(array_unique(array_merge(
$existing['matched_sub_questions'] ?? [],
$chunk['matched_sub_questions'] ?? []
)));
// Keep the higher similarity score
if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
$existing['similarity'] = $chunk['similarity'];
}
if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
$existing['reranker_score'] = $chunk['reranker_score'];
}
$byKey[$key] = $existing;
}
$merged = array_values($byKey);
usort($merged, function (array $a, array $b): int {
$aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
$bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
return $bScore <=> $aScore;
});
return array_slice($merged, 0, $cap);
}
private function numberSources(array $chunks): array
{
$out = [];
foreach ($chunks as $i => $c) {
$c['n'] = $i + 1;
$out[] = $c;
}
return $out;
}
private function synthesise(
string $seedDescription,
string $brief,
array $subQuestions,
array $numberedSources,
string $engine,
string $language,
float $temperature,
string $advocateRole = ''
): array {
$locale = $language === 'no' ? 'Norwegian' : 'English';
if (empty($numberedSources)) {
return [
'json' => [
'brief_markdown' => $language === 'no'
? 'Jeg fant ikke tilstrekkelig kildestøtte i korpuset til å gi et grunnlagsbasert svar.'
: 'I did not find enough source support in the corpus to give a grounded answer.',
'what_we_found' => 'No retrieved sources passed the similarity threshold.',
'what_remains_uncertain' => ['No corpus evidence retrieved for the given query and slice selection.'],
'next_practical_step' => 'Try widening slice selection or rephrasing with more specific statutory or party terms.',
],
'deploy_label' => $engine === 'gpu' ? 'GPU (cuttlefish)' : ($engine === 'azure_full' ? 'gpt-4o' : $this->azure->chatDeployment()),
];
}
$sourcesContext = [];
foreach ($numberedSources as $s) {
$sourcesContext[] = sprintf(
"[%d] (%s) %s%s\n Corpus: %s\n Authority: %s | Jurisdiction: %s\n Excerpt: %s",
$s['n'],
$s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
$s['title'],
!empty($s['section']) ? ' — ' . $s['section'] : '',
$s['package_or_corpus'],
$s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
$s['jurisdiction'] ?? 'n/a',
$s['excerpt']
);
}
$sourcesText = implode("\n\n", $sourcesContext);
$subQText = '';
if ($subQuestions) {
$lines = array_map(
fn(array $sq, int $i): string => sprintf('%d. (%s) %s', $i + 1, $sq['id'], $sq['question']),
$subQuestions,
array_keys($subQuestions)
);
$subQText = "\nSub-questions explored:\n" . implode("\n", $lines);
}
$sourceCount = count($numberedSources);
$lengthGuidance = $sourceCount >= 3
? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.'
: '250-450 words, 2-3 short paragraphs. Note when evidence is thin.';
if ($advocateRole !== '') {
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools producing a legal preparation brief in {$locale}.
Your client: {$advocateRole}
You MUST ground every claim in the numbered sources below using inline `[n]` citation markers. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
User input:
{$seedDescription}
Research brief:
{$brief}
{$subQText}
Sources ({$sourceCount} numbered):
{$sourcesText}
Return JSON only in {$locale}:
{
"brief_markdown": "Partisan but factually grounded advocate brief. {$lengthGuidance} Structure: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Identified weaknesses in the opposing party's position with [n] citations, (4) Procedural rights and obligations {$advocateRole} should assert. End with a one-line caveat that this is legal preparation support, not final legal advice.",
"client_strengths": ["3-6 strings — the strongest factual/legal points for {$advocateRole}, each anchored to at least one [n] source"],
"opposing_weaknesses": ["2-5 strings — vulnerabilities in the opposing position supported by retrieved sources. Omit this array entirely if evidence is thin — do NOT invent weaknesses."],
"what_we_found": "2-sentence summary of the most relevant retrieved authority for {$advocateRole}",
"what_remains_uncertain": ["3-5 gaps where evidence is insufficient or law is unclear — be honest"],
"next_practical_step": "one concrete action for {$advocateRole} to take next (legal filing, evidence gathering, consultation type, etc.)"
}
Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer citing statute sections (e.g. "Barneloven §43") and case names verbatim from source excerpts.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- `opposing_weaknesses` must be omitted or empty when no retrieved source actually supports the identified weakness.
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
PROMPT;
} else {
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools running a deep-research synthesis. You MUST ground every claim in the numbered sources below, using inline `[n]` citation markers that map to the source list. Do NOT cite a source you did not use. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
User input:
{$seedDescription}
Research brief:
{$brief}
{$subQText}
Sources ({$sourceCount} numbered):
{$sourcesText}
Return JSON only in {$locale}:
{
"brief_markdown": "Markdown legal brief. {$lengthGuidance} Every factual claim ends with one or more inline [n] markers keyed to the sources above. Use level-3 headings (###) sparingly to separate paragraphs by theme when helpful. End with a one-line caveat that this is research support, not legal advice.",
"what_we_found": "2-4 sentence plain-language summary of the grounded finding",
"what_remains_uncertain": ["specific gaps — what the corpus did not cover, conflicting authority, or where confidence is limited (3-6 items when sources >= 3)"],
"next_practical_step": "one concrete next action the user can take to strengthen the case or close a gap"
}
Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer pinpointing statute sections (e.g. "Barneloven §43") and case names verbatim from the source excerpts.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
PROMPT;
}
$messages = [
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
];
$opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3200, 'timeout' => 180];
try {
if ($engine === 'gpu') {
$response = dbnToolsCallGpuLlm($messages, $opts);
$deployLabel = 'GPU (cuttlefish)';
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
} elseif ($engine === 'azure_full') {
$raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
$deployLabel = 'gpt-4o';
} else {
$raw = $this->azure->chatText($messages, $opts);
$deployLabel = $this->azure->chatDeployment();
}
} catch (Throwable $e) {
dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
}
$json = $this->azure->decodeJsonObject($raw);
if (!is_array($json) || empty($json['brief_markdown'])) {
// Salvage as plain markdown
$json = [
'brief_markdown' => $raw,
'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.',
'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
'next_practical_step' => 'Review the brief manually before relying on it.',
];
}
return [
'json' => $json,
'deploy_label' => $deployLabel,
];
}
private function citationConfidence(array $sources): string
{
if (!$sources) {
return 'low';
}
$scores = array_values(array_filter(array_map(
fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
$sources
), 'is_numeric'));
$best = $scores ? max($scores) : 0;
if (count($sources) >= 6 && $best >= 0.5) {
return 'high';
}
if (count($sources) >= 3 && $best >= 0.35) {
return 'medium';
}
return 'low';
}
private function trace(string $label, string $detail, string $status = 'complete'): array
{
return [
'label' => $label,
'detail' => $detail,
'status' => $status,
];
}
private function elapsedMs(float $start): int
{
return (int)round((microtime(true) - $start) * 1000);
}
}