d156f8cf6b
Wire the legal-domain persona picker into corpus, deep-research, korrespond and the dashboard chat. Each endpoint reads the chosen profile, resolves its packages against client 57, and scopes retrieval via package_ids (falling back to family when omitted). New dashboard tenants now subscribe to all DBN domain packages so persona switching survives the subscription intersection. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1298 lines
61 KiB
PHP
1298 lines
61 KiB
PHP
<?php
|
||
declare(strict_types=1);
|
||
|
||
require_once __DIR__ . '/bootstrap.php';
|
||
require_once __DIR__ . '/AzureOpenAiGateway.php';
|
||
require_once __DIR__ . '/DbnGatewayFactory.php';
|
||
require_once __DIR__ . '/DbnBedrockModelRouter.php';
|
||
|
||
final class DbnDeepResearchAgent
|
||
{
|
||
private const MAX_SEED_CHARS = 16000;
|
||
private const MAX_UPLOAD_CHARS = 64000;
|
||
private const CHUNK_WORDS = 600;
|
||
private const CHUNK_OVERLAP_WORDS = 75;
|
||
private const MIN_CHUNK_WORDS = 50;
|
||
private const POOL_CAP = 30;
|
||
|
||
private DbnAzureOpenAiGateway|DbnBedrockGateway $azure;
|
||
private array $uploadVecs = [];
|
||
private array $stepTimings = [];
|
||
|
||
public function __construct(DbnAzureOpenAiGateway|DbnBedrockGateway|null $azure = null)
|
||
{
|
||
$this->azure = $azure ?: DbnGatewayFactory::makeForTool('deep-research');
|
||
}
|
||
|
||
public function run(
|
||
string $seedQuery,
|
||
string $pastedText,
|
||
array $uploadedFiles,
|
||
array $sliceSelection,
|
||
string $engine,
|
||
string $language,
|
||
array $controls,
|
||
?callable $emit = null,
|
||
string $advocateRole = '',
|
||
?array $priorContext = null,
|
||
string $branchNotes = '',
|
||
array $subQuestionsOverride = [],
|
||
?string $persona = null
|
||
): array {
|
||
$seedQuery = trim($seedQuery);
|
||
$pastedText = trim($pastedText);
|
||
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'dbn_legal', 'dbn_legal_v3', 'claude_sonnet', 'claude_haiku'], true) ? $engine : 'azure_mini';
|
||
$language = dbnToolsNormalizeUiLanguage($language);
|
||
|
||
$controls = $this->normalizeControls($controls);
|
||
|
||
if ($seedQuery === '' && $pastedText === '' && empty($uploadedFiles)) {
|
||
dbnToolsAbort('Provide a question, paste text, or upload at least one file.', 422, 'missing_seed');
|
||
}
|
||
|
||
$client = dbnToolsRequireClient();
|
||
$personaResolved = dbnToolsResolvePersona((int)$client['id'], $persona);
|
||
$packageIds = array_values(array_filter(
|
||
array_map('intval', $personaResolved['package_ids'] ?? []),
|
||
static fn(int $id): bool => $id > 0
|
||
));
|
||
if (!$packageIds) {
|
||
// Persona resolved without a package → fall back to the legacy family package.
|
||
$packageIds = [(int)$this->requireFamilyPackage((int)$client['id'])['id']];
|
||
}
|
||
|
||
dbnToolsBootCaveau();
|
||
$aiPortalRoot = dbnToolsAiPortalRoot();
|
||
require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
|
||
|
||
$this->uploadVecs = [];
|
||
$this->stepTimings = [];
|
||
|
||
$trace = [];
|
||
$seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, $uploadedFiles);
|
||
|
||
$emitStep = function (string $stepId, string $label, string $detail, string $status) use (&$trace, $emit): void {
|
||
$trace[] = $this->trace($label, $detail, $status);
|
||
if ($emit) {
|
||
$emit('step', [
|
||
'step' => $stepId,
|
||
'label' => $label,
|
||
'detail' => $detail,
|
||
'status' => $status,
|
||
]);
|
||
}
|
||
};
|
||
$emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void {
|
||
if ($emit) {
|
||
$emit('step', [
|
||
'step' => $stepId,
|
||
'label' => $label,
|
||
'detail' => $detail,
|
||
'status' => 'running',
|
||
]);
|
||
}
|
||
};
|
||
|
||
// STEP 1: Query interpretation
|
||
$emitRunning('interpretation', 'Query interpretation', 'Summarising the seed input…');
|
||
$stepStart = microtime(true);
|
||
$interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole, $priorContext, $branchNotes);
|
||
$this->stepTimings['interpretation'] = $this->elapsedMs($stepStart);
|
||
$emitStep('interpretation', 'Query interpretation', $interpretation['detail'], 'complete');
|
||
|
||
// STEP 2: Query expansion (or use caller-supplied override)
|
||
$stepStart = microtime(true);
|
||
if (!empty($subQuestionsOverride)) {
|
||
$subQuestions = array_values(array_filter($subQuestionsOverride, fn($sq) =>
|
||
is_array($sq) && !empty(trim((string)($sq['question'] ?? '')))
|
||
));
|
||
$this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
|
||
$emitStep('expansion', 'Query expansion',
|
||
sprintf('Using %d custom sub-question(s) supplied by the user.', count($subQuestions)), 'complete');
|
||
} else {
|
||
$emitRunning('expansion', 'Query expansion', 'Generating sub-questions…');
|
||
$expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $interpretation['key_signals'], $controls['sub_q_count'], $language, $advocateRole);
|
||
$this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
|
||
$subQuestions = $expansion['questions'];
|
||
$expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
|
||
$expansionDetail = $expansion['fallback']
|
||
? 'Could not parse sub-questions; falling back to retrieving on the seed query alone.'
|
||
: sprintf('Generated %d sub-questions to research the corpus from multiple angles.', count($subQuestions));
|
||
$emitStep('expansion', 'Query expansion', $expansionDetail, $expansionStatus);
|
||
}
|
||
|
||
// STEP 3: Slice resolution
|
||
$emitRunning('slice_resolution', 'Slice resolution', 'Resolving slice toggles to document IDs…');
|
||
$stepStart = microtime(true);
|
||
$sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
|
||
if (!array_filter($sliceSelectionNormalized)) {
|
||
dbnToolsAbort('Enable at least one corpus slice before running deep research.', 422, 'no_slices');
|
||
}
|
||
$ragDb = dbnToolsRagDb();
|
||
try {
|
||
$sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
|
||
$sliceStatus = 'complete';
|
||
$sliceDetail = sprintf(
|
||
'%d slice(s) active → %d candidate documents constrain the corpus search.',
|
||
count(array_filter($sliceSelectionNormalized)),
|
||
count($sharedDocIds)
|
||
);
|
||
} catch (Throwable $e) {
|
||
error_log('DBN deep research slice resolve failed: ' . $e->getMessage());
|
||
$sharedDocIds = [];
|
||
$sliceStatus = 'warning';
|
||
$sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.';
|
||
}
|
||
$this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
|
||
$emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus);
|
||
|
||
// STEP 4: Upload indexing (in-memory, ephemeral)
|
||
$emitRunning('upload_indexing', 'Upload indexing', empty($uploadedFiles)
|
||
? 'No uploads; skipping…'
|
||
: sprintf('Chunking + embedding %d file(s) in memory…', count($uploadedFiles)));
|
||
$stepStart = microtime(true);
|
||
$uploadChunks = [];
|
||
foreach ($uploadedFiles as $idx => $file) {
|
||
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
|
||
// Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size
|
||
$text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8');
|
||
$uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
|
||
}
|
||
$uploadStatus = 'complete';
|
||
$uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
|
||
if ($uploadChunks) {
|
||
try {
|
||
// Embed in small batches of 5, emitting progress between each so the stream
|
||
// stays alive during slow CPU Ollama inference (nomic-embed-text on chloe).
|
||
$texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
|
||
$allVecs = [];
|
||
$batchSize = 5;
|
||
for ($b = 0; $b < count($texts); $b += $batchSize) {
|
||
$batch = array_slice($texts, $b, $batchSize);
|
||
if ($emit) {
|
||
$emit('progress', ['detail' => sprintf(
|
||
'Embedding chunks %d–%d of %d…',
|
||
$b + 1, $b + count($batch), count($texts)
|
||
)]);
|
||
}
|
||
$batchVecs = dbnToolsLiteLLMEmbedBatch($batch);
|
||
$allVecs = array_merge($allVecs, $batchVecs);
|
||
}
|
||
$vecs = $allVecs;
|
||
if (count($vecs) === count($uploadChunks)) {
|
||
foreach ($uploadChunks as $i => $chunk) {
|
||
$this->uploadVecs[] = [
|
||
'meta' => $chunk,
|
||
'vec' => $vecs[$i],
|
||
];
|
||
}
|
||
} else {
|
||
$uploadStatus = 'warning';
|
||
$uploadDetail = 'Upload embedding returned an unexpected count; uploaded chunks will not participate in retrieval.';
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('DBN deep research upload embed failed: ' . $e->getMessage());
|
||
$uploadStatus = 'warning';
|
||
$uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.';
|
||
$this->uploadVecs = [];
|
||
}
|
||
} elseif (empty($uploadedFiles)) {
|
||
$uploadDetail = 'No files uploaded; agent will research the corpus only.';
|
||
}
|
||
$this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
|
||
$emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus);
|
||
|
||
// STEP 5: Retrieval (per sub-question)
|
||
$retrievalQueries = $subQuestions ?: [[
|
||
'id' => 'q1',
|
||
'question' => $seedQuery !== '' ? $seedQuery : ($interpretation['brief'] ?: 'legal research'),
|
||
'rationale' => 'Seed query (no sub-question expansion).',
|
||
]];
|
||
$emitRunning('retrieval', 'Retrieval', sprintf('Hybrid vector + keyword + rerank across %d sub-question(s)…', count($retrievalQueries)));
|
||
$stepStart = microtime(true);
|
||
|
||
try {
|
||
$rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
|
||
} catch (Throwable $e) {
|
||
dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
|
||
}
|
||
|
||
$rawPool = [];
|
||
$retrievalWarnings = 0;
|
||
$rawCorpusCount = 0;
|
||
$rawUploadCount = 0;
|
||
$filteredOutCount = 0;
|
||
foreach ($retrievalQueries as $idx => $sq) {
|
||
if ($emit) {
|
||
$emit('subq', [
|
||
'index' => $idx + 1,
|
||
'total' => count($retrievalQueries),
|
||
'id' => $sq['id'],
|
||
'question' => $sq['question'],
|
||
]);
|
||
}
|
||
try {
|
||
$corpusChunks = $rag->searchAll(
|
||
$sq['question'],
|
||
$controls['chunk_limit'],
|
||
null,
|
||
[
|
||
'search_private' => false,
|
||
'search_shared' => true,
|
||
'package_ids' => $packageIds,
|
||
'shared_doc_ids' => $sharedDocIds,
|
||
'chunk_limit' => $controls['chunk_limit'],
|
||
'search_method' => 'hybrid',
|
||
'reranker_enabled' => true,
|
||
'include_beta_website' => false,
|
||
'include_primary_website'=> false,
|
||
]
|
||
);
|
||
} catch (Throwable $e) {
|
||
error_log('DBN deep research sub-Q retrieval failed: ' . $e->getMessage());
|
||
$corpusChunks = [];
|
||
$retrievalWarnings++;
|
||
}
|
||
$rawCorpusCount += count($corpusChunks);
|
||
foreach ($corpusChunks as $chunk) {
|
||
if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) {
|
||
$filteredOutCount++;
|
||
continue;
|
||
}
|
||
$rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
|
||
}
|
||
|
||
// Upload chunk retrieval via cosine sim
|
||
if (!empty($this->uploadVecs)) {
|
||
$uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']);
|
||
$rawUploadCount += count($uploadHits);
|
||
foreach ($uploadHits as $hit) {
|
||
$hit['matched_sub_questions'] = [$sq['id']];
|
||
$rawPool[] = $hit;
|
||
}
|
||
}
|
||
}
|
||
|
||
$merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
|
||
$this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
|
||
$retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
|
||
$retrievalDetail = sprintf(
|
||
'%d sub-question(s) × hybrid + RRF + rerank → %d corpus chunks (%d filtered) + %d upload hits → %d unique after dedupe.',
|
||
count($retrievalQueries),
|
||
$rawCorpusCount,
|
||
$filteredOutCount,
|
||
$rawUploadCount,
|
||
count($merged)
|
||
);
|
||
$emitStep('retrieval', 'Retrieval', $retrievalDetail, $retrievalStatus);
|
||
|
||
// Cap pool to reranker top-K for synthesis
|
||
$synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
|
||
|
||
// Hydrate corpus sources with source_url + authority_label via batched dbn_v6 query
|
||
$this->hydrateSourceUrls($synthesisPool);
|
||
|
||
$numberedSources = $this->numberSources($synthesisPool);
|
||
|
||
$retrievalCounts = [
|
||
'raw_corpus' => $rawCorpusCount,
|
||
'filtered_website' => $filteredOutCount,
|
||
'post_filter_corpus' => $rawCorpusCount - $filteredOutCount,
|
||
'raw_upload' => $rawUploadCount,
|
||
'after_dedupe' => count($merged),
|
||
'after_topk' => count($numberedSources),
|
||
];
|
||
|
||
// STEP 6: Synthesis
|
||
$synthesisEngineLabel = $engine === 'azure_full' ? 'Azure gpt-4o' : ($engine === 'gpu' ? 'GPU qwen2.5:14b' : 'Azure gpt-4o-mini');
|
||
$emitRunning('synthesis', 'Synthesis', sprintf('Synthesising cited brief with %s — this is the slowest step…', $synthesisEngineLabel));
|
||
$stepStart = microtime(true);
|
||
// Attach upload summaries (generated lazily) to numbered sources
|
||
if (!empty($uploadedFiles) && !empty($numberedSources)) {
|
||
$uploadSummaries = [];
|
||
foreach ($uploadedFiles as $idx => $file) {
|
||
$text = mb_substr((string)($file['text'] ?? ''), 0, 4000, 'UTF-8');
|
||
$filename = (string)($file['filename'] ?? "file-{$idx}");
|
||
if ($text === '') continue;
|
||
try {
|
||
$raw = $this->azure->chatText([
|
||
['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
|
||
['role' => 'user', 'content' => "Summarise this document for a legal researcher.\n\nFilename: {$filename}\n\nContent:\n{$text}"],
|
||
], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 20]);
|
||
$uploadSummaries[$idx] = trim($raw);
|
||
} catch (Throwable $e) {
|
||
error_log('DBN upload summary gen failed for file ' . $idx . ': ' . $e->getMessage());
|
||
$uploadSummaries[$idx] = null;
|
||
}
|
||
}
|
||
foreach ($numberedSources as &$src) {
|
||
if (($src['source_origin'] ?? '') !== 'upload') continue;
|
||
if (preg_match('/^upload:(\d+):/', (string)($src['chunk_id'] ?? ''), $m)) {
|
||
$src['summary'] = $uploadSummaries[(int)$m[1]] ?? null;
|
||
}
|
||
}
|
||
unset($src);
|
||
}
|
||
|
||
$synthesis = $this->synthesise(
|
||
$seedDescription,
|
||
$interpretation['brief'],
|
||
$retrievalQueries,
|
||
$numberedSources,
|
||
$engine,
|
||
$language,
|
||
$controls['temperature'],
|
||
$advocateRole,
|
||
$priorContext,
|
||
$branchNotes,
|
||
$interpretation['key_signals'] ?? []
|
||
);
|
||
$this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
|
||
$emitStep(
|
||
'synthesis',
|
||
'Synthesis',
|
||
sprintf('%s synthesised the brief using %d grounded source(s).', $synthesis['deploy_label'], count($numberedSources)),
|
||
'complete'
|
||
);
|
||
|
||
// STEP 7: Confidence
|
||
$confidence = $this->citationConfidence($numberedSources);
|
||
$emitStep(
|
||
'confidence',
|
||
'Citation confidence',
|
||
sprintf('%s confidence based on %d source(s) and reranker score distribution.', ucfirst($confidence), count($numberedSources)),
|
||
$confidence === 'low' ? 'warning' : 'complete'
|
||
);
|
||
|
||
// Stitch sub-question chunk_ids + top_sources (top 3 sources matched by each sub-Q)
|
||
$subQOut = [];
|
||
foreach ($retrievalQueries as $sq) {
|
||
$matchedChunks = array_values(array_filter(
|
||
$numberedSources,
|
||
fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
|
||
));
|
||
$topSources = array_slice($matchedChunks, 0, 3);
|
||
$subQOut[] = [
|
||
'id' => $sq['id'],
|
||
'question' => $sq['question'],
|
||
'rationale' => $sq['rationale'] ?? '',
|
||
'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
|
||
'top_sources' => array_map(fn(array $s) => [
|
||
'n' => $s['n'] ?? null,
|
||
'title' => $s['title'] ?? '',
|
||
'section' => $s['section'] ?? null,
|
||
'deep_link' => $s['deep_link'] ?? $s['source_url'] ?? null,
|
||
'source_url' => $s['source_url'] ?? null,
|
||
'source_origin' => $s['source_origin'] ?? 'corpus',
|
||
'authority_label'=> $s['authority_label'] ?? null,
|
||
'graph_expanded' => $s['graph_expanded'] ?? false,
|
||
'excerpt' => $s['excerpt'] ?? '',
|
||
], $topSources),
|
||
];
|
||
}
|
||
|
||
$isAdvocate = $advocateRole !== '';
|
||
return [
|
||
'tool' => $isAdvocate ? 'advocate' : 'deep_research',
|
||
'language' => $language,
|
||
'advocate_role' => $isAdvocate ? $advocateRole : null,
|
||
'brief_markdown' => (string)($synthesis['json']['brief_markdown'] ?? $synthesis['json']['answer'] ?? ''),
|
||
'client_strengths' => $isAdvocate ? ($synthesis['json']['client_strengths'] ?? []) : null,
|
||
'opposing_weaknesses' => $isAdvocate ? ($synthesis['json']['opposing_weaknesses'] ?? []) : null,
|
||
'sub_questions' => $subQOut,
|
||
'sources' => $numberedSources,
|
||
'what_we_found' => (string)($synthesis['json']['what_we_found'] ?? ''),
|
||
'evidence_trail' => $numberedSources,
|
||
'what_remains_uncertain' => $synthesis['json']['what_remains_uncertain'] ?? [],
|
||
'next_practical_step' => (string)($synthesis['json']['next_practical_step'] ?? ''),
|
||
'trace' => $trace,
|
||
'trace_metadata' => [
|
||
'chunk_count' => count($merged),
|
||
'source_count' => count($numberedSources),
|
||
'sub_question_count' => count($retrievalQueries),
|
||
'upload_chunk_count' => count($this->uploadVecs),
|
||
'deployment' => $synthesis['deploy_label'],
|
||
'engine_used' => $engine,
|
||
'citation_confidence' => $confidence,
|
||
'elapsed_ms_per_step' => $this->stepTimings,
|
||
'retrieval_counts' => $retrievalCounts,
|
||
'slices_active' => array_keys(array_filter($sliceSelectionNormalized)),
|
||
],
|
||
'disclaimer' => dbnToolsDisclaimer($language),
|
||
];
|
||
}
|
||
|
||
private function normalizeControls(array $controls): array
|
||
{
|
||
return [
|
||
'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
|
||
'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
|
||
'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
|
||
'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
|
||
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.10))),
|
||
];
|
||
}
|
||
|
||
private function requireFamilyPackage(int $clientId): array
|
||
{
|
||
$package = dbnToolsFetchPackage('family-legal');
|
||
if (!$package || empty($package['is_active'])) {
|
||
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
|
||
}
|
||
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
|
||
dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
|
||
}
|
||
return $package;
|
||
}
|
||
|
||
private function buildSeedDescription(string $seedQuery, string $pastedText, array $uploadedFiles): string
|
||
{
|
||
$parts = [];
|
||
if ($seedQuery !== '') {
|
||
$parts[] = "Question:\n" . mb_substr($seedQuery, 0, self::MAX_SEED_CHARS, 'UTF-8');
|
||
}
|
||
if ($pastedText !== '') {
|
||
$parts[] = "Pasted text:\n" . mb_substr($pastedText, 0, self::MAX_SEED_CHARS, 'UTF-8');
|
||
}
|
||
foreach ($uploadedFiles as $idx => $file) {
|
||
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
|
||
$text = (string)($file['text'] ?? '');
|
||
if ($text === '') {
|
||
continue;
|
||
}
|
||
$parts[] = sprintf("Uploaded file [%s]:\n%s", $filename, mb_substr($text, 0, self::MAX_UPLOAD_CHARS, 'UTF-8'));
|
||
}
|
||
return implode("\n\n", $parts);
|
||
}
|
||
|
||
private function interpretSeed(string $seedDescription, string $language, string $advocateRole = '', ?array $priorContext = null, string $branchNotes = ''): array
|
||
{
|
||
$locale = dbnToolsLanguageName($language);
|
||
$rolePrefix = $advocateRole !== ''
|
||
? "You are preparing a case-research brief for: {$advocateRole}. Frame your interpretation to identify the strongest legal angles for this party.\n\n"
|
||
: '';
|
||
|
||
$priorContextBlock = '';
|
||
if (!empty($priorContext)) {
|
||
$parts = ['Prior research context:'];
|
||
if (!empty($priorContext['original_query'])) {
|
||
$parts[] = 'Original question: ' . mb_substr((string)$priorContext['original_query'], 0, 300, 'UTF-8');
|
||
}
|
||
if (!empty($priorContext['what_we_found'])) {
|
||
$parts[] = 'Key findings: ' . mb_substr((string)$priorContext['what_we_found'], 0, 400, 'UTF-8');
|
||
}
|
||
if ($branchNotes !== '') {
|
||
$parts[] = 'Researcher notes: ' . mb_substr($branchNotes, 0, 300, 'UTF-8');
|
||
}
|
||
$priorContextBlock = implode("\n", $parts) . "\n\nNow investigate this branch:\n";
|
||
}
|
||
|
||
$prompt = <<<PROMPT
|
||
{$rolePrefix}{$priorContextBlock}You are reviewing the input below to set up a deep legal research pass against the Do Better Norge family-law corpus.
|
||
|
||
Input:
|
||
{$seedDescription}
|
||
|
||
In {$locale}, produce JSON with:
|
||
{
|
||
"brief": "1-3 sentence description of what the user is trying to research (≤ 300 chars)",
|
||
"key_signals": ["short keywords or terms that should drive retrieval"]
|
||
}
|
||
PROMPT;
|
||
|
||
try {
|
||
$sysMsg = ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'];
|
||
$userMsg = ['role' => 'user', 'content' => $prompt];
|
||
if ($language === 'no' || $advocateRole !== '') {
|
||
$resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [
|
||
'model' => 'dbn-legal-agent-v2', 'json' => true,
|
||
'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 40,
|
||
]);
|
||
$raw = (string)($resp['choices'][0]['message']['content'] ?? '');
|
||
} else {
|
||
$interpGateway = ($this->azure instanceof DbnBedrockGateway)
|
||
? $this->azure->withDeployment(DbnBedrockModelRouter::LITELLM_HAIKU)
|
||
: $this->azure;
|
||
$raw = $interpGateway->chatText([$sysMsg, $userMsg],
|
||
['json' => true, 'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 60]);
|
||
}
|
||
$json = $this->azure->decodeJsonObject($raw);
|
||
if (is_array($json) && !empty($json['brief'])) {
|
||
$signals = is_array($json['key_signals'] ?? null) ? array_slice($json['key_signals'], 0, 8) : [];
|
||
$signalText = $signals ? implode(', ', $signals) : '';
|
||
return [
|
||
'brief' => (string)$json['brief'],
|
||
'key_signals' => $signals,
|
||
'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
|
||
];
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('DBN deep research interpretation failed: ' . $e->getMessage());
|
||
}
|
||
|
||
return [
|
||
'brief' => '',
|
||
'key_signals' => [],
|
||
'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
|
||
];
|
||
}
|
||
|
||
private function expandQueries(string $seedDescription, string $brief, array $keySignals, int $targetCount, string $language, string $advocateRole = ''): array
|
||
{
|
||
$locale = dbnToolsLanguageName($language);
|
||
$anchorsLine = !empty($keySignals)
|
||
? "\nKey retrieval anchors (incorporate these terms into your sub-questions where relevant):\n" . implode(', ', $keySignals) . "\n"
|
||
: '';
|
||
|
||
// Truncate seed to 2000 chars — $brief already captures the key context;
|
||
// the full upload text (up to 192K chars) would push past the 60s timeout.
|
||
$seedExcerpt = mb_strimwidth($seedDescription, 0, 2000, '…', 'UTF-8');
|
||
|
||
if ($advocateRole !== '') {
|
||
$prompt = <<<PROMPT
|
||
You are a Norwegian family-law research assistant building a case for: {$advocateRole}.
|
||
Generate exactly {$targetCount} targeted sub-questions designed to find:
|
||
1. Lovdata statutes and ECHR/Hague precedents that support {$advocateRole}'s position.
|
||
2. Procedural rights and obligations the opposing party must satisfy — failures here help {$advocateRole}.
|
||
3. Case law that exposes weaknesses in the opposing party's likely arguments.
|
||
4. Specific articles, paragraphs, or judgments {$advocateRole}'s representative should cite.
|
||
5. Specific documentation and procedural obligations Barnevernet or the opposing authority must fulfil — procedural or evidentiary failures that Norwegian courts have used to rule in favour of parents or children.
|
||
|
||
Research brief:
|
||
{$brief}
|
||
{$anchorsLine}
|
||
Raw input:
|
||
{$seedExcerpt}
|
||
|
||
Return JSON only in {$locale}:
|
||
{
|
||
"sub_questions": [
|
||
{"id":"q1","question":"...","rationale":"how finding this strengthens {$advocateRole}'s case (≤ 140 chars)"}
|
||
]
|
||
}
|
||
|
||
Rules:
|
||
- Exactly {$targetCount} sub-questions, no more, no fewer.
|
||
- Every question must be answerable from Norwegian family-law, child-welfare, or ECHR/Hague sources.
|
||
- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame, Barnevernet procedural obligation).
|
||
- Each sub-question must reference a DIFFERENT legal instrument, statute section, or ECHR article — do not repeat the same §-reference or case name across sub-questions.
|
||
- Sub-questions must be self-contained — readable without the raw input.
|
||
- Write the questions in {$locale}.
|
||
PROMPT;
|
||
} else {
|
||
$prompt = <<<PROMPT
|
||
You are decomposing a Do Better Norge legal-research request into {$targetCount} focused sub-questions that should each be answered by the legal corpus (Norwegian family law, child welfare, ECHR/Hague).
|
||
|
||
Research brief:
|
||
{$brief}
|
||
{$anchorsLine}
|
||
Raw input:
|
||
{$seedExcerpt}
|
||
|
||
Return JSON only:
|
||
{
|
||
"sub_questions": [
|
||
{"id":"q1","question":"... ({$locale})","rationale":"why this angle matters (≤ 140 chars)"}
|
||
]
|
||
}
|
||
|
||
Rules:
|
||
- Exactly {$targetCount} sub-questions, no more, no fewer.
|
||
- Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
|
||
- Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
|
||
- Each sub-question must reference a DIFFERENT legal instrument, statute section, or ECHR article — do not repeat the same §-reference or case name across sub-questions.
|
||
- Sub-questions must be self-contained — readable without seeing the seed text.
|
||
- Write the questions in {$locale}.
|
||
PROMPT;
|
||
}
|
||
|
||
try {
|
||
$sysMsg = ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'];
|
||
$userMsg = ['role' => 'user', 'content' => $prompt];
|
||
if ($language === 'no') {
|
||
$resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [
|
||
'model' => 'dbn-legal-agent-v2', 'json' => true,
|
||
'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 50,
|
||
]);
|
||
$raw = (string)($resp['choices'][0]['message']['content'] ?? '');
|
||
} else {
|
||
$expGateway = ($this->azure instanceof DbnBedrockGateway)
|
||
? $this->azure->withDeployment(DbnBedrockModelRouter::LITELLM_HAIKU)
|
||
: $this->azure;
|
||
$raw = $expGateway->chatText([$sysMsg, $userMsg],
|
||
['json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 60]);
|
||
}
|
||
$json = $this->azure->decodeJsonObject($raw);
|
||
$items = is_array($json['sub_questions'] ?? null) ? $json['sub_questions'] : [];
|
||
$normalized = [];
|
||
foreach ($items as $i => $item) {
|
||
if (!is_array($item) || empty($item['question'])) {
|
||
continue;
|
||
}
|
||
$normalized[] = [
|
||
'id' => 'q' . ($i + 1),
|
||
'question' => trim((string)$item['question']),
|
||
'rationale' => trim((string)($item['rationale'] ?? '')),
|
||
];
|
||
if (count($normalized) >= $targetCount) break;
|
||
}
|
||
if (count($normalized) >= 2) {
|
||
return ['questions' => $normalized, 'fallback' => false];
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('DBN deep research expansion failed: ' . $e->getMessage());
|
||
}
|
||
|
||
return ['questions' => [], 'fallback' => true];
|
||
}
|
||
|
||
private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
|
||
{
|
||
$text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
|
||
if ($text === '') {
|
||
return [];
|
||
}
|
||
$words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||
if (!$words) {
|
||
return [];
|
||
}
|
||
|
||
$chunks = [];
|
||
$i = 0;
|
||
$chunkIdx = 0;
|
||
$total = count($words);
|
||
while ($i < $total) {
|
||
$slice = array_slice($words, $i, self::CHUNK_WORDS);
|
||
if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
|
||
$chunks[] = [
|
||
'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
|
||
'file_index' => $fileIdx,
|
||
'chunk_index'=> $chunkIdx,
|
||
'filename' => $filename,
|
||
'text' => implode(' ', $slice),
|
||
];
|
||
$chunkIdx++;
|
||
}
|
||
$advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
|
||
if ($advance < 1) $advance = 1;
|
||
$i += $advance;
|
||
if (count($slice) < self::CHUNK_WORDS) {
|
||
break;
|
||
}
|
||
}
|
||
return $chunks;
|
||
}
|
||
|
||
private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
|
||
{
|
||
if (empty($this->uploadVecs)) {
|
||
return [];
|
||
}
|
||
try {
|
||
$qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? [];
|
||
} catch (Throwable $e) {
|
||
error_log('DBN deep research sub-Q embed failed: ' . $e->getMessage());
|
||
return [];
|
||
}
|
||
if (empty($qVec)) {
|
||
return [];
|
||
}
|
||
$scored = [];
|
||
foreach ($this->uploadVecs as $entry) {
|
||
$sim = $this->cosineSim($qVec, $entry['vec']);
|
||
if ($sim < $threshold) {
|
||
continue;
|
||
}
|
||
$scored[] = [
|
||
'chunk_id' => $entry['meta']['chunk_id'],
|
||
'title' => 'uploaded: ' . $entry['meta']['filename'],
|
||
'section' => null,
|
||
'package_or_corpus' => 'Your upload',
|
||
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 950),
|
||
'chunk_text' => $entry['meta']['text'],
|
||
'similarity' => round($sim, 4),
|
||
'reranker_score' => null,
|
||
'document_id' => null,
|
||
'source_origin' => 'upload',
|
||
'authority_type' => null,
|
||
'jurisdiction' => null,
|
||
];
|
||
}
|
||
usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
|
||
$keep = (int)ceil($limitPerSubQ / 2);
|
||
return array_slice($scored, 0, max(1, $keep));
|
||
}
|
||
|
||
private function cosineSim(array $a, array $b): float
|
||
{
|
||
$len = min(count($a), count($b));
|
||
if ($len === 0) return 0.0;
|
||
$dot = 0.0;
|
||
$na = 0.0;
|
||
$nb = 0.0;
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$x = (float)$a[$i];
|
||
$y = (float)$b[$i];
|
||
$dot += $x * $y;
|
||
$na += $x * $x;
|
||
$nb += $y * $y;
|
||
}
|
||
if ($na === 0.0 || $nb === 0.0) return 0.0;
|
||
return $dot / (sqrt($na) * sqrt($nb));
|
||
}
|
||
|
||
private function normalizeCorpusChunk(array $chunk, string $subQId): array
|
||
{
|
||
$similarity = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
|
||
$rerankerScore = isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null;
|
||
return [
|
||
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
|
||
'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
|
||
'section' => $chunk['section_title'] ?? null,
|
||
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
|
||
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 950),
|
||
'chunk_text' => (string)($chunk['content'] ?? ''),
|
||
'similarity' => $similarity,
|
||
'reranker_score' => $rerankerScore,
|
||
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
|
||
'graph_expanded' => !empty($chunk['_graph_expanded']),
|
||
'source_origin' => 'corpus',
|
||
'authority_type' => $chunk['authority_type'] ?? null,
|
||
'jurisdiction' => $chunk['jurisdiction'] ?? null,
|
||
'publication_year' => $chunk['publication_year'] ?? null,
|
||
// Filled in later by hydrateSourceUrls()
|
||
'source_url' => null,
|
||
'deep_link' => null,
|
||
'authority_label' => null,
|
||
'corpus_source_name'=> null,
|
||
'publication_date' => null,
|
||
'matched_sub_questions' => [$subQId],
|
||
];
|
||
}
|
||
|
||
/**
|
||
* Post-retrieval filter: drop chunks that don't belong in a family-law research pass.
|
||
*
|
||
* EU AI Act (source_id=7, EUR-Lex) leaks through when the Qdrant search runs
|
||
* unconstrained (e.g. empty shared_doc_ids on slice-resolution failure). It is
|
||
* never relevant to Norwegian family law and is always excluded.
|
||
*
|
||
* DBN website pages (Resource Directory, Flashcards, etc.) are indexed with
|
||
* NULL source_id and score artificially high on broad queries. They are excluded
|
||
* unless the dbn_resources slice is explicitly ON.
|
||
*/
|
||
private function shouldExcludeChunk(array $chunk, array $activeSlices): bool
|
||
{
|
||
$name = strtolower((string)($chunk['source_name'] ?? ''));
|
||
$title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
|
||
$url = strtolower((string)($chunk['source_url'] ?? ''));
|
||
|
||
// EU AI Act — never relevant to family law research
|
||
if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true;
|
||
if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true;
|
||
|
||
// DBN website pages — allow through only when dbn_resources slice is ON
|
||
$isDbnPage = (
|
||
str_contains($name, 'website')
|
||
|| str_contains($title, 'dobetternorge.no')
|
||
|| preg_match('/^(homepage|landing|about |contact )/i', $title)
|
||
|| str_contains($title, 'resource directory')
|
||
|| preg_match('/^flashcards?\s*[-–|]/i', $title)
|
||
|| preg_match('/\|\s*do better norge\s*$/i', $title)
|
||
|| preg_match('/[-–]\s*do better norge\s*$/i', $title)
|
||
);
|
||
if ($isDbnPage) {
|
||
return !($activeSlices['dbn_resources'] ?? false);
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
/**
|
||
* Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc.
|
||
* Uses a direct query against bnl_corpus.documents (only columns that exist there —
|
||
* the temporal columns added in migration 136 are absent on this instance).
|
||
*/
|
||
private function hydrateSourceUrls(array &$pool): void
|
||
{
|
||
$docIds = [];
|
||
foreach ($pool as $chunk) {
|
||
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
|
||
$docId = (int)($chunk['document_id'] ?? 0);
|
||
if ($docId > 0) $docIds[$docId] = true;
|
||
}
|
||
if (empty($docIds)) return;
|
||
|
||
try {
|
||
$ragDb = dbnToolsRagDb();
|
||
$ids = array_keys($docIds);
|
||
$ph = implode(',', array_fill(0, count($ids), '?'));
|
||
|
||
$stmt = $ragDb->prepare("
|
||
SELECT d.id, d.title, d.source_url, d.authority_type,
|
||
d.publication_date, d.source_id, d.jurisdiction,
|
||
d.summary, LEFT(d.content, 4000) AS content_excerpt
|
||
FROM documents d
|
||
WHERE d.id IN ({$ph})
|
||
");
|
||
$stmt->execute($ids);
|
||
|
||
$docMeta = [];
|
||
$sourceIds = [];
|
||
foreach ($stmt as $row) {
|
||
$dId = (int)$row['id'];
|
||
$sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
|
||
if ($sid) $sourceIds[] = $sid;
|
||
$docMeta[$dId] = [
|
||
'source_url' => $row['source_url'] ?? null,
|
||
'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null),
|
||
'publication_date' => $row['publication_date'] ?? null,
|
||
'corpus_source_name' => 'Do Better Legal',
|
||
'source_id' => $sid,
|
||
'summary' => $row['summary'] ?? null,
|
||
'content_excerpt' => (string)($row['content_excerpt'] ?? ''),
|
||
'title' => (string)($row['title'] ?? ''),
|
||
];
|
||
}
|
||
|
||
// Lazily generate summaries for documents that don't have one yet
|
||
$unsummarized = array_filter($docMeta, fn($m) => $m['summary'] === null && $m['content_excerpt'] !== '');
|
||
foreach ($unsummarized as $dId => $m) {
|
||
try {
|
||
$raw = $this->azure->chatText([
|
||
['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
|
||
['role' => 'user', 'content' => "Summarise this Norwegian family law document for a legal researcher.\nFocus on: which legal provisions it covers, its authority type, and what questions it helps answer.\n\nTitle: {$m['title']}\n\nContent:\n{$m['content_excerpt']}"],
|
||
], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
|
||
$summary = trim($raw);
|
||
if ($summary !== '') {
|
||
$ragDb->prepare("UPDATE documents SET summary = ? WHERE id = ?")->execute([$summary, $dId]);
|
||
$docMeta[$dId]['summary'] = $summary;
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('DBN hydrateSourceUrls summary gen failed for doc ' . $dId . ': ' . $e->getMessage());
|
||
}
|
||
}
|
||
|
||
// Enrich with corpus source name from bnl_admin.corpus_sources
|
||
if (!empty($sourceIds)) {
|
||
$uSids = array_values(array_unique($sourceIds));
|
||
$sPh = implode(',', array_fill(0, count($uSids), '?'));
|
||
$sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
|
||
$sStmt->execute($uSids);
|
||
$srcNames = [];
|
||
foreach ($sStmt as $row) {
|
||
$srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
|
||
}
|
||
foreach ($docMeta as &$m) {
|
||
if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
|
||
$m['corpus_source_name'] = $srcNames[$m['source_id']];
|
||
}
|
||
}
|
||
unset($m);
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage());
|
||
return;
|
||
}
|
||
|
||
foreach ($pool as &$chunk) {
|
||
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
|
||
$docId = (int)($chunk['document_id'] ?? 0);
|
||
if (!$docId || !isset($docMeta[$docId])) continue;
|
||
$m = $docMeta[$docId];
|
||
$sourceUrl = $m['source_url'] ?? null;
|
||
$chunk['source_url'] = $sourceUrl;
|
||
$chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
|
||
$chunk['authority_label'] = $m['authority_label'] ?? $chunk['authority_label'];
|
||
$chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
|
||
$chunk['publication_date'] = $m['publication_date'] ?? null;
|
||
$chunk['summary'] = $m['summary'] ?? null;
|
||
}
|
||
unset($chunk);
|
||
}
|
||
|
||
/**
|
||
* Construct a clickable URL into the original article. Lovdata supports
|
||
* path-style section anchors (e.g. /§43). For other hosts we return the
|
||
* document root URL.
|
||
*/
|
||
private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string
|
||
{
|
||
if (!$sourceUrl) return null;
|
||
$sourceUrl = trim($sourceUrl);
|
||
if ($sourceUrl === '') return null;
|
||
|
||
if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl)
|
||
&& $sectionTitle
|
||
&& preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) {
|
||
return rtrim($sourceUrl, '/') . '/§' . $m[1];
|
||
}
|
||
return $sourceUrl;
|
||
}
|
||
|
||
private function mergeAndDedupe(array $rawPool, int $cap): array
|
||
{
|
||
$byKey = [];
|
||
foreach ($rawPool as $chunk) {
|
||
$key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
|
||
if (!isset($byKey[$key])) {
|
||
$byKey[$key] = $chunk;
|
||
continue;
|
||
}
|
||
$existing = $byKey[$key];
|
||
$existing['matched_sub_questions'] = array_values(array_unique(array_merge(
|
||
$existing['matched_sub_questions'] ?? [],
|
||
$chunk['matched_sub_questions'] ?? []
|
||
)));
|
||
// Keep the higher similarity score
|
||
if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
|
||
$existing['similarity'] = $chunk['similarity'];
|
||
}
|
||
if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
|
||
$existing['reranker_score'] = $chunk['reranker_score'];
|
||
}
|
||
$byKey[$key] = $existing;
|
||
}
|
||
$merged = array_values($byKey);
|
||
usort($merged, function (array $a, array $b): int {
|
||
$aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
|
||
$bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
|
||
return $bScore <=> $aScore;
|
||
});
|
||
return array_slice($merged, 0, $cap);
|
||
}
|
||
|
||
private function numberSources(array $chunks): array
|
||
{
|
||
$out = [];
|
||
foreach ($chunks as $i => $c) {
|
||
$c['n'] = $i + 1;
|
||
$out[] = $c;
|
||
}
|
||
return $out;
|
||
}
|
||
|
||
private function synthesise(
|
||
string $seedDescription,
|
||
string $brief,
|
||
array $subQuestions,
|
||
array $numberedSources,
|
||
string $engine,
|
||
string $language,
|
||
float $temperature,
|
||
string $advocateRole = '',
|
||
?array $priorContext = null,
|
||
string $branchNotes = '',
|
||
array $keySignals = []
|
||
): array {
|
||
$locale = dbnToolsLanguageName($language);
|
||
|
||
if (empty($numberedSources)) {
|
||
return [
|
||
'json' => [
|
||
'brief_markdown' => match (dbnToolsNormalizeUiLanguage($language)) {
|
||
'no' => 'Jeg fant ikke tilstrekkelig kildestøtte i korpuset til å gi et grunnlagsbasert svar.',
|
||
'uk' => 'Я не знайшов достатньої підтримки джерел у корпусі, щоб дати обґрунтовану відповідь.',
|
||
'pl' => 'Nie znalazłem wystarczającego wsparcia źródłowego w korpusie, aby udzielić ugruntowanej odpowiedzi.',
|
||
default => 'I did not find enough source support in the corpus to give a grounded answer.',
|
||
},
|
||
'what_we_found' => 'No retrieved sources passed the similarity threshold.',
|
||
'what_remains_uncertain' => ['No corpus evidence retrieved for the given query and slice selection.'],
|
||
'next_practical_step' => 'Try widening slice selection or rephrasing with more specific statutory or party terms.',
|
||
],
|
||
'deploy_label' => match($engine) {
|
||
'gpu' => 'GPU (cuttlefish)',
|
||
'dbn_legal' => 'dbn-legal-agent-v2',
|
||
'dbn_legal_v3' => 'dbn-legal-agent-v3',
|
||
'azure_full' => 'gpt-4o',
|
||
'claude_sonnet'=> 'Claude 3.5 Sonnet',
|
||
default => $this->azure->chatDeployment(),
|
||
},
|
||
'thinking_trace'=> null,
|
||
];
|
||
}
|
||
|
||
$priorContextSection = '';
|
||
if (!empty($priorContext)) {
|
||
$prior = [];
|
||
if (!empty($priorContext['original_query'])) {
|
||
$prior[] = 'Original research question: ' . mb_substr((string)$priorContext['original_query'], 0, 300, 'UTF-8');
|
||
}
|
||
if (!empty($priorContext['brief_summary'])) {
|
||
$prior[] = "Key findings from prior research:\n" . mb_substr((string)$priorContext['brief_summary'], 0, 600, 'UTF-8');
|
||
}
|
||
if ($branchNotes !== '') {
|
||
$prior[] = 'Researcher notes: ' . mb_substr($branchNotes, 0, 300, 'UTF-8');
|
||
}
|
||
if ($prior) {
|
||
$priorContextSection = "\nBackground from prior research:\n" . implode("\n", $prior) . "\n";
|
||
}
|
||
}
|
||
|
||
$sourcesContext = [];
|
||
foreach ($numberedSources as $s) {
|
||
$sourcesContext[] = sprintf(
|
||
"[%d] (%s) %s%s\n Corpus: %s\n Authority: %s | Jurisdiction: %s\n Excerpt: %s",
|
||
$s['n'],
|
||
$s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
|
||
$s['title'],
|
||
!empty($s['section']) ? ' — ' . $s['section'] : '',
|
||
$s['package_or_corpus'],
|
||
$s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
|
||
$s['jurisdiction'] ?? 'n/a',
|
||
$s['excerpt']
|
||
);
|
||
}
|
||
$sourcesText = implode("\n\n", $sourcesContext);
|
||
|
||
$subQText = '';
|
||
if ($subQuestions) {
|
||
$lines = array_map(
|
||
fn(array $sq, int $i): string => sprintf('%d. (%s) %s', $i + 1, $sq['id'], $sq['question']),
|
||
$subQuestions,
|
||
array_keys($subQuestions)
|
||
);
|
||
$subQText = "\nSub-questions explored:\n" . implode("\n", $lines);
|
||
}
|
||
|
||
$sourceCount = count($numberedSources);
|
||
$lengthGuidance = $sourceCount >= 3
|
||
? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.'
|
||
: '250-450 words, 2-3 short paragraphs. Note when evidence is thin.';
|
||
|
||
$keySignalsLine = !empty($keySignals)
|
||
? "\nKey retrieval signals (statutory/factual terms that drove corpus search — ground your brief in these where sources permit):\n" . implode(', ', $keySignals) . "\n"
|
||
: '';
|
||
|
||
if ($advocateRole !== '') {
|
||
$prompt = <<<PROMPT
|
||
You are Do Better Norge Legal Tools producing a legal preparation brief in {$locale}.
|
||
Your client: {$advocateRole}
|
||
{$priorContextSection}
|
||
User input:
|
||
{$seedDescription}
|
||
|
||
Research brief:
|
||
{$brief}
|
||
{$keySignalsLine}
|
||
{$subQText}
|
||
|
||
Sources ({$sourceCount} numbered):
|
||
{$sourcesText}
|
||
|
||
Rules — read ALL of these before writing a single word of output:
|
||
- Every factual claim must end with one or more `[n]` markers. A citation is valid ONLY when that source's excerpt explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic.
|
||
- Do NOT invent statute sections, case names, paragraph numbers, dates, or parties. Copy statute references (e.g. §43, §4-12) and ECHR citations verbatim from the excerpt text — never infer a section number that does not appear in an excerpt.
|
||
- If no source supports a point, omit the point entirely — do NOT speculate.
|
||
- Legal hierarchy: when multiple sources support a claim, prefer the highest-authority source — statute (Barneloven/Barnevernsloven/etc.) > Høyesterett decision > ECHR Grand Chamber > ECHR regular chamber > lower courts > Bufdir guidance.
|
||
- Citation self-check: before writing each [n] marker, confirm that source [n] exists in the list and its excerpt actually supports the specific claim being made.
|
||
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
|
||
- `opposing_weaknesses`: OMIT this field by default. Populate it only when ≥2 retrieved sources explicitly support the identified weakness. Do not speculate or infer weaknesses from thin evidence.
|
||
- `brief_markdown` must be {$lengthGuidance} Structure it as: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Procedural rights and obligations {$advocateRole} should assert, (4) Opposing weaknesses — only if `opposing_weaknesses` is non-empty. End with a one-line caveat that this is legal preparation support, not final legal advice.
|
||
- `client_strengths`: 3-6 items, each must include at least one [n] citation.
|
||
- `what_remains_uncertain`: 3-5 honest gaps where evidence is insufficient or law is unclear.
|
||
- Respond in {$locale}.
|
||
- Output valid JSON only — no markdown fences around the JSON object itself.
|
||
|
||
Return JSON:
|
||
{
|
||
"brief_markdown": "<advocate brief>",
|
||
"client_strengths": ["<strength with [n]>"],
|
||
"opposing_weaknesses": ["<weakness with [n]>"],
|
||
"what_we_found": "<2-sentence summary of the most relevant retrieved authority for {$advocateRole}>",
|
||
"what_remains_uncertain": ["<gap>"],
|
||
"next_practical_step": "<one concrete action for {$advocateRole} to take next>"
|
||
}
|
||
PROMPT;
|
||
} else {
|
||
$prompt = <<<PROMPT
|
||
You are Do Better Norge Legal Tools running a deep-research synthesis. You MUST ground every claim in the numbered sources below, using inline `[n]` citation markers that map to the source list. Do NOT cite a source you did not use. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
|
||
{$priorContextSection}
|
||
User input:
|
||
{$seedDescription}
|
||
|
||
Research brief:
|
||
{$brief}
|
||
{$subQText}
|
||
|
||
Sources ({$sourceCount} numbered):
|
||
{$sourcesText}
|
||
|
||
Return JSON only in {$locale}:
|
||
{
|
||
"brief_markdown": "Markdown legal brief. {$lengthGuidance} Every factual claim ends with one or more inline [n] markers keyed to the sources above. Use level-3 headings (###) sparingly to separate paragraphs by theme when helpful. End with a one-line caveat that this is research support, not legal advice.",
|
||
"what_we_found": "2-4 sentence plain-language summary of the grounded finding",
|
||
"what_remains_uncertain": ["specific gaps — what the corpus did not cover, conflicting authority, or where confidence is limited (3-6 items when sources >= 3)"],
|
||
"next_practical_step": "one concrete next action the user can take to strengthen the case or close a gap"
|
||
}
|
||
|
||
Rules:
|
||
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
|
||
- A `[n]` citation is only valid when the excerpt for source [n] explicitly states or directly implies the claim — do not cite a source merely because it is on the same topic.
|
||
- If no source supports a point, omit the point — DO NOT speculate.
|
||
- Copy statute section numbers (e.g. §43, §4-12) and ECHR case citations verbatim from the excerpt text — never rephrase or infer a section number that does not appear in an excerpt.
|
||
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
|
||
- Respond in {$locale}.
|
||
- Output valid JSON only — no markdown fences around the JSON object itself.
|
||
PROMPT;
|
||
}
|
||
|
||
$messages = [
|
||
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences. Every legal claim must be supported by a source from the numbered list. Do not invent statute sections, case names, paragraph numbers, or dates. If no source supports a point, omit it entirely.'],
|
||
['role' => 'user', 'content' => $prompt],
|
||
];
|
||
$synthTemp = ($advocateRole !== '') ? min($temperature, 0.20) : $temperature;
|
||
// Advocate briefs require ~4-6K tokens (brief + strengths + weaknesses + uncertainty).
|
||
// Non-advocate deep-research responses are shorter (~2-3K). Use separate limits.
|
||
$synthMaxTokens = ($advocateRole !== '') ? 6000 : 4000;
|
||
$opts = ['json' => true, 'temperature' => $synthTemp, 'max_tokens' => $synthMaxTokens, 'timeout' => 180];
|
||
$thinkingTrace = null;
|
||
|
||
try {
|
||
if ($engine === 'dbn_legal_v3') {
|
||
$response = dbnToolsCallGpuLlm($messages, array_merge($opts, ['model' => 'dbn-legal-agent-v3', 'timeout' => 180]));
|
||
$deployLabel = 'dbn-legal-agent-v3';
|
||
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
|
||
} elseif ($engine === 'dbn_legal') {
|
||
$response = dbnToolsCallGpuLlm($messages, array_merge($opts, ['model' => 'dbn-legal-agent-v2', 'timeout' => 180]));
|
||
$deployLabel = 'dbn-legal-agent-v2';
|
||
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
|
||
} elseif ($engine === 'gpu') {
|
||
$response = dbnToolsCallGpuLlm($messages, $opts);
|
||
$deployLabel = 'GPU (cuttlefish)';
|
||
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
|
||
} elseif ($engine === 'azure_full') {
|
||
$raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
|
||
$deployLabel = 'gpt-4o';
|
||
} elseif ($engine === 'azure_mini' && $this->azure instanceof DbnBedrockGateway) {
|
||
// When Bedrock enabled, azure_mini → Haiku (fast, ~20-50s synthesis)
|
||
$haiku = $this->azure->withDeployment(DbnBedrockModelRouter::LITELLM_HAIKU);
|
||
$raw = $haiku->chatText($messages, array_merge($opts, ['timeout' => 90]));
|
||
$deployLabel = 'Claude Haiku 4.5 (AWS Bedrock)';
|
||
$thinkingTrace = null;
|
||
} elseif ($engine === 'claude_sonnet' || ($this->azure instanceof DbnBedrockGateway)) {
|
||
if (
|
||
$this->azure instanceof DbnBedrockGateway
|
||
&& dbnToolsEnv('DBN_BEDROCK_THINKING_ENABLED', 'false') === 'true'
|
||
&& DbnBedrockModelRouter::supportsThinking($this->azure->chatDeployment())
|
||
) {
|
||
// Extended thinking — Pro showcase
|
||
$thinkResult = $this->azure->chatWithThinking($messages, [
|
||
'max_tokens' => 16000,
|
||
'thinking_budget'=> (int)dbnToolsEnv('DBN_BEDROCK_THINKING_BUDGET', '8000'),
|
||
'timeout' => 300,
|
||
]);
|
||
$raw = $thinkResult['text'];
|
||
$thinkingTrace = $thinkResult['thinking'] ?? null;
|
||
$deployLabel = 'Claude 3.5 Sonnet (extended thinking)';
|
||
} else {
|
||
$raw = $this->azure->chatText($messages, $opts);
|
||
$thinkingTrace = null;
|
||
$deployLabel = $this->azure instanceof DbnBedrockGateway
|
||
? 'Claude 3.5 Sonnet'
|
||
: $this->azure->chatDeployment();
|
||
}
|
||
} else {
|
||
$raw = $this->azure->chatText($messages, $opts);
|
||
$deployLabel = $this->azure->chatDeployment();
|
||
}
|
||
} catch (Throwable $e) {
|
||
dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
|
||
}
|
||
|
||
$json = $this->azure->decodeJsonObject($raw);
|
||
if (!is_array($json) || empty($json['brief_markdown'])) {
|
||
// Salvage as plain markdown
|
||
$json = [
|
||
'brief_markdown' => $raw,
|
||
'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.',
|
||
'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
|
||
'next_practical_step' => 'Review the brief manually before relying on it.',
|
||
];
|
||
}
|
||
|
||
return [
|
||
'json' => $json,
|
||
'deploy_label' => $deployLabel,
|
||
'thinking_trace'=> $thinkingTrace,
|
||
];
|
||
}
|
||
|
||
private function citationConfidence(array $sources): string
|
||
{
|
||
if (!$sources) {
|
||
return 'low';
|
||
}
|
||
$scores = array_values(array_filter(array_map(
|
||
fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
|
||
$sources
|
||
), 'is_numeric'));
|
||
$best = $scores ? max($scores) : 0;
|
||
if (count($sources) >= 6 && $best >= 0.5) {
|
||
return 'high';
|
||
}
|
||
if (count($sources) >= 3 && $best >= 0.35) {
|
||
return 'medium';
|
||
}
|
||
return 'low';
|
||
}
|
||
|
||
public function generateSubQPreview(
|
||
string $seedQuery,
|
||
string $pastedText,
|
||
string $engine,
|
||
string $language,
|
||
array $controls,
|
||
string $advocateRole = '',
|
||
?array $priorContext = null,
|
||
string $branchNotes = ''
|
||
): array {
|
||
$seedQuery = trim($seedQuery);
|
||
$pastedText = trim($pastedText);
|
||
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'dbn_legal', 'dbn_legal_v3', 'claude_sonnet', 'claude_haiku'], true) ? $engine : 'azure_mini';
|
||
$language = dbnToolsNormalizeUiLanguage($language);
|
||
$controls = $this->normalizeControls($controls);
|
||
|
||
if ($seedQuery === '' && $pastedText === '') {
|
||
dbnToolsAbort('Provide a question or pasted text.', 422, 'missing_seed');
|
||
}
|
||
|
||
dbnToolsRequireClient();
|
||
dbnToolsBootCaveau();
|
||
$aiPortalRoot = dbnToolsAiPortalRoot();
|
||
require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
|
||
|
||
$seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, []);
|
||
$interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole, $priorContext, $branchNotes);
|
||
$expansion = $this->expandQueries(
|
||
$seedDescription,
|
||
$interpretation['brief'],
|
||
$interpretation['key_signals'],
|
||
$controls['sub_q_count'],
|
||
$language,
|
||
$advocateRole
|
||
);
|
||
|
||
return [
|
||
'ok' => true,
|
||
'interpretation' => $interpretation,
|
||
'sub_questions' => $expansion['questions'],
|
||
'fallback' => $expansion['fallback'] ?? false,
|
||
];
|
||
}
|
||
|
||
private function trace(string $label, string $detail, string $status = 'complete'): array
|
||
{
|
||
return [
|
||
'label' => $label,
|
||
'detail' => $detail,
|
||
'status' => $status,
|
||
];
|
||
}
|
||
|
||
private function elapsedMs(float $start): int
|
||
{
|
||
return (int)round((microtime(true) - $start) * 1000);
|
||
}
|
||
}
|