Add Deep Research tool — agent + rank/rerank RAG

New surface at /deep-research.php where the user pastes a question or uploads PDF/DOCX/TXT case files and a LLM-orchestrated agent researches the Do Better Norge legal corpus from 3-5 angles, with hybrid retrieval, cross-encoder rerank, and synthesis that emits an inline-[n]-cited markdown brief plus a numbered sources panel. Uploaded documents are chunked + embedded in memory only (nomic-embed-text via LiteLLM) and searched alongside the shared corpus during the same request — never persisted to disk, DB, or Qdrant. Reuses ClientRagPipeline::searchAll (hybrid + rerank), dbnV6 slice helpers, and the existing extract.php text-extraction logic via a new dbnToolsExtractUploadedFile() helper. Also adds dbnToolsCallGpuLlm() helper in bootstrap.php — fixes a latent bug where LegalTools.php was already calling that name with no definition. Search.php is unchanged.
2026-05-15 10:30:47 +02:00
parent 55e11cb649
commit 4cbe0a4ac4
10 changed files with 2119 additions and 125 deletions
@@ -0,0 +1,727 @@
+<?php
+declare(strict_types=1);
+
+require_once __DIR__ . '/bootstrap.php';
+require_once __DIR__ . '/AzureOpenAiGateway.php';
+
+final class DbnDeepResearchAgent
+{
+    private const MAX_SEED_CHARS = 16000;
+    private const MAX_UPLOAD_CHARS = 64000;
+    private const CHUNK_WORDS = 600;
+    private const CHUNK_OVERLAP_WORDS = 75;
+    private const MIN_CHUNK_WORDS = 50;
+    private const POOL_CAP = 30;
+
+    private DbnAzureOpenAiGateway $azure;
+    private ?AiGateway $ai = null;
+    private array $uploadVecs = [];
+    private array $stepTimings = [];
+
+    public function __construct(?DbnAzureOpenAiGateway $azure = null)
+    {
+        $this->azure = $azure ?: new DbnAzureOpenAiGateway();
+    }
+
+    public function run(
+        string $seedQuery,
+        string $pastedText,
+        array  $uploadedFiles,
+        array  $sliceSelection,
+        string $engine,
+        string $language,
+        array  $controls
+    ): array {
+        $seedQuery   = trim($seedQuery);
+        $pastedText  = trim($pastedText);
+        $engine      = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
+        $language    = in_array($language, ['en', 'no'], true) ? $language : 'en';
+
+        $controls = $this->normalizeControls($controls);
+
+        if ($seedQuery === '' && $pastedText === '' && empty($uploadedFiles)) {
+            dbnToolsAbort('Provide a question, paste text, or upload at least one file.', 422, 'missing_seed');
+        }
+
+        $client  = dbnToolsRequireClient();
+        $package = $this->requireFamilyPackage((int)$client['id']);
+
+        dbnToolsBootCaveau();
+        $aiPortalRoot = dbnToolsAiPortalRoot();
+        require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
+        require_once $aiPortalRoot . '/lib/ai/AiGateway.php';
+
+        $this->ai = new AiGateway();
+        $this->uploadVecs = [];
+        $this->stepTimings = [];
+
+        $trace = [];
+        $seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, $uploadedFiles);
+
+        // STEP 1: Query interpretation — build research brief
+        $stepStart = microtime(true);
+        $interpretation = $this->interpretSeed($seedDescription, $language);
+        $this->stepTimings['interpretation'] = $this->elapsedMs($stepStart);
+        $trace[] = $this->trace(
+            'Query interpretation',
+            $interpretation['detail'],
+            'complete'
+        );
+
+        // STEP 2: Query expansion
+        $stepStart = microtime(true);
+        $expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language);
+        $this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
+        $subQuestions = $expansion['questions'];
+        $expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
+        $trace[] = $this->trace(
+            'Query expansion',
+            $expansion['fallback']
+                ? 'Could not parse sub-questions; falling back to retrieving on the seed query alone.'
+                : sprintf('Generated %d sub-questions to research the corpus from multiple angles.', count($subQuestions)),
+            $expansionStatus
+        );
+
+        // STEP 3: Slice resolution
+        $stepStart = microtime(true);
+        $sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
+        if (!array_filter($sliceSelectionNormalized)) {
+            dbnToolsAbort('Enable at least one corpus slice before running deep research.', 422, 'no_slices');
+        }
+        $ragDb = dbnToolsRagDb();
+        try {
+            $sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
+            $sliceStatus = 'complete';
+            $sliceDetail = sprintf(
+                '%d slice(s) active → %d candidate documents constrain the corpus search.',
+                count(array_filter($sliceSelectionNormalized)),
+                count($sharedDocIds)
+            );
+        } catch (Throwable $e) {
+            error_log('DBN deep research slice resolve failed: ' . $e->getMessage());
+            $sharedDocIds = [];
+            $sliceStatus = 'warning';
+            $sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.';
+        }
+        $this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
+        $trace[] = $this->trace('Slice resolution', $sliceDetail, $sliceStatus);
+
+        // STEP 4: Upload indexing (in-memory, ephemeral)
+        $stepStart = microtime(true);
+        $uploadChunks = [];
+        foreach ($uploadedFiles as $idx => $file) {
+            $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
+            $text = (string)($file['text'] ?? '');
+            $uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
+        }
+        $uploadStatus = 'complete';
+        $uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
+        if ($uploadChunks) {
+            try {
+                $texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
+                $vecs = $this->ai->embedBatch($texts, 'nomic-embed-text');
+                if (count($vecs) === count($uploadChunks)) {
+                    foreach ($uploadChunks as $i => $chunk) {
+                        $this->uploadVecs[] = [
+                            'meta' => $chunk,
+                            'vec'  => $vecs[$i],
+                        ];
+                    }
+                } else {
+                    $uploadStatus = 'warning';
+                    $uploadDetail = 'Upload embedding returned an unexpected count; uploaded chunks will not participate in retrieval.';
+                }
+            } catch (Throwable $e) {
+                error_log('DBN deep research upload embed failed: ' . $e->getMessage());
+                $uploadStatus = 'warning';
+                $uploadDetail = 'Upload embedding gateway unreachable; uploaded chunks will not participate in retrieval.';
+                $this->uploadVecs = [];
+            }
+        } elseif (empty($uploadedFiles)) {
+            $uploadDetail = 'No files uploaded; agent will research the corpus only.';
+        }
+        $this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
+        $trace[] = $this->trace('Upload indexing', $uploadDetail, $uploadStatus);
+
+        // STEP 5: Retrieval (per sub-question)
+        $stepStart = microtime(true);
+        $retrievalQueries = $subQuestions ?: [[
+            'id'        => 'q1',
+            'question'  => $seedQuery !== '' ? $seedQuery : ($interpretation['brief'] ?: 'legal research'),
+            'rationale' => 'Seed query (no sub-question expansion).',
+        ]];
+
+        try {
+            $rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
+        } catch (Throwable $e) {
+            dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
+        }
+
+        $rawPool = [];
+        $retrievalWarnings = 0;
+        foreach ($retrievalQueries as $sq) {
+            try {
+                $corpusChunks = $rag->searchAll(
+                    $sq['question'],
+                    $controls['chunk_limit'],
+                    null,
+                    [
+                        'search_private'   => false,
+                        'search_shared'    => true,
+                        'package_ids'      => [(int)$package['id']],
+                        'shared_doc_ids'   => $sharedDocIds,
+                        'chunk_limit'      => $controls['chunk_limit'],
+                        'search_method'    => 'hybrid',
+                        'reranker_enabled' => true,
+                    ]
+                );
+            } catch (Throwable $e) {
+                error_log('DBN deep research sub-Q retrieval failed: ' . $e->getMessage());
+                $corpusChunks = [];
+                $retrievalWarnings++;
+            }
+            foreach ($corpusChunks as $chunk) {
+                $rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
+            }
+
+            // Upload chunk retrieval via cosine sim
+            if (!empty($this->uploadVecs)) {
+                $uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']);
+                foreach ($uploadHits as $hit) {
+                    $hit['matched_sub_questions'] = [$sq['id']];
+                    $rawPool[] = $hit;
+                }
+            }
+        }
+
+        $merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
+        $this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
+        $retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
+        $trace[] = $this->trace(
+            'Retrieval',
+            sprintf(
+                '%d sub-question(s) × hybrid + RRF + rerank → %d raw chunks → %d unique after dedupe.',
+                count($retrievalQueries),
+                count($rawPool),
+                count($merged)
+            ),
+            $retrievalStatus
+        );
+
+        // Cap pool to reranker top-K for synthesis
+        $synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
+        $numberedSources = $this->numberSources($synthesisPool);
+
+        // STEP 6: Synthesis
+        $stepStart = microtime(true);
+        $synthesis = $this->synthesise(
+            $seedDescription,
+            $interpretation['brief'],
+            $retrievalQueries,
+            $numberedSources,
+            $engine,
+            $language,
+            $controls['temperature']
+        );
+        $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
+        $trace[] = $this->trace(
+            'Synthesis',
+            sprintf('%s synthesised the brief using %d grounded source(s).', $synthesis['deploy_label'], count($numberedSources)),
+            'complete'
+        );
+
+        // STEP 7: Confidence
+        $confidence = $this->citationConfidence($numberedSources);
+        $trace[] = $this->trace(
+            'Citation confidence',
+            sprintf('%s confidence based on %d source(s) and reranker score distribution.', ucfirst($confidence), count($numberedSources)),
+            $confidence === 'low' ? 'warning' : 'complete'
+        );
+
+        // Stitch sub-question chunk_ids
+        $subQOut = [];
+        foreach ($retrievalQueries as $sq) {
+            $matchedChunks = array_values(array_filter(
+                $numberedSources,
+                fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
+            ));
+            $subQOut[] = [
+                'id'        => $sq['id'],
+                'question'  => $sq['question'],
+                'rationale' => $sq['rationale'] ?? '',
+                'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
+            ];
+        }
+
+        return [
+            'tool'           => 'deep_research',
+            'language'       => $language,
+            'brief_markdown' => (string)($synthesis['json']['brief_markdown'] ?? $synthesis['json']['answer'] ?? ''),
+            'sub_questions'  => $subQOut,
+            'sources'        => $numberedSources,
+            'what_we_found'  => (string)($synthesis['json']['what_we_found'] ?? ''),
+            'evidence_trail' => $numberedSources,
+            'what_remains_uncertain' => $synthesis['json']['what_remains_uncertain'] ?? [],
+            'next_practical_step'    => (string)($synthesis['json']['next_practical_step'] ?? ''),
+            'trace' => $trace,
+            'trace_metadata' => [
+                'chunk_count'         => count($merged),
+                'source_count'        => count($numberedSources),
+                'sub_question_count'  => count($retrievalQueries),
+                'upload_chunk_count'  => count($this->uploadVecs),
+                'deployment'          => $synthesis['deploy_label'],
+                'engine_used'         => $engine,
+                'citation_confidence' => $confidence,
+                'elapsed_ms_per_step' => $this->stepTimings,
+                'slices_active'       => array_keys(array_filter($sliceSelectionNormalized)),
+            ],
+            'disclaimer' => dbnToolsDisclaimer($language),
+        ];
+    }
+
+    private function normalizeControls(array $controls): array
+    {
+        return [
+            'sub_q_count'          => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
+            'chunk_limit'          => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
+            'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
+            'reranker_top_k'       => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
+            'temperature'          => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
+        ];
+    }
+
+    private function requireFamilyPackage(int $clientId): array
+    {
+        $package = dbnToolsFetchPackage('family-legal');
+        if (!$package || empty($package['is_active'])) {
+            dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
+        }
+        if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
+            dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
+        }
+        return $package;
+    }
+
+    private function buildSeedDescription(string $seedQuery, string $pastedText, array $uploadedFiles): string
+    {
+        $parts = [];
+        if ($seedQuery !== '') {
+            $parts[] = "Question:\n" . mb_substr($seedQuery, 0, self::MAX_SEED_CHARS, 'UTF-8');
+        }
+        if ($pastedText !== '') {
+            $parts[] = "Pasted text:\n" . mb_substr($pastedText, 0, self::MAX_SEED_CHARS, 'UTF-8');
+        }
+        foreach ($uploadedFiles as $idx => $file) {
+            $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
+            $text = (string)($file['text'] ?? '');
+            if ($text === '') {
+                continue;
+            }
+            $parts[] = sprintf("Uploaded file [%s]:\n%s", $filename, mb_substr($text, 0, self::MAX_UPLOAD_CHARS, 'UTF-8'));
+        }
+        return implode("\n\n", $parts);
+    }
+
+    private function interpretSeed(string $seedDescription, string $language): array
+    {
+        $locale = $language === 'no' ? 'Norwegian' : 'English';
+        $prompt = <<<PROMPT
+You are reviewing the input below to set up a deep legal research pass against the Do Better Norge family-law corpus.
+
+Input:
+{$seedDescription}
+
+In {$locale}, produce JSON with:
+{
+  "brief": "1-3 sentence description of what the user is trying to research (≤ 220 chars)",
+  "key_signals": ["short keywords or terms that should drive retrieval"]
+}
+PROMPT;
+
+        try {
+            $raw = $this->azure->chatText([
+                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
+                ['role' => 'user',   'content' => $prompt],
+            ], ['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]);
+            $json = $this->azure->decodeJsonObject($raw);
+            if (is_array($json) && !empty($json['brief'])) {
+                $signals = $json['key_signals'] ?? [];
+                $signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : '';
+                return [
+                    'brief' => (string)$json['brief'],
+                    'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
+                ];
+            }
+        } catch (Throwable $e) {
+            error_log('DBN deep research interpretation failed: ' . $e->getMessage());
+        }
+
+        return [
+            'brief' => '',
+            'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
+        ];
+    }
+
+    private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language): array
+    {
+        $locale = $language === 'no' ? 'Norwegian' : 'English';
+        $prompt = <<<PROMPT
+You are decomposing a Do Better Norge legal-research request into {$targetCount} focused sub-questions that should each be answered by the legal corpus (Norwegian family law, child welfare, ECHR/Hague).
+
+Research brief:
+{$brief}
+
+Raw input:
+{$seedDescription}
+
+Return JSON only:
+{
+  "sub_questions": [
+    {"id":"q1","question":"... ({$locale})","rationale":"why this angle matters (≤ 140 chars)"}
+  ]
+}
+
+Rules:
+- Exactly {$targetCount} sub-questions, no more, no fewer.
+- Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
+- Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
+- Sub-questions must be self-contained — readable without seeing the seed text.
+- Write the questions in {$locale}.
+PROMPT;
+
+        try {
+            $raw = $this->azure->chatText([
+                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
+                ['role' => 'user',   'content' => $prompt],
+            ], ['json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 35]);
+            $json = $this->azure->decodeJsonObject($raw);
+            $items = is_array($json['sub_questions'] ?? null) ? $json['sub_questions'] : [];
+            $normalized = [];
+            foreach ($items as $i => $item) {
+                if (!is_array($item) || empty($item['question'])) {
+                    continue;
+                }
+                $normalized[] = [
+                    'id'        => 'q' . ($i + 1),
+                    'question'  => trim((string)$item['question']),
+                    'rationale' => trim((string)($item['rationale'] ?? '')),
+                ];
+                if (count($normalized) >= $targetCount) break;
+            }
+            if (count($normalized) >= 2) {
+                return ['questions' => $normalized, 'fallback' => false];
+            }
+        } catch (Throwable $e) {
+            error_log('DBN deep research expansion failed: ' . $e->getMessage());
+        }
+
+        return ['questions' => [], 'fallback' => true];
+    }
+
+    private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
+    {
+        $text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
+        if ($text === '') {
+            return [];
+        }
+        $words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
+        if (!$words) {
+            return [];
+        }
+
+        $chunks = [];
+        $i = 0;
+        $chunkIdx = 0;
+        $total = count($words);
+        while ($i < $total) {
+            $slice = array_slice($words, $i, self::CHUNK_WORDS);
+            if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
+                $chunks[] = [
+                    'chunk_id'   => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
+                    'file_index' => $fileIdx,
+                    'chunk_index'=> $chunkIdx,
+                    'filename'   => $filename,
+                    'text'       => implode(' ', $slice),
+                ];
+                $chunkIdx++;
+            }
+            $advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
+            if ($advance < 1) $advance = 1;
+            $i += $advance;
+            if (count($slice) < self::CHUNK_WORDS) {
+                break;
+            }
+        }
+        return $chunks;
+    }
+
+    private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
+    {
+        if (empty($this->uploadVecs)) {
+            return [];
+        }
+        try {
+            $qVec = $this->ai->embed($question, 'nomic-embed-text');
+        } catch (Throwable $e) {
+            error_log('DBN deep research sub-Q embed failed: ' . $e->getMessage());
+            return [];
+        }
+        if (empty($qVec)) {
+            return [];
+        }
+        $scored = [];
+        foreach ($this->uploadVecs as $entry) {
+            $sim = $this->cosineSim($qVec, $entry['vec']);
+            if ($sim < $threshold) {
+                continue;
+            }
+            $scored[] = [
+                'chunk_id'          => $entry['meta']['chunk_id'],
+                'title'             => 'uploaded: ' . $entry['meta']['filename'],
+                'section'           => null,
+                'package_or_corpus' => 'Your upload',
+                'excerpt'           => dbnToolsExcerpt($entry['meta']['text'], 620),
+                'chunk_text'        => $entry['meta']['text'],
+                'similarity'        => round($sim, 4),
+                'reranker_score'    => null,
+                'document_id'       => null,
+                'source_origin'     => 'upload',
+                'authority_type'    => null,
+                'jurisdiction'      => null,
+            ];
+        }
+        usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
+        $keep = (int)ceil($limitPerSubQ / 2);
+        return array_slice($scored, 0, max(1, $keep));
+    }
+
+    private function cosineSim(array $a, array $b): float
+    {
+        $len = min(count($a), count($b));
+        if ($len === 0) return 0.0;
+        $dot = 0.0;
+        $na = 0.0;
+        $nb = 0.0;
+        for ($i = 0; $i < $len; $i++) {
+            $x = (float)$a[$i];
+            $y = (float)$b[$i];
+            $dot += $x * $y;
+            $na  += $x * $x;
+            $nb  += $y * $y;
+        }
+        if ($na === 0.0 || $nb === 0.0) return 0.0;
+        return $dot / (sqrt($na) * sqrt($nb));
+    }
+
+    private function normalizeCorpusChunk(array $chunk, string $subQId): array
+    {
+        $similarity = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
+        $rerankerScore = isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null;
+        return [
+            'chunk_id'          => isset($chunk['id']) ? (int)$chunk['id'] : null,
+            'title'             => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
+            'section'           => $chunk['section_title'] ?? null,
+            'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Norge'),
+            'excerpt'           => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
+            'chunk_text'        => (string)($chunk['content'] ?? ''),
+            'similarity'        => $similarity,
+            'reranker_score'    => $rerankerScore,
+            'document_id'       => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
+            'source_origin'     => 'corpus',
+            'authority_type'    => $chunk['authority_type'] ?? null,
+            'jurisdiction'      => $chunk['jurisdiction'] ?? null,
+            'matched_sub_questions' => [$subQId],
+        ];
+    }
+
+    private function mergeAndDedupe(array $rawPool, int $cap): array
+    {
+        $byKey = [];
+        foreach ($rawPool as $chunk) {
+            $key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
+            if (!isset($byKey[$key])) {
+                $byKey[$key] = $chunk;
+                continue;
+            }
+            $existing = $byKey[$key];
+            $existing['matched_sub_questions'] = array_values(array_unique(array_merge(
+                $existing['matched_sub_questions'] ?? [],
+                $chunk['matched_sub_questions'] ?? []
+            )));
+            // Keep the higher similarity score
+            if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
+                $existing['similarity'] = $chunk['similarity'];
+            }
+            if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
+                $existing['reranker_score'] = $chunk['reranker_score'];
+            }
+            $byKey[$key] = $existing;
+        }
+        $merged = array_values($byKey);
+        usort($merged, function (array $a, array $b): int {
+            $aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
+            $bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
+            return $bScore <=> $aScore;
+        });
+        return array_slice($merged, 0, $cap);
+    }
+
+    private function numberSources(array $chunks): array
+    {
+        $out = [];
+        foreach ($chunks as $i => $c) {
+            $c['n'] = $i + 1;
+            $out[] = $c;
+        }
+        return $out;
+    }
+
+    private function synthesise(
+        string $seedDescription,
+        string $brief,
+        array  $subQuestions,
+        array  $numberedSources,
+        string $engine,
+        string $language,
+        float  $temperature
+    ): array {
+        $locale = $language === 'no' ? 'Norwegian' : 'English';
+
+        if (empty($numberedSources)) {
+            return [
+                'json' => [
+                    'brief_markdown' => $language === 'no'
+                        ? 'Jeg fant ikke tilstrekkelig kildestøtte i korpuset til å gi et grunnlagsbasert svar.'
+                        : 'I did not find enough source support in the corpus to give a grounded answer.',
+                    'what_we_found' => 'No retrieved sources passed the similarity threshold.',
+                    'what_remains_uncertain' => ['No corpus evidence retrieved for the given query and slice selection.'],
+                    'next_practical_step' => 'Try widening slice selection or rephrasing with more specific statutory or party terms.',
+                ],
+                'deploy_label' => $engine === 'gpu' ? 'GPU (cuttlefish)' : ($engine === 'azure_full' ? 'gpt-4o' : $this->azure->chatDeployment()),
+            ];
+        }
+
+        $sourcesContext = [];
+        foreach ($numberedSources as $s) {
+            $sourcesContext[] = sprintf(
+                "[%d] (%s) %s%s\n    Corpus: %s\n    Excerpt: %s",
+                $s['n'],
+                $s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
+                $s['title'],
+                !empty($s['section']) ? ' — ' . $s['section'] : '',
+                $s['package_or_corpus'],
+                $s['excerpt']
+            );
+        }
+        $sourcesText = implode("\n\n", $sourcesContext);
+
+        $subQText = '';
+        if ($subQuestions) {
+            $lines = array_map(
+                fn(array $sq, int $i): string => sprintf('%d. (%s) %s', $i + 1, $sq['id'], $sq['question']),
+                $subQuestions,
+                array_keys($subQuestions)
+            );
+            $subQText = "\nSub-questions explored:\n" . implode("\n", $lines);
+        }
+
+        $prompt = <<<PROMPT
+You are Do Better Norge Legal Tools running a deep-research synthesis. You MUST ground every claim in the numbered sources below, using inline `[n]` citation markers that map to the source list. Do NOT cite a source you did not use. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
+
+User input:
+{$seedDescription}
+
+Research brief:
+{$brief}
+{$subQText}
+
+Sources (numbered):
+{$sourcesText}
+
+Return JSON only in {$locale}:
+{
+  "brief_markdown": "Markdown legal brief, 250-700 words, with inline [n] citation markers keyed to the sources above. Use short paragraphs. End with a one-line caveat. Do NOT include headings above level 3 (###).",
+  "what_we_found": "1-2 sentence plain-language summary of the grounded finding",
+  "what_remains_uncertain": ["gaps or caveats — what the corpus did not cover or where confidence is limited"],
+  "next_practical_step": "one concrete next action the user can take"
+}
+
+Rules:
+- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
+- If no source supports a point, omit the point.
+- Respond in {$locale}.
+- Output valid JSON only — no markdown fences around the JSON.
+PROMPT;
+
+        $messages = [
+            ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
+            ['role' => 'user',   'content' => $prompt],
+        ];
+        $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 2200, 'timeout' => 120];
+
+        try {
+            if ($engine === 'gpu') {
+                $response = dbnToolsCallGpuLlm($messages, $opts);
+                $deployLabel = 'GPU (cuttlefish)';
+                $raw = (string)($response['choices'][0]['message']['content'] ?? '');
+            } elseif ($engine === 'azure_full') {
+                $raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
+                $deployLabel = 'gpt-4o';
+            } else {
+                $raw = $this->azure->chatText($messages, $opts);
+                $deployLabel = $this->azure->chatDeployment();
+            }
+        } catch (Throwable $e) {
+            dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
+        }
+
+        $json = $this->azure->decodeJsonObject($raw);
+        if (!is_array($json) || empty($json['brief_markdown'])) {
+            // Salvage as plain markdown
+            $json = [
+                'brief_markdown' => $raw,
+                'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.',
+                'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
+                'next_practical_step' => 'Review the brief manually before relying on it.',
+            ];
+        }
+
+        return [
+            'json'         => $json,
+            'deploy_label' => $deployLabel,
+        ];
+    }
+
+    private function citationConfidence(array $sources): string
+    {
+        if (!$sources) {
+            return 'low';
+        }
+        $scores = array_values(array_filter(array_map(
+            fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
+            $sources
+        ), 'is_numeric'));
+        $best = $scores ? max($scores) : 0;
+        if (count($sources) >= 6 && $best >= 0.5) {
+            return 'high';
+        }
+        if (count($sources) >= 3 && $best >= 0.35) {
+            return 'medium';
+        }
+        return 'low';
+    }
+
+    private function trace(string $label, string $detail, string $status = 'complete'): array
+    {
+        return [
+            'label'  => $label,
+            'detail' => $detail,
+            'status' => $status,
+        ];
+    }
+
+    private function elapsedMs(float $start): int
+    {
+        return (int)round((microtime(true) - $start) * 1000);
+    }
+}