dobetternorge-tools/includes/DeepResearchAgent.php

<?php
declare(strict_types=1);

require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';

final class DbnDeepResearchAgent
{
    private const MAX_SEED_CHARS = 16000;
    private const MAX_UPLOAD_CHARS = 64000;
    private const CHUNK_WORDS = 600;
    private const CHUNK_OVERLAP_WORDS = 75;
    private const MIN_CHUNK_WORDS = 50;
    private const POOL_CAP = 30;

    private DbnAzureOpenAiGateway $azure;
    private array $uploadVecs = [];
    private array $stepTimings = [];

    public function __construct(?DbnAzureOpenAiGateway $azure = null)
    {
        $this->azure = $azure ?: new DbnAzureOpenAiGateway();
    }

    public function run(
        string $seedQuery,
        string $pastedText,
        array  $uploadedFiles,
        array  $sliceSelection,
        string $engine,
        string $language,
        array  $controls,
        ?callable $emit = null,
        string $advocateRole = ''
    ): array {
        $seedQuery   = trim($seedQuery);
        $pastedText  = trim($pastedText);
        $engine      = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
        $language    = in_array($language, ['en', 'no'], true) ? $language : 'en';

        $controls = $this->normalizeControls($controls);

        if ($seedQuery === '' && $pastedText === '' && empty($uploadedFiles)) {
            dbnToolsAbort('Provide a question, paste text, or upload at least one file.', 422, 'missing_seed');
        }

        $client  = dbnToolsRequireClient();
        $package = $this->requireFamilyPackage((int)$client['id']);

        dbnToolsBootCaveau();
        $aiPortalRoot = dbnToolsAiPortalRoot();
        require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';

        $this->uploadVecs = [];
        $this->stepTimings = [];

        $trace = [];
        $seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, $uploadedFiles);

        $emitStep = function (string $stepId, string $label, string $detail, string $status) use (&$trace, $emit): void {
            $trace[] = $this->trace($label, $detail, $status);
            if ($emit) {
                $emit('step', [
                    'step'   => $stepId,
                    'label'  => $label,
                    'detail' => $detail,
                    'status' => $status,
                ]);
            }
        };
        $emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void {
            if ($emit) {
                $emit('step', [
                    'step'   => $stepId,
                    'label'  => $label,
                    'detail' => $detail,
                    'status' => 'running',
                ]);
            }
        };

        // STEP 1: Query interpretation
        $emitRunning('interpretation', 'Query interpretation', 'Summarising the seed input…');
        $stepStart = microtime(true);
        $interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole);
        $this->stepTimings['interpretation'] = $this->elapsedMs($stepStart);
        $emitStep('interpretation', 'Query interpretation', $interpretation['detail'], 'complete');

        // STEP 2: Query expansion
        $emitRunning('expansion', 'Query expansion', 'Generating sub-questions…');
        $stepStart = microtime(true);
        $expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language, $advocateRole);
        $this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
        $subQuestions = $expansion['questions'];
        $expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
        $expansionDetail = $expansion['fallback']
            ? 'Could not parse sub-questions; falling back to retrieving on the seed query alone.'
            : sprintf('Generated %d sub-questions to research the corpus from multiple angles.', count($subQuestions));
        $emitStep('expansion', 'Query expansion', $expansionDetail, $expansionStatus);

        // STEP 3: Slice resolution
        $emitRunning('slice_resolution', 'Slice resolution', 'Resolving slice toggles to document IDs…');
        $stepStart = microtime(true);
        $sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
        if (!array_filter($sliceSelectionNormalized)) {
            dbnToolsAbort('Enable at least one corpus slice before running deep research.', 422, 'no_slices');
        }
        $ragDb = dbnToolsRagDb();
        try {
            $sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
            $sliceStatus = 'complete';
            $sliceDetail = sprintf(
                '%d slice(s) active → %d candidate documents constrain the corpus search.',
                count(array_filter($sliceSelectionNormalized)),
                count($sharedDocIds)
            );
        } catch (Throwable $e) {
            error_log('DBN deep research slice resolve failed: ' . $e->getMessage());
            $sharedDocIds = [];
            $sliceStatus = 'warning';
            $sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.';
        }
        $this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
        $emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus);

        // STEP 4: Upload indexing (in-memory, ephemeral)
        $emitRunning('upload_indexing', 'Upload indexing', empty($uploadedFiles)
            ? 'No uploads; skipping…'
            : sprintf('Chunking + embedding %d file(s) in memory…', count($uploadedFiles)));
        $stepStart = microtime(true);
        $uploadChunks = [];
        foreach ($uploadedFiles as $idx => $file) {
            $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
            // Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size
            $text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8');
            $uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
        }
        $uploadStatus = 'complete';
        $uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
        if ($uploadChunks) {
            try {
                // Embed in small batches of 5, emitting progress between each so the stream
                // stays alive during slow CPU Ollama inference (nomic-embed-text on chloe).
                $texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
                $allVecs = [];
                $batchSize = 5;
                for ($b = 0; $b < count($texts); $b += $batchSize) {
                    $batch = array_slice($texts, $b, $batchSize);
                    if ($emit) {
                        $emit('progress', ['detail' => sprintf(
                            'Embedding chunks %d–%d of %d…',
                            $b + 1, $b + count($batch), count($texts)
                        )]);
                    }
                    $batchVecs = dbnToolsLiteLLMEmbedBatch($batch);
                    $allVecs = array_merge($allVecs, $batchVecs);
                }
                $vecs = $allVecs;
                if (count($vecs) === count($uploadChunks)) {
                    foreach ($uploadChunks as $i => $chunk) {
                        $this->uploadVecs[] = [
                            'meta' => $chunk,
                            'vec'  => $vecs[$i],
                        ];
                    }
                } else {
                    $uploadStatus = 'warning';
                    $uploadDetail = 'Upload embedding returned an unexpected count; uploaded chunks will not participate in retrieval.';
                }
            } catch (Throwable $e) {
                error_log('DBN deep research upload embed failed: ' . $e->getMessage());
                $uploadStatus = 'warning';
                $uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.';
                $this->uploadVecs = [];
            }
        } elseif (empty($uploadedFiles)) {
            $uploadDetail = 'No files uploaded; agent will research the corpus only.';
        }
        $this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
        $emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus);

        // STEP 5: Retrieval (per sub-question)
        $retrievalQueries = $subQuestions ?: [[
            'id'        => 'q1',
            'question'  => $seedQuery !== '' ? $seedQuery : ($interpretation['brief'] ?: 'legal research'),
            'rationale' => 'Seed query (no sub-question expansion).',
        ]];
        $emitRunning('retrieval', 'Retrieval', sprintf('Hybrid vector + keyword + rerank across %d sub-question(s)…', count($retrievalQueries)));
        $stepStart = microtime(true);

        try {
            $rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
        } catch (Throwable $e) {
            dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
        }

        $rawPool = [];
        $retrievalWarnings = 0;
        $rawCorpusCount = 0;
        $rawUploadCount = 0;
        $filteredOutCount = 0;
        foreach ($retrievalQueries as $idx => $sq) {
            if ($emit) {
                $emit('subq', [
                    'index'    => $idx + 1,
                    'total'    => count($retrievalQueries),
                    'id'       => $sq['id'],
                    'question' => $sq['question'],
                ]);
            }
            try {
                $corpusChunks = $rag->searchAll(
                    $sq['question'],
                    $controls['chunk_limit'],
                    null,
                    [
                        'search_private'         => false,
                        'search_shared'          => true,
                        'package_ids'            => [(int)$package['id']],
                        'shared_doc_ids'         => $sharedDocIds,
                        'chunk_limit'            => $controls['chunk_limit'],
                        'search_method'          => 'hybrid',
                        'reranker_enabled'       => true,
                        'include_beta_website'   => false,
                        'include_primary_website'=> false,
                    ]
                );
            } catch (Throwable $e) {
                error_log('DBN deep research sub-Q retrieval failed: ' . $e->getMessage());
                $corpusChunks = [];
                $retrievalWarnings++;
            }
            $rawCorpusCount += count($corpusChunks);
            foreach ($corpusChunks as $chunk) {
                if ($this->isWebsiteChunk($chunk)) {
                    $filteredOutCount++;
                    continue;
                }
                $rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
            }

            // Upload chunk retrieval via cosine sim
            if (!empty($this->uploadVecs)) {
                $uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']);
                $rawUploadCount += count($uploadHits);
                foreach ($uploadHits as $hit) {
                    $hit['matched_sub_questions'] = [$sq['id']];
                    $rawPool[] = $hit;
                }
            }
        }

        $merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
        $this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
        $retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
        $retrievalDetail = sprintf(
            '%d sub-question(s) × hybrid + RRF + rerank → %d corpus chunks (%d filtered) + %d upload hits → %d unique after dedupe.',
            count($retrievalQueries),
            $rawCorpusCount,
            $filteredOutCount,
            $rawUploadCount,
            count($merged)
        );
        $emitStep('retrieval', 'Retrieval', $retrievalDetail, $retrievalStatus);

        // Cap pool to reranker top-K for synthesis
        $synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);

        // Hydrate corpus sources with source_url + authority_label via batched dbn_v6 query
        $this->hydrateSourceUrls($synthesisPool);

        $numberedSources = $this->numberSources($synthesisPool);

        $retrievalCounts = [
            'raw_corpus'         => $rawCorpusCount,
            'filtered_website'   => $filteredOutCount,
            'post_filter_corpus' => $rawCorpusCount - $filteredOutCount,
            'raw_upload'         => $rawUploadCount,
            'after_dedupe'       => count($merged),
            'after_topk'         => count($numberedSources),
        ];

        // STEP 6: Synthesis
        $synthesisEngineLabel = $engine === 'azure_full' ? 'Azure gpt-4o' : ($engine === 'gpu' ? 'GPU qwen2.5:14b' : 'Azure gpt-4o-mini');
        $emitRunning('synthesis', 'Synthesis', sprintf('Synthesising cited brief with %s — this is the slowest step…', $synthesisEngineLabel));
        $stepStart = microtime(true);
        $synthesis = $this->synthesise(
            $seedDescription,
            $interpretation['brief'],
            $retrievalQueries,
            $numberedSources,
            $engine,
            $language,
            $controls['temperature'],
            $advocateRole
        );
        $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
        $emitStep(
            'synthesis',
            'Synthesis',
            sprintf('%s synthesised the brief using %d grounded source(s).', $synthesis['deploy_label'], count($numberedSources)),
            'complete'
        );

        // STEP 7: Confidence
        $confidence = $this->citationConfidence($numberedSources);
        $emitStep(
            'confidence',
            'Citation confidence',
            sprintf('%s confidence based on %d source(s) and reranker score distribution.', ucfirst($confidence), count($numberedSources)),
            $confidence === 'low' ? 'warning' : 'complete'
        );

        // Stitch sub-question chunk_ids + top_sources (top 3 sources matched by each sub-Q)
        $subQOut = [];
        foreach ($retrievalQueries as $sq) {
            $matchedChunks = array_values(array_filter(
                $numberedSources,
                fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
            ));
            $topSources = array_slice($matchedChunks, 0, 3);
            $subQOut[] = [
                'id'          => $sq['id'],
                'question'    => $sq['question'],
                'rationale'   => $sq['rationale'] ?? '',
                'chunk_ids'   => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
                'top_sources' => array_map(fn(array $s) => [
                    'n'              => $s['n'] ?? null,
                    'title'          => $s['title'] ?? '',
                    'section'        => $s['section'] ?? null,
                    'deep_link'      => $s['deep_link'] ?? $s['source_url'] ?? null,
                    'source_url'     => $s['source_url'] ?? null,
                    'source_origin'  => $s['source_origin'] ?? 'corpus',
                    'authority_label'=> $s['authority_label'] ?? null,
                    'excerpt'        => $s['excerpt'] ?? '',
                ], $topSources),
            ];
        }

        $isAdvocate = $advocateRole !== '';
        return [
            'tool'           => $isAdvocate ? 'advocate' : 'deep_research',
            'language'       => $language,
            'advocate_role'  => $isAdvocate ? $advocateRole : null,
            'brief_markdown' => (string)($synthesis['json']['brief_markdown'] ?? $synthesis['json']['answer'] ?? ''),
            'client_strengths'    => $isAdvocate ? ($synthesis['json']['client_strengths'] ?? []) : null,
            'opposing_weaknesses' => $isAdvocate ? ($synthesis['json']['opposing_weaknesses'] ?? []) : null,
            'sub_questions'  => $subQOut,
            'sources'        => $numberedSources,
            'what_we_found'  => (string)($synthesis['json']['what_we_found'] ?? ''),
            'evidence_trail' => $numberedSources,
            'what_remains_uncertain' => $synthesis['json']['what_remains_uncertain'] ?? [],
            'next_practical_step'    => (string)($synthesis['json']['next_practical_step'] ?? ''),
            'trace' => $trace,
            'trace_metadata' => [
                'chunk_count'         => count($merged),
                'source_count'        => count($numberedSources),
                'sub_question_count'  => count($retrievalQueries),
                'upload_chunk_count'  => count($this->uploadVecs),
                'deployment'          => $synthesis['deploy_label'],
                'engine_used'         => $engine,
                'citation_confidence' => $confidence,
                'elapsed_ms_per_step' => $this->stepTimings,
                'retrieval_counts'    => $retrievalCounts,
                'slices_active'       => array_keys(array_filter($sliceSelectionNormalized)),
            ],
            'disclaimer' => dbnToolsDisclaimer($language),
        ];
    }

    private function normalizeControls(array $controls): array
    {
        return [
            'sub_q_count'          => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
            'chunk_limit'          => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
            'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
            'reranker_top_k'       => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
            'temperature'          => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
        ];
    }

    private function requireFamilyPackage(int $clientId): array
    {
        $package = dbnToolsFetchPackage('family-legal');
        if (!$package || empty($package['is_active'])) {
            dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
        }
        if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
            dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
        }
        return $package;
    }

    private function buildSeedDescription(string $seedQuery, string $pastedText, array $uploadedFiles): string
    {
        $parts = [];
        if ($seedQuery !== '') {
            $parts[] = "Question:\n" . mb_substr($seedQuery, 0, self::MAX_SEED_CHARS, 'UTF-8');
        }
        if ($pastedText !== '') {
            $parts[] = "Pasted text:\n" . mb_substr($pastedText, 0, self::MAX_SEED_CHARS, 'UTF-8');
        }
        foreach ($uploadedFiles as $idx => $file) {
            $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
            $text = (string)($file['text'] ?? '');
            if ($text === '') {
                continue;
            }
            $parts[] = sprintf("Uploaded file [%s]:\n%s", $filename, mb_substr($text, 0, self::MAX_UPLOAD_CHARS, 'UTF-8'));
        }
        return implode("\n\n", $parts);
    }

    private function interpretSeed(string $seedDescription, string $language, string $advocateRole = ''): array
    {
        $locale = $language === 'no' ? 'Norwegian' : 'English';
        $rolePrefix = $advocateRole !== ''
            ? "You are preparing a case-research brief for: {$advocateRole}. Frame your interpretation to identify the strongest legal angles for this party.\n\n"
            : '';
        $prompt = <<<PROMPT
{$rolePrefix}You are reviewing the input below to set up a deep legal research pass against the Do Better Norge family-law corpus.

Input:
{$seedDescription}

In {$locale}, produce JSON with:
{
  "brief": "1-3 sentence description of what the user is trying to research (≤ 220 chars)",
  "key_signals": ["short keywords or terms that should drive retrieval"]
}
PROMPT;

        try {
            $raw = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && !empty($json['brief'])) {
                $signals = $json['key_signals'] ?? [];
                $signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : '';
                return [
                    'brief' => (string)$json['brief'],
                    'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
                ];
            }
        } catch (Throwable $e) {
            error_log('DBN deep research interpretation failed: ' . $e->getMessage());
        }

        return [
            'brief' => '',
            'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
        ];
    }

    private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language, string $advocateRole = ''): array
    {
        $locale = $language === 'no' ? 'Norwegian' : 'English';

        if ($advocateRole !== '') {
            $prompt = <<<PROMPT
You are a Norwegian family-law research assistant building a case for: {$advocateRole}.
Generate exactly {$targetCount} targeted sub-questions designed to find:
1. Lovdata statutes and ECHR/Hague precedents that support {$advocateRole}'s position.
2. Procedural rights and obligations the opposing party must satisfy — failures here help {$advocateRole}.
3. Case law that exposes weaknesses in the opposing party's likely arguments.
4. Specific articles, paragraphs, or judgments {$advocateRole}'s representative should cite.

Research brief:
{$brief}

Raw input:
{$seedDescription}

Return JSON only in {$locale}:
{
  "sub_questions": [
    {"id":"q1","question":"...","rationale":"how finding this strengthens {$advocateRole}'s case (≤ 140 chars)"}
  ]
}

Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Every question must be answerable from Norwegian family-law, child-welfare, or ECHR/Hague sources.
- Each question must cover a DIFFERENT angle (supporting statute, procedural right, opposing weakness, ECHR precedent, evidentiary frame).
- Sub-questions must be self-contained — readable without the raw input.
- Write the questions in {$locale}.
PROMPT;
        } else {
            $prompt = <<<PROMPT
You are decomposing a Do Better Norge legal-research request into {$targetCount} focused sub-questions that should each be answered by the legal corpus (Norwegian family law, child welfare, ECHR/Hague).

Research brief:
{$brief}

Raw input:
{$seedDescription}

Return JSON only:
{
  "sub_questions": [
    {"id":"q1","question":"... ({$locale})","rationale":"why this angle matters (≤ 140 chars)"}
  ]
}

Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
- Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
- Sub-questions must be self-contained — readable without seeing the seed text.
- Write the questions in {$locale}.
PROMPT;
        }

        try {
            $raw = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 35]);
            $json = $this->azure->decodeJsonObject($raw);
            $items = is_array($json['sub_questions'] ?? null) ? $json['sub_questions'] : [];
            $normalized = [];
            foreach ($items as $i => $item) {
                if (!is_array($item) || empty($item['question'])) {
                    continue;
                }
                $normalized[] = [
                    'id'        => 'q' . ($i + 1),
                    'question'  => trim((string)$item['question']),
                    'rationale' => trim((string)($item['rationale'] ?? '')),
                ];
                if (count($normalized) >= $targetCount) break;
            }
            if (count($normalized) >= 2) {
                return ['questions' => $normalized, 'fallback' => false];
            }
        } catch (Throwable $e) {
            error_log('DBN deep research expansion failed: ' . $e->getMessage());
        }

        return ['questions' => [], 'fallback' => true];
    }

    private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
    {
        $text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
        if ($text === '') {
            return [];
        }
        $words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
        if (!$words) {
            return [];
        }

        $chunks = [];
        $i = 0;
        $chunkIdx = 0;
        $total = count($words);
        while ($i < $total) {
            $slice = array_slice($words, $i, self::CHUNK_WORDS);
            if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
                $chunks[] = [
                    'chunk_id'   => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
                    'file_index' => $fileIdx,
                    'chunk_index'=> $chunkIdx,
                    'filename'   => $filename,
                    'text'       => implode(' ', $slice),
                ];
                $chunkIdx++;
            }
            $advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
            if ($advance < 1) $advance = 1;
            $i += $advance;
            if (count($slice) < self::CHUNK_WORDS) {
                break;
            }
        }
        return $chunks;
    }

    private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
    {
        if (empty($this->uploadVecs)) {
            return [];
        }
        try {
            $qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? [];
        } catch (Throwable $e) {
            error_log('DBN deep research sub-Q embed failed: ' . $e->getMessage());
            return [];
        }
        if (empty($qVec)) {
            return [];
        }
        $scored = [];
        foreach ($this->uploadVecs as $entry) {
            $sim = $this->cosineSim($qVec, $entry['vec']);
            if ($sim < $threshold) {
                continue;
            }
            $scored[] = [
                'chunk_id'          => $entry['meta']['chunk_id'],
                'title'             => 'uploaded: ' . $entry['meta']['filename'],
                'section'           => null,
                'package_or_corpus' => 'Your upload',
                'excerpt'           => dbnToolsExcerpt($entry['meta']['text'], 620),
                'chunk_text'        => $entry['meta']['text'],
                'similarity'        => round($sim, 4),
                'reranker_score'    => null,
                'document_id'       => null,
                'source_origin'     => 'upload',
                'authority_type'    => null,
                'jurisdiction'      => null,
            ];
        }
        usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
        $keep = (int)ceil($limitPerSubQ / 2);
        return array_slice($scored, 0, max(1, $keep));
    }

    private function cosineSim(array $a, array $b): float
    {
        $len = min(count($a), count($b));
        if ($len === 0) return 0.0;
        $dot = 0.0;
        $na = 0.0;
        $nb = 0.0;
        for ($i = 0; $i < $len; $i++) {
            $x = (float)$a[$i];
            $y = (float)$b[$i];
            $dot += $x * $y;
            $na  += $x * $x;
            $nb  += $y * $y;
        }
        if ($na === 0.0 || $nb === 0.0) return 0.0;
        return $dot / (sqrt($na) * sqrt($nb));
    }

    private function normalizeCorpusChunk(array $chunk, string $subQId): array
    {
        $similarity = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
        $rerankerScore = isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null;
        return [
            'chunk_id'          => isset($chunk['id']) ? (int)$chunk['id'] : null,
            'title'             => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
            'section'           => $chunk['section_title'] ?? null,
            'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
            'excerpt'           => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
            'chunk_text'        => (string)($chunk['content'] ?? ''),
            'similarity'        => $similarity,
            'reranker_score'    => $rerankerScore,
            'document_id'       => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
            'source_origin'     => 'corpus',
            'authority_type'    => $chunk['authority_type'] ?? null,
            'jurisdiction'      => $chunk['jurisdiction'] ?? null,
            'publication_year'  => $chunk['publication_year'] ?? null,
            // Filled in later by hydrateSourceUrls()
            'source_url'        => null,
            'deep_link'         => null,
            'authority_label'   => null,
            'corpus_source_name'=> null,
            'publication_date'  => null,
            'matched_sub_questions' => [$subQId],
        ];
    }

    /**
     * Defensive post-filter: drop any chunk that smells like a marketing-website hit
     * (dobetternorge.no marketing pages have source_group 'website-primary'/'website-beta'
     * but the chunk payload only carries `source_name` — use a name+title regex check).
     */
    private function isWebsiteChunk(array $chunk): bool
    {
        $name = strtolower((string)($chunk['source_name'] ?? ''));
        $title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
        if ($name === '') return false;
        // Trusted shared-corpus packages do not contain the word 'website'. Marketing
        // sources are explicitly labelled with source_group=website-primary/beta upstream.
        if (str_contains($name, 'website')) return true;
        if (str_contains($title, 'dobetternorge.no')) return true;
        if (preg_match('/^(homepage|landing|about |contact )/i', $title)) return true;
        return false;
    }

    /**
     * Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc.
     * Uses a direct query against bnl_corpus.documents (only columns that exist there —
     * the temporal columns added in migration 136 are absent on this instance).
     */
    private function hydrateSourceUrls(array &$pool): void
    {
        $docIds = [];
        foreach ($pool as $chunk) {
            if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
            $docId = (int)($chunk['document_id'] ?? 0);
            if ($docId > 0) $docIds[$docId] = true;
        }
        if (empty($docIds)) return;

        try {
            $ragDb = dbnToolsRagDb();
            $ids   = array_keys($docIds);
            $ph    = implode(',', array_fill(0, count($ids), '?'));

            $stmt = $ragDb->prepare("
                SELECT d.id, d.title, d.source_url, d.authority_type,
                       d.publication_date, d.source_id, d.jurisdiction
                FROM documents d
                WHERE d.id IN ({$ph})
            ");
            $stmt->execute($ids);

            $docMeta  = [];
            $sourceIds = [];
            foreach ($stmt as $row) {
                $dId = (int)$row['id'];
                $sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
                if ($sid) $sourceIds[] = $sid;
                $docMeta[$dId] = [
                    'source_url'       => $row['source_url']       ?? null,
                    'authority_label'  => dbnV6AuthorityLabel($row['authority_type'] ?? null),
                    'publication_date' => $row['publication_date'] ?? null,
                    'corpus_source_name' => 'Do Better Legal',
                    'source_id'        => $sid,
                ];
            }

            // Enrich with corpus source name from bnl_admin.corpus_sources
            if (!empty($sourceIds)) {
                $uSids = array_values(array_unique($sourceIds));
                $sPh   = implode(',', array_fill(0, count($uSids), '?'));
                $sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
                $sStmt->execute($uSids);
                $srcNames = [];
                foreach ($sStmt as $row) {
                    $srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
                }
                foreach ($docMeta as &$m) {
                    if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
                        $m['corpus_source_name'] = $srcNames[$m['source_id']];
                    }
                }
                unset($m);
            }
        } catch (Throwable $e) {
            error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage());
            return;
        }

        foreach ($pool as &$chunk) {
            if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
            $docId = (int)($chunk['document_id'] ?? 0);
            if (!$docId || !isset($docMeta[$docId])) continue;
            $m = $docMeta[$docId];
            $sourceUrl = $m['source_url'] ?? null;
            $chunk['source_url']         = $sourceUrl;
            $chunk['deep_link']          = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
            $chunk['authority_label']    = $m['authority_label'] ?? $chunk['authority_label'];
            $chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
            $chunk['publication_date']   = $m['publication_date'] ?? null;
        }
        unset($chunk);
    }

    /**
     * Construct a clickable URL into the original article. Lovdata supports
     * path-style section anchors (e.g. /§43). For other hosts we return the
     * document root URL.
     */
    private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string
    {
        if (!$sourceUrl) return null;
        $sourceUrl = trim($sourceUrl);
        if ($sourceUrl === '') return null;

        if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl)
            && $sectionTitle
            && preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) {
            return rtrim($sourceUrl, '/') . '/§' . $m[1];
        }
        return $sourceUrl;
    }

    private function mergeAndDedupe(array $rawPool, int $cap): array
    {
        $byKey = [];
        foreach ($rawPool as $chunk) {
            $key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
            if (!isset($byKey[$key])) {
                $byKey[$key] = $chunk;
                continue;
            }
            $existing = $byKey[$key];
            $existing['matched_sub_questions'] = array_values(array_unique(array_merge(
                $existing['matched_sub_questions'] ?? [],
                $chunk['matched_sub_questions'] ?? []
            )));
            // Keep the higher similarity score
            if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
                $existing['similarity'] = $chunk['similarity'];
            }
            if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
                $existing['reranker_score'] = $chunk['reranker_score'];
            }
            $byKey[$key] = $existing;
        }
        $merged = array_values($byKey);
        usort($merged, function (array $a, array $b): int {
            $aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
            $bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
            return $bScore <=> $aScore;
        });
        return array_slice($merged, 0, $cap);
    }

    private function numberSources(array $chunks): array
    {
        $out = [];
        foreach ($chunks as $i => $c) {
            $c['n'] = $i + 1;
            $out[] = $c;
        }
        return $out;
    }

    private function synthesise(
        string $seedDescription,
        string $brief,
        array  $subQuestions,
        array  $numberedSources,
        string $engine,
        string $language,
        float  $temperature,
        string $advocateRole = ''
    ): array {
        $locale = $language === 'no' ? 'Norwegian' : 'English';

        if (empty($numberedSources)) {
            return [
                'json' => [
                    'brief_markdown' => $language === 'no'
                        ? 'Jeg fant ikke tilstrekkelig kildestøtte i korpuset til å gi et grunnlagsbasert svar.'
                        : 'I did not find enough source support in the corpus to give a grounded answer.',
                    'what_we_found' => 'No retrieved sources passed the similarity threshold.',
                    'what_remains_uncertain' => ['No corpus evidence retrieved for the given query and slice selection.'],
                    'next_practical_step' => 'Try widening slice selection or rephrasing with more specific statutory or party terms.',
                ],
                'deploy_label' => $engine === 'gpu' ? 'GPU (cuttlefish)' : ($engine === 'azure_full' ? 'gpt-4o' : $this->azure->chatDeployment()),
            ];
        }

        $sourcesContext = [];
        foreach ($numberedSources as $s) {
            $sourcesContext[] = sprintf(
                "[%d] (%s) %s%s\n    Corpus: %s\n    Authority: %s | Jurisdiction: %s\n    Excerpt: %s",
                $s['n'],
                $s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
                $s['title'],
                !empty($s['section']) ? ' — ' . $s['section'] : '',
                $s['package_or_corpus'],
                $s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
                $s['jurisdiction'] ?? 'n/a',
                $s['excerpt']
            );
        }
        $sourcesText = implode("\n\n", $sourcesContext);

        $subQText = '';
        if ($subQuestions) {
            $lines = array_map(
                fn(array $sq, int $i): string => sprintf('%d. (%s) %s', $i + 1, $sq['id'], $sq['question']),
                $subQuestions,
                array_keys($subQuestions)
            );
            $subQText = "\nSub-questions explored:\n" . implode("\n", $lines);
        }

        $sourceCount = count($numberedSources);
        $lengthGuidance = $sourceCount >= 3
            ? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.'
            : '250-450 words, 2-3 short paragraphs. Note when evidence is thin.';

        if ($advocateRole !== '') {
            $prompt = <<<PROMPT
You are Do Better Norge Legal Tools producing a legal preparation brief in {$locale}.
Your client: {$advocateRole}

You MUST ground every claim in the numbered sources below using inline `[n]` citation markers. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.

User input:
{$seedDescription}

Research brief:
{$brief}
{$subQText}

Sources ({$sourceCount} numbered):
{$sourcesText}

Return JSON only in {$locale}:
{
  "brief_markdown": "Partisan but factually grounded advocate brief. {$lengthGuidance} Structure: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Identified weaknesses in the opposing party's position with [n] citations, (4) Procedural rights and obligations {$advocateRole} should assert. End with a one-line caveat that this is legal preparation support, not final legal advice.",
  "client_strengths": ["3-6 strings — the strongest factual/legal points for {$advocateRole}, each anchored to at least one [n] source"],
  "opposing_weaknesses": ["2-5 strings — vulnerabilities in the opposing position supported by retrieved sources. Omit this array entirely if evidence is thin — do NOT invent weaknesses."],
  "what_we_found": "2-sentence summary of the most relevant retrieved authority for {$advocateRole}",
  "what_remains_uncertain": ["3-5 gaps where evidence is insufficient or law is unclear — be honest"],
  "next_practical_step": "one concrete action for {$advocateRole} to take next (legal filing, evidence gathering, consultation type, etc.)"
}

Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer citing statute sections (e.g. "Barneloven §43") and case names verbatim from source excerpts.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- `opposing_weaknesses` must be omitted or empty when no retrieved source actually supports the identified weakness.
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
PROMPT;
        } else {
            $prompt = <<<PROMPT
You are Do Better Norge Legal Tools running a deep-research synthesis. You MUST ground every claim in the numbered sources below, using inline `[n]` citation markers that map to the source list. Do NOT cite a source you did not use. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.

User input:
{$seedDescription}

Research brief:
{$brief}
{$subQText}

Sources ({$sourceCount} numbered):
{$sourcesText}

Return JSON only in {$locale}:
{
  "brief_markdown": "Markdown legal brief. {$lengthGuidance} Every factual claim ends with one or more inline [n] markers keyed to the sources above. Use level-3 headings (###) sparingly to separate paragraphs by theme when helpful. End with a one-line caveat that this is research support, not legal advice.",
  "what_we_found": "2-4 sentence plain-language summary of the grounded finding",
  "what_remains_uncertain": ["specific gaps — what the corpus did not cover, conflicting authority, or where confidence is limited (3-6 items when sources >= 3)"],
  "next_practical_step": "one concrete next action the user can take to strengthen the case or close a gap"
}

Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- If no source supports a point, omit the point — DO NOT speculate.
- Prefer pinpointing statute sections (e.g. "Barneloven §43") and case names verbatim from the source excerpts.
- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON object itself.
PROMPT;
        }

        $messages = [
            ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
            ['role' => 'user',   'content' => $prompt],
        ];
        $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3200, 'timeout' => 180];

        try {
            if ($engine === 'gpu') {
                $response = dbnToolsCallGpuLlm($messages, $opts);
                $deployLabel = 'GPU (cuttlefish)';
                $raw = (string)($response['choices'][0]['message']['content'] ?? '');
            } elseif ($engine === 'azure_full') {
                $raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
                $deployLabel = 'gpt-4o';
            } else {
                $raw = $this->azure->chatText($messages, $opts);
                $deployLabel = $this->azure->chatDeployment();
            }
        } catch (Throwable $e) {
            dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
        }

        $json = $this->azure->decodeJsonObject($raw);
        if (!is_array($json) || empty($json['brief_markdown'])) {
            // Salvage as plain markdown
            $json = [
                'brief_markdown' => $raw,
                'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.',
                'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
                'next_practical_step' => 'Review the brief manually before relying on it.',
            ];
        }

        return [
            'json'         => $json,
            'deploy_label' => $deployLabel,
        ];
    }

    private function citationConfidence(array $sources): string
    {
        if (!$sources) {
            return 'low';
        }
        $scores = array_values(array_filter(array_map(
            fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
            $sources
        ), 'is_numeric'));
        $best = $scores ? max($scores) : 0;
        if (count($sources) >= 6 && $best >= 0.5) {
            return 'high';
        }
        if (count($sources) >= 3 && $best >= 0.35) {
            return 'medium';
        }
        return 'low';
    }

    private function trace(string $label, string $detail, string $status = 'complete'): array
    {
        return [
            'label'  => $label,
            'detail' => $detail,
            'status' => $status,
        ];
    }

    private function elapsedMs(float $start): int
    {
        return (int)round((microtime(true) - $start) * 1000);
    }
}