dobetternorge-tools/includes/BvjAnalyzerAgent.php

<?php
declare(strict_types=1);

require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';

/**
 * BVJ (Barnevernet) Analyzer Agent
 *
 * Standalone 7-step pipeline that:
 *  1. Classifies the uploaded document and extracts metadata
 *  2. Extracts all named parties with roles
 *  3. Builds a chronological timeline of events
 *  4. Generates partisan sub-questions for corpus RAG
 *  5. Retrieves from the legal corpus (hybrid dense+BM25)
 *  6. Synthesises an advocacy brief + procedural red flags
 *  7. Assesses citation confidence
 *
 * Steps 1-3 always use azure_mini regardless of the user's engine choice.
 * Step 6 (synthesis) uses the user's selected engine.
 */
final class DbnBvjAnalyzerAgent
{
    private const MAX_DOC_CHARS        = 64000;
    private const CHUNK_WORDS          = 600;
    private const CHUNK_OVERLAP_WORDS  = 75;
    private const MIN_CHUNK_WORDS      = 50;
    private const POOL_CAP             = 30;
    // Steps 1-3 always use this engine — fast and cheap for structured extraction
    private const EXTRACT_ENGINE       = 'azure_mini';

    private DbnAzureOpenAiGateway $azure;
    private array $uploadVecs  = [];
    private array $stepTimings = [];

    public function __construct(?DbnAzureOpenAiGateway $azure = null)
    {
        $this->azure = $azure ?: new DbnAzureOpenAiGateway();
    }

    /**
     * Main pipeline. At least 1 uploaded file is required.
     *
     * @param array     $uploadedFiles   [{filename, text, chars, truncated}]
     * @param string    $advocateRole    Party the user represents
     * @param string    $engine          Affects synthesis only: azure_mini|azure_full|gpu
     * @param string    $language        'en' or 'no'
     * @param array     $sliceSelection  Corpus slice toggles
     * @param array     $controls        sub_q_count, chunk_limit, similarity_threshold, reranker_top_k, temperature
     * @param string    $additionalNotes Optional user context to supplement the document
     * @param callable|null $emit        function(string $event, array $payload): void
     */
    public function run(
        array     $uploadedFiles,
        string    $advocateRole,
        string    $engine,
        string    $language,
        array     $sliceSelection,
        array     $controls,
        string    $additionalNotes = '',
        ?callable $emit = null
    ): array {
        $engine   = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true)
                    ? $engine : 'azure_mini';
        $language = dbnToolsNormalizeUiLanguage($language);
        $controls = $this->normalizeControls($controls);

        if (empty($uploadedFiles)) {
            dbnToolsAbort('Upload at least one BVJ document before running the analyzer.', 422, 'no_uploads');
        }

        $client  = dbnToolsRequireClient();
        $package = $this->requireFamilyPackage((int)$client['id']);

        dbnToolsBootCaveau();
        $aiPortalRoot = dbnToolsAiPortalRoot();
        require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';

        $this->uploadVecs  = [];
        $this->stepTimings = [];
        $trace             = [];

        $emitStep = function (string $stepId, string $label, string $detail, string $status)
                    use (&$trace, $emit): void {
            $trace[] = $this->trace($label, $detail, $status);
            if ($emit) {
                $emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => $status]);
            }
        };
        $emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void {
            if ($emit) {
                $emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => 'running']);
            }
        };

        // Build combined document text (first file is primary; additional files appended)
        $docText = '';
        foreach ($uploadedFiles as $idx => $file) {
            $text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8');
            if ($text === '') continue;
            $filename = (string)($file['filename'] ?? sprintf('document-%d', $idx + 1));
            $docText .= ($docText !== '' ? "\n\n--- Document: {$filename} ---\n\n" : '') . $text;
        }
        if ($docText === '') {
            dbnToolsAbort('Could not extract text from the uploaded file(s).', 422, 'empty_document');
        }
        $docText = mb_substr($docText, 0, self::MAX_DOC_CHARS * 2, 'UTF-8');

        // ── STEP 1: Document classification ────────────────────────────────────
        $emitRunning('doc_classify', 'Document classification', 'Classifying document and extracting metadata…');
        $stepStart = microtime(true);
        $docMeta = $this->classifyDocument($docText, $language);
        $this->stepTimings['doc_classify'] = $this->elapsedMs($stepStart);
        if ($emit) {
            $emit('doc_meta', ['result' => $docMeta]);
        }
        $docTypeBadge = $docMeta['doc_type'] ?? 'BVJ Document';
        $refStr = $docMeta['reference_number'] ? ' · ref ' . $docMeta['reference_number'] : '';
        $authStr = $docMeta['issuing_authority'] ? $docMeta['issuing_authority'] : '';
        $emitStep('doc_classify', 'Document classification',
            trim("{$docTypeBadge} · {$authStr}{$refStr}"), 'complete');

        // ── STEP 2: Party extraction ────────────────────────────────────────────
        $emitRunning('party_extract', 'Party extraction', 'Identifying all named parties and their roles…');
        $stepStart = microtime(true);
        $parties = $this->extractParties($docText, $language);
        $this->stepTimings['party_extract'] = $this->elapsedMs($stepStart);
        if ($emit) {
            $emit('parties', ['parties' => $parties]);
        }
        $emitStep('party_extract', 'Party extraction',
            sprintf('%d %s identified.', count($parties), count($parties) === 1 ? 'party' : 'parties'),
            'complete');

        // ── STEP 3: Timeline extraction ─────────────────────────────────────────
        $emitRunning('timeline_extract', 'Timeline extraction', 'Building chronological event timeline…');
        $stepStart = microtime(true);
        $timelineEvents = $this->extractTimeline($docText, $language);
        $this->stepTimings['timeline_extract'] = $this->elapsedMs($stepStart);
        if ($emit) {
            $emit('timeline', ['events' => $timelineEvents]);
        }
        $highCount = count(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
        $emitStep('timeline_extract', 'Timeline extraction',
            sprintf('%d events extracted (%d high-significance).', count($timelineEvents), $highCount),
            'complete');

        // ── STEP 4: Sub-question generation ────────────────────────────────────
        $emitRunning('sub_question_gen', 'Sub-question generation',
            sprintf('Generating %d research angles for %s…', $controls['sub_q_count'], $advocateRole ?: 'selected role'));
        $stepStart = microtime(true);
        $subQuestions = $this->generateSubQuestions(
            $docMeta, $parties, $timelineEvents,
            $advocateRole, $controls['sub_q_count'], $language
        );
        $this->stepTimings['sub_question_gen'] = $this->elapsedMs($stepStart);
        $emitStep('sub_question_gen', 'Sub-question generation',
            sprintf('%d sub-questions generated for %s.', count($subQuestions), $advocateRole ?: 'selected role'),
            'complete');

        // ── STEP 5: Slice resolution + upload indexing + corpus retrieval ───────
        $emitRunning('slice_resolution', 'Slice resolution', 'Resolving corpus slice toggles…');
        $stepStart = microtime(true);
        $sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
        if (!array_filter($sliceSelectionNormalized)) {
            dbnToolsAbort('Enable at least one corpus slice before running the analyzer.', 422, 'no_slices');
        }
        $ragDb = dbnToolsRagDb();
        try {
            $sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
            $sliceDetail  = sprintf('%d slice(s) active → %d candidate documents.',
                count(array_filter($sliceSelectionNormalized)), count($sharedDocIds));
            $sliceStatus  = 'complete';
        } catch (Throwable $e) {
            error_log('BVJ slice resolve failed: ' . $e->getMessage());
            $sharedDocIds = [];
            $sliceDetail  = 'Slice resolution failed; corpus search will run unconstrained.';
            $sliceStatus  = 'warning';
        }
        $this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
        $emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus);

        // Upload indexing
        $emitRunning('upload_indexing', 'Upload indexing',
            sprintf('Chunking + embedding %d file(s)…', count($uploadedFiles)));
        $stepStart    = microtime(true);
        $uploadChunks = [];
        foreach ($uploadedFiles as $idx => $file) {
            $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
            $text     = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8');
            $uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
        }
        $uploadStatus = 'complete';
        $uploadDetail = sprintf('%d file(s) → %d in-memory chunks indexed.', count($uploadedFiles), count($uploadChunks));
        if ($uploadChunks) {
            try {
                $texts   = array_map(fn(array $c) => $c['text'], $uploadChunks);
                $allVecs = [];
                $batchSz = 5;
                for ($b = 0; $b < count($texts); $b += $batchSz) {
                    $batch = array_slice($texts, $b, $batchSz);
                    if ($emit) {
                        $emit('progress', ['detail' => sprintf(
                            'Embedding chunks %d–%d of %d…',
                            $b + 1, $b + count($batch), count($texts)
                        )]);
                    }
                    $allVecs = array_merge($allVecs, dbnToolsLiteLLMEmbedBatch($batch));
                }
                if (count($allVecs) === count($uploadChunks)) {
                    foreach ($uploadChunks as $i => $chunk) {
                        $this->uploadVecs[] = ['meta' => $chunk, 'vec' => $allVecs[$i]];
                    }
                } else {
                    $uploadStatus = 'warning';
                    $uploadDetail = 'Upload embedding count mismatch; uploaded chunks will not participate in retrieval.';
                }
            } catch (Throwable $e) {
                error_log('BVJ upload embed failed: ' . $e->getMessage());
                $uploadStatus = 'warning';
                $uploadDetail = 'Upload embedding timed out; corpus-only retrieval will run.';
                $this->uploadVecs = [];
            }
        }
        $this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
        $emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus);

        // Corpus retrieval (per sub-question)
        $retrievalQueries = $subQuestions ?: [[
            'id'        => 'q1',
            'question'  => sprintf('%s case involving %s', $docMeta['doc_type'] ?? 'BVJ document', $advocateRole),
            'rationale' => 'Fallback query (sub-question generation returned empty).',
        ]];
        $emitRunning('retrieval', 'Corpus retrieval',
            sprintf('Hybrid vector + keyword across %d sub-question(s)…', count($retrievalQueries)));
        $stepStart = microtime(true);

        try {
            $rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
        } catch (Throwable $e) {
            dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
        }

        $rawPool = [];
        $retrievalWarnings = 0;
        $rawCorpusCount    = 0;
        $rawUploadCount    = 0;
        $filteredOutCount  = 0;

        foreach ($retrievalQueries as $idx => $sq) {
            if ($emit) {
                $emit('subq', [
                    'index'    => $idx + 1,
                    'total'    => count($retrievalQueries),
                    'id'       => $sq['id'],
                    'question' => $sq['question'],
                ]);
            }
            try {
                $corpusChunks = $rag->searchAll(
                    $sq['question'],
                    $controls['chunk_limit'],
                    null,
                    [
                        'search_private'          => false,
                        'search_shared'           => true,
                        'package_ids'             => [(int)$package['id']],
                        'shared_doc_ids'          => $sharedDocIds,
                        'chunk_limit'             => $controls['chunk_limit'],
                        'search_method'           => 'hybrid',
                        'reranker_enabled'        => true,
                        'include_beta_website'    => false,
                        'include_primary_website' => false,
                    ]
                );
            } catch (Throwable $e) {
                error_log('BVJ sub-Q retrieval failed: ' . $e->getMessage());
                $corpusChunks = [];
                $retrievalWarnings++;
            }
            $rawCorpusCount += count($corpusChunks);
            foreach ($corpusChunks as $chunk) {
                if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) {
                    $filteredOutCount++;
                    continue;
                }
                $rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
            }
            if (!empty($this->uploadVecs)) {
                $uploadHits = $this->retrieveFromUploads(
                    $sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']
                );
                $rawUploadCount += count($uploadHits);
                foreach ($uploadHits as $hit) {
                    $hit['matched_sub_questions'] = [$sq['id']];
                    $rawPool[] = $hit;
                }
            }
        }

        $merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
        $this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
        $retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
        $retrievalDetail = sprintf(
            '%d sub-Q(s) × hybrid → %d corpus (%d filtered) + %d upload → %d unique after dedupe.',
            count($retrievalQueries), $rawCorpusCount, $filteredOutCount, $rawUploadCount, count($merged)
        );
        $emitStep('retrieval', 'Corpus retrieval', $retrievalDetail, $retrievalStatus);

        $synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
        $this->hydrateSourceUrls($synthesisPool);
        $numberedSources = $this->numberSources($synthesisPool);

        // Generate upload summaries for sources from uploaded files
        if (!empty($uploadedFiles) && !empty($numberedSources)) {
            $uploadSummaries = [];
            foreach ($uploadedFiles as $idx => $file) {
                $text     = mb_substr((string)($file['text'] ?? ''), 0, 4000, 'UTF-8');
                $filename = (string)($file['filename'] ?? "file-{$idx}");
                if ($text === '') continue;
                try {
                    $raw = $this->azure->chatText([
                        ['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
                        ['role' => 'user', 'content' => "Summarise this BVJ document for a legal researcher.\n\nFilename: {$filename}\n\nContent:\n{$text}"],
                    ], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
                    $uploadSummaries[$idx] = trim($raw);
                } catch (Throwable $e) {
                    error_log('BVJ upload summary gen failed for file ' . $idx . ': ' . $e->getMessage());
                    $uploadSummaries[$idx] = null;
                }
            }
            foreach ($numberedSources as &$src) {
                if (($src['source_origin'] ?? '') !== 'upload') continue;
                if (preg_match('/^upload:(\d+):/', (string)($src['chunk_id'] ?? ''), $m)) {
                    $src['summary'] = $uploadSummaries[(int)$m[1]] ?? null;
                }
            }
            unset($src);
        }

        $retrievalCounts = [
            'raw_corpus'   => $rawCorpusCount,
            'filtered'     => $filteredOutCount,
            'raw_upload'   => $rawUploadCount,
            'after_dedupe' => count($merged),
            'after_topk'   => count($numberedSources),
        ];

        // ── STEP 6: Synthesis ───────────────────────────────────────────────────
        $engineLabel = match ($engine) {
            'azure_full' => 'Azure gpt-4o',
            'gpu'        => 'GPU qwen2.5:14b',
            default      => 'Azure gpt-4o-mini',
        };
        $emitRunning('synthesis', 'Synthesis',
            sprintf('Synthesising advocacy brief with %s…', $engineLabel));
        $stepStart = microtime(true);
        $synthesis = $this->synthesiseBvj(
            $docText, $docMeta, $parties, $timelineEvents,
            $subQuestions, $numberedSources,
            $advocateRole, $engine, $language, $controls['temperature'], $additionalNotes,
            $emit
        );
        $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
        $emitStep('synthesis', 'Synthesis',
            sprintf('%s synthesised advocacy brief using %d source(s) + document.',
                $synthesis['deploy_label'], count($numberedSources)),
            'complete');

        // ── STEP 7: Confidence ──────────────────────────────────────────────────
        $confidence = $this->citationConfidence($numberedSources);
        $emitStep('confidence', 'Citation confidence',
            sprintf('%s confidence based on %d source(s).', ucfirst($confidence), count($numberedSources)),
            $confidence === 'low' ? 'warning' : 'complete');

        // Build sub-question output with top_sources
        $subQOut = [];
        foreach ($retrievalQueries as $sq) {
            $matchedChunks = array_values(array_filter(
                $numberedSources,
                fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
            ));
            $topSources = array_slice($matchedChunks, 0, 3);
            $subQOut[] = [
                'id'          => $sq['id'],
                'question'    => $sq['question'],
                'rationale'   => $sq['rationale'] ?? '',
                'chunk_ids'   => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
                'top_sources' => array_map(fn(array $s) => [
                    'n'               => $s['n'] ?? null,
                    'title'           => $s['title'] ?? '',
                    'section'         => $s['section'] ?? null,
                    'deep_link'       => $s['deep_link'] ?? $s['source_url'] ?? null,
                    'source_url'      => $s['source_url'] ?? null,
                    'source_origin'   => $s['source_origin'] ?? 'corpus',
                    'authority_label' => $s['authority_label'] ?? null,
                    'excerpt'         => $s['excerpt'] ?? '',
                ], $topSources),
            ];
        }

        $synJson = $synthesis['json'];
        return [
            'tool'                   => 'bvj_analyzer',
            'language'               => $language,
            'advocate_role'          => $advocateRole,
            'doc_meta'               => $docMeta,
            'parties'                => $parties,
            'timeline'               => ['events' => $timelineEvents],
            'advocacy_brief'         => (string)($synJson['advocacy_brief'] ?? ''),
            'procedural_red_flags'   => is_array($synJson['procedural_red_flags'] ?? null)
                                        ? $synJson['procedural_red_flags'] : [],
            'client_strengths'       => is_array($synJson['client_strengths'] ?? null)
                                        ? $synJson['client_strengths'] : [],
            'opposing_weaknesses'    => is_array($synJson['opposing_weaknesses'] ?? null)
                                        ? $synJson['opposing_weaknesses'] : [],
            'sub_questions'          => $subQOut,
            'sources'                => $numberedSources,
            'what_we_found'          => (string)($synJson['what_we_found'] ?? ''),
            'what_remains_uncertain' => $synJson['what_remains_uncertain'] ?? [],
            'next_practical_step'    => (string)($synJson['next_practical_step'] ?? ''),
            'trace'                  => $trace,
            'trace_metadata'         => [
                'chunk_count'         => count($merged),
                'source_count'        => count($numberedSources),
                'sub_question_count'  => count($retrievalQueries),
                'upload_chunk_count'  => count($this->uploadVecs),
                'deployment'          => $synthesis['deploy_label'],
                'engine_used'         => $engine,
                'citation_confidence' => $confidence,
                'elapsed_ms_per_step' => $this->stepTimings,
                'retrieval_counts'    => $retrievalCounts,
                'slices_active'       => array_keys(array_filter($sliceSelectionNormalized)),
            ],
            'disclaimer' => dbnToolsDisclaimer($language),
        ];
    }

    // ── Step 1: Document classification ──────────────────────────────────────

    private function classifyDocument(string $docText, string $language): array
    {
        $locale  = dbnToolsLanguageName($language);
        $excerpt = mb_substr($docText, 0, 6000, 'UTF-8');

        $prompt = <<<PROMPT
You are analysing a Norwegian child welfare (Barnevernet) document.
Extract the following metadata from the document text below.

Return JSON only in {$locale}:
{
  "doc_type": "The document type as a short phrase, e.g. Bekymringsmelding, Vedtak, Omsorgsovertakelse, Fylkesnemnda-kjennelse, Rapport, or the detected type",
  "doc_date": "Primary document date in ISO 8601 format (YYYY-MM-DD) if identifiable, otherwise null",
  "issuing_authority": "Name of the issuing authority or institution, e.g. Trondheim kommune barneverntjeneste",
  "reference_number": "Case or document reference number if present, otherwise null",
  "child_info": "Brief description of the child(ren) involved, e.g. name and birth date if visible — anonymise if clearly redacted"
}

Rules:
- If a field cannot be determined, use null.
- doc_type should be the Norwegian term if recognisable (e.g. Bekymringsmelding), otherwise English.
- Do not invent information not present in the text.

Document text (first 6000 chars):
{$excerpt}
PROMPT;

        $default = [
            'doc_type'          => 'BVJ Document',
            'doc_date'          => null,
            'issuing_authority' => null,
            'reference_number'  => null,
            'child_info'        => null,
        ];

        try {
            $raw = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 400, 'timeout' => 30]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json)) {
                return array_merge($default, array_filter($json, fn($v) => $v !== null && $v !== ''));
            }
        } catch (Throwable $e) {
            error_log('BVJ classifyDocument failed: ' . $e->getMessage());
        }
        return $default;
    }

    // ── Step 2: Party extraction ──────────────────────────────────────────────

    private function extractParties(string $docText, string $language): array
    {
        $locale  = dbnToolsLanguageName($language);
        $excerpt = mb_substr($docText, 0, 20000, 'UTF-8');

        $prompt = <<<PROMPT
You are analysing a Norwegian child welfare (Barnevernet) document.
Identify ALL named parties — every person or institution referred to by name or title.

Respond in {$locale}. Return a JSON object with a single key "parties" containing an array of objects.
Each object must have these four fields:
- "name": full name or institution name (string)
- "role": their role in the case, e.g. Biological mother, Biological father, Child, Barnevernarbeider, Saksbehandler, Leder, Melder, Politi, Lege, Psykolog, Advokat, Talsperson for barnet, Tilsynsfører, Sakkyndig, Foster carer (fosterforelder), Rusklinikk, Statsforvalter
- "organization": employer or institution if mentioned, otherwise null
- "relationship_to_child": relationship to the child in the document, e.g. Mother, Father, Sibling, Caseworker, Melder, Supervisor, or null

Rules:
- Include every named person and named institution — even peripheral ones.
- Include Barnevernvakta (bvv) as an institution even if no individual caseworkers are named.
- If a name appears to be redacted or anonymised (e.g. "mor", "far", "barnet", initials like "A.B."), include them with role inferred from context.
- Do not invent parties not present in the text.
- Maximum 25 parties.

Document text:
{$excerpt}
PROMPT;

        try {
            $raw = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 2000, 'timeout' => 45]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && is_array($json['parties'] ?? null)) {
                return array_slice($json['parties'], 0, 25);
            }
            // Fallback: model returned an array at root level instead of {parties:[...]}
            if (is_array($json) && isset($json[0]['name'])) {
                return array_slice($json, 0, 25);
            }
            error_log('BVJ extractParties unexpected structure: ' . substr($raw, 0, 300));
        } catch (Throwable $e) {
            error_log('BVJ extractParties failed: ' . $e->getMessage());
        }
        return [];
    }

    // ── Step 3: Timeline extraction ───────────────────────────────────────────

    private function extractTimeline(string $docText, string $language): array
    {
        $locale  = dbnToolsLanguageName($language);
        $excerpt = mb_substr($docText, 0, 20000, 'UTF-8');

        $prompt = <<<PROMPT
Build a chronological timeline from this Norwegian child welfare (Barnevernet) document in {$locale}.

Extract ALL dates, times, and temporal references — including phone calls, home visits, meetings, decisions, and assessments.

IMPORTANT — Norwegian date and time formats to recognise:
- DD.MM.YY  (e.g. 18.07.20 = 2020-07-18)
- DD.MM.YYYY (e.g. 18.07.2020)
- D.M.YY    (e.g. 6.1.20 = 2020-01-06)
- DD.MM.    (day and month without year — infer year from surrounding context)
- Times: kl. HH:MM, klokken HH:MM, kl HH.MM
- Diary/log format: lines beginning with a date or time are always events.
- Two-digit years: interpret as 20YY (20 → 2020, 21 → 2021).

Barnevernet-specific events that are ALWAYS high significance:
- Akuttvedtak (emergency placement) under §4-6 or §4-25
- Omsorgsovertakelse (care order) under §4-12
- Police involvement or assistance (politibistand)
- Formal decision (vedtak) or court order (kjennelse)
- Deadline breaches: bekymringsmelding not processed within 7 days; investigation not opened within 6 weeks
- Forhandlingsmøte (negotiation hearing) or Fylkesnemnda hearing
- Supervised contact visits (samvær) being reduced or denied
- Placement in foster care or institution (fosterhjem, institusjon)

For each event provide:
- "date": ISO 8601 date (YYYY-MM-DD) if determinable, otherwise best-effort description
- "time_of_day": HH:MM if present, otherwise null
- "actor": person, institution, or party involved
- "action": concise description (≤ 80 chars) of what happened
- "significance": high (acute measure, removal, police involvement, formal decision, statutory deadline breach) | medium (home visit, phone call, meeting, assessment) | low (minor update, note)

Sort chronologically. Maximum 40 events.

Document text:
{$excerpt}

Return JSON only:
{
  "events": [{"date":"...","time_of_day":null,"actor":"...","action":"...","significance":"high|medium|low"}]
}
PROMPT;

        try {
            $raw = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 4000, 'timeout' => 55]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && is_array($json['events'] ?? null)) {
                return array_slice($json['events'], 0, 40);
            }
        } catch (Throwable $e) {
            error_log('BVJ extractTimeline failed: ' . $e->getMessage());
        }
        return [];
    }

    // ── Step 4: Sub-question generation ──────────────────────────────────────

    private function generateSubQuestions(
        array  $docMeta,
        array  $parties,
        array  $timelineEvents,
        string $advocateRole,
        int    $count,
        string $language
    ): array {
        $locale    = dbnToolsLanguageName($language);
        $docType   = $docMeta['doc_type'] ?? 'BVJ document';
        $docDate   = $docMeta['doc_date'] ?? 'unknown date';
        $authority = $docMeta['issuing_authority'] ?? 'the municipality';
        $roleStr   = $advocateRole !== '' ? $advocateRole : 'the affected party';

        // Summarise high-significance events first, then others
        $highEvents  = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
        $otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'));
        $topEvents   = array_slice(array_merge($highEvents, $otherEvents), 0, 12);
        $eventSummary = '';
        foreach ($topEvents as $ev) {
            $sig = ($ev['significance'] ?? 'low') === 'high' ? '[HIGH] ' : '';
            $eventSummary .= sprintf("- %s %s%s (%s)\n",
                $ev['date'] ?? '?', $sig, $ev['action'] ?? '', $ev['actor'] ?? '');
        }

        // Summarise parties
        $partyList = '';
        foreach (array_slice($parties, 0, 10) as $p) {
            $org = !empty($p['organization']) ? ' at ' . $p['organization'] : '';
            $partyList .= sprintf("- %s (%s%s)\n", $p['name'] ?? '?', $p['role'] ?? '?', $org);
        }

        $angleGuidance = match (true) {
            $count >= 5 => <<<ANGLES
Cover these five distinct legal angles (one per question):
1. Statutory rights and obligations under Barnevernloven (e.g. §4-2, §4-6, §4-12) specific to the measures taken
2. ECHR Article 8 proportionality and procedural safeguards — cite the specific measures and dates from this case
3. Procedural obligations BVV must fulfil (advance notice, documentation, hearing rights) — anchor to documented events
4. Bufdir/Statsforvalter guidance on investigation standards and thresholds for intervention
5. Norwegian appellate court decisions on comparable measures and family circumstances
ANGLES,
            $count === 4 => <<<ANGLES
Cover these four distinct legal angles (one per question):
1. Statutory rights under Barnevernloven anchored to the specific measures and dates in this case
2. ECHR Article 8 — proportionality of the specific intervention and any procedural violations
3. BVV's procedural obligations — documentation, notice, and hearing rights — as evidenced by the timeline
4. Bufdir guidance and Norwegian court decisions on comparable fact patterns
ANGLES,
            default => <<<ANGLES
Cover three distinct legal angles (one per question):
1. Statutory rights under Barnevernloven for the specific type of measure documented
2. ECHR Article 8 proportionality and procedural safeguards
3. BVV's procedural obligations and whether the documented timeline shows any breach
ANGLES,
        };

        $prompt = <<<PROMPT
You are a Norwegian family-law research assistant building a case for: {$roleStr}.

Case facts extracted from the uploaded document:
- Document type: {$docType}
- Date: {$docDate}
- Issuing authority: {$authority}
- Key events (chronological):
{$eventSummary}
- Key parties:
{$partyList}

Generate exactly {$count} sub-questions to search the Norwegian legal corpus for arguments that SUPPORT {$roleStr}'s position.

{$angleGuidance}

CRITICAL: Every question MUST embed specific facts from this case — use the actual authority name, document date, type of measure, and parties where relevant. Generic questions ("What are parental rights?") are useless for retrieval. Specific questions ("What notice requirements must {$authority} meet before issuing an emergency placement under Barnevernloven §4-6?") are highly effective.

Return JSON only in {$locale}:
{
  "sub_questions": [
    {"id":"q1","question":"...","rationale":"why this angle strengthens {$roleStr}'s position (≤ 120 chars)"}
  ]
}

Rules:
- Exactly {$count} sub-questions.
- Each question targets a DIFFERENT legal angle.
- Include specific case details (authority, date, measure type) in each question.
- Questions must be self-contained and answerable from Norwegian family-law, child-welfare, or ECHR sources.
- Respond in {$locale}.
PROMPT;

        try {
            $raw = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.15, 'max_tokens' => 1000, 'timeout' => 40]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && is_array($json['sub_questions'] ?? null) && count($json['sub_questions']) >= 1) {
                $sqs = [];
                foreach (array_slice($json['sub_questions'], 0, $count) as $sq) {
                    if (!empty($sq['id']) && !empty($sq['question'])) {
                        $sqs[] = [
                            'id'        => (string)$sq['id'],
                            'question'  => (string)$sq['question'],
                            'rationale' => (string)($sq['rationale'] ?? ''),
                        ];
                    }
                }
                if ($sqs) return $sqs;
            }
        } catch (Throwable $e) {
            error_log('BVJ generateSubQuestions failed: ' . $e->getMessage());
        }

        // Fallback: generic sub-questions
        $role = $advocateRole ?: 'affected party';
        return [
            ['id' => 'q1', 'question' => "What procedural rights does {$role} have in Barnevernet proceedings under Barnevernloven?", 'rationale' => 'Procedural rights'],
            ['id' => 'q2', 'question' => "What does ECHR Article 8 require when child welfare authorities intervene in family life?", 'rationale' => 'ECHR Article 8'],
            ['id' => 'q3', 'question' => "What Bufdir guidance applies to the proportionality of Barnevernet interventions?", 'rationale' => 'Proportionality'],
            ['id' => 'q4', 'question' => "What are the documentation and notice obligations of BVV before taking acute measures?", 'rationale' => 'Documentation obligations'],
        ];
    }

    // ── Step 6: Synthesis ─────────────────────────────────────────────────────

    private function synthesiseBvj(
        string    $docText,
        array     $docMeta,
        array     $parties,
        array     $timelineEvents,
        array     $subQuestions,
        array     $numberedSources,
        string    $advocateRole,
        string    $engine,
        string    $language,
        float     $temperature,
        string    $additionalNotes,
        ?callable $emit = null
    ): array {
        $locale   = dbnToolsLanguageName($language);
        $roleStr  = $advocateRole !== '' ? $advocateRole : 'the affected party';
        $docType  = $docMeta['doc_type'] ?? 'BVJ Document';
        $docDate  = $docMeta['doc_date'] ?? 'unknown date';
        $authority = $docMeta['issuing_authority'] ?? 'unknown authority';
        $refNo    = $docMeta['reference_number'] ? ' (ref ' . $docMeta['reference_number'] . ')' : '';
        $childInfo = $docMeta['child_info'] ?? 'not specified';
        $sourceCount = count($numberedSources);

        if (empty($numberedSources)) {
            $emptyBrief = match (dbnToolsNormalizeUiLanguage($language)) {
                'no' => 'Ingen kildetreff ble funnet i korpuset for de valgte skivene og spørsmålene.',
                'uk' => 'Для вибраних розділів і підпитань не знайдено джерел у корпусі.',
                'pl' => 'Nie znaleziono źródeł w korpusie dla wybranych sekcji i pytań pomocniczych.',
                default => 'No corpus sources were retrieved for the selected slices and sub-questions.',
            };
            return [
                'json' => [
                    'advocacy_brief'         => $emptyBrief,
                    'procedural_red_flags'   => [],
                    'client_strengths'       => [],
                    'opposing_weaknesses'    => [],
                    'what_we_found'          => 'No retrieved sources passed the similarity threshold.',
                    'what_remains_uncertain' => ['No corpus evidence retrieved — widen slice selection or try different sub-questions.'],
                    'next_practical_step'    => 'Enable more corpus slices (Norwegian Courts, Bufdir Guidance) and re-run.',
                ],
                'deploy_label' => match($engine) {
                    'gpu'        => 'GPU (cuttlefish)',
                    'azure_full' => 'gpt-4o',
                    default      => $this->azure->chatDeployment(),
                },
            ];
        }

        // Build parties summary (top 8)
        $partiesSummary = '';
        foreach (array_slice($parties, 0, 12) as $i => $p) {
            $org = $p['organization'] ? ' (' . $p['organization'] . ')' : '';
            $rel = $p['relationship_to_child'] ? ' — rel: ' . $p['relationship_to_child'] : '';
            $partiesSummary .= sprintf("%d. %s — %s%s%s\n", $i + 1, $p['name'] ?? '', $p['role'] ?? '', $org, $rel);
        }

        // Build timeline summary (top 20 most significant events)
        $highEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
        $otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'));
        $topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 20);
        $timelineSummary = '';
        foreach ($topEvents as $ev) {
            $time = $ev['time_of_day'] ? ' kl.' . $ev['time_of_day'] : '';
            $timelineSummary .= sprintf("- %s%s [%s] %s: %s\n",
                $ev['date'] ?? '?', $time,
                strtoupper($ev['significance'] ?? 'low'),
                $ev['actor'] ?? '', $ev['action'] ?? '');
        }

        // Build sources text
        $sourcesContext = [];
        foreach ($numberedSources as $s) {
            $sourcesContext[] = sprintf(
                "[%d] (%s) %s%s\n    Corpus: %s\n    Authority: %s | Jurisdiction: %s\n    Excerpt: %s",
                $s['n'],
                $s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
                $s['title'],
                !empty($s['section']) ? ' — ' . $s['section'] : '',
                $s['package_or_corpus'],
                $s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
                $s['jurisdiction'] ?? 'n/a',
                $s['excerpt']
            );
        }
        $sourcesText = implode("\n\n", $sourcesContext);

        // Build sub-question text
        $subQText = '';
        if ($subQuestions) {
            $subQText = "\nSub-questions researched:\n";
            foreach ($subQuestions as $sq) {
                $subQText .= sprintf("- %s: %s\n", $sq['id'], $sq['question']);
            }
        }

        $notesSection = $additionalNotes !== ''
            ? "\n== ADDITIONAL CONTEXT FROM ADVOCATE ==\n{$additionalNotes}\n"
            : '';

        $docExcerpt = mb_substr($docText, 0, 8000, 'UTF-8');

        $prompt = <<<PROMPT
You are Do Better Norge Legal Tools. Produce a structured Barnevernet case analysis for: {$roleStr}.

HALLUCINATION RULES — READ FIRST:
- You may ONLY cite statute sections (§), ECHR article numbers, ECHR application numbers, case names, and Bufdir/Statsforvalter circular references that appear verbatim in the numbered corpus sources below.
- Do NOT cite statute sections, case names, or ECHR applications from your training memory — they may be misremembered or no longer in force.
- If no source supports a claim, omit the claim rather than invent support.
- Every factual legal claim in advocacy_brief MUST end with at least one [n] or [DOC] citation. Unsupported claims are a liability for the client.

Return valid JSON only. No markdown fences.

== DOCUMENT METADATA ==
Type: {$docType}{$refNo}
Date: {$docDate}
Issuing authority: {$authority}
Child: {$childInfo}

== KEY PARTIES ==
{$partiesSummary}

== TIMELINE (from document) ==
{$timelineSummary}

== CORPUS SOURCES ({$sourceCount} numbered — cite as [n]) ==
{$sourcesText}
{$notesSection}
{$subQText}

== DOCUMENT EXCERPT (first 8000 chars — cite as [DOC]) ==
{$docExcerpt}

== ADVOCACY BRIEF FORMAT ==
Write the advocacy_brief as a Markdown document with these sections:

## Case Overview
Summarise what happened: document type, issuing authority, key events from the timeline. Every factual statement must cite [DOC].

## {$roleStr}'s Core Legal Position
The strongest statutory and ECHR arguments in favour of {$roleStr}. Cite [n] for each legal point. Only cite statutes and cases that appear in the corpus sources above.

## Procedural Compliance Issues
Where BVV/the authority may have failed their own procedural obligations. Ground each point in a specific documented action from [DOC] and the applicable statute or guidance from [n].

## Client Strengths
3-6 factual and legal advantages for {$roleStr}, each anchored with [n] or [DOC].

## Counter-Arguments and Responses
The most likely opposing arguments and how to rebut them. Cite [n] for rebuttal sources.

## Recommended Next Steps
2-4 concrete legal actions {$roleStr} should take now.

End with one line: "*This brief is AI-assisted and for discussion purposes only — verify all legal references with a qualified Norwegian family-law lawyer.*"

Target length: 600-1000 words.

== JSON OUTPUT ==
{
  "advocacy_brief": "<the Markdown brief following the format above>",

  "procedural_red_flags": [
    {
      "description": "Concise description of the potential procedural violation",
      "legal_basis": "Statute or ECHR article from a corpus source — e.g. Barnevernloven §4-2 [3]",
      "severity": "high|medium|low",
      "source_refs": ["[n]", "[DOC]"],
      "what_to_check": "Exact document text or action to verify with a lawyer"
    }
  ],

  "client_strengths": ["3-6 items, each ending with [n] or [DOC]"],
  "opposing_weaknesses": ["2-5 documented vulnerabilities in BVV or opposing position — OMIT if not supported by at least one [n]"],
  "what_we_found": "2-sentence plain-language summary of the single most critical finding",
  "what_remains_uncertain": ["3-5 specific information gaps or legal questions that need clarification"],
  "next_practical_step": "The single most important concrete legal action for {$roleStr} to take within the next 7 days"
}

Rules:
- severity: high = likely violation of a codified statutory right or ECHR guarantee; medium = procedural irregularity; low = best-practice gap only.
- procedural_red_flags must be grounded in documented BVV actions visible in [DOC] or the timeline.
- If fewer than 2 corpus sources support opposing_weaknesses, return an empty array.
- Respond in {$locale}.
PROMPT;

        $sysPrompt = 'You return valid JSON only. No markdown fences. Every legal citation must come from the provided corpus sources, not from training memory.';

        $messages = [
            ['role' => 'system', 'content' => $sysPrompt],
            ['role' => 'user',   'content' => $prompt],
        ];
        $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 4500, 'timeout' => 240];

        $deployLabel = match ($engine) {
            'gpu'        => 'GPU (cuttlefish)',
            'azure_full' => 'gpt-4o',
            default      => $this->azure->chatDeployment(),
        };

        $raw = '';
        try {
            if ($engine === 'gpu') {
                $response = dbnToolsCallGpuLlm($messages, $opts);
                $raw = (string)($response['choices'][0]['message']['content'] ?? '');
            } elseif ($engine === 'azure_full') {
                $raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
            } else {
                $raw = $this->azure->chatText($messages, $opts);
            }
        } catch (Throwable $e) {
            dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
        }

        $json = $this->azure->decodeJsonObject($raw);
        if (!is_array($json) || empty($json['advocacy_brief'])) {
            $json = [
                'advocacy_brief'         => $raw,
                'procedural_red_flags'   => [],
                'client_strengths'       => [],
                'opposing_weaknesses'    => [],
                'what_we_found'          => 'Synthesis returned non-structured output; rendered as raw markdown.',
                'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
                'next_practical_step'    => 'Review the brief manually before relying on it.',
            ];
        }

        // Step 6b: dbn-legal-agent targeted legal Q&A check (azure engines only; silent on failure)
        // Asks one focused question about the document's statutory basis to surface domain knowledge
        // that Azure reliably misses (klar nødvendighet threshold, Strand Lobben, fvl §17/§41).
        if (in_array($engine, ['azure_mini', 'azure_full'], true)) {
            $checkFindings = dbnToolsRunLegalCheck(
                (string)($json['advocacy_brief'] ?? ''),
                $docType
            );
            if (!empty($checkFindings)) {
                if (!is_array($json['procedural_red_flags'] ?? null)) {
                    $json['procedural_red_flags'] = [];
                }
                foreach ($checkFindings as $cf) {
                    $json['procedural_red_flags'][] = $cf;
                }
                $json['check_model'] = 'dbn-legal-agent-v2';
            }
        }

        return ['json' => $json, 'deploy_label' => $deployLabel];
    }

    // ── GPU streaming helper (keeps browser connection alive during slow models) ──

    /**
     * Call the LiteLLM endpoint with streaming enabled and accumulate the full text.
     * Every 15 seconds, calls $onProgress() so PHP can flush a keepalive event to the browser.
     */
    private function callGpuLlmStream(array $messages, array $options, ?callable $onProgress): string
    {
        $url     = 'http://10.0.1.10:4000/v1/chat/completions';
        $apiKey  = (string)(dbnToolsEnv('LITELLM_MASTER_KEY') ?: 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d');
        $timeout = (int)($options['timeout'] ?? 660);

        $payload = [
            'model'       => (string)($options['model'] ?? 'qwen2.5:14b'),
            'messages'    => $messages,
            'temperature' => $options['temperature'] ?? 0.1,
            'max_tokens'  => $options['max_tokens'] ?? 2800,
            'stream'      => true,
        ];
        if (!empty($options['stop']) && is_array($options['stop'])) {
            $payload['stop'] = $options['stop'];
        }
        $body    = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
        $headers = [
            'Content-Type: application/json',
            'Authorization: Bearer ' . $apiKey,
        ];

        $accumulated   = '';
        $lastKeepalive = microtime(true);
        $curlErr       = '';

        $ch = curl_init($url);
        curl_setopt_array($ch, [
            CURLOPT_POST           => true,
            CURLOPT_POSTFIELDS     => $body,
            CURLOPT_HTTPHEADER     => $headers,
            CURLOPT_TIMEOUT        => $timeout,
            CURLOPT_RETURNTRANSFER => false,
            CURLOPT_WRITEFUNCTION  => static function ($ch, $data) use (&$accumulated, &$lastKeepalive, $onProgress): int {
                foreach (explode("\n", $data) as $line) {
                    $trimmed = ltrim($line);
                    if (!str_starts_with($trimmed, 'data: ')) continue;
                    $json = substr($trimmed, 6);
                    if (trim($json) === '[DONE]') continue;
                    $chunk   = json_decode($json, true);
                    $delta   = $chunk['choices'][0]['delta']['content'] ?? '';
                    if ($delta !== '') $accumulated .= $delta;
                }
                if ($onProgress !== null && microtime(true) - $lastKeepalive >= 15.0) {
                    $lastKeepalive = microtime(true);
                    $onProgress();
                    @flush();
                }
                return strlen($data);
            },
        ]);

        curl_exec($ch);
        $curlErr = curl_error($ch);
        curl_close($ch);

        if ($curlErr !== '') {
            throw new RuntimeException('GPU stream request failed: ' . $curlErr);
        }
        return trim($accumulated);
    }

    // ── Shared helpers (copied from DbnDeepResearchAgent) ────────────────────

    private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
    {
        $text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
        if ($text === '') return [];
        $words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
        if (!$words) return [];

        $chunks   = [];
        $i        = 0;
        $chunkIdx = 0;
        $total    = count($words);
        while ($i < $total) {
            $slice = array_slice($words, $i, self::CHUNK_WORDS);
            if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
                $chunks[] = [
                    'chunk_id'    => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
                    'file_index'  => $fileIdx,
                    'chunk_index' => $chunkIdx,
                    'filename'    => $filename,
                    'text'        => implode(' ', $slice),
                ];
                $chunkIdx++;
            }
            $advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
            if ($advance < 1) $advance = 1;
            $i += $advance;
            if (count($slice) < self::CHUNK_WORDS) break;
        }
        return $chunks;
    }

    private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
    {
        if (empty($this->uploadVecs)) return [];
        try {
            $qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? [];
        } catch (Throwable $e) {
            error_log('BVJ sub-Q embed failed: ' . $e->getMessage());
            return [];
        }
        if (empty($qVec)) return [];

        $scored = [];
        foreach ($this->uploadVecs as $entry) {
            $sim = $this->cosineSim($qVec, $entry['vec']);
            if ($sim < $threshold) continue;
            $scored[] = [
                'chunk_id'          => $entry['meta']['chunk_id'],
                'title'             => 'uploaded: ' . $entry['meta']['filename'],
                'section'           => null,
                'package_or_corpus' => 'Your upload',
                'excerpt'           => dbnToolsExcerpt($entry['meta']['text'], 620),
                'chunk_text'        => $entry['meta']['text'],
                'similarity'        => round($sim, 4),
                'reranker_score'    => null,
                'document_id'       => null,
                'source_origin'     => 'upload',
                'authority_type'    => null,
                'jurisdiction'      => null,
            ];
        }
        usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
        $keep = (int)ceil($limitPerSubQ / 2);
        return array_slice($scored, 0, max(1, $keep));
    }

    private function cosineSim(array $a, array $b): float
    {
        $len = min(count($a), count($b));
        if ($len === 0) return 0.0;
        $dot = $na = $nb = 0.0;
        for ($i = 0; $i < $len; $i++) {
            $x = (float)$a[$i]; $y = (float)$b[$i];
            $dot += $x * $y; $na += $x * $x; $nb += $y * $y;
        }
        if ($na === 0.0 || $nb === 0.0) return 0.0;
        return $dot / (sqrt($na) * sqrt($nb));
    }

    private function normalizeCorpusChunk(array $chunk, string $subQId): array
    {
        return [
            'chunk_id'               => isset($chunk['id']) ? (int)$chunk['id'] : null,
            'title'                  => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
            'section'                => $chunk['section_title'] ?? null,
            'package_or_corpus'      => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
            'excerpt'                => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
            'chunk_text'             => (string)($chunk['content'] ?? ''),
            'similarity'             => isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null,
            'reranker_score'         => isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null,
            'document_id'            => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
            'source_origin'          => 'corpus',
            'authority_type'         => $chunk['authority_type'] ?? null,
            'jurisdiction'           => $chunk['jurisdiction'] ?? null,
            'publication_year'       => $chunk['publication_year'] ?? null,
            'source_url'             => null,
            'deep_link'              => null,
            'authority_label'        => null,
            'corpus_source_name'     => null,
            'publication_date'       => null,
            'matched_sub_questions'  => [$subQId],
        ];
    }

    private function shouldExcludeChunk(array $chunk, array $activeSlices): bool
    {
        $title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
        $url   = strtolower((string)($chunk['source_url'] ?? ''));
        $name  = strtolower((string)($chunk['source_name'] ?? ''));

        if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true;
        if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true;

        $isDbnPage = (
            str_contains($name, 'website')
            || str_contains($title, 'dobetternorge.no')
            || preg_match('/^(homepage|landing|about |contact )/i', $title)
            || str_contains($title, 'resource directory')
            || preg_match('/^flashcards?\s*[-–|]/i', $title)
            || preg_match('/\|\s*do better norge\s*$/i', $title)
            || preg_match('/[-–]\s*do better norge\s*$/i', $title)
        );
        if ($isDbnPage) {
            return !($activeSlices['dbn_resources'] ?? false);
        }
        return false;
    }

    private function hydrateSourceUrls(array &$pool): void
    {
        $docIds = [];
        foreach ($pool as $chunk) {
            if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
            $docId = (int)($chunk['document_id'] ?? 0);
            if ($docId > 0) $docIds[$docId] = true;
        }
        if (empty($docIds)) return;

        try {
            $ragDb = dbnToolsRagDb();
            $ids   = array_keys($docIds);
            $ph    = implode(',', array_fill(0, count($ids), '?'));

            $stmt = $ragDb->prepare("
                SELECT d.id, d.title, d.source_url, d.authority_type,
                       d.publication_date, d.source_id, d.jurisdiction,
                       d.summary, LEFT(d.content, 4000) AS content_excerpt
                FROM documents d
                WHERE d.id IN ({$ph})
            ");
            $stmt->execute($ids);

            $docMeta  = [];
            $sourceIds = [];
            foreach ($stmt as $row) {
                $dId = (int)$row['id'];
                $sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
                if ($sid) $sourceIds[] = $sid;
                $docMeta[$dId] = [
                    'source_url'         => $row['source_url'] ?? null,
                    'authority_label'    => dbnV6AuthorityLabel($row['authority_type'] ?? null),
                    'publication_date'   => $row['publication_date'] ?? null,
                    'corpus_source_name' => 'Do Better Legal',
                    'source_id'          => $sid,
                    'summary'            => $row['summary'] ?? null,
                    'content_excerpt'    => (string)($row['content_excerpt'] ?? ''),
                    'title'              => (string)($row['title'] ?? ''),
                ];
            }

            $unsummarized = array_filter($docMeta, fn($m) => $m['summary'] === null && $m['content_excerpt'] !== '');
            foreach ($unsummarized as $dId => $m) {
                try {
                    $raw = $this->azure->chatText([
                        ['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
                        ['role' => 'user', 'content' => "Summarise this Norwegian family law document.\nFocus on: legal provisions covered, authority type, and questions it helps answer.\n\nTitle: {$m['title']}\n\nContent:\n{$m['content_excerpt']}"],
                    ], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
                    $summary = trim($raw);
                    if ($summary !== '') {
                        $ragDb->prepare("UPDATE documents SET summary = ? WHERE id = ?")->execute([$summary, $dId]);
                        $docMeta[$dId]['summary'] = $summary;
                    }
                } catch (Throwable $e) {
                    error_log('BVJ hydrateSourceUrls summary gen failed for doc ' . $dId . ': ' . $e->getMessage());
                }
            }

            if (!empty($sourceIds)) {
                $uSids = array_values(array_unique($sourceIds));
                $sPh   = implode(',', array_fill(0, count($uSids), '?'));
                $sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
                $sStmt->execute($uSids);
                $srcNames = [];
                foreach ($sStmt as $row) {
                    $srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
                }
                foreach ($docMeta as &$m) {
                    if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
                        $m['corpus_source_name'] = $srcNames[$m['source_id']];
                    }
                }
                unset($m);
            }
        } catch (Throwable $e) {
            error_log('BVJ hydrateSourceUrls failed: ' . $e->getMessage());
            return;
        }

        foreach ($pool as &$chunk) {
            if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
            $docId = (int)($chunk['document_id'] ?? 0);
            if (!$docId || !isset($docMeta[$docId])) continue;
            $m = $docMeta[$docId];
            $sourceUrl = $m['source_url'] ?? null;
            $chunk['source_url']         = $sourceUrl;
            $chunk['deep_link']          = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
            $chunk['authority_label']    = $m['authority_label'] ?? $chunk['authority_label'];
            $chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
            $chunk['publication_date']   = $m['publication_date'] ?? null;
            $chunk['summary']            = $m['summary'] ?? null;
        }
        unset($chunk);
    }

    private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string
    {
        if (!$sourceUrl) return null;
        $sourceUrl = trim($sourceUrl);
        if ($sourceUrl === '') return null;
        if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl)
            && $sectionTitle
            && preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) {
            return rtrim($sourceUrl, '/') . '/§' . $m[1];
        }
        return $sourceUrl;
    }

    private function mergeAndDedupe(array $rawPool, int $cap): array
    {
        $byKey = [];
        foreach ($rawPool as $chunk) {
            $key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
            if (!isset($byKey[$key])) {
                $byKey[$key] = $chunk;
                continue;
            }
            $existing = $byKey[$key];
            $existing['matched_sub_questions'] = array_values(array_unique(array_merge(
                $existing['matched_sub_questions'] ?? [],
                $chunk['matched_sub_questions'] ?? []
            )));
            if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
                $existing['similarity'] = $chunk['similarity'];
            }
            if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
                $existing['reranker_score'] = $chunk['reranker_score'];
            }
            $byKey[$key] = $existing;
        }
        $merged = array_values($byKey);
        usort($merged, function (array $a, array $b): int {
            $aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
            $bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
            return $bScore <=> $aScore;
        });
        return array_slice($merged, 0, $cap);
    }

    private function numberSources(array $chunks): array
    {
        $out = [];
        foreach ($chunks as $i => $c) {
            $c['n'] = $i + 1;
            $out[] = $c;
        }
        return $out;
    }

    private function citationConfidence(array $sources): string
    {
        if (!$sources) return 'low';
        $scores = array_values(array_filter(array_map(
            fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
            $sources
        ), 'is_numeric'));
        $best = $scores ? max($scores) : 0;
        if (count($sources) >= 6 && $best >= 0.5) return 'high';
        if (count($sources) >= 3 && $best >= 0.35) return 'medium';
        return 'low';
    }

    private function normalizeControls(array $controls): array
    {
        return [
            'sub_q_count'          => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
            'chunk_limit'          => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
            'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
            'reranker_top_k'       => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
            'temperature'          => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
        ];
    }

    private function requireFamilyPackage(int $clientId): array
    {
        $package = dbnToolsFetchPackage('family-legal');
        if (!$package || empty($package['is_active'])) {
            dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
        }
        if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
            dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
        }
        return $package;
    }

    private function trace(string $label, string $detail, string $status = 'complete'): array
    {
        return ['label' => $label, 'detail' => $detail, 'status' => $status];
    }

    private function elapsedMs(float $start): int
    {
        return (int)round((microtime(true) - $start) * 1000);
    }
}