dobetternorge-tools/includes/BvjAnalyzerAgent.php

<?php
declare(strict_types=1);

require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';

/**
 * BVJ (Barnevernet) Analyzer Agent
 *
 * Standalone 7-step pipeline that:
 *  1. Classifies the uploaded document and extracts metadata
 *  2. Extracts all named parties with roles
 *  3. Builds a chronological timeline of events
 *  4. Generates partisan sub-questions for corpus RAG
 *  5. Retrieves from the legal corpus (hybrid dense+BM25)
 *  6. Synthesises an advocacy brief + procedural red flags
 *  7. Assesses citation confidence
 *
 * Steps 1-3 always use azure_mini regardless of the user's engine choice.
 * Step 6 (synthesis) uses the user's selected engine.
 */
final class DbnBvjAnalyzerAgent
{
    private const MAX_DOC_CHARS        = 64000;
    private const CHUNK_WORDS          = 600;
    private const CHUNK_OVERLAP_WORDS  = 75;
    private const MIN_CHUNK_WORDS      = 50;
    private const POOL_CAP             = 30;
    // Steps 1-3 always use this engine — fast and cheap for structured extraction
    private const EXTRACT_ENGINE       = 'azure_mini';

    private DbnAzureOpenAiGateway $azure;
    private array $uploadVecs  = [];
    private array $stepTimings = [];

    public function __construct(?DbnAzureOpenAiGateway $azure = null)
    {
        $this->azure = $azure ?: new DbnAzureOpenAiGateway();
    }

    /**
     * Main pipeline. At least 1 uploaded file is required.
     *
     * @param array     $uploadedFiles   [{filename, text, chars, truncated}]
     * @param string    $advocateRole    Party the user represents
     * @param string    $engine          Affects synthesis only: azure_mini|azure_full|gpu
     * @param string    $language        'en' or 'no'
     * @param array     $sliceSelection  Corpus slice toggles
     * @param array     $controls        sub_q_count, chunk_limit, similarity_threshold, reranker_top_k, temperature
     * @param string    $additionalNotes Optional user context to supplement the document
     * @param callable|null $emit        function(string $event, array $payload): void
     */
    public function run(
        array     $uploadedFiles,
        string    $advocateRole,
        string    $engine,
        string    $language,
        array     $sliceSelection,
        array     $controls,
        string    $additionalNotes = '',
        ?callable $emit = null
    ): array {
        $engine   = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true)
                    ? $engine : 'azure_mini';
        $language = dbnToolsNormalizeUiLanguage($language);
        $controls = $this->normalizeControls($controls);

        if (empty($uploadedFiles)) {
            dbnToolsAbort('Upload at least one BVJ document before running the analyzer.', 422, 'no_uploads');
        }

        $client  = dbnToolsRequireClient();
        $package = $this->requireFamilyPackage((int)$client['id']);

        dbnToolsBootCaveau();
        $aiPortalRoot = dbnToolsAiPortalRoot();
        require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';

        $this->uploadVecs  = [];
        $this->stepTimings = [];
        $trace             = [];

        $emitStep = function (string $stepId, string $label, string $detail, string $status)
                    use (&$trace, $emit): void {
            $trace[] = $this->trace($label, $detail, $status);
            if ($emit) {
                $emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => $status]);
            }
        };
        $emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void {
            if ($emit) {
                $emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => 'running']);
            }
        };

        // Build combined document text (first file is primary; additional files appended)
        $docText = '';
        foreach ($uploadedFiles as $idx => $file) {
            $text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8');
            if ($text === '') continue;
            $filename = (string)($file['filename'] ?? sprintf('document-%d', $idx + 1));
            $docText .= ($docText !== '' ? "\n\n--- Document: {$filename} ---\n\n" : '') . $text;
        }
        if ($docText === '') {
            dbnToolsAbort('Could not extract text from the uploaded file(s).', 422, 'empty_document');
        }
        $docText = mb_substr($docText, 0, self::MAX_DOC_CHARS * 2, 'UTF-8');

        // ── STEP 1: Document classification ────────────────────────────────────
        $emitRunning('doc_classify', 'Document classification', 'Classifying document and extracting metadata…');
        $stepStart = microtime(true);
        $docMeta = $this->classifyDocument($docText, $language);
        $this->stepTimings['doc_classify'] = $this->elapsedMs($stepStart);
        if ($emit) {
            $emit('doc_meta', ['result' => $docMeta]);
        }
        $docTypeBadge = $docMeta['doc_type'] ?? 'BVJ Document';
        $refStr = $docMeta['reference_number'] ? ' · ref ' . $docMeta['reference_number'] : '';
        $authStr = $docMeta['issuing_authority'] ? $docMeta['issuing_authority'] : '';
        $emitStep('doc_classify', 'Document classification',
            trim("{$docTypeBadge} · {$authStr}{$refStr}"), 'complete');

        // ── STEP 2: Party extraction ────────────────────────────────────────────
        $emitRunning('party_extract', 'Party extraction', 'Identifying all named parties and their roles…');
        $stepStart = microtime(true);
        $parties = $this->extractParties($docText, $language);
        $this->stepTimings['party_extract'] = $this->elapsedMs($stepStart);
        if ($emit) {
            $emit('parties', ['parties' => $parties]);
        }
        $emitStep('party_extract', 'Party extraction',
            sprintf('%d %s identified.', count($parties), count($parties) === 1 ? 'party' : 'parties'),
            'complete');

        // ── STEP 3: Timeline extraction ─────────────────────────────────────────
        $emitRunning('timeline_extract', 'Timeline extraction', 'Building chronological event timeline…');
        $stepStart = microtime(true);
        $timelineEvents = $this->extractTimeline($docText, $language);
        $this->stepTimings['timeline_extract'] = $this->elapsedMs($stepStart);
        if ($emit) {
            $emit('timeline', ['events' => $timelineEvents]);
        }
        $highCount = count(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
        $emitStep('timeline_extract', 'Timeline extraction',
            sprintf('%d events extracted (%d high-significance).', count($timelineEvents), $highCount),
            'complete');

        // ── STEP 4: Sub-question generation ────────────────────────────────────
        $emitRunning('sub_question_gen', 'Sub-question generation',
            sprintf('Generating %d research angles for %s…', $controls['sub_q_count'], $advocateRole ?: 'selected role'));
        $stepStart = microtime(true);
        $subQuestions = $this->generateSubQuestions(
            $docMeta, $parties, $timelineEvents,
            $advocateRole, $controls['sub_q_count'], $language
        );
        $this->stepTimings['sub_question_gen'] = $this->elapsedMs($stepStart);
        $emitStep('sub_question_gen', 'Sub-question generation',
            sprintf('%d sub-questions generated for %s.', count($subQuestions), $advocateRole ?: 'selected role'),
            'complete');

        // ── STEP 5: Slice resolution + upload indexing + corpus retrieval ───────
        $emitRunning('slice_resolution', 'Slice resolution', 'Resolving corpus slice toggles…');
        $stepStart = microtime(true);
        $sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
        if (!array_filter($sliceSelectionNormalized)) {
            dbnToolsAbort('Enable at least one corpus slice before running the analyzer.', 422, 'no_slices');
        }
        $ragDb = dbnToolsRagDb();
        try {
            $sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
            $sliceDetail  = sprintf('%d slice(s) active → %d candidate documents.',
                count(array_filter($sliceSelectionNormalized)), count($sharedDocIds));
            $sliceStatus  = 'complete';
        } catch (Throwable $e) {
            error_log('BVJ slice resolve failed: ' . $e->getMessage());
            $sharedDocIds = [];
            $sliceDetail  = 'Slice resolution failed; corpus search will run unconstrained.';
            $sliceStatus  = 'warning';
        }
        $this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
        $emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus);

        // Upload indexing
        $emitRunning('upload_indexing', 'Upload indexing',
            sprintf('Chunking + embedding %d file(s)…', count($uploadedFiles)));
        $stepStart    = microtime(true);
        $uploadChunks = [];
        foreach ($uploadedFiles as $idx => $file) {
            $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
            $text     = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8');
            $uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
        }
        $uploadStatus = 'complete';
        $uploadDetail = sprintf('%d file(s) → %d in-memory chunks indexed.', count($uploadedFiles), count($uploadChunks));
        if ($uploadChunks) {
            try {
                $texts   = array_map(fn(array $c) => $c['text'], $uploadChunks);
                $allVecs = [];
                $batchSz = 5;
                for ($b = 0; $b < count($texts); $b += $batchSz) {
                    $batch = array_slice($texts, $b, $batchSz);
                    if ($emit) {
                        $emit('progress', ['detail' => sprintf(
                            'Embedding chunks %d–%d of %d…',
                            $b + 1, $b + count($batch), count($texts)
                        )]);
                    }
                    $allVecs = array_merge($allVecs, dbnToolsLiteLLMEmbedBatch($batch));
                }
                if (count($allVecs) === count($uploadChunks)) {
                    foreach ($uploadChunks as $i => $chunk) {
                        $this->uploadVecs[] = ['meta' => $chunk, 'vec' => $allVecs[$i]];
                    }
                } else {
                    $uploadStatus = 'warning';
                    $uploadDetail = 'Upload embedding count mismatch; uploaded chunks will not participate in retrieval.';
                }
            } catch (Throwable $e) {
                error_log('BVJ upload embed failed: ' . $e->getMessage());
                $uploadStatus = 'warning';
                $uploadDetail = 'Upload embedding timed out; corpus-only retrieval will run.';
                $this->uploadVecs = [];
            }
        }
        $this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
        $emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus);

        // Corpus retrieval (per sub-question)
        $retrievalQueries = $subQuestions ?: [[
            'id'        => 'q1',
            'question'  => sprintf('%s case involving %s', $docMeta['doc_type'] ?? 'BVJ document', $advocateRole),
            'rationale' => 'Fallback query (sub-question generation returned empty).',
        ]];
        $emitRunning('retrieval', 'Corpus retrieval',
            sprintf('Hybrid vector + keyword across %d sub-question(s)…', count($retrievalQueries)));
        $stepStart = microtime(true);

        try {
            $rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
        } catch (Throwable $e) {
            dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
        }

        $rawPool = [];
        $retrievalWarnings = 0;
        $rawCorpusCount    = 0;
        $rawUploadCount    = 0;
        $filteredOutCount  = 0;

        foreach ($retrievalQueries as $idx => $sq) {
            if ($emit) {
                $emit('subq', [
                    'index'    => $idx + 1,
                    'total'    => count($retrievalQueries),
                    'id'       => $sq['id'],
                    'question' => $sq['question'],
                ]);
            }
            try {
                $corpusChunks = $rag->searchAll(
                    $sq['question'],
                    $controls['chunk_limit'],
                    null,
                    [
                        'search_private'          => false,
                        'search_shared'           => true,
                        'package_ids'             => [(int)$package['id']],
                        'shared_doc_ids'          => $sharedDocIds,
                        'chunk_limit'             => $controls['chunk_limit'],
                        'search_method'           => 'hybrid',
                        'reranker_enabled'        => true,
                        'include_beta_website'    => false,
                        'include_primary_website' => false,
                    ]
                );
            } catch (Throwable $e) {
                error_log('BVJ sub-Q retrieval failed: ' . $e->getMessage());
                $corpusChunks = [];
                $retrievalWarnings++;
            }
            $rawCorpusCount += count($corpusChunks);
            foreach ($corpusChunks as $chunk) {
                if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) {
                    $filteredOutCount++;
                    continue;
                }
                $rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
            }
            if (!empty($this->uploadVecs)) {
                $uploadHits = $this->retrieveFromUploads(
                    $sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']
                );
                $rawUploadCount += count($uploadHits);
                foreach ($uploadHits as $hit) {
                    $hit['matched_sub_questions'] = [$sq['id']];
                    $rawPool[] = $hit;
                }
            }
        }

        $merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
        $this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
        $retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
        $retrievalDetail = sprintf(
            '%d sub-Q(s) × hybrid → %d corpus (%d filtered) + %d upload → %d unique after dedupe.',
            count($retrievalQueries), $rawCorpusCount, $filteredOutCount, $rawUploadCount, count($merged)
        );
        $emitStep('retrieval', 'Corpus retrieval', $retrievalDetail, $retrievalStatus);

        $synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
        $this->hydrateSourceUrls($synthesisPool);
        $numberedSources = $this->numberSources($synthesisPool);

        // Generate upload summaries for sources from uploaded files
        if (!empty($uploadedFiles) && !empty($numberedSources)) {
            $uploadSummaries = [];
            foreach ($uploadedFiles as $idx => $file) {
                $text     = mb_substr((string)($file['text'] ?? ''), 0, 4000, 'UTF-8');
                $filename = (string)($file['filename'] ?? "file-{$idx}");
                if ($text === '') continue;
                try {
                    $raw = $this->azure->chatText([
                        ['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
                        ['role' => 'user', 'content' => "Summarise this BVJ document for a legal researcher.\n\nFilename: {$filename}\n\nContent:\n{$text}"],
                    ], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
                    $uploadSummaries[$idx] = trim($raw);
                } catch (Throwable $e) {
                    error_log('BVJ upload summary gen failed for file ' . $idx . ': ' . $e->getMessage());
                    $uploadSummaries[$idx] = null;
                }
            }
            foreach ($numberedSources as &$src) {
                if (($src['source_origin'] ?? '') !== 'upload') continue;
                if (preg_match('/^upload:(\d+):/', (string)($src['chunk_id'] ?? ''), $m)) {
                    $src['summary'] = $uploadSummaries[(int)$m[1]] ?? null;
                }
            }
            unset($src);
        }

        $retrievalCounts = [
            'raw_corpus'   => $rawCorpusCount,
            'filtered'     => $filteredOutCount,
            'raw_upload'   => $rawUploadCount,
            'after_dedupe' => count($merged),
            'after_topk'   => count($numberedSources),
        ];

        // ── STEP 6: Synthesis ───────────────────────────────────────────────────
        $engineLabel = match ($engine) {
            'azure_full' => 'Azure gpt-4o',
            'gpu'        => 'GPU qwen2.5:14b',
            default      => 'Azure gpt-4o-mini',
        };
        $emitRunning('synthesis', 'Synthesis',
            sprintf('Synthesising advocacy brief with %s…', $engineLabel));
        $stepStart = microtime(true);
        $synthesis = $this->synthesiseBvj(
            $docText, $docMeta, $parties, $timelineEvents,
            $subQuestions, $numberedSources,
            $advocateRole, $engine, $language, $controls['temperature'], $additionalNotes,
            $emit
        );
        $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
        $emitStep('synthesis', 'Synthesis',
            sprintf('%s synthesised advocacy brief using %d source(s) + document.',
                $synthesis['deploy_label'], count($numberedSources)),
            'complete');

        // ── STEP 7: Confidence ──────────────────────────────────────────────────
        $confidence = $this->citationConfidence($numberedSources);
        $emitStep('confidence', 'Citation confidence',
            sprintf('%s confidence based on %d source(s).', ucfirst($confidence), count($numberedSources)),
            $confidence === 'low' ? 'warning' : 'complete');

        // Build sub-question output with top_sources
        $subQOut = [];
        foreach ($retrievalQueries as $sq) {
            $matchedChunks = array_values(array_filter(
                $numberedSources,
                fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
            ));
            $topSources = array_slice($matchedChunks, 0, 3);
            $subQOut[] = [
                'id'          => $sq['id'],
                'question'    => $sq['question'],
                'rationale'   => $sq['rationale'] ?? '',
                'chunk_ids'   => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
                'top_sources' => array_map(fn(array $s) => [
                    'n'               => $s['n'] ?? null,
                    'title'           => $s['title'] ?? '',
                    'section'         => $s['section'] ?? null,
                    'deep_link'       => $s['deep_link'] ?? $s['source_url'] ?? null,
                    'source_url'      => $s['source_url'] ?? null,
                    'source_origin'   => $s['source_origin'] ?? 'corpus',
                    'authority_label' => $s['authority_label'] ?? null,
                    'excerpt'         => $s['excerpt'] ?? '',
                ], $topSources),
            ];
        }

        $synJson = $synthesis['json'];
        return [
            'tool'                   => 'bvj_analyzer',
            'language'               => $language,
            'advocate_role'          => $advocateRole,
            'doc_meta'               => $docMeta,
            'parties'                => $parties,
            'timeline'               => ['events' => $timelineEvents],
            'advocacy_brief'         => (string)($synJson['advocacy_brief'] ?? ''),
            'procedural_red_flags'   => is_array($synJson['procedural_red_flags'] ?? null)
                                        ? $synJson['procedural_red_flags'] : [],
            'client_strengths'       => is_array($synJson['client_strengths'] ?? null)
                                        ? $synJson['client_strengths'] : [],
            'opposing_weaknesses'    => is_array($synJson['opposing_weaknesses'] ?? null)
                                        ? $synJson['opposing_weaknesses'] : [],
            'sub_questions'          => $subQOut,
            'sources'                => $numberedSources,
            'what_we_found'          => (string)($synJson['what_we_found'] ?? ''),
            'what_remains_uncertain' => $synJson['what_remains_uncertain'] ?? [],
            'next_practical_step'    => (string)($synJson['next_practical_step'] ?? ''),
            'trace'                  => $trace,
            'trace_metadata'         => [
                'chunk_count'         => count($merged),
                'source_count'        => count($numberedSources),
                'sub_question_count'  => count($retrievalQueries),
                'upload_chunk_count'  => count($this->uploadVecs),
                'deployment'          => $synthesis['deploy_label'],
                'engine_used'         => $engine,
                'citation_confidence' => $confidence,
                'elapsed_ms_per_step' => $this->stepTimings,
                'retrieval_counts'    => $retrievalCounts,
                'slices_active'       => array_keys(array_filter($sliceSelectionNormalized)),
            ],
            'disclaimer' => dbnToolsDisclaimer($language),
        ];
    }

    // ── Step 1: Document classification ──────────────────────────────────────

    private function classifyDocument(string $docText, string $language): array
    {
        $locale  = dbnToolsLanguageName($language);
        $excerpt = mb_substr($docText, 0, 6000, 'UTF-8');

        $prompt = <<<PROMPT
You are analysing a Norwegian child welfare (Barnevernet) document.
Extract the following metadata from the document text below.

Return JSON only in {$locale}:
{
  "doc_type": "The document type as a short phrase, e.g. Bekymringsmelding, Vedtak, Omsorgsovertakelse, Fylkesnemnda-kjennelse, Rapport, or the detected type",
  "doc_date": "Primary document date in ISO 8601 format (YYYY-MM-DD) if identifiable, otherwise null",
  "issuing_authority": "Name of the issuing authority or institution, e.g. Trondheim kommune barneverntjeneste",
  "reference_number": "Case or document reference number if present, otherwise null",
  "child_info": "Brief description of the child(ren) involved, e.g. name and birth date if visible — anonymise if clearly redacted"
}

Rules:
- If a field cannot be determined, use null.
- doc_type should be the Norwegian term if recognisable (e.g. Bekymringsmelding), otherwise English.
- Do not invent information not present in the text.

Document text (first 6000 chars):
{$excerpt}
PROMPT;

        $default = [
            'doc_type'          => 'BVJ Document',
            'doc_date'          => null,
            'issuing_authority' => null,
            'reference_number'  => null,
            'child_info'        => null,
        ];

        try {
            $raw = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 400, 'timeout' => 30]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json)) {
                return array_merge($default, array_filter($json, fn($v) => $v !== null && $v !== ''));
            }
        } catch (Throwable $e) {
            error_log('BVJ classifyDocument failed: ' . $e->getMessage());
        }
        return $default;
    }

    // ── Step 2: Party extraction ──────────────────────────────────────────────

    private function extractParties(string $docText, string $language): array
    {
        $locale  = dbnToolsLanguageName($language);
        $excerpt = mb_substr($docText, 0, 12000, 'UTF-8');

        $prompt = <<<PROMPT
You are analysing a Norwegian child welfare (Barnevernet) document.
Identify ALL named parties — every person or institution referred to by name or title.

Respond in {$locale}. Return a JSON object with a single key "parties" containing an array of objects.
Each object must have these four fields:
- "name": full name or institution name (string)
- "role": their role in the case, e.g. Biological mother, Child, Barnevernarbeider, Saksbehandler, Melder, Politi, Lege, Advokat, Foster carer, Rusklinikk
- "organization": employer or institution if mentioned, otherwise null
- "relationship_to_child": relationship to the child in the document, e.g. Mother, Father, Caseworker, Melder, or null

Rules:
- Include every named person and named institution — even peripheral ones.
- Include Barnevernvakta (bvv) as an institution even if no individual caseworkers are named.
- Do not invent parties not present in the text.
- Maximum 20 parties.

Document text:
{$excerpt}
PROMPT;

        try {
            $raw = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 1500, 'timeout' => 40]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && is_array($json['parties'] ?? null)) {
                return array_slice($json['parties'], 0, 20);
            }
            // Fallback: model returned an array at root level instead of {parties:[...]}
            if (is_array($json) && isset($json[0]['name'])) {
                return array_slice($json, 0, 20);
            }
            error_log('BVJ extractParties unexpected structure: ' . substr($raw, 0, 300));
        } catch (Throwable $e) {
            error_log('BVJ extractParties failed: ' . $e->getMessage());
        }
        return [];
    }

    // ── Step 3: Timeline extraction ───────────────────────────────────────────

    private function extractTimeline(string $docText, string $language): array
    {
        $locale  = dbnToolsLanguageName($language);
        $excerpt = mb_substr($docText, 0, 12000, 'UTF-8');

        $prompt = <<<PROMPT
Build a chronological timeline from this Norwegian child welfare (Barnevernet) document in {$locale}.

Extract ALL dates, times, and temporal references — including phone calls, home visits, meetings, decisions, and assessments.

IMPORTANT — Norwegian date and time formats to recognise:
- DD.MM.YY  (e.g. 18.07.20 = 2020-07-18)
- DD.MM.YYYY (e.g. 18.07.2020)
- D.M.YY    (e.g. 6.1.20 = 2020-01-06)
- DD.MM.    (day and month without year — infer year from surrounding context)
- Times: kl. HH:MM, klokken HH:MM, kl HH.MM
- Diary/log format: lines beginning with a date or time are always events.
- Two-digit years: interpret as 20YY (20 → 2020, 21 → 2021).

For each event provide:
- "date": ISO 8601 date (YYYY-MM-DD) if determinable, otherwise best-effort description
- "time_of_day": HH:MM if present, otherwise null
- "actor": person, institution, or party involved
- "action": concise description (≤ 80 chars) of what happened
- "significance": high (acute measure, removal, police involvement, formal decision) | medium (home visit, phone call, meeting) | low (minor update, note)

Sort chronologically. Maximum 30 events.

Document text:
{$excerpt}

Return JSON only:
{
  "events": [{"date":"...","time_of_day":null,"actor":"...","action":"...","significance":"high|medium|low"}]
}
PROMPT;

        try {
            $raw = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 3000, 'timeout' => 45]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && is_array($json['events'] ?? null)) {
                return array_slice($json['events'], 0, 30);
            }
        } catch (Throwable $e) {
            error_log('BVJ extractTimeline failed: ' . $e->getMessage());
        }
        return [];
    }

    // ── Step 4: Sub-question generation ──────────────────────────────────────

    private function generateSubQuestions(
        array  $docMeta,
        array  $parties,
        array  $timelineEvents,
        string $advocateRole,
        int    $count,
        string $language
    ): array {
        $locale   = dbnToolsLanguageName($language);
        $docType  = $docMeta['doc_type'] ?? 'BVJ document';
        $roleStr  = $advocateRole !== '' ? $advocateRole : 'the affected party';

        // Summarise the top events to give the model context
        $eventSummary = '';
        $highEvents = array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high');
        $topEvents = array_slice(array_merge(array_values($highEvents),
            array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'))), 0, 8);
        foreach ($topEvents as $ev) {
            $eventSummary .= sprintf("- %s: %s (%s)\n", $ev['date'] ?? '?', $ev['action'] ?? '', $ev['actor'] ?? '');
        }

        // Summarise parties
        $partyList = '';
        foreach (array_slice($parties, 0, 8) as $p) {
            $partyList .= sprintf("- %s (%s)\n", $p['name'] ?? '', $p['role'] ?? '');
        }

        $prompt = <<<PROMPT
You are a Norwegian family-law research assistant building a case for: {$roleStr}.

A {$docType} has been uploaded. Key events:
{$eventSummary}
Key parties:
{$partyList}

Generate exactly {$count} targeted sub-questions to research the legal corpus for arguments that SUPPORT {$roleStr}'s position. Each question should explore a different angle:
1. Statutory rights and obligations (Barnevernloven, Barneloven)
2. ECHR Article 8 and 9 precedents vs Norway
3. Procedural requirements BVV must follow (notice, documentation, proportionality)
4. Bufdir guidance on case handling standards
5. Norwegian court decisions on similar fact patterns

Return JSON only in {$locale}:
{
  "sub_questions": [
    {"id":"q1","question":"...","rationale":"how this angle strengthens {$roleStr}'s position (≤ 120 chars)"}
  ]
}

Rules:
- Exactly {$count} sub-questions, no more no fewer.
- Every question must be answerable from Norwegian family-law, child-welfare, or ECHR sources.
- Each question must cover a DIFFERENT legal angle.
- Questions must be self-contained without needing the raw document.
- Respond in {$locale}.
PROMPT;

        try {
            $raw = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.15, 'max_tokens' => 1000, 'timeout' => 40]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && is_array($json['sub_questions'] ?? null) && count($json['sub_questions']) >= 1) {
                $sqs = [];
                foreach (array_slice($json['sub_questions'], 0, $count) as $sq) {
                    if (!empty($sq['id']) && !empty($sq['question'])) {
                        $sqs[] = [
                            'id'        => (string)$sq['id'],
                            'question'  => (string)$sq['question'],
                            'rationale' => (string)($sq['rationale'] ?? ''),
                        ];
                    }
                }
                if ($sqs) return $sqs;
            }
        } catch (Throwable $e) {
            error_log('BVJ generateSubQuestions failed: ' . $e->getMessage());
        }

        // Fallback: generic sub-questions
        $role = $advocateRole ?: 'affected party';
        return [
            ['id' => 'q1', 'question' => "What procedural rights does {$role} have in Barnevernet proceedings under Barnevernloven?", 'rationale' => 'Procedural rights'],
            ['id' => 'q2', 'question' => "What does ECHR Article 8 require when child welfare authorities intervene in family life?", 'rationale' => 'ECHR Article 8'],
            ['id' => 'q3', 'question' => "What Bufdir guidance applies to the proportionality of Barnevernet interventions?", 'rationale' => 'Proportionality'],
            ['id' => 'q4', 'question' => "What are the documentation and notice obligations of BVV before taking acute measures?", 'rationale' => 'Documentation obligations'],
        ];
    }

    // ── Step 6: Synthesis ─────────────────────────────────────────────────────

    private function synthesiseBvj(
        string    $docText,
        array     $docMeta,
        array     $parties,
        array     $timelineEvents,
        array     $subQuestions,
        array     $numberedSources,
        string    $advocateRole,
        string    $engine,
        string    $language,
        float     $temperature,
        string    $additionalNotes,
        ?callable $emit = null
    ): array {
        $locale   = dbnToolsLanguageName($language);
        $roleStr  = $advocateRole !== '' ? $advocateRole : 'the affected party';
        $docType  = $docMeta['doc_type'] ?? 'BVJ Document';
        $docDate  = $docMeta['doc_date'] ?? 'unknown date';
        $authority = $docMeta['issuing_authority'] ?? 'unknown authority';
        $refNo    = $docMeta['reference_number'] ? ' (ref ' . $docMeta['reference_number'] . ')' : '';
        $childInfo = $docMeta['child_info'] ?? 'not specified';
        $sourceCount = count($numberedSources);

        if (empty($numberedSources)) {
            $emptyBrief = match (dbnToolsNormalizeUiLanguage($language)) {
                'no' => 'Ingen kildetreff ble funnet i korpuset for de valgte skivene og spørsmålene.',
                'uk' => 'Для вибраних розділів і підпитань не знайдено джерел у корпусі.',
                'pl' => 'Nie znaleziono źródeł w korpusie dla wybranych sekcji i pytań pomocniczych.',
                default => 'No corpus sources were retrieved for the selected slices and sub-questions.',
            };
            return [
                'json' => [
                    'advocacy_brief'         => $emptyBrief,
                    'procedural_red_flags'   => [],
                    'client_strengths'       => [],
                    'opposing_weaknesses'    => [],
                    'what_we_found'          => 'No retrieved sources passed the similarity threshold.',
                    'what_remains_uncertain' => ['No corpus evidence retrieved — widen slice selection or try different sub-questions.'],
                    'next_practical_step'    => 'Enable more corpus slices (Norwegian Courts, Bufdir Guidance) and re-run.',
                ],
                'deploy_label' => match($engine) {
                    'gpu'        => 'GPU (cuttlefish)',
                    'azure_full' => 'gpt-4o',
                    default      => $this->azure->chatDeployment(),
                },
            ];
        }

        // Build parties summary (top 8)
        $partiesSummary = '';
        foreach (array_slice($parties, 0, 8) as $i => $p) {
            $org = $p['organization'] ? ' (' . $p['organization'] . ')' : '';
            $rel = $p['relationship_to_child'] ? ' — rel: ' . $p['relationship_to_child'] : '';
            $partiesSummary .= sprintf("%d. %s — %s%s%s\n", $i + 1, $p['name'] ?? '', $p['role'] ?? '', $org, $rel);
        }

        // Build timeline summary (top 15 most significant events)
        $highEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
        $otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'));
        $topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 15);
        $timelineSummary = '';
        foreach ($topEvents as $ev) {
            $time = $ev['time_of_day'] ? ' kl.' . $ev['time_of_day'] : '';
            $timelineSummary .= sprintf("- %s%s [%s] %s: %s\n",
                $ev['date'] ?? '?', $time,
                strtoupper($ev['significance'] ?? 'low'),
                $ev['actor'] ?? '', $ev['action'] ?? '');
        }

        // Build sources text
        $sourcesContext = [];
        foreach ($numberedSources as $s) {
            $sourcesContext[] = sprintf(
                "[%d] (%s) %s%s\n    Corpus: %s\n    Authority: %s | Jurisdiction: %s\n    Excerpt: %s",
                $s['n'],
                $s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
                $s['title'],
                !empty($s['section']) ? ' — ' . $s['section'] : '',
                $s['package_or_corpus'],
                $s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
                $s['jurisdiction'] ?? 'n/a',
                $s['excerpt']
            );
        }
        $sourcesText = implode("\n\n", $sourcesContext);

        // Build sub-question text
        $subQText = '';
        if ($subQuestions) {
            $subQText = "\nSub-questions researched:\n";
            foreach ($subQuestions as $sq) {
                $subQText .= sprintf("- %s: %s\n", $sq['id'], $sq['question']);
            }
        }

        $notesSection = $additionalNotes !== ''
            ? "\n== ADDITIONAL CONTEXT FROM ADVOCATE ==\n{$additionalNotes}\n"
            : '';

        $docExcerpt = mb_substr($docText, 0, 3000, 'UTF-8');

        $prompt = <<<PROMPT
You are Do Better Norge Legal Tools producing a structured Barnevernet case analysis brief.
You are representing: {$roleStr}

Ground every claim in the numbered corpus sources below using [n] markers, OR in the uploaded document using [DOC].
Do NOT invent statutes, paragraph numbers, case names, ECHR applications, dates, or parties.
Return valid JSON only. No markdown fences.

== DOCUMENT METADATA ==
Type: {$docType}{$refNo}
Date: {$docDate}
Issuing authority: {$authority}
Child: {$childInfo}

== KEY PARTIES ==
{$partiesSummary}

== TIMELINE (from document) ==
{$timelineSummary}

== CORPUS SOURCES ({$sourceCount} numbered) ==
{$sourcesText}
{$notesSection}
{$subQText}

== DOCUMENT EXCERPT (first 3000 chars — use [DOC] to cite) ==
{$docExcerpt}

Return JSON in {$locale}:
{
  "advocacy_brief": "Partisan legal brief in Markdown. Structure:\n## Case Overview\n(What happened according to [DOC] — doc type, authority, key events)\n\n## {$roleStr}'s Core Legal Position\n(Strongest statutory and ECHR arguments — cite [n] and [DOC])\n\n## Procedural Compliance Issues\n(Where BVV may have failed their own procedural obligations — cite [DOC][n])\n\n## Client Strengths\n(Factual and legal advantages for {$roleStr} — cite [n][DOC])\n\n## Counter-Arguments and Responses\n(Likely opposing arguments and how to rebut — cite [n])\n\n## Recommended Next Steps\n(Concrete legal actions)\n\nEnd with a one-line disclaimer. Length: 500-1000 words.",

  "procedural_red_flags": [
    {
      "description": "Concise description of the potential procedural violation",
      "legal_basis": "Statute or ECHR article potentially violated, e.g. Barnevernloven §6-1, ECHR Art.8",
      "severity": "high",
      "source_refs": ["[n]", "[DOC]"],
      "what_to_check": "Specific document text or action requiring legal verification"
    }
  ],

  "client_strengths": ["3-6 items anchored with [n] or [DOC]"],
  "opposing_weaknesses": ["2-5 vulnerabilities in BVV or opposing party position — omit if unsupported by sources"],
  "what_we_found": "2-sentence plain-language summary of the most critical finding",
  "what_remains_uncertain": ["3-5 specific gaps — missing information, unclear authority, conflicting sources"],
  "next_practical_step": "The single most important concrete legal action for {$roleStr}"
}

Rules:
- Every factual claim in advocacy_brief must end with [n] or [DOC].
- procedural_red_flags must be grounded in documented BVV actions — no speculation.
- severity: high = likely violation of a codified right; medium = procedural irregularity; low = best-practice gap.
- If no corpus source supports a claimed weakness, omit it from opposing_weaknesses.
- Cite statute sections and ECHR articles as they appear in the corpus excerpts.
- Respond in {$locale}.
PROMPT;

        $sysPrompt = 'You return valid JSON only. No markdown fences.';

        $messages = [
            ['role' => 'system', 'content' => $sysPrompt],
            ['role' => 'user',   'content' => $prompt],
        ];
        $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3000, 'timeout' => 200];

        $deployLabel = match ($engine) {
            'gpu'        => 'GPU (cuttlefish)',
            'azure_full' => 'gpt-4o',
            default      => $this->azure->chatDeployment(),
        };

        $raw = '';
        try {
            if ($engine === 'gpu') {
                $response = dbnToolsCallGpuLlm($messages, $opts);
                $raw = (string)($response['choices'][0]['message']['content'] ?? '');
            } elseif ($engine === 'azure_full') {
                $raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
            } else {
                $raw = $this->azure->chatText($messages, $opts);
            }
        } catch (Throwable $e) {
            dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
        }

        $json = $this->azure->decodeJsonObject($raw);
        if (!is_array($json) || empty($json['advocacy_brief'])) {
            $json = [
                'advocacy_brief'         => $raw,
                'procedural_red_flags'   => [],
                'client_strengths'       => [],
                'opposing_weaknesses'    => [],
                'what_we_found'          => 'Synthesis returned non-structured output; rendered as raw markdown.',
                'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
                'next_practical_step'    => 'Review the brief manually before relying on it.',
            ];
        }

        return ['json' => $json, 'deploy_label' => $deployLabel];
    }

    // ── GPU streaming helper (keeps browser connection alive during slow models) ──

    /**
     * Call the LiteLLM endpoint with streaming enabled and accumulate the full text.
     * Every 15 seconds, calls $onProgress() so PHP can flush a keepalive event to the browser.
     */
    private function callGpuLlmStream(array $messages, array $options, ?callable $onProgress): string
    {
        $url     = 'http://10.0.1.10:4000/v1/chat/completions';
        $apiKey  = (string)(dbnToolsEnv('LITELLM_MASTER_KEY') ?: 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d');
        $timeout = (int)($options['timeout'] ?? 660);

        $payload = [
            'model'       => (string)($options['model'] ?? 'qwen2.5:14b'),
            'messages'    => $messages,
            'temperature' => $options['temperature'] ?? 0.1,
            'max_tokens'  => $options['max_tokens'] ?? 2800,
            'stream'      => true,
        ];
        if (!empty($options['stop']) && is_array($options['stop'])) {
            $payload['stop'] = $options['stop'];
        }
        $body    = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
        $headers = [
            'Content-Type: application/json',
            'Authorization: Bearer ' . $apiKey,
        ];

        $accumulated   = '';
        $lastKeepalive = microtime(true);
        $curlErr       = '';

        $ch = curl_init($url);
        curl_setopt_array($ch, [
            CURLOPT_POST           => true,
            CURLOPT_POSTFIELDS     => $body,
            CURLOPT_HTTPHEADER     => $headers,
            CURLOPT_TIMEOUT        => $timeout,
            CURLOPT_RETURNTRANSFER => false,
            CURLOPT_WRITEFUNCTION  => static function ($ch, $data) use (&$accumulated, &$lastKeepalive, $onProgress): int {
                foreach (explode("\n", $data) as $line) {
                    $trimmed = ltrim($line);
                    if (!str_starts_with($trimmed, 'data: ')) continue;
                    $json = substr($trimmed, 6);
                    if (trim($json) === '[DONE]') continue;
                    $chunk   = json_decode($json, true);
                    $delta   = $chunk['choices'][0]['delta']['content'] ?? '';
                    if ($delta !== '') $accumulated .= $delta;
                }
                if ($onProgress !== null && microtime(true) - $lastKeepalive >= 15.0) {
                    $lastKeepalive = microtime(true);
                    $onProgress();
                    @flush();
                }
                return strlen($data);
            },
        ]);

        curl_exec($ch);
        $curlErr = curl_error($ch);
        curl_close($ch);

        if ($curlErr !== '') {
            throw new RuntimeException('GPU stream request failed: ' . $curlErr);
        }
        return trim($accumulated);
    }

    // ── Shared helpers (copied from DbnDeepResearchAgent) ────────────────────

    private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
    {
        $text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
        if ($text === '') return [];
        $words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
        if (!$words) return [];

        $chunks   = [];
        $i        = 0;
        $chunkIdx = 0;
        $total    = count($words);
        while ($i < $total) {
            $slice = array_slice($words, $i, self::CHUNK_WORDS);
            if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
                $chunks[] = [
                    'chunk_id'    => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
                    'file_index'  => $fileIdx,
                    'chunk_index' => $chunkIdx,
                    'filename'    => $filename,
                    'text'        => implode(' ', $slice),
                ];
                $chunkIdx++;
            }
            $advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
            if ($advance < 1) $advance = 1;
            $i += $advance;
            if (count($slice) < self::CHUNK_WORDS) break;
        }
        return $chunks;
    }

    private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
    {
        if (empty($this->uploadVecs)) return [];
        try {
            $qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? [];
        } catch (Throwable $e) {
            error_log('BVJ sub-Q embed failed: ' . $e->getMessage());
            return [];
        }
        if (empty($qVec)) return [];

        $scored = [];
        foreach ($this->uploadVecs as $entry) {
            $sim = $this->cosineSim($qVec, $entry['vec']);
            if ($sim < $threshold) continue;
            $scored[] = [
                'chunk_id'          => $entry['meta']['chunk_id'],
                'title'             => 'uploaded: ' . $entry['meta']['filename'],
                'section'           => null,
                'package_or_corpus' => 'Your upload',
                'excerpt'           => dbnToolsExcerpt($entry['meta']['text'], 620),
                'chunk_text'        => $entry['meta']['text'],
                'similarity'        => round($sim, 4),
                'reranker_score'    => null,
                'document_id'       => null,
                'source_origin'     => 'upload',
                'authority_type'    => null,
                'jurisdiction'      => null,
            ];
        }
        usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
        $keep = (int)ceil($limitPerSubQ / 2);
        return array_slice($scored, 0, max(1, $keep));
    }

    private function cosineSim(array $a, array $b): float
    {
        $len = min(count($a), count($b));
        if ($len === 0) return 0.0;
        $dot = $na = $nb = 0.0;
        for ($i = 0; $i < $len; $i++) {
            $x = (float)$a[$i]; $y = (float)$b[$i];
            $dot += $x * $y; $na += $x * $x; $nb += $y * $y;
        }
        if ($na === 0.0 || $nb === 0.0) return 0.0;
        return $dot / (sqrt($na) * sqrt($nb));
    }

    private function normalizeCorpusChunk(array $chunk, string $subQId): array
    {
        return [
            'chunk_id'               => isset($chunk['id']) ? (int)$chunk['id'] : null,
            'title'                  => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
            'section'                => $chunk['section_title'] ?? null,
            'package_or_corpus'      => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
            'excerpt'                => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
            'chunk_text'             => (string)($chunk['content'] ?? ''),
            'similarity'             => isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null,
            'reranker_score'         => isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null,
            'document_id'            => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
            'source_origin'          => 'corpus',
            'authority_type'         => $chunk['authority_type'] ?? null,
            'jurisdiction'           => $chunk['jurisdiction'] ?? null,
            'publication_year'       => $chunk['publication_year'] ?? null,
            'source_url'             => null,
            'deep_link'              => null,
            'authority_label'        => null,
            'corpus_source_name'     => null,
            'publication_date'       => null,
            'matched_sub_questions'  => [$subQId],
        ];
    }

    private function shouldExcludeChunk(array $chunk, array $activeSlices): bool
    {
        $title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
        $url   = strtolower((string)($chunk['source_url'] ?? ''));
        $name  = strtolower((string)($chunk['source_name'] ?? ''));

        if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true;
        if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true;

        $isDbnPage = (
            str_contains($name, 'website')
            || str_contains($title, 'dobetternorge.no')
            || preg_match('/^(homepage|landing|about |contact )/i', $title)
            || str_contains($title, 'resource directory')
            || preg_match('/^flashcards?\s*[-–|]/i', $title)
            || preg_match('/\|\s*do better norge\s*$/i', $title)
            || preg_match('/[-–]\s*do better norge\s*$/i', $title)
        );
        if ($isDbnPage) {
            return !($activeSlices['dbn_resources'] ?? false);
        }
        return false;
    }

    private function hydrateSourceUrls(array &$pool): void
    {
        $docIds = [];
        foreach ($pool as $chunk) {
            if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
            $docId = (int)($chunk['document_id'] ?? 0);
            if ($docId > 0) $docIds[$docId] = true;
        }
        if (empty($docIds)) return;

        try {
            $ragDb = dbnToolsRagDb();
            $ids   = array_keys($docIds);
            $ph    = implode(',', array_fill(0, count($ids), '?'));

            $stmt = $ragDb->prepare("
                SELECT d.id, d.title, d.source_url, d.authority_type,
                       d.publication_date, d.source_id, d.jurisdiction,
                       d.summary, LEFT(d.content, 4000) AS content_excerpt
                FROM documents d
                WHERE d.id IN ({$ph})
            ");
            $stmt->execute($ids);

            $docMeta  = [];
            $sourceIds = [];
            foreach ($stmt as $row) {
                $dId = (int)$row['id'];
                $sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
                if ($sid) $sourceIds[] = $sid;
                $docMeta[$dId] = [
                    'source_url'         => $row['source_url'] ?? null,
                    'authority_label'    => dbnV6AuthorityLabel($row['authority_type'] ?? null),
                    'publication_date'   => $row['publication_date'] ?? null,
                    'corpus_source_name' => 'Do Better Legal',
                    'source_id'          => $sid,
                    'summary'            => $row['summary'] ?? null,
                    'content_excerpt'    => (string)($row['content_excerpt'] ?? ''),
                    'title'              => (string)($row['title'] ?? ''),
                ];
            }

            $unsummarized = array_filter($docMeta, fn($m) => $m['summary'] === null && $m['content_excerpt'] !== '');
            foreach ($unsummarized as $dId => $m) {
                try {
                    $raw = $this->azure->chatText([
                        ['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
                        ['role' => 'user', 'content' => "Summarise this Norwegian family law document.\nFocus on: legal provisions covered, authority type, and questions it helps answer.\n\nTitle: {$m['title']}\n\nContent:\n{$m['content_excerpt']}"],
                    ], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
                    $summary = trim($raw);
                    if ($summary !== '') {
                        $ragDb->prepare("UPDATE documents SET summary = ? WHERE id = ?")->execute([$summary, $dId]);
                        $docMeta[$dId]['summary'] = $summary;
                    }
                } catch (Throwable $e) {
                    error_log('BVJ hydrateSourceUrls summary gen failed for doc ' . $dId . ': ' . $e->getMessage());
                }
            }

            if (!empty($sourceIds)) {
                $uSids = array_values(array_unique($sourceIds));
                $sPh   = implode(',', array_fill(0, count($uSids), '?'));
                $sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
                $sStmt->execute($uSids);
                $srcNames = [];
                foreach ($sStmt as $row) {
                    $srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
                }
                foreach ($docMeta as &$m) {
                    if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
                        $m['corpus_source_name'] = $srcNames[$m['source_id']];
                    }
                }
                unset($m);
            }
        } catch (Throwable $e) {
            error_log('BVJ hydrateSourceUrls failed: ' . $e->getMessage());
            return;
        }

        foreach ($pool as &$chunk) {
            if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
            $docId = (int)($chunk['document_id'] ?? 0);
            if (!$docId || !isset($docMeta[$docId])) continue;
            $m = $docMeta[$docId];
            $sourceUrl = $m['source_url'] ?? null;
            $chunk['source_url']         = $sourceUrl;
            $chunk['deep_link']          = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
            $chunk['authority_label']    = $m['authority_label'] ?? $chunk['authority_label'];
            $chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
            $chunk['publication_date']   = $m['publication_date'] ?? null;
            $chunk['summary']            = $m['summary'] ?? null;
        }
        unset($chunk);
    }

    private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string
    {
        if (!$sourceUrl) return null;
        $sourceUrl = trim($sourceUrl);
        if ($sourceUrl === '') return null;
        if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl)
            && $sectionTitle
            && preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) {
            return rtrim($sourceUrl, '/') . '/§' . $m[1];
        }
        return $sourceUrl;
    }

    private function mergeAndDedupe(array $rawPool, int $cap): array
    {
        $byKey = [];
        foreach ($rawPool as $chunk) {
            $key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
            if (!isset($byKey[$key])) {
                $byKey[$key] = $chunk;
                continue;
            }
            $existing = $byKey[$key];
            $existing['matched_sub_questions'] = array_values(array_unique(array_merge(
                $existing['matched_sub_questions'] ?? [],
                $chunk['matched_sub_questions'] ?? []
            )));
            if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
                $existing['similarity'] = $chunk['similarity'];
            }
            if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
                $existing['reranker_score'] = $chunk['reranker_score'];
            }
            $byKey[$key] = $existing;
        }
        $merged = array_values($byKey);
        usort($merged, function (array $a, array $b): int {
            $aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
            $bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
            return $bScore <=> $aScore;
        });
        return array_slice($merged, 0, $cap);
    }

    private function numberSources(array $chunks): array
    {
        $out = [];
        foreach ($chunks as $i => $c) {
            $c['n'] = $i + 1;
            $out[] = $c;
        }
        return $out;
    }

    private function citationConfidence(array $sources): string
    {
        if (!$sources) return 'low';
        $scores = array_values(array_filter(array_map(
            fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
            $sources
        ), 'is_numeric'));
        $best = $scores ? max($scores) : 0;
        if (count($sources) >= 6 && $best >= 0.5) return 'high';
        if (count($sources) >= 3 && $best >= 0.35) return 'medium';
        return 'low';
    }

    private function normalizeControls(array $controls): array
    {
        return [
            'sub_q_count'          => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
            'chunk_limit'          => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
            'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
            'reranker_top_k'       => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
            'temperature'          => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
        ];
    }

    private function requireFamilyPackage(int $clientId): array
    {
        $package = dbnToolsFetchPackage('family-legal');
        if (!$package || empty($package['is_active'])) {
            dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
        }
        if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
            dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
        }
        return $package;
    }

    private function trace(string $label, string $detail, string $status = 'complete'): array
    {
        return ['label' => $label, 'detail' => $detail, 'status' => $status];
    }

    private function elapsedMs(float $start): int
    {
        return (int)round((microtime(true) - $start) * 1000);
    }
}