dobetternorge-tools/includes/DiscrepancyAgent.php

<?php
declare(strict_types=1);

require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';

/**
 * Document Discrepancy Finder Agent
 *
 * 8-step pipeline comparing two Barnevernet document versions:
 *  1. Classify Document A
 *  2. Classify Document B
 *  3. Extract parties from both documents
 *  4. Build timelines from both documents
 *  5. Cross-reference parties (added / removed / changed)
 *  6. Cross-reference timelines (contradictions / deletions / additions)
 *  7. Generate legal research sub-questions from discrepancies
 *  8. Corpus retrieval + synthesis of discrepancy report
 *
 * Steps 1-6 always use azure_mini. Step 8 synthesis uses the user's chosen engine.
 */
final class DbnDiscrepancyAgent
{
    private const MAX_DOC_CHARS = 64000;
    private const POOL_CAP      = 20;

    private DbnAzureOpenAiGateway $azure;
    private array $stepTimings = [];

    public function __construct(?DbnAzureOpenAiGateway $azure = null)
    {
        $this->azure = $azure ?: new DbnAzureOpenAiGateway();
    }

    /**
     * @param array         $fileA           {filename, text, chars, truncated}
     * @param array         $fileB           {filename, text, chars, truncated}
     * @param string        $engine          'azure_mini'|'azure_full'|'gpu'
     * @param string        $language        'en'|'no'|'uk'|'pl'
     * @param array         $sliceSelection  Corpus slice toggles
     * @param callable|null $emit            function(string $event, array $payload): void
     */
    public function run(
        array     $fileA,
        array     $fileB,
        string    $engine,
        string    $language,
        array     $sliceSelection,
        ?callable $emit = null
    ): array {
        $engine   = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
        $language = dbnToolsNormalizeUiLanguage($language);

        $textA = mb_substr((string)($fileA['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8');
        $textB = mb_substr((string)($fileB['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8');

        if ($textA === '' || $textB === '') {
            dbnToolsAbort('Could not extract text from one or both uploaded files.', 422, 'empty_document');
        }

        $nameA = (string)($fileA['filename'] ?? 'Document A');
        $nameB = (string)($fileB['filename'] ?? 'Document B');

        $client  = dbnToolsRequireClient();
        $package = $this->requireFamilyPackage((int)$client['id']);

        dbnToolsBootCaveau();
        $aiPortalRoot = dbnToolsAiPortalRoot();
        require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';

        $this->stepTimings = [];
        $trace = [];

        $emitStep = function (string $stepId, string $label, string $detail, string $status)
                    use (&$trace, $emit): void {
            $trace[] = ['label' => $label, 'detail' => $detail, 'status' => $status];
            if ($emit) {
                $emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => $status]);
            }
        };
        $emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void {
            if ($emit) {
                $emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => 'running']);
            }
        };

        // ── STEP 1+2: Classify both documents ─────────────────────────────────
        $emitRunning('doc_classify', 'Classify documents', "Classifying {$nameA}…");
        $stepStart = microtime(true);
        $metaA = $this->classifyDoc($textA, $nameA, $language);
        if ($emit) $emit('doc_a_meta', ['result' => $metaA]);
        if ($emit) $emit('progress', ['detail' => "Classifying {$nameB}…"]);
        $metaB = $this->classifyDoc($textB, $nameB, $language);
        if ($emit) $emit('doc_b_meta', ['result' => $metaB]);
        $this->stepTimings['doc_classify'] = $this->elapsedMs($stepStart);
        $emitStep('doc_classify', 'Classify documents',
            sprintf('%s (%s) → %s (%s)',
                $metaA['doc_type'] ?? 'Document A', $metaA['doc_date'] ?? '?',
                $metaB['doc_type'] ?? 'Document B', $metaB['doc_date'] ?? '?'),
            'complete');

        // ── STEP 3: Extract parties from both documents ─────────────────────────
        $emitRunning('party_extract', 'Extract parties', "Extracting parties from {$nameA}…");
        $stepStart = microtime(true);
        $partiesA = $this->extractPartiesDoc($textA, $nameA, $language);
        if ($emit) $emit('parties_a', ['parties' => $partiesA]);
        if ($emit) $emit('progress', ['detail' => "Extracting parties from {$nameB}…"]);
        $partiesB = $this->extractPartiesDoc($textB, $nameB, $language);
        if ($emit) $emit('parties_b', ['parties' => $partiesB]);
        $this->stepTimings['party_extract'] = $this->elapsedMs($stepStart);
        $emitStep('party_extract', 'Extract parties',
            sprintf('%d in %s · %d in %s', count($partiesA), $nameA, count($partiesB), $nameB),
            'complete');

        // ── STEP 4: Build timelines from both documents ─────────────────────────
        $emitRunning('timeline_extract', 'Build timelines', "Building timeline from {$nameA}…");
        $stepStart = microtime(true);
        $timelineA = $this->extractTimelineDoc($textA, $nameA, $language);
        if ($emit) $emit('timeline_a', ['events' => $timelineA]);
        if ($emit) $emit('progress', ['detail' => "Building timeline from {$nameB}…"]);
        $timelineB = $this->extractTimelineDoc($textB, $nameB, $language);
        if ($emit) $emit('timeline_b', ['events' => $timelineB]);
        $this->stepTimings['timeline_extract'] = $this->elapsedMs($stepStart);
        $emitStep('timeline_extract', 'Build timelines',
            sprintf('%d events in %s · %d events in %s',
                count($timelineA), $nameA, count($timelineB), $nameB),
            'complete');

        // ── STEP 5: Cross-reference parties ────────────────────────────────────
        $emitRunning('cross_parties', 'Cross-reference parties', 'Comparing parties across both documents…');
        $stepStart   = microtime(true);
        $partiesDiff = $this->crossReferenceParties($partiesA, $partiesB, $nameA, $nameB, $language);
        if ($emit) $emit('parties_diff', ['result' => $partiesDiff]);
        $this->stepTimings['cross_parties'] = $this->elapsedMs($stepStart);
        $pRemoved = count($partiesDiff['in_a_only'] ?? []);
        $pAdded   = count($partiesDiff['in_b_only'] ?? []);
        $pChanged = count($partiesDiff['changed_between'] ?? []);
        $emitStep('cross_parties', 'Cross-reference parties',
            sprintf('%d removed · %d added · %d changed', $pRemoved, $pAdded, $pChanged),
            'complete');

        // ── STEP 6: Cross-reference timelines ─────────────────────────────────
        $emitRunning('cross_timelines', 'Cross-reference timelines',
            'Scanning for contradictions, deletions, and new events…');
        $stepStart    = microtime(true);
        $timelineDiff = $this->crossReferenceTimelines(
            $timelineA, $timelineB, $textA, $textB, $nameA, $nameB, $language
        );
        if ($emit) $emit('timeline_diff', ['result' => $timelineDiff]);
        $this->stepTimings['cross_timelines'] = $this->elapsedMs($stepStart);
        $conflictCount = count($timelineDiff['conflicts'] ?? []);
        $deletedCount  = count($timelineDiff['in_a_only'] ?? []);
        $addedCount    = count($timelineDiff['in_b_only'] ?? []);
        $emitStep('cross_timelines', 'Cross-reference timelines',
            sprintf('%d contradictions · %d deleted events · %d new events',
                $conflictCount, $deletedCount, $addedCount),
            'complete');

        // ── STEP 7: Generate research sub-questions ────────────────────────────
        $emitRunning('sub_question_gen', 'Research questions',
            'Generating legal research questions from discrepancies…');
        $stepStart    = microtime(true);
        $subQuestions = $this->generateDiscrepancySubQ(
            $partiesDiff, $timelineDiff, $metaA, $metaB, $language
        );
        $this->stepTimings['sub_question_gen'] = $this->elapsedMs($stepStart);
        $emitStep('sub_question_gen', 'Research questions',
            sprintf('%d legal research question(s) generated.', count($subQuestions)),
            'complete');

        // ── STEP 8: Corpus retrieval ────────────────────────────────────────────
        $emitRunning('retrieval', 'Retrieve legal context',
            sprintf('Hybrid vector + keyword search across %d question(s)…', count($subQuestions)));
        $stepStart = microtime(true);

        $sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
        if (!array_filter($sliceSelectionNormalized)) {
            $sliceSelectionNormalized = [
                'child_welfare'   => true,
                'echr'            => true,
                'family_core'     => true,
                'bufdir_guidance' => true,
            ];
        }

        $ragDb = dbnToolsRagDb();
        try {
            $sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
        } catch (Throwable $e) {
            error_log('Discrepancy slice resolve failed: ' . $e->getMessage());
            $sharedDocIds = [];
        }

        try {
            $rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
        } catch (Throwable $e) {
            dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
        }

        $retrievalQueries = $subQuestions ?: [[
            'id'        => 'q1',
            'question'  => 'ECHR procedural requirements when Barnevernet changes facts between document versions',
            'rationale' => 'Fallback query',
        ]];

        $rawPool           = [];
        $retrievalWarnings = 0;
        $rawCorpusCount    = 0;

        foreach ($retrievalQueries as $idx => $sq) {
            if ($emit) {
                $emit('subq', [
                    'index'    => $idx + 1,
                    'total'    => count($retrievalQueries),
                    'id'       => $sq['id'],
                    'question' => $sq['question'],
                ]);
            }
            try {
                $corpusChunks = $rag->searchAll(
                    $sq['question'],
                    6,
                    null,
                    [
                        'search_private'          => false,
                        'search_shared'           => true,
                        'package_ids'             => [(int)$package['id']],
                        'shared_doc_ids'          => $sharedDocIds,
                        'chunk_limit'             => 6,
                        'search_method'           => 'hybrid',
                        'reranker_enabled'        => true,
                        'include_beta_website'    => false,
                        'include_primary_website' => false,
                    ]
                );
            } catch (Throwable $e) {
                error_log('Discrepancy sub-Q retrieval failed: ' . $e->getMessage());
                $corpusChunks = [];
                $retrievalWarnings++;
            }
            $rawCorpusCount += count($corpusChunks);
            foreach ($corpusChunks as $chunk) {
                $rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
            }
        }

        $merged          = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
        $this->hydrateSourceUrls($merged);
        $numberedSources = $this->numberSources(array_slice($merged, 0, 12));
        $this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
        $retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
        $emitStep('retrieval', 'Retrieve legal context',
            sprintf('%d sub-Q(s) → %d corpus chunks → %d unique sources.',
                count($retrievalQueries), $rawCorpusCount, count($numberedSources)),
            $retrievalStatus);

        // ── STEP 9: Synthesis ───────────────────────────────────────────────────
        $engineLabel = match ($engine) {
            'azure_full' => 'Azure gpt-4o',
            'gpu'        => 'GPU qwen2.5:14b',
            default      => 'Azure gpt-4o-mini',
        };
        $emitRunning('synthesis', 'Synthesize report',
            sprintf('Synthesising discrepancy report with %s…', $engineLabel));
        $stepStart = microtime(true);
        $synthesis = $this->synthesize(
            $metaA, $metaB, $nameA, $nameB,
            $partiesDiff, $timelineDiff,
            $numberedSources, $engine, $language
        );
        $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
        $confidence = $this->citationConfidence($numberedSources);
        $emitStep('synthesis', 'Synthesize report',
            sprintf('Report complete · %d source(s) · %s confidence.',
                count($numberedSources), $confidence),
            'complete');

        $synJson = $synthesis['json'];
        return [
            'tool'                   => 'discrepancy',
            'language'               => $language,
            'doc_a_name'             => $nameA,
            'doc_b_name'             => $nameB,
            'doc_a_meta'             => $metaA,
            'doc_b_meta'             => $metaB,
            'parties_a'              => $partiesA,
            'parties_b'              => $partiesB,
            'timeline_a'             => $timelineA,
            'timeline_b'             => $timelineB,
            'parties_diff'           => $partiesDiff,
            'timeline_diff'          => $timelineDiff,
            'headline_finding'       => (string)($synJson['headline_finding'] ?? ''),
            'critical_discrepancies' => is_array($synJson['critical_discrepancies'] ?? null)
                                        ? $synJson['critical_discrepancies'] : [],
            'recommended_actions'    => is_array($synJson['recommended_actions'] ?? null)
                                        ? $synJson['recommended_actions'] : [],
            'what_remains_uncertain' => is_array($synJson['what_remains_uncertain'] ?? null)
                                        ? $synJson['what_remains_uncertain'] : [],
            'sources'                => $numberedSources,
            'sub_questions'          => $subQuestions,
            'citation_confidence'    => $confidence,
            'trace'                  => $trace,
            'trace_metadata'         => [
                'source_count'        => count($numberedSources),
                'sub_question_count'  => count($retrievalQueries),
                'conflict_count'      => $conflictCount,
                'deleted_count'       => $deletedCount,
                'added_count'         => $addedCount,
                'deployment'          => $synthesis['deploy_label'],
                'engine_used'         => $engine,
                'citation_confidence' => $confidence,
                'elapsed_ms_per_step' => $this->stepTimings,
            ],
            'disclaimer' => dbnToolsDisclaimer($language),
        ];
    }

    // ── Per-document classification ────────────────────────────────────────────

    private function classifyDoc(string $docText, string $label, string $language): array
    {
        $locale  = dbnToolsLanguageName($language);
        $excerpt = mb_substr($docText, 0, 6000, 'UTF-8');

        $prompt = <<<PROMPT
You are analysing a Norwegian child welfare (Barnevernet) document labelled "{$label}".
Extract metadata. Return JSON only in {$locale}:
{
  "doc_type": "Document type, e.g. Bekymringsmelding, Vedtak, Rapport, Omsorgsovertakelse, Fylkesnemnda-kjennelse",
  "doc_date": "Primary date ISO 8601 (YYYY-MM-DD) or null",
  "issuing_authority": "Issuing authority name or null",
  "reference_number": "Case/reference number or null",
  "child_info": "Brief description of child(ren) — anonymise if redacted"
}
Use null for missing fields. Do not invent information.

Document text (first 6000 chars):
{$excerpt}
PROMPT;

        $default = [
            'doc_type'          => $label,
            'doc_date'          => null,
            'issuing_authority' => null,
            'reference_number'  => null,
            'child_info'        => null,
        ];
        try {
            $raw  = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 400, 'timeout' => 30]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json)) {
                return array_merge($default, array_filter($json, fn($v) => $v !== null && $v !== ''));
            }
        } catch (Throwable $e) {
            error_log('Discrepancy classifyDoc failed (' . $label . '): ' . $e->getMessage());
        }
        return $default;
    }

    // ── Per-document party extraction ──────────────────────────────────────────

    private function extractPartiesDoc(string $docText, string $label, string $language): array
    {
        $locale  = dbnToolsLanguageName($language);
        $excerpt = mb_substr($docText, 0, 20000, 'UTF-8');

        $prompt = <<<PROMPT
You are analysing a Norwegian child welfare (Barnevernet) document labelled "{$label}".
Identify ALL named parties — every person or institution referred to by name or title.

Respond in {$locale}. Return JSON with key "parties" containing an array. Each object:
- "name": full name or institution name
- "role": e.g. Biological mother, Caseworker, Leder, Barnevernvakta, Politi, Sakkyndig, Talsperson
- "organization": employer/institution or null
- "relationship_to_child": relationship to the child or null

Rules: Include all named people and institutions. Maximum 20 parties.

Document text:
{$excerpt}
PROMPT;

        try {
            $raw  = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 2000, 'timeout' => 45]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && is_array($json['parties'] ?? null)) {
                return array_slice($json['parties'], 0, 20);
            }
            if (is_array($json) && isset($json[0]['name'])) {
                return array_slice($json, 0, 20);
            }
        } catch (Throwable $e) {
            error_log('Discrepancy extractPartiesDoc failed (' . $label . '): ' . $e->getMessage());
        }
        return [];
    }

    // ── Per-document timeline extraction ───────────────────────────────────────

    private function extractTimelineDoc(string $docText, string $label, string $language): array
    {
        $locale  = dbnToolsLanguageName($language);
        $excerpt = mb_substr($docText, 0, 20000, 'UTF-8');

        $prompt = <<<PROMPT
Build a chronological timeline from this Norwegian Barnevernet document labelled "{$label}" in {$locale}.

Extract ALL dates and temporal references — visits, meetings, decisions, phone calls, assessments.

Norwegian date formats to recognise:
- DD.MM.YYYY, DD.MM.YY, D.M.YY, DD.MM. (infer year from context)
- Times: kl. HH:MM, klokken HH:MM
- Two-digit years: 20YY

Barnevernet events of HIGH significance:
- Akuttvedtak (§4-6, §4-25), Omsorgsovertakelse (§4-12), police involvement
- Formal vedtak or kjennelse, Fylkesnemnda hearing, Forhandlingsmøte
- Contact (samvær) reduced or denied, foster/institution placement
- Deadline breaches (§4-2 not processed within 7 days, investigation not opened within 6 weeks)

For each event:
- "date": ISO 8601 if determinable, else best-effort description
- "time_of_day": HH:MM or null
- "actor": person/institution involved
- "action": ≤ 80 chars describing what happened
- "significance": "high"|"medium"|"low"

Sort chronologically. Maximum 40 events. Return JSON: {"events":[...]}

Document text:
{$excerpt}
PROMPT;

        try {
            $raw  = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 4000, 'timeout' => 55]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && is_array($json['events'] ?? null)) {
                return array_slice($json['events'], 0, 40);
            }
        } catch (Throwable $e) {
            error_log('Discrepancy extractTimelineDoc failed (' . $label . '): ' . $e->getMessage());
        }
        return [];
    }

    // ── Cross-reference: parties ───────────────────────────────────────────────

    private function crossReferenceParties(
        array  $partiesA,
        array  $partiesB,
        string $nameA,
        string $nameB,
        string $language
    ): array {
        $locale       = dbnToolsLanguageName($language);
        $partiesAJson = json_encode($partiesA, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
        $partiesBJson = json_encode($partiesB, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);

        $prompt = <<<PROMPT
You are comparing parties across two versions of Norwegian Barnevernet documents.

Document A ({$nameA}) parties:
{$partiesAJson}

Document B ({$nameB}) parties:
{$partiesBJson}

Compare the two party lists and find:
1. Parties in A but absent from B — people/institutions removed from the later version
2. New parties in B not in A — new people/institutions introduced in the later version
3. The same person appearing in both but with a changed role, description, or relationship

For each entry explain the potential legal significance in a Barnevernet case context.

Return JSON only in {$locale}:
{
  "in_a_only": [
    {"name":"...","role_in_a":"...","significance":"One sentence why their removal may matter (≤ 130 chars)"}
  ],
  "in_b_only": [
    {"name":"...","role_in_b":"...","significance":"One sentence why their addition may matter (≤ 130 chars)"}
  ],
  "changed_between": [
    {"name":"...","in_a":"Role/details in A","in_b":"Role/details in B","significance":"One sentence on the change (≤ 130 chars)"}
  ]
}

Rules:
- Only flag genuine discrepancies. Match the same person with minor name spelling variations.
- Do not invent parties not present in the data above.
- If no discrepancies of a type exist, return an empty array.
PROMPT;

        $default = ['in_a_only' => [], 'in_b_only' => [], 'changed_between' => []];
        try {
            $raw  = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 2000, 'timeout' => 50]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json)) {
                return array_merge($default, array_intersect_key($json, $default));
            }
        } catch (Throwable $e) {
            error_log('Discrepancy crossReferenceParties failed: ' . $e->getMessage());
        }
        return $default;
    }

    // ── Cross-reference: timelines ─────────────────────────────────────────────

    private function crossReferenceTimelines(
        array  $timelineA,
        array  $timelineB,
        string $textA,
        string $textB,
        string $nameA,
        string $nameB,
        string $language
    ): array {
        $locale  = dbnToolsLanguageName($language);
        $tlAJson = json_encode(array_slice($timelineA, 0, 30), JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
        $tlBJson = json_encode(array_slice($timelineB, 0, 30), JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
        $excerptA = mb_substr($textA, 0, 3000, 'UTF-8');
        $excerptB = mb_substr($textB, 0, 3000, 'UTF-8');

        $prompt = <<<PROMPT
You are comparing timelines from two versions of Norwegian Barnevernet documents to find legally significant discrepancies.

Document A ({$nameA}) timeline:
{$tlAJson}

Document B ({$nameB}) timeline:
{$tlBJson}

Source excerpt from Document A:
{$excerptA}

Source excerpt from Document B:
{$excerptB}

Find all discrepancies:
1. CONTRADICTIONS — same date/event described differently between A and B
2. DELETIONS — events in A that are absent or missing from B (removed facts)
3. ADDITIONS — events in B not present in A (new allegations or narrative elements)
4. DATE SHIFTS — same event but with a different date in A vs B
5. PROCEDURAL GAPS — actions referenced but not documented in either version

Also identify overall NARRATIVE SHIFTS — how the framing changed between A and B.

For significance: "high" (changes facts central to the decision), "medium" (changes context or procedure), "low" (minor wording).

Return JSON only in {$locale}:
{
  "conflicts": [
    {
      "date_a": "YYYY-MM-DD or description or null",
      "date_b": "YYYY-MM-DD or description or null",
      "doc_a_says": "What Document A says about this event",
      "doc_b_says": "What Document B says about this event",
      "conflict_type": "contradiction|deletion|addition|date_shift",
      "significance": "high|medium|low",
      "legal_significance": "One sentence why this matters legally (≤ 150 chars)"
    }
  ],
  "in_a_only": [
    {
      "date": "...",
      "actor": "...",
      "description": "Event in A not present in B",
      "significance": "high|medium|low",
      "legal_significance": "..."
    }
  ],
  "in_b_only": [
    {
      "date": "...",
      "actor": "...",
      "description": "New event in B not present in A",
      "significance": "high|medium|low",
      "legal_significance": "..."
    }
  ],
  "procedural_gaps": [
    {"gap": "Description of the gap", "significance": "high|medium|low"}
  ],
  "narrative_shifts": {
    "summary": "1-2 sentence description of how the overall narrative changed between A and B",
    "new_in_b": ["Key new allegation or narrative element added in B"],
    "removed_from_b": ["Key fact or narrative element present in A but absent in B"]
  }
}

Rules:
- Only report genuine discrepancies grounded in the data above. Do not invent events.
- If no discrepancies of a type exist, return an empty array.
- Maximum 15 conflicts, 10 in_a_only, 10 in_b_only.
PROMPT;

        $default = [
            'conflicts'        => [],
            'in_a_only'        => [],
            'in_b_only'        => [],
            'procedural_gaps'  => [],
            'narrative_shifts' => ['summary' => '', 'new_in_b' => [], 'removed_from_b' => []],
        ];
        try {
            $raw  = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 4000, 'timeout' => 90]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json)) {
                return array_merge($default, array_intersect_key($json, $default));
            }
        } catch (Throwable $e) {
            error_log('Discrepancy crossReferenceTimelines failed: ' . $e->getMessage());
        }
        return $default;
    }

    // ── Sub-question generation ────────────────────────────────────────────────

    private function generateDiscrepancySubQ(
        array  $partiesDiff,
        array  $timelineDiff,
        array  $metaA,
        array  $metaB,
        string $language
    ): array {
        $locale = dbnToolsLanguageName($language);

        $parts   = [];
        $pRemove = count($partiesDiff['in_a_only'] ?? []);
        $pAdd    = count($partiesDiff['in_b_only'] ?? []);
        $pChange = count($partiesDiff['changed_between'] ?? []);
        if ($pRemove) $parts[] = "{$pRemove} parties removed between versions";
        if ($pAdd)    $parts[] = "{$pAdd} new parties added in later version";
        if ($pChange) $parts[] = "{$pChange} parties changed between versions";

        $conflicts = $timelineDiff['conflicts'] ?? [];
        $deleted   = $timelineDiff['in_a_only'] ?? [];
        $added     = $timelineDiff['in_b_only'] ?? [];
        $procGaps  = $timelineDiff['procedural_gaps'] ?? [];
        if ($conflicts) $parts[] = count($conflicts) . ' timeline contradictions';
        if ($deleted)   $parts[] = count($deleted)   . ' events deleted from later version';
        if ($added)     $parts[] = count($added)      . ' new events added in later version';
        if ($procGaps)  $parts[] = count($procGaps)   . ' procedural gaps identified';

        $summary  = $parts ? implode(', ', $parts) . '.' : 'Some discrepancies found.';
        $docTypeA = $metaA['doc_type'] ?? 'Document A';
        $docTypeB = $metaB['doc_type'] ?? 'Document B';
        $authA    = $metaA['issuing_authority'] ?? 'the municipality';

        $exampleFacts = '';
        if (!empty($conflicts[0])) {
            $c = $conflicts[0];
            $exampleFacts .= "- Contradiction: A says '{$c['doc_a_says']}', B says '{$c['doc_b_says']}'\n";
        }
        if (!empty($deleted[0])) {
            $exampleFacts .= "- Deleted from B: '{$deleted[0]['description']}'\n";
        }
        if (!empty($added[0])) {
            $exampleFacts .= "- New in B: '{$added[0]['description']}'\n";
        }
        if (!empty($procGaps[0])) {
            $exampleFacts .= "- Procedural gap: '{$procGaps[0]['gap']}'\n";
        }
        if (!empty(($partiesDiff['changed_between'] ?? [])[0])) {
            $pc = $partiesDiff['changed_between'][0];
            $exampleFacts .= "- Party change: {$pc['name']}: '{$pc['in_a']}' → '{$pc['in_b']}'\n";
        }

        $prompt = <<<PROMPT
A family uploaded two Barnevernet documents for comparison:
- Document A: {$docTypeA} from {$authA}
- Document B: {$docTypeB}
- Discrepancies found: {$summary}

Most significant examples:
{$exampleFacts}

Generate exactly 4 specific legal research questions targeting the legal significance of these discrepancies.

Focus areas:
1. ECHR Article 8 procedural fairness when Barnevernet changes factual narrative between document versions
2. Barnevernloven requirements for changing the stated basis for an intervention
3. Procedural obligations when new allegations are introduced after initial filing
4. Documentation and evidence standards (Bufdir/Statsforvalter guidance)

Make each question specific to the discrepancies above — embed actual details.

Return JSON only in {$locale}:
{
  "sub_questions": [
    {"id":"q1","question":"...","rationale":"Why this angle matters (≤ 100 chars)"}
  ]
}
PROMPT;

        try {
            $raw  = $this->azure->chatText([
                ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
                ['role' => 'user',   'content' => $prompt],
            ], ['json' => true, 'temperature' => 0.15, 'max_tokens' => 1000, 'timeout' => 40]);
            $json = $this->azure->decodeJsonObject($raw);
            if (is_array($json) && is_array($json['sub_questions'] ?? null)) {
                $sqs = [];
                foreach (array_slice($json['sub_questions'], 0, 5) as $sq) {
                    if (!empty($sq['id']) && !empty($sq['question'])) {
                        $sqs[] = [
                            'id'        => (string)$sq['id'],
                            'question'  => (string)$sq['question'],
                            'rationale' => (string)($sq['rationale'] ?? ''),
                        ];
                    }
                }
                if ($sqs) return $sqs;
            }
        } catch (Throwable $e) {
            error_log('Discrepancy generateDiscrepancySubQ failed: ' . $e->getMessage());
        }

        return [
            ['id' => 'q1', 'question' => 'What does ECHR Article 8 require when Barnevernet changes the factual basis of an intervention between document versions?', 'rationale' => 'ECHR procedural fairness'],
            ['id' => 'q2', 'question' => 'Under Barnevernloven, can new allegations be introduced after the initial care order application has been filed?', 'rationale' => 'New allegations validity'],
            ['id' => 'q3', 'question' => 'What are Barnevernloven documentation requirements for home visits and assessments?', 'rationale' => 'Documentation obligations'],
            ['id' => 'q4', 'question' => 'What Bufdir guidance exists on evidence standards and investigation quality for Barnevernet interventions?', 'rationale' => 'Evidence standards'],
        ];
    }

    // ── Synthesis ──────────────────────────────────────────────────────────────

    private function synthesize(
        array  $metaA,
        array  $metaB,
        string $nameA,
        string $nameB,
        array  $partiesDiff,
        array  $timelineDiff,
        array  $numberedSources,
        string $engine,
        string $language
    ): array {
        $locale      = dbnToolsLanguageName($language);
        $sourceCount = count($numberedSources);
        $deployLabel = match ($engine) {
            'gpu'        => 'GPU (cuttlefish)',
            'azure_full' => 'gpt-4o',
            default      => $this->azure->chatDeployment(),
        };

        if (empty($numberedSources)) {
            return [
                'json' => [
                    'headline_finding'       => 'No corpus sources retrieved. Discrepancies were identified but could not be cross-referenced with the legal corpus for legal significance assessment.',
                    'critical_discrepancies' => [],
                    'recommended_actions'    => ['Enable corpus slices (Child Welfare, ECHR, Family Core, Bufdir Guidance) and re-run for legal significance mapping.'],
                    'what_remains_uncertain' => ['Legal significance of each discrepancy — re-run with corpus slices enabled.'],
                ],
                'deploy_label' => $deployLabel,
            ];
        }

        $sourcesContext = [];
        foreach ($numberedSources as $s) {
            $sourcesContext[] = sprintf(
                "[%d] %s%s\n    Corpus: %s | Authority: %s\n    Excerpt: %s",
                $s['n'],
                $s['title'],
                !empty($s['section']) ? ' — ' . $s['section'] : '',
                $s['package_or_corpus'],
                $s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
                $s['excerpt']
            );
        }
        $sourcesText = implode("\n\n", $sourcesContext);

        $discrepancyJson = json_encode([
            'timeline_conflicts'    => array_slice($timelineDiff['conflicts'] ?? [], 0, 10),
            'events_deleted_from_b' => array_slice($timelineDiff['in_a_only'] ?? [], 0, 8),
            'events_added_in_b'     => array_slice($timelineDiff['in_b_only'] ?? [], 0, 8),
            'procedural_gaps'       => array_slice($timelineDiff['procedural_gaps'] ?? [], 0, 5),
            'narrative_shifts'      => $timelineDiff['narrative_shifts'] ?? [],
            'parties_removed'       => $partiesDiff['in_a_only'] ?? [],
            'parties_added'         => $partiesDiff['in_b_only'] ?? [],
            'parties_changed'       => $partiesDiff['changed_between'] ?? [],
        ], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);

        $docTypeA  = $metaA['doc_type'] ?? $nameA;
        $docDateA  = $metaA['doc_date'] ?? '?';
        $docTypeB  = $metaB['doc_type'] ?? $nameB;
        $docDateB  = $metaB['doc_date'] ?? '?';
        $authority = $metaA['issuing_authority'] ?? $metaB['issuing_authority'] ?? 'the authority';

        $prompt = <<<PROMPT
You are Do Better Norge Legal Tools evaluating discrepancies between two Barnevernet document versions.

HALLUCINATION RULES:
- Only cite statute sections (§), ECHR articles, and case names that appear verbatim in the corpus sources below.
- Do not cite from training memory. Every legal citation must use [n] notation.

== DOCUMENTS ==
Document A: {$docTypeA} · {$docDateA} · {$authority}
Document B: {$docTypeB} · {$docDateB}

== DISCREPANCIES IDENTIFIED ==
{$discrepancyJson}

== CORPUS SOURCES ({$sourceCount} numbered — cite as [n]) ==
{$sourcesText}

== OUTPUT ==
Return valid JSON only. No markdown fences.

{
  "headline_finding": "2-3 sentence plain-language summary of the most significant discrepancy and its legal implication.",

  "critical_discrepancies": [
    {
      "category": "timeline_conflict|narrative_shift|party_discrepancy|procedural_gap",
      "title": "Short title ≤ 60 chars",
      "document_a_says": "What Document A says",
      "document_b_says": "What Document B says or what is missing",
      "significance": "high|medium|low",
      "legal_relevance": "How this may affect the case — cite [n] if corpus supports",
      "citations": ["[1]", "[3]"]
    }
  ],

  "recommended_actions": [
    "2-5 specific concrete actions for the family or their lawyer"
  ],

  "what_remains_uncertain": [
    "2-4 specific questions needing legal professional verification"
  ]
}

Rules:
- critical_discrepancies: max 10 items, ordered high → low significance.
- Only include genuine discrepancies from the data provided.
- High-significance items must cite at least one [n] if corpus evidence exists.
- recommended_actions must be concrete, not generic.
- Respond in {$locale}.
PROMPT;

        $sysPrompt = 'You return valid JSON only. No markdown fences. Only cite legal sources from the provided corpus, not training memory.';
        $messages  = [
            ['role' => 'system', 'content' => $sysPrompt],
            ['role' => 'user',   'content' => $prompt],
        ];
        $opts = ['json' => true, 'temperature' => 0.15, 'max_tokens' => 4000, 'timeout' => 240];

        $raw = '';
        try {
            if ($engine === 'gpu') {
                $response = dbnToolsCallGpuLlm($messages, $opts);
                $raw = (string)($response['choices'][0]['message']['content'] ?? '');
            } elseif ($engine === 'azure_full') {
                $raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
            } else {
                $raw = $this->azure->chatText($messages, $opts);
            }
        } catch (Throwable $e) {
            dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
        }

        $json = $this->azure->decodeJsonObject($raw);
        if (!is_array($json) || empty($json['headline_finding'])) {
            $json = [
                'headline_finding'       => $raw,
                'critical_discrepancies' => [],
                'recommended_actions'    => [],
                'what_remains_uncertain' => [],
            ];
        }
        return ['json' => $json, 'deploy_label' => $deployLabel];
    }

    // ── Corpus helpers ─────────────────────────────────────────────────────────

    private function normalizeCorpusChunk(array $chunk, string $subQId): array
    {
        return [
            'chunk_id'              => isset($chunk['id']) ? (int)$chunk['id'] : null,
            'title'                 => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
            'section'               => $chunk['section_title'] ?? null,
            'package_or_corpus'     => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
            'excerpt'               => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
            'chunk_text'            => (string)($chunk['content'] ?? ''),
            'similarity'            => isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null,
            'reranker_score'        => isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null,
            'document_id'           => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
            'source_origin'         => 'corpus',
            'authority_type'        => $chunk['authority_type'] ?? null,
            'jurisdiction'          => $chunk['jurisdiction'] ?? null,
            'source_url'            => null,
            'deep_link'             => null,
            'authority_label'       => null,
            'matched_sub_questions' => [$subQId],
        ];
    }

    private function mergeAndDedupe(array $rawPool, int $cap): array
    {
        $byKey = [];
        foreach ($rawPool as $chunk) {
            $key = 'corpus:' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
            if (!isset($byKey[$key])) {
                $byKey[$key] = $chunk;
                continue;
            }
            $existing = $byKey[$key];
            $existing['matched_sub_questions'] = array_values(array_unique(array_merge(
                $existing['matched_sub_questions'] ?? [],
                $chunk['matched_sub_questions'] ?? []
            )));
            if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
                $existing['reranker_score'] = $chunk['reranker_score'];
            }
            if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
                $existing['similarity'] = $chunk['similarity'];
            }
            $byKey[$key] = $existing;
        }
        $merged = array_values($byKey);
        usort($merged, function (array $a, array $b): int {
            $aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
            $bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
            return $bScore <=> $aScore;
        });
        return array_slice($merged, 0, $cap);
    }

    private function numberSources(array $chunks): array
    {
        $out = [];
        foreach ($chunks as $i => $c) {
            $c['n'] = $i + 1;
            $out[]  = $c;
        }
        return $out;
    }

    private function citationConfidence(array $sources): string
    {
        if (!$sources) return 'low';
        $scores = array_values(array_filter(array_map(
            fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
            $sources
        ), 'is_numeric'));
        $best = $scores ? max($scores) : 0;
        if (count($sources) >= 5 && $best >= 0.5) return 'high';
        if (count($sources) >= 3 && $best >= 0.35) return 'medium';
        return 'low';
    }

    private function hydrateSourceUrls(array &$pool): void
    {
        $docIds = [];
        foreach ($pool as $chunk) {
            $docId = (int)($chunk['document_id'] ?? 0);
            if ($docId > 0) $docIds[$docId] = true;
        }
        if (empty($docIds)) return;
        try {
            $ragDb = dbnToolsRagDb();
            $ids   = array_keys($docIds);
            $ph    = implode(',', array_fill(0, count($ids), '?'));
            $stmt  = $ragDb->prepare(
                "SELECT d.id, d.source_url, d.authority_type, d.publication_date, d.source_id, d.title
                 FROM documents d WHERE d.id IN ({$ph})"
            );
            $stmt->execute($ids);
            $docMeta   = [];
            $sourceIds = [];
            foreach ($stmt as $row) {
                $dId = (int)$row['id'];
                $sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
                if ($sid) $sourceIds[] = $sid;
                $docMeta[$dId] = [
                    'source_url'       => $row['source_url'] ?? null,
                    'authority_label'  => dbnV6AuthorityLabel($row['authority_type'] ?? null),
                    'publication_date' => $row['publication_date'] ?? null,
                    'source_id'        => $sid,
                ];
            }
            if ($sourceIds) {
                $uSids  = array_values(array_unique($sourceIds));
                $sPh    = implode(',', array_fill(0, count($uSids), '?'));
                $sStmt  = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
                $sStmt->execute($uSids);
                $srcNames = [];
                foreach ($sStmt as $row) {
                    $srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
                }
                foreach ($docMeta as &$m) {
                    if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
                        $m['corpus_source_name'] = $srcNames[$m['source_id']];
                    }
                }
                unset($m);
            }
        } catch (Throwable $e) {
            error_log('Discrepancy hydrateSourceUrls failed: ' . $e->getMessage());
            return;
        }
        foreach ($pool as &$chunk) {
            $docId = (int)($chunk['document_id'] ?? 0);
            if (!$docId || !isset($docMeta[$docId])) continue;
            $m = $docMeta[$docId];
            $chunk['source_url']         = $m['source_url'] ?? null;
            $chunk['deep_link']          = $m['source_url'] ?? null;
            $chunk['authority_label']    = $m['authority_label'] ?? $chunk['authority_label'];
            $chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
            $chunk['publication_date']   = $m['publication_date'] ?? null;
        }
        unset($chunk);
    }

    private function requireFamilyPackage(int $clientId): array
    {
        $package = dbnToolsFetchPackage('family-legal');
        if (!$package || empty($package['is_active'])) {
            dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
        }
        if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
            dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
        }
        return $package;
    }

    private function elapsedMs(float $start): int
    {
        return (int)round((microtime(true) - $start) * 1000);
    }
}