azure = $azure ?: DbnGatewayFactory::makeForTool('discrepancy-find'); } /** * @param array $fileA {filename, text, chars, truncated} * @param array $fileB {filename, text, chars, truncated} * @param string $engine 'azure_mini'|'azure_full'|'gpu' * @param string $language 'en'|'no'|'uk'|'pl' * @param array $sliceSelection Corpus slice toggles * @param callable|null $emit function(string $event, array $payload): void */ public function run( array $fileA, array $fileB, string $engine, string $language, array $sliceSelection, ?callable $emit = null ): array { $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini'; $language = dbnToolsNormalizeUiLanguage($language); $textA = mb_substr((string)($fileA['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8'); $textB = mb_substr((string)($fileB['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8'); if ($textA === '' || $textB === '') { dbnToolsAbort('Could not extract text from one or both uploaded files.', 422, 'empty_document'); } $nameA = (string)($fileA['filename'] ?? 'Document A'); $nameB = (string)($fileB['filename'] ?? 'Document B'); $client = dbnToolsRequireClient(); $package = $this->requireFamilyPackage((int)$client['id']); dbnToolsBootCaveau(); $aiPortalRoot = dbnToolsAiPortalRoot(); require_once $aiPortalRoot . '/platform/includes/dbn_v6.php'; $this->stepTimings = []; $trace = []; $emitStep = function (string $stepId, string $label, string $detail, string $status) use (&$trace, $emit): void { $trace[] = ['label' => $label, 'detail' => $detail, 'status' => $status]; if ($emit) { $emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => $status]); } }; $emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void { if ($emit) { $emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => 'running']); } }; // ── STEP 1+2: Classify both documents ───────────────────────────────── $emitRunning('doc_classify', 'Classify documents', "Classifying {$nameA}…"); $stepStart = microtime(true); $metaA = $this->classifyDoc($textA, $nameA, $language); if ($emit) $emit('doc_a_meta', ['result' => $metaA]); if ($emit) $emit('progress', ['detail' => "Classifying {$nameB}…"]); $metaB = $this->classifyDoc($textB, $nameB, $language); if ($emit) $emit('doc_b_meta', ['result' => $metaB]); $this->stepTimings['doc_classify'] = $this->elapsedMs($stepStart); $emitStep('doc_classify', 'Classify documents', sprintf('%s (%s) → %s (%s)', $metaA['doc_type'] ?? 'Document A', $metaA['doc_date'] ?? '?', $metaB['doc_type'] ?? 'Document B', $metaB['doc_date'] ?? '?'), 'complete'); // ── STEP 3: Extract parties from both documents ───────────────────────── $emitRunning('party_extract', 'Extract parties', "Extracting parties from {$nameA}…"); $stepStart = microtime(true); $partiesA = $this->extractPartiesDoc($textA, $nameA, $language); if ($emit) $emit('parties_a', ['parties' => $partiesA]); if ($emit) $emit('progress', ['detail' => "Extracting parties from {$nameB}…"]); $partiesB = $this->extractPartiesDoc($textB, $nameB, $language); if ($emit) $emit('parties_b', ['parties' => $partiesB]); $this->stepTimings['party_extract'] = $this->elapsedMs($stepStart); $emitStep('party_extract', 'Extract parties', sprintf('%d in %s · %d in %s', count($partiesA), $nameA, count($partiesB), $nameB), 'complete'); // ── STEP 4: Build timelines from both documents ───────────────────────── $emitRunning('timeline_extract', 'Build timelines', "Building timeline from {$nameA}…"); $stepStart = microtime(true); $timelineA = $this->extractTimelineDoc($textA, $nameA, $language); if ($emit) $emit('timeline_a', ['events' => $timelineA]); if ($emit) $emit('progress', ['detail' => "Building timeline from {$nameB}…"]); $timelineB = $this->extractTimelineDoc($textB, $nameB, $language); if ($emit) $emit('timeline_b', ['events' => $timelineB]); $this->stepTimings['timeline_extract'] = $this->elapsedMs($stepStart); $emitStep('timeline_extract', 'Build timelines', sprintf('%d events in %s · %d events in %s', count($timelineA), $nameA, count($timelineB), $nameB), 'complete'); // ── STEP 5: Cross-reference parties ──────────────────────────────────── $emitRunning('cross_parties', 'Cross-reference parties', 'Comparing parties across both documents…'); $stepStart = microtime(true); $partiesDiff = $this->crossReferenceParties($partiesA, $partiesB, $nameA, $nameB, $language); if ($emit) $emit('parties_diff', ['result' => $partiesDiff]); $this->stepTimings['cross_parties'] = $this->elapsedMs($stepStart); $pRemoved = count($partiesDiff['in_a_only'] ?? []); $pAdded = count($partiesDiff['in_b_only'] ?? []); $pChanged = count($partiesDiff['changed_between'] ?? []); $emitStep('cross_parties', 'Cross-reference parties', sprintf('%d removed · %d added · %d changed', $pRemoved, $pAdded, $pChanged), 'complete'); // ── STEP 6: Cross-reference timelines ───────────────────────────────── $emitRunning('cross_timelines', 'Cross-reference timelines', 'Scanning for contradictions, deletions, and new events…'); $stepStart = microtime(true); $timelineDiff = $this->crossReferenceTimelines( $timelineA, $timelineB, $textA, $textB, $nameA, $nameB, $language ); if ($emit) $emit('timeline_diff', ['result' => $timelineDiff]); $this->stepTimings['cross_timelines'] = $this->elapsedMs($stepStart); $conflictCount = count($timelineDiff['conflicts'] ?? []); $deletedCount = count($timelineDiff['in_a_only'] ?? []); $addedCount = count($timelineDiff['in_b_only'] ?? []); $emitStep('cross_timelines', 'Cross-reference timelines', sprintf('%d contradictions · %d deleted events · %d new events', $conflictCount, $deletedCount, $addedCount), 'complete'); // ── STEP 7: Generate research sub-questions ──────────────────────────── $emitRunning('sub_question_gen', 'Research questions', 'Generating legal research questions from discrepancies…'); $stepStart = microtime(true); $subQuestions = $this->generateDiscrepancySubQ( $partiesDiff, $timelineDiff, $metaA, $metaB, $language ); $this->stepTimings['sub_question_gen'] = $this->elapsedMs($stepStart); $emitStep('sub_question_gen', 'Research questions', sprintf('%d legal research question(s) generated.', count($subQuestions)), 'complete'); // ── STEP 8: Corpus retrieval ──────────────────────────────────────────── $emitRunning('retrieval', 'Retrieve legal context', sprintf('Hybrid vector + keyword search across %d question(s)…', count($subQuestions))); $stepStart = microtime(true); $sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection); if (!array_filter($sliceSelectionNormalized)) { $sliceSelectionNormalized = [ 'child_welfare' => true, 'echr' => true, 'family_core' => true, 'bufdir_guidance' => true, ]; } $ragDb = dbnToolsRagDb(); try { $sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized); } catch (Throwable $e) { error_log('Discrepancy slice resolve failed: ' . $e->getMessage()); $sharedDocIds = []; } try { $rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60); } catch (Throwable $e) { dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed'); } $retrievalQueries = $subQuestions ?: [[ 'id' => 'q1', 'question' => 'ECHR procedural requirements when Barnevernet changes facts between document versions', 'rationale' => 'Fallback query', ]]; $rawPool = []; $retrievalWarnings = 0; $rawCorpusCount = 0; foreach ($retrievalQueries as $idx => $sq) { if ($emit) { $emit('subq', [ 'index' => $idx + 1, 'total' => count($retrievalQueries), 'id' => $sq['id'], 'question' => $sq['question'], ]); } try { $corpusChunks = $rag->searchAll( $sq['question'], 6, null, [ 'search_private' => false, 'search_shared' => true, 'package_ids' => [(int)$package['id']], 'shared_doc_ids' => $sharedDocIds, 'chunk_limit' => 6, 'search_method' => 'hybrid', 'reranker_enabled' => true, 'include_beta_website' => false, 'include_primary_website' => false, ] ); } catch (Throwable $e) { error_log('Discrepancy sub-Q retrieval failed: ' . $e->getMessage()); $corpusChunks = []; $retrievalWarnings++; } $rawCorpusCount += count($corpusChunks); foreach ($corpusChunks as $chunk) { $rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']); } } $merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP); $this->hydrateSourceUrls($merged); $numberedSources = $this->numberSources(array_slice($merged, 0, 12)); $this->stepTimings['retrieval'] = $this->elapsedMs($stepStart); $retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete'; $emitStep('retrieval', 'Retrieve legal context', sprintf('%d sub-Q(s) → %d corpus chunks → %d unique sources.', count($retrievalQueries), $rawCorpusCount, count($numberedSources)), $retrievalStatus); // ── STEP 9: Synthesis ─────────────────────────────────────────────────── $engineLabel = match ($engine) { 'azure_full' => 'Azure gpt-4o', 'gpu' => 'GPU qwen2.5:14b', default => 'Azure gpt-4o-mini', }; $emitRunning('synthesis', 'Synthesize report', sprintf('Synthesising discrepancy report with %s…', $engineLabel)); $stepStart = microtime(true); $synthesis = $this->synthesize( $metaA, $metaB, $nameA, $nameB, $partiesDiff, $timelineDiff, $numberedSources, $engine, $language ); $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart); $confidence = $this->citationConfidence($numberedSources); $emitStep('synthesis', 'Synthesize report', sprintf('Report complete · %d source(s) · %s confidence.', count($numberedSources), $confidence), 'complete'); $synJson = $synthesis['json']; return [ 'tool' => 'discrepancy', 'language' => $language, 'doc_a_name' => $nameA, 'doc_b_name' => $nameB, 'doc_a_meta' => $metaA, 'doc_b_meta' => $metaB, 'parties_a' => $partiesA, 'parties_b' => $partiesB, 'timeline_a' => $timelineA, 'timeline_b' => $timelineB, 'parties_diff' => $partiesDiff, 'timeline_diff' => $timelineDiff, 'headline_finding' => (string)($synJson['headline_finding'] ?? ''), 'critical_discrepancies' => is_array($synJson['critical_discrepancies'] ?? null) ? $synJson['critical_discrepancies'] : [], 'recommended_actions' => is_array($synJson['recommended_actions'] ?? null) ? $synJson['recommended_actions'] : [], 'what_remains_uncertain' => is_array($synJson['what_remains_uncertain'] ?? null) ? $synJson['what_remains_uncertain'] : [], 'sources' => $numberedSources, 'sub_questions' => $subQuestions, 'citation_confidence' => $confidence, 'trace' => $trace, 'trace_metadata' => [ 'source_count' => count($numberedSources), 'sub_question_count' => count($retrievalQueries), 'conflict_count' => $conflictCount, 'deleted_count' => $deletedCount, 'added_count' => $addedCount, 'deployment' => $synthesis['deploy_label'], 'engine_used' => $engine, 'citation_confidence' => $confidence, 'elapsed_ms_per_step' => $this->stepTimings, ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } // ── Per-document classification ──────────────────────────────────────────── private function classifyDoc(string $docText, string $label, string $language): array { $locale = dbnToolsLanguageName($language); $excerpt = mb_substr($docText, 0, 6000, 'UTF-8'); $prompt = << $label, 'doc_date' => null, 'issuing_authority' => null, 'reference_number' => null, 'child_info' => null, ]; try { $raw = $this->azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 400, 'timeout' => 30]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json)) { return array_merge($default, array_filter($json, fn($v) => $v !== null && $v !== '')); } } catch (Throwable $e) { error_log('Discrepancy classifyDoc failed (' . $label . '): ' . $e->getMessage()); } return $default; } // ── Per-document party extraction ────────────────────────────────────────── private function extractPartiesDoc(string $docText, string $label, string $language): array { $locale = dbnToolsLanguageName($language); $excerpt = mb_substr($docText, 0, 20000, 'UTF-8'); $prompt = <<azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 2000, 'timeout' => 45]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json) && is_array($json['parties'] ?? null)) { return array_slice($json['parties'], 0, 20); } if (is_array($json) && isset($json[0]['name'])) { return array_slice($json, 0, 20); } } catch (Throwable $e) { error_log('Discrepancy extractPartiesDoc failed (' . $label . '): ' . $e->getMessage()); } return []; } // ── Per-document timeline extraction ─────────────────────────────────────── private function extractTimelineDoc(string $docText, string $label, string $language): array { $locale = dbnToolsLanguageName($language); $excerpt = mb_substr($docText, 0, 20000, 'UTF-8'); $prompt = <<azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 4000, 'timeout' => 55]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json) && is_array($json['events'] ?? null)) { return array_slice($json['events'], 0, 40); } } catch (Throwable $e) { error_log('Discrepancy extractTimelineDoc failed (' . $label . '): ' . $e->getMessage()); } return []; } // ── Cross-reference: parties ─────────────────────────────────────────────── private function crossReferenceParties( array $partiesA, array $partiesB, string $nameA, string $nameB, string $language ): array { $locale = dbnToolsLanguageName($language); $partiesAJson = json_encode($partiesA, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT); $partiesBJson = json_encode($partiesB, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT); $prompt = << [], 'in_b_only' => [], 'changed_between' => []]; try { $raw = $this->azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 2000, 'timeout' => 50]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json)) { return array_merge($default, array_intersect_key($json, $default)); } } catch (Throwable $e) { error_log('Discrepancy crossReferenceParties failed: ' . $e->getMessage()); } return $default; } // ── Cross-reference: timelines ───────────────────────────────────────────── private function crossReferenceTimelines( array $timelineA, array $timelineB, string $textA, string $textB, string $nameA, string $nameB, string $language ): array { $locale = dbnToolsLanguageName($language); $tlAJson = json_encode(array_slice($timelineA, 0, 30), JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT); $tlBJson = json_encode(array_slice($timelineB, 0, 30), JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT); $excerptA = mb_substr($textA, 0, 3000, 'UTF-8'); $excerptB = mb_substr($textB, 0, 3000, 'UTF-8'); $prompt = << [], 'in_a_only' => [], 'in_b_only' => [], 'procedural_gaps' => [], 'narrative_shifts' => ['summary' => '', 'new_in_b' => [], 'removed_from_b' => []], ]; try { $raw = $this->azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 4000, 'timeout' => 90]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json)) { return array_merge($default, array_intersect_key($json, $default)); } } catch (Throwable $e) { error_log('Discrepancy crossReferenceTimelines failed: ' . $e->getMessage()); } return $default; } // ── Sub-question generation ──────────────────────────────────────────────── private function generateDiscrepancySubQ( array $partiesDiff, array $timelineDiff, array $metaA, array $metaB, string $language ): array { $locale = dbnToolsLanguageName($language); $parts = []; $pRemove = count($partiesDiff['in_a_only'] ?? []); $pAdd = count($partiesDiff['in_b_only'] ?? []); $pChange = count($partiesDiff['changed_between'] ?? []); if ($pRemove) $parts[] = "{$pRemove} parties removed between versions"; if ($pAdd) $parts[] = "{$pAdd} new parties added in later version"; if ($pChange) $parts[] = "{$pChange} parties changed between versions"; $conflicts = $timelineDiff['conflicts'] ?? []; $deleted = $timelineDiff['in_a_only'] ?? []; $added = $timelineDiff['in_b_only'] ?? []; $procGaps = $timelineDiff['procedural_gaps'] ?? []; if ($conflicts) $parts[] = count($conflicts) . ' timeline contradictions'; if ($deleted) $parts[] = count($deleted) . ' events deleted from later version'; if ($added) $parts[] = count($added) . ' new events added in later version'; if ($procGaps) $parts[] = count($procGaps) . ' procedural gaps identified'; $summary = $parts ? implode(', ', $parts) . '.' : 'Some discrepancies found.'; $docTypeA = $metaA['doc_type'] ?? 'Document A'; $docTypeB = $metaB['doc_type'] ?? 'Document B'; $authA = $metaA['issuing_authority'] ?? 'the municipality'; $exampleFacts = ''; if (!empty($conflicts[0])) { $c = $conflicts[0]; $exampleFacts .= "- Contradiction: A says '{$c['doc_a_says']}', B says '{$c['doc_b_says']}'\n"; } if (!empty($deleted[0])) { $exampleFacts .= "- Deleted from B: '{$deleted[0]['description']}'\n"; } if (!empty($added[0])) { $exampleFacts .= "- New in B: '{$added[0]['description']}'\n"; } if (!empty($procGaps[0])) { $exampleFacts .= "- Procedural gap: '{$procGaps[0]['gap']}'\n"; } if (!empty(($partiesDiff['changed_between'] ?? [])[0])) { $pc = $partiesDiff['changed_between'][0]; $exampleFacts .= "- Party change: {$pc['name']}: '{$pc['in_a']}' → '{$pc['in_b']}'\n"; } $prompt = <<azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ], ['json' => true, 'temperature' => 0.15, 'max_tokens' => 1000, 'timeout' => 40]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json) && is_array($json['sub_questions'] ?? null)) { $sqs = []; foreach (array_slice($json['sub_questions'], 0, 5) as $sq) { if (!empty($sq['id']) && !empty($sq['question'])) { $sqs[] = [ 'id' => (string)$sq['id'], 'question' => (string)$sq['question'], 'rationale' => (string)($sq['rationale'] ?? ''), ]; } } if ($sqs) return $sqs; } } catch (Throwable $e) { error_log('Discrepancy generateDiscrepancySubQ failed: ' . $e->getMessage()); } return [ ['id' => 'q1', 'question' => 'What does ECHR Article 8 require when Barnevernet changes the factual basis of an intervention between document versions?', 'rationale' => 'ECHR procedural fairness'], ['id' => 'q2', 'question' => 'Under Barnevernloven, can new allegations be introduced after the initial care order application has been filed?', 'rationale' => 'New allegations validity'], ['id' => 'q3', 'question' => 'What are Barnevernloven documentation requirements for home visits and assessments?', 'rationale' => 'Documentation obligations'], ['id' => 'q4', 'question' => 'What Bufdir guidance exists on evidence standards and investigation quality for Barnevernet interventions?', 'rationale' => 'Evidence standards'], ]; } // ── Synthesis ────────────────────────────────────────────────────────────── private function synthesize( array $metaA, array $metaB, string $nameA, string $nameB, array $partiesDiff, array $timelineDiff, array $numberedSources, string $engine, string $language ): array { $locale = dbnToolsLanguageName($language); $sourceCount = count($numberedSources); $deployLabel = match ($engine) { 'gpu' => 'GPU (cuttlefish)', 'dbn_legal_v3' => 'dbn-legal-agent-v3', 'azure_full' => 'gpt-4o', default => $this->azure->chatDeployment(), }; if (empty($numberedSources)) { return [ 'json' => [ 'headline_finding' => 'No corpus sources retrieved. Discrepancies were identified but could not be cross-referenced with the legal corpus for legal significance assessment.', 'critical_discrepancies' => [], 'recommended_actions' => ['Enable corpus slices (Child Welfare, ECHR, Family Core, Bufdir Guidance) and re-run for legal significance mapping.'], 'what_remains_uncertain' => ['Legal significance of each discrepancy — re-run with corpus slices enabled.'], ], 'deploy_label' => $deployLabel, ]; } $sourcesContext = []; foreach ($numberedSources as $s) { $sourcesContext[] = sprintf( "[%d] %s%s\n Corpus: %s | Authority: %s\n Excerpt: %s", $s['n'], $s['title'], !empty($s['section']) ? ' — ' . $s['section'] : '', $s['package_or_corpus'], $s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'), $s['excerpt'] ); } $sourcesText = implode("\n\n", $sourcesContext); $discrepancyJson = json_encode([ 'timeline_conflicts' => array_slice($timelineDiff['conflicts'] ?? [], 0, 10), 'events_deleted_from_b' => array_slice($timelineDiff['in_a_only'] ?? [], 0, 8), 'events_added_in_b' => array_slice($timelineDiff['in_b_only'] ?? [], 0, 8), 'procedural_gaps' => array_slice($timelineDiff['procedural_gaps'] ?? [], 0, 5), 'narrative_shifts' => $timelineDiff['narrative_shifts'] ?? [], 'parties_removed' => $partiesDiff['in_a_only'] ?? [], 'parties_added' => $partiesDiff['in_b_only'] ?? [], 'parties_changed' => $partiesDiff['changed_between'] ?? [], ], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT); $docTypeA = $metaA['doc_type'] ?? $nameA; $docDateA = $metaA['doc_date'] ?? '?'; $docTypeB = $metaB['doc_type'] ?? $nameB; $docDateB = $metaB['doc_date'] ?? '?'; $authority = $metaA['issuing_authority'] ?? $metaB['issuing_authority'] ?? 'the authority'; $product = dbnToolsProductName(); $prompt = << 'system', 'content' => $sysPrompt], ['role' => 'user', 'content' => $prompt], ]; $opts = ['json' => true, 'temperature' => 0.15, 'max_tokens' => 4000, 'timeout' => 240]; $raw = ''; try { if ($engine === 'dbn_legal_v3') { $response = dbnToolsCallGpuLlm($messages, array_merge($opts, ['model' => 'dbn-legal-agent-v3', 'timeout' => 180])); $raw = (string)($response['choices'][0]['message']['content'] ?? ''); } elseif ($engine === 'gpu') { $response = dbnToolsCallGpuLlm($messages, $opts); $raw = (string)($response['choices'][0]['message']['content'] ?? ''); } elseif ($engine === 'azure_full') { $raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts); } else { $raw = $this->azure->chatText($messages, $opts); } } catch (Throwable $e) { dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error'); } $json = $this->azure->decodeJsonObject($raw); if (!is_array($json) || empty($json['headline_finding'])) { $json = [ 'headline_finding' => $raw, 'critical_discrepancies' => [], 'recommended_actions' => [], 'what_remains_uncertain' => [], ]; } return ['json' => $json, 'deploy_label' => $deployLabel]; } // ── Corpus helpers ───────────────────────────────────────────────────────── private function normalizeCorpusChunk(array $chunk, string $subQId): array { return [ 'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null, 'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'), 'section' => $chunk['section_title'] ?? null, 'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'), 'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620), 'chunk_text' => (string)($chunk['content'] ?? ''), 'similarity' => isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null, 'reranker_score' => isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null, 'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null, 'source_origin' => 'corpus', 'authority_type' => $chunk['authority_type'] ?? null, 'jurisdiction' => $chunk['jurisdiction'] ?? null, 'source_url' => null, 'deep_link' => null, 'authority_label' => null, 'matched_sub_questions' => [$subQId], ]; } private function mergeAndDedupe(array $rawPool, int $cap): array { $byKey = []; foreach ($rawPool as $chunk) { $key = 'corpus:' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4))); if (!isset($byKey[$key])) { $byKey[$key] = $chunk; continue; } $existing = $byKey[$key]; $existing['matched_sub_questions'] = array_values(array_unique(array_merge( $existing['matched_sub_questions'] ?? [], $chunk['matched_sub_questions'] ?? [] ))); if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) { $existing['reranker_score'] = $chunk['reranker_score']; } if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) { $existing['similarity'] = $chunk['similarity']; } $byKey[$key] = $existing; } $merged = array_values($byKey); usort($merged, function (array $a, array $b): int { $aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0; $bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0; return $bScore <=> $aScore; }); return array_slice($merged, 0, $cap); } private function numberSources(array $chunks): array { $out = []; foreach ($chunks as $i => $c) { $c['n'] = $i + 1; $out[] = $c; } return $out; } private function citationConfidence(array $sources): string { if (!$sources) return 'low'; $scores = array_values(array_filter(array_map( fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null, $sources ), 'is_numeric')); $best = $scores ? max($scores) : 0; if (count($sources) >= 5 && $best >= 0.5) return 'high'; if (count($sources) >= 3 && $best >= 0.35) return 'medium'; return 'low'; } private function hydrateSourceUrls(array &$pool): void { $docIds = []; foreach ($pool as $chunk) { $docId = (int)($chunk['document_id'] ?? 0); if ($docId > 0) $docIds[$docId] = true; } if (empty($docIds)) return; try { $ragDb = dbnToolsRagDb(); $ids = array_keys($docIds); $ph = implode(',', array_fill(0, count($ids), '?')); $stmt = $ragDb->prepare( "SELECT d.id, d.source_url, d.authority_type, d.publication_date, d.source_id, d.title FROM documents d WHERE d.id IN ({$ph})" ); $stmt->execute($ids); $docMeta = []; $sourceIds = []; foreach ($stmt as $row) { $dId = (int)$row['id']; $sid = isset($row['source_id']) ? (int)$row['source_id'] : null; if ($sid) $sourceIds[] = $sid; $docMeta[$dId] = [ 'source_url' => $row['source_url'] ?? null, 'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null), 'publication_date' => $row['publication_date'] ?? null, 'source_id' => $sid, ]; } if ($sourceIds) { $uSids = array_values(array_unique($sourceIds)); $sPh = implode(',', array_fill(0, count($uSids), '?')); $sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})"); $sStmt->execute($uSids); $srcNames = []; foreach ($sStmt as $row) { $srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal')); } foreach ($docMeta as &$m) { if ($m['source_id'] && isset($srcNames[$m['source_id']])) { $m['corpus_source_name'] = $srcNames[$m['source_id']]; } } unset($m); } } catch (Throwable $e) { error_log('Discrepancy hydrateSourceUrls failed: ' . $e->getMessage()); return; } foreach ($pool as &$chunk) { $docId = (int)($chunk['document_id'] ?? 0); if (!$docId || !isset($docMeta[$docId])) continue; $m = $docMeta[$docId]; $chunk['source_url'] = $m['source_url'] ?? null; $chunk['deep_link'] = $m['source_url'] ?? null; $chunk['authority_label'] = $m['authority_label'] ?? $chunk['authority_label']; $chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null; $chunk['publication_date'] = $m['publication_date'] ?? null; } unset($chunk); } private function requireFamilyPackage(int $clientId): array { $package = dbnToolsFetchPackage('family-legal'); if (!$package || empty($package['is_active'])) { dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable'); } if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) { dbnToolsAbort(dbnToolsProductName() . ' does not have an active family-legal subscription.', 503, 'subscription_missing'); } return $package; } private function elapsedMs(float $start): int { return (int)round((microtime(true) - $start) * 1000); } }