azure = $azure ?: new DbnAzureOpenAiGateway(); } /** * Main pipeline. At least 1 uploaded file is required. * * @param array $uploadedFiles [{filename, text, chars, truncated}] * @param string $advocateRole Party the user represents * @param string $engine Affects synthesis only: azure_mini|azure_full|gpu|dbn_legal * @param string $language 'en' or 'no' * @param array $sliceSelection Corpus slice toggles * @param array $controls sub_q_count, chunk_limit, similarity_threshold, reranker_top_k, temperature * @param string $additionalNotes Optional user context to supplement the document * @param callable|null $emit function(string $event, array $payload): void */ public function run( array $uploadedFiles, string $advocateRole, string $engine, string $language, array $sliceSelection, array $controls, string $additionalNotes = '', ?callable $emit = null ): array { $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu', 'dbn_legal'], true) ? $engine : 'azure_mini'; $language = in_array($language, ['en', 'no'], true) ? $language : 'en'; $controls = $this->normalizeControls($controls); if (empty($uploadedFiles)) { dbnToolsAbort('Upload at least one BVJ document before running the analyzer.', 422, 'no_uploads'); } $client = dbnToolsRequireClient(); $package = $this->requireFamilyPackage((int)$client['id']); dbnToolsBootCaveau(); $aiPortalRoot = dbnToolsAiPortalRoot(); require_once $aiPortalRoot . '/platform/includes/dbn_v6.php'; $this->uploadVecs = []; $this->stepTimings = []; $trace = []; $emitStep = function (string $stepId, string $label, string $detail, string $status) use (&$trace, $emit): void { $trace[] = $this->trace($label, $detail, $status); if ($emit) { $emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => $status]); } }; $emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void { if ($emit) { $emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => 'running']); } }; // Build combined document text (first file is primary; additional files appended) $docText = ''; foreach ($uploadedFiles as $idx => $file) { $text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8'); if ($text === '') continue; $filename = (string)($file['filename'] ?? sprintf('document-%d', $idx + 1)); $docText .= ($docText !== '' ? "\n\n--- Document: {$filename} ---\n\n" : '') . $text; } if ($docText === '') { dbnToolsAbort('Could not extract text from the uploaded file(s).', 422, 'empty_document'); } $docText = mb_substr($docText, 0, self::MAX_DOC_CHARS * 2, 'UTF-8'); // ── STEP 1: Document classification ──────────────────────────────────── $emitRunning('doc_classify', 'Document classification', 'Classifying document and extracting metadata…'); $stepStart = microtime(true); $docMeta = $this->classifyDocument($docText, $language); $this->stepTimings['doc_classify'] = $this->elapsedMs($stepStart); if ($emit) { $emit('doc_meta', ['result' => $docMeta]); } $docTypeBadge = $docMeta['doc_type'] ?? 'BVJ Document'; $refStr = $docMeta['reference_number'] ? ' · ref ' . $docMeta['reference_number'] : ''; $authStr = $docMeta['issuing_authority'] ? $docMeta['issuing_authority'] : ''; $emitStep('doc_classify', 'Document classification', trim("{$docTypeBadge} · {$authStr}{$refStr}"), 'complete'); // ── STEP 2: Party extraction ──────────────────────────────────────────── $emitRunning('party_extract', 'Party extraction', 'Identifying all named parties and their roles…'); $stepStart = microtime(true); $parties = $this->extractParties($docText, $language); $this->stepTimings['party_extract'] = $this->elapsedMs($stepStart); if ($emit) { $emit('parties', ['parties' => $parties]); } $emitStep('party_extract', 'Party extraction', sprintf('%d %s identified.', count($parties), count($parties) === 1 ? 'party' : 'parties'), 'complete'); // ── STEP 3: Timeline extraction ───────────────────────────────────────── $emitRunning('timeline_extract', 'Timeline extraction', 'Building chronological event timeline…'); $stepStart = microtime(true); $timelineEvents = $this->extractTimeline($docText, $language); $this->stepTimings['timeline_extract'] = $this->elapsedMs($stepStart); if ($emit) { $emit('timeline', ['events' => $timelineEvents]); } $highCount = count(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high')); $emitStep('timeline_extract', 'Timeline extraction', sprintf('%d events extracted (%d high-significance).', count($timelineEvents), $highCount), 'complete'); // ── STEP 4: Sub-question generation ──────────────────────────────────── $emitRunning('sub_question_gen', 'Sub-question generation', sprintf('Generating %d research angles for %s…', $controls['sub_q_count'], $advocateRole ?: 'selected role')); $stepStart = microtime(true); $subQuestions = $this->generateSubQuestions( $docMeta, $parties, $timelineEvents, $advocateRole, $controls['sub_q_count'], $language ); $this->stepTimings['sub_question_gen'] = $this->elapsedMs($stepStart); $emitStep('sub_question_gen', 'Sub-question generation', sprintf('%d sub-questions generated for %s.', count($subQuestions), $advocateRole ?: 'selected role'), 'complete'); // ── STEP 5: Slice resolution + upload indexing + corpus retrieval ─────── $emitRunning('slice_resolution', 'Slice resolution', 'Resolving corpus slice toggles…'); $stepStart = microtime(true); $sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection); if (!array_filter($sliceSelectionNormalized)) { dbnToolsAbort('Enable at least one corpus slice before running the analyzer.', 422, 'no_slices'); } $ragDb = dbnToolsRagDb(); try { $sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized); $sliceDetail = sprintf('%d slice(s) active → %d candidate documents.', count(array_filter($sliceSelectionNormalized)), count($sharedDocIds)); $sliceStatus = 'complete'; } catch (Throwable $e) { error_log('BVJ slice resolve failed: ' . $e->getMessage()); $sharedDocIds = []; $sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.'; $sliceStatus = 'warning'; } $this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart); $emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus); // Upload indexing $emitRunning('upload_indexing', 'Upload indexing', sprintf('Chunking + embedding %d file(s)…', count($uploadedFiles))); $stepStart = microtime(true); $uploadChunks = []; foreach ($uploadedFiles as $idx => $file) { $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1)); $text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8'); $uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx)); } $uploadStatus = 'complete'; $uploadDetail = sprintf('%d file(s) → %d in-memory chunks indexed.', count($uploadedFiles), count($uploadChunks)); if ($uploadChunks) { try { $texts = array_map(fn(array $c) => $c['text'], $uploadChunks); $allVecs = []; $batchSz = 5; for ($b = 0; $b < count($texts); $b += $batchSz) { $batch = array_slice($texts, $b, $batchSz); if ($emit) { $emit('progress', ['detail' => sprintf( 'Embedding chunks %d–%d of %d…', $b + 1, $b + count($batch), count($texts) )]); } $allVecs = array_merge($allVecs, dbnToolsLiteLLMEmbedBatch($batch)); } if (count($allVecs) === count($uploadChunks)) { foreach ($uploadChunks as $i => $chunk) { $this->uploadVecs[] = ['meta' => $chunk, 'vec' => $allVecs[$i]]; } } else { $uploadStatus = 'warning'; $uploadDetail = 'Upload embedding count mismatch; uploaded chunks will not participate in retrieval.'; } } catch (Throwable $e) { error_log('BVJ upload embed failed: ' . $e->getMessage()); $uploadStatus = 'warning'; $uploadDetail = 'Upload embedding timed out; corpus-only retrieval will run.'; $this->uploadVecs = []; } } $this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart); $emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus); // Corpus retrieval (per sub-question) $retrievalQueries = $subQuestions ?: [[ 'id' => 'q1', 'question' => sprintf('%s case involving %s', $docMeta['doc_type'] ?? 'BVJ document', $advocateRole), 'rationale' => 'Fallback query (sub-question generation returned empty).', ]]; $emitRunning('retrieval', 'Corpus retrieval', sprintf('Hybrid vector + keyword across %d sub-question(s)…', count($retrievalQueries))); $stepStart = microtime(true); try { $rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60); } catch (Throwable $e) { dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed'); } $rawPool = []; $retrievalWarnings = 0; $rawCorpusCount = 0; $rawUploadCount = 0; $filteredOutCount = 0; foreach ($retrievalQueries as $idx => $sq) { if ($emit) { $emit('subq', [ 'index' => $idx + 1, 'total' => count($retrievalQueries), 'id' => $sq['id'], 'question' => $sq['question'], ]); } try { $corpusChunks = $rag->searchAll( $sq['question'], $controls['chunk_limit'], null, [ 'search_private' => false, 'search_shared' => true, 'package_ids' => [(int)$package['id']], 'shared_doc_ids' => $sharedDocIds, 'chunk_limit' => $controls['chunk_limit'], 'search_method' => 'hybrid', 'reranker_enabled' => true, 'include_beta_website' => false, 'include_primary_website' => false, ] ); } catch (Throwable $e) { error_log('BVJ sub-Q retrieval failed: ' . $e->getMessage()); $corpusChunks = []; $retrievalWarnings++; } $rawCorpusCount += count($corpusChunks); foreach ($corpusChunks as $chunk) { if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) { $filteredOutCount++; continue; } $rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']); } if (!empty($this->uploadVecs)) { $uploadHits = $this->retrieveFromUploads( $sq['question'], $controls['chunk_limit'], $controls['similarity_threshold'] ); $rawUploadCount += count($uploadHits); foreach ($uploadHits as $hit) { $hit['matched_sub_questions'] = [$sq['id']]; $rawPool[] = $hit; } } } $merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP); $this->stepTimings['retrieval'] = $this->elapsedMs($stepStart); $retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete'; $retrievalDetail = sprintf( '%d sub-Q(s) × hybrid → %d corpus (%d filtered) + %d upload → %d unique after dedupe.', count($retrievalQueries), $rawCorpusCount, $filteredOutCount, $rawUploadCount, count($merged) ); $emitStep('retrieval', 'Corpus retrieval', $retrievalDetail, $retrievalStatus); $synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']); $this->hydrateSourceUrls($synthesisPool); $numberedSources = $this->numberSources($synthesisPool); // Generate upload summaries for sources from uploaded files if (!empty($uploadedFiles) && !empty($numberedSources)) { $uploadSummaries = []; foreach ($uploadedFiles as $idx => $file) { $text = mb_substr((string)($file['text'] ?? ''), 0, 4000, 'UTF-8'); $filename = (string)($file['filename'] ?? "file-{$idx}"); if ($text === '') continue; try { $raw = $this->azure->chatText([ ['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'], ['role' => 'user', 'content' => "Summarise this BVJ document for a legal researcher.\n\nFilename: {$filename}\n\nContent:\n{$text}"], ], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]); $uploadSummaries[$idx] = trim($raw); } catch (Throwable $e) { error_log('BVJ upload summary gen failed for file ' . $idx . ': ' . $e->getMessage()); $uploadSummaries[$idx] = null; } } foreach ($numberedSources as &$src) { if (($src['source_origin'] ?? '') !== 'upload') continue; if (preg_match('/^upload:(\d+):/', (string)($src['chunk_id'] ?? ''), $m)) { $src['summary'] = $uploadSummaries[(int)$m[1]] ?? null; } } unset($src); } $retrievalCounts = [ 'raw_corpus' => $rawCorpusCount, 'filtered' => $filteredOutCount, 'raw_upload' => $rawUploadCount, 'after_dedupe' => count($merged), 'after_topk' => count($numberedSources), ]; // ── STEP 6: Synthesis ─────────────────────────────────────────────────── $engineLabel = match ($engine) { 'azure_full' => 'Azure gpt-4o', 'gpu' => 'GPU qwen2.5:14b', 'dbn_legal' => 'dbn-legal-agent', default => 'Azure gpt-4o-mini', }; $emitRunning('synthesis', 'Synthesis', sprintf('Synthesising advocacy brief with %s…', $engineLabel)); $stepStart = microtime(true); $synthesis = $this->synthesiseBvj( $docText, $docMeta, $parties, $timelineEvents, $subQuestions, $numberedSources, $advocateRole, $engine, $language, $controls['temperature'], $additionalNotes, $emit ); $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart); $emitStep('synthesis', 'Synthesis', sprintf('%s synthesised advocacy brief using %d source(s) + document.', $synthesis['deploy_label'], count($numberedSources)), 'complete'); // ── STEP 7: Confidence ────────────────────────────────────────────────── $confidence = $this->citationConfidence($numberedSources); $emitStep('confidence', 'Citation confidence', sprintf('%s confidence based on %d source(s).', ucfirst($confidence), count($numberedSources)), $confidence === 'low' ? 'warning' : 'complete'); // Build sub-question output with top_sources $subQOut = []; foreach ($retrievalQueries as $sq) { $matchedChunks = array_values(array_filter( $numberedSources, fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true) )); $topSources = array_slice($matchedChunks, 0, 3); $subQOut[] = [ 'id' => $sq['id'], 'question' => $sq['question'], 'rationale' => $sq['rationale'] ?? '', 'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)), 'top_sources' => array_map(fn(array $s) => [ 'n' => $s['n'] ?? null, 'title' => $s['title'] ?? '', 'section' => $s['section'] ?? null, 'deep_link' => $s['deep_link'] ?? $s['source_url'] ?? null, 'source_url' => $s['source_url'] ?? null, 'source_origin' => $s['source_origin'] ?? 'corpus', 'authority_label' => $s['authority_label'] ?? null, 'excerpt' => $s['excerpt'] ?? '', ], $topSources), ]; } $synJson = $synthesis['json']; return [ 'tool' => 'bvj_analyzer', 'language' => $language, 'advocate_role' => $advocateRole, 'doc_meta' => $docMeta, 'parties' => $parties, 'timeline' => ['events' => $timelineEvents], 'advocacy_brief' => (string)($synJson['advocacy_brief'] ?? ''), 'procedural_red_flags' => is_array($synJson['procedural_red_flags'] ?? null) ? $synJson['procedural_red_flags'] : [], 'client_strengths' => is_array($synJson['client_strengths'] ?? null) ? $synJson['client_strengths'] : [], 'opposing_weaknesses' => is_array($synJson['opposing_weaknesses'] ?? null) ? $synJson['opposing_weaknesses'] : [], 'sub_questions' => $subQOut, 'sources' => $numberedSources, 'what_we_found' => (string)($synJson['what_we_found'] ?? ''), 'what_remains_uncertain' => $synJson['what_remains_uncertain'] ?? [], 'next_practical_step' => (string)($synJson['next_practical_step'] ?? ''), 'trace' => $trace, 'trace_metadata' => [ 'chunk_count' => count($merged), 'source_count' => count($numberedSources), 'sub_question_count' => count($retrievalQueries), 'upload_chunk_count' => count($this->uploadVecs), 'deployment' => $synthesis['deploy_label'], 'engine_used' => $engine, 'citation_confidence' => $confidence, 'elapsed_ms_per_step' => $this->stepTimings, 'retrieval_counts' => $retrievalCounts, 'slices_active' => array_keys(array_filter($sliceSelectionNormalized)), ], 'disclaimer' => dbnToolsDisclaimer($language), ]; } // ── Step 1: Document classification ────────────────────────────────────── private function classifyDocument(string $docText, string $language): array { $locale = $language === 'no' ? 'Norwegian' : 'English'; $excerpt = mb_substr($docText, 0, 6000, 'UTF-8'); $prompt = << 'BVJ Document', 'doc_date' => null, 'issuing_authority' => null, 'reference_number' => null, 'child_info' => null, ]; try { $raw = $this->azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 400, 'timeout' => 30]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json)) { return array_merge($default, array_filter($json, fn($v) => $v !== null && $v !== '')); } } catch (Throwable $e) { error_log('BVJ classifyDocument failed: ' . $e->getMessage()); } return $default; } // ── Step 2: Party extraction ────────────────────────────────────────────── private function extractParties(string $docText, string $language): array { $locale = $language === 'no' ? 'Norwegian' : 'English'; $excerpt = mb_substr($docText, 0, 12000, 'UTF-8'); $prompt = <<azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 1500, 'timeout' => 40]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json) && is_array($json['parties'] ?? null)) { return array_slice($json['parties'], 0, 20); } // Fallback: model returned an array at root level instead of {parties:[...]} if (is_array($json) && isset($json[0]['name'])) { return array_slice($json, 0, 20); } error_log('BVJ extractParties unexpected structure: ' . substr($raw, 0, 300)); } catch (Throwable $e) { error_log('BVJ extractParties failed: ' . $e->getMessage()); } return []; } // ── Step 3: Timeline extraction ─────────────────────────────────────────── private function extractTimeline(string $docText, string $language): array { $locale = $language === 'no' ? 'Norwegian' : 'English'; $excerpt = mb_substr($docText, 0, 12000, 'UTF-8'); $prompt = <<azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 3000, 'timeout' => 45]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json) && is_array($json['events'] ?? null)) { return array_slice($json['events'], 0, 30); } } catch (Throwable $e) { error_log('BVJ extractTimeline failed: ' . $e->getMessage()); } return []; } // ── Step 4: Sub-question generation ────────────────────────────────────── private function generateSubQuestions( array $docMeta, array $parties, array $timelineEvents, string $advocateRole, int $count, string $language ): array { $locale = $language === 'no' ? 'Norwegian' : 'English'; $docType = $docMeta['doc_type'] ?? 'BVJ document'; $roleStr = $advocateRole !== '' ? $advocateRole : 'the affected party'; // Summarise the top events to give the model context $eventSummary = ''; $highEvents = array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'); $topEvents = array_slice(array_merge(array_values($highEvents), array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'))), 0, 8); foreach ($topEvents as $ev) { $eventSummary .= sprintf("- %s: %s (%s)\n", $ev['date'] ?? '?', $ev['action'] ?? '', $ev['actor'] ?? ''); } // Summarise parties $partyList = ''; foreach (array_slice($parties, 0, 8) as $p) { $partyList .= sprintf("- %s (%s)\n", $p['name'] ?? '', $p['role'] ?? ''); } $prompt = <<azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ], ['json' => true, 'temperature' => 0.15, 'max_tokens' => 1000, 'timeout' => 40]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json) && is_array($json['sub_questions'] ?? null) && count($json['sub_questions']) >= 1) { $sqs = []; foreach (array_slice($json['sub_questions'], 0, $count) as $sq) { if (!empty($sq['id']) && !empty($sq['question'])) { $sqs[] = [ 'id' => (string)$sq['id'], 'question' => (string)$sq['question'], 'rationale' => (string)($sq['rationale'] ?? ''), ]; } } if ($sqs) return $sqs; } } catch (Throwable $e) { error_log('BVJ generateSubQuestions failed: ' . $e->getMessage()); } // Fallback: generic sub-questions $role = $advocateRole ?: 'affected party'; return [ ['id' => 'q1', 'question' => "What procedural rights does {$role} have in Barnevernet proceedings under Barnevernloven?", 'rationale' => 'Procedural rights'], ['id' => 'q2', 'question' => "What does ECHR Article 8 require when child welfare authorities intervene in family life?", 'rationale' => 'ECHR Article 8'], ['id' => 'q3', 'question' => "What Bufdir guidance applies to the proportionality of Barnevernet interventions?", 'rationale' => 'Proportionality'], ['id' => 'q4', 'question' => "What are the documentation and notice obligations of BVV before taking acute measures?", 'rationale' => 'Documentation obligations'], ]; } // ── Step 6: Synthesis ───────────────────────────────────────────────────── private function synthesiseBvj( string $docText, array $docMeta, array $parties, array $timelineEvents, array $subQuestions, array $numberedSources, string $advocateRole, string $engine, string $language, float $temperature, string $additionalNotes, ?callable $emit = null ): array { $locale = $language === 'no' ? 'Norwegian' : 'English'; $roleStr = $advocateRole !== '' ? $advocateRole : 'the affected party'; $docType = $docMeta['doc_type'] ?? 'BVJ Document'; $docDate = $docMeta['doc_date'] ?? 'unknown date'; $authority = $docMeta['issuing_authority'] ?? 'unknown authority'; $refNo = $docMeta['reference_number'] ? ' (ref ' . $docMeta['reference_number'] . ')' : ''; $childInfo = $docMeta['child_info'] ?? 'not specified'; $sourceCount = count($numberedSources); if (empty($numberedSources)) { $emptyBrief = $language === 'no' ? 'Ingen kildetreff ble funnet i korpuset for de valgte skivene og spørsmålene.' : 'No corpus sources were retrieved for the selected slices and sub-questions.'; return [ 'json' => [ 'advocacy_brief' => $emptyBrief, 'procedural_red_flags' => [], 'client_strengths' => [], 'opposing_weaknesses' => [], 'what_we_found' => 'No retrieved sources passed the similarity threshold.', 'what_remains_uncertain' => ['No corpus evidence retrieved — widen slice selection or try different sub-questions.'], 'next_practical_step' => 'Enable more corpus slices (Norwegian Courts, Bufdir Guidance) and re-run.', ], 'deploy_label' => match($engine) { 'gpu' => 'GPU (cuttlefish)', 'dbn_legal' => 'dbn-legal-agent', 'azure_full' => 'gpt-4o', default => $this->azure->chatDeployment(), }, ]; } // Build parties summary (top 8) $partiesSummary = ''; foreach (array_slice($parties, 0, 8) as $i => $p) { $org = $p['organization'] ? ' (' . $p['organization'] . ')' : ''; $rel = $p['relationship_to_child'] ? ' — rel: ' . $p['relationship_to_child'] : ''; $partiesSummary .= sprintf("%d. %s — %s%s%s\n", $i + 1, $p['name'] ?? '', $p['role'] ?? '', $org, $rel); } // Build timeline summary (top 15 most significant events) $highEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high')); $otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high')); $topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 15); $timelineSummary = ''; foreach ($topEvents as $ev) { $time = $ev['time_of_day'] ? ' kl.' . $ev['time_of_day'] : ''; $timelineSummary .= sprintf("- %s%s [%s] %s: %s\n", $ev['date'] ?? '?', $time, strtoupper($ev['significance'] ?? 'low'), $ev['actor'] ?? '', $ev['action'] ?? ''); } // Build sources text $sourcesContext = []; foreach ($numberedSources as $s) { $sourcesContext[] = sprintf( "[%d] (%s) %s%s\n Corpus: %s\n Authority: %s | Jurisdiction: %s\n Excerpt: %s", $s['n'], $s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus', $s['title'], !empty($s['section']) ? ' — ' . $s['section'] : '', $s['package_or_corpus'], $s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'), $s['jurisdiction'] ?? 'n/a', $s['excerpt'] ); } $sourcesText = implode("\n\n", $sourcesContext); // Build sub-question text $subQText = ''; if ($subQuestions) { $subQText = "\nSub-questions researched:\n"; foreach ($subQuestions as $sq) { $subQText .= sprintf("- %s: %s\n", $sq['id'], $sq['question']); } } $notesSection = $additionalNotes !== '' ? "\n== ADDITIONAL CONTEXT FROM ADVOCATE ==\n{$additionalNotes}\n" : ''; $docExcerpt = mb_substr($docText, 0, 3000, 'UTF-8'); $prompt = << 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], ]; $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3000, 'timeout' => 200]; $deployLabel = match ($engine) { 'gpu' => 'GPU (cuttlefish)', 'dbn_legal' => 'dbn-legal-agent', 'azure_full' => 'gpt-4o', default => $this->azure->chatDeployment(), }; $raw = ''; try { if ($engine === 'dbn_legal') { // dbn-legal-agent is slow (~6 t/s on cuttlefish). Stream the response and emit // keepalive events every 15 s so the browser connection stays alive. $raw = $this->callGpuLlmStream($messages, [ 'model' => 'dbn-legal-agent', 'temperature' => $temperature, 'max_tokens' => 2800, 'timeout' => 660, ], $emit ? static function () use ($emit): void { $emit('progress', ['detail' => 'dbn-legal-agent generating…']); } : null); } elseif ($engine === 'gpu') { $response = dbnToolsCallGpuLlm($messages, $opts); $raw = (string)($response['choices'][0]['message']['content'] ?? ''); } elseif ($engine === 'azure_full') { $raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts); } else { $raw = $this->azure->chatText($messages, $opts); } } catch (Throwable $e) { dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error'); } $json = $this->azure->decodeJsonObject($raw); if (!is_array($json) || empty($json['advocacy_brief'])) { $json = [ 'advocacy_brief' => $raw, 'procedural_red_flags' => [], 'client_strengths' => [], 'opposing_weaknesses' => [], 'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.', 'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'], 'next_practical_step' => 'Review the brief manually before relying on it.', ]; } return ['json' => $json, 'deploy_label' => $deployLabel]; } // ── GPU streaming helper (keeps browser connection alive during slow models) ── /** * Call the LiteLLM endpoint with streaming enabled and accumulate the full text. * Every 15 seconds, calls $onProgress() so PHP can flush a keepalive event to the browser. */ private function callGpuLlmStream(array $messages, array $options, ?callable $onProgress): string { $url = 'http://10.0.1.10:4000/v1/chat/completions'; $apiKey = (string)(dbnToolsEnv('LITELLM_MASTER_KEY') ?: 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d'); $timeout = (int)($options['timeout'] ?? 660); $payload = [ 'model' => (string)($options['model'] ?? 'qwen2.5:14b'), 'messages' => $messages, 'temperature' => $options['temperature'] ?? 0.1, 'max_tokens' => $options['max_tokens'] ?? 2800, 'stream' => true, ]; $body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); $headers = [ 'Content-Type: application/json', 'Authorization: Bearer ' . $apiKey, ]; $accumulated = ''; $lastKeepalive = microtime(true); $curlErr = ''; $ch = curl_init($url); curl_setopt_array($ch, [ CURLOPT_POST => true, CURLOPT_POSTFIELDS => $body, CURLOPT_HTTPHEADER => $headers, CURLOPT_TIMEOUT => $timeout, CURLOPT_RETURNTRANSFER => false, CURLOPT_WRITEFUNCTION => static function ($ch, $data) use (&$accumulated, &$lastKeepalive, $onProgress): int { foreach (explode("\n", $data) as $line) { $trimmed = ltrim($line); if (!str_starts_with($trimmed, 'data: ')) continue; $json = substr($trimmed, 6); if (trim($json) === '[DONE]') continue; $chunk = json_decode($json, true); $delta = $chunk['choices'][0]['delta']['content'] ?? ''; if ($delta !== '') $accumulated .= $delta; } if ($onProgress !== null && microtime(true) - $lastKeepalive >= 15.0) { $lastKeepalive = microtime(true); $onProgress(); @flush(); } return strlen($data); }, ]); curl_exec($ch); $curlErr = curl_error($ch); curl_close($ch); if ($curlErr !== '') { throw new RuntimeException('GPU stream request failed: ' . $curlErr); } return trim($accumulated); } // ── Shared helpers (copied from DbnDeepResearchAgent) ──────────────────── private function splitIntoChunks(string $text, string $filename, int $fileIdx): array { $text = preg_replace('/\s+/u', ' ', trim($text)) ?? ''; if ($text === '') return []; $words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: []; if (!$words) return []; $chunks = []; $i = 0; $chunkIdx = 0; $total = count($words); while ($i < $total) { $slice = array_slice($words, $i, self::CHUNK_WORDS); if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) { $chunks[] = [ 'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx), 'file_index' => $fileIdx, 'chunk_index' => $chunkIdx, 'filename' => $filename, 'text' => implode(' ', $slice), ]; $chunkIdx++; } $advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS; if ($advance < 1) $advance = 1; $i += $advance; if (count($slice) < self::CHUNK_WORDS) break; } return $chunks; } private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array { if (empty($this->uploadVecs)) return []; try { $qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? []; } catch (Throwable $e) { error_log('BVJ sub-Q embed failed: ' . $e->getMessage()); return []; } if (empty($qVec)) return []; $scored = []; foreach ($this->uploadVecs as $entry) { $sim = $this->cosineSim($qVec, $entry['vec']); if ($sim < $threshold) continue; $scored[] = [ 'chunk_id' => $entry['meta']['chunk_id'], 'title' => 'uploaded: ' . $entry['meta']['filename'], 'section' => null, 'package_or_corpus' => 'Your upload', 'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620), 'chunk_text' => $entry['meta']['text'], 'similarity' => round($sim, 4), 'reranker_score' => null, 'document_id' => null, 'source_origin' => 'upload', 'authority_type' => null, 'jurisdiction' => null, ]; } usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity'])); $keep = (int)ceil($limitPerSubQ / 2); return array_slice($scored, 0, max(1, $keep)); } private function cosineSim(array $a, array $b): float { $len = min(count($a), count($b)); if ($len === 0) return 0.0; $dot = $na = $nb = 0.0; for ($i = 0; $i < $len; $i++) { $x = (float)$a[$i]; $y = (float)$b[$i]; $dot += $x * $y; $na += $x * $x; $nb += $y * $y; } if ($na === 0.0 || $nb === 0.0) return 0.0; return $dot / (sqrt($na) * sqrt($nb)); } private function normalizeCorpusChunk(array $chunk, string $subQId): array { return [ 'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null, 'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'), 'section' => $chunk['section_title'] ?? null, 'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'), 'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620), 'chunk_text' => (string)($chunk['content'] ?? ''), 'similarity' => isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null, 'reranker_score' => isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null, 'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null, 'source_origin' => 'corpus', 'authority_type' => $chunk['authority_type'] ?? null, 'jurisdiction' => $chunk['jurisdiction'] ?? null, 'publication_year' => $chunk['publication_year'] ?? null, 'source_url' => null, 'deep_link' => null, 'authority_label' => null, 'corpus_source_name' => null, 'publication_date' => null, 'matched_sub_questions' => [$subQId], ]; } private function shouldExcludeChunk(array $chunk, array $activeSlices): bool { $title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? '')); $url = strtolower((string)($chunk['source_url'] ?? '')); $name = strtolower((string)($chunk['source_name'] ?? '')); if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true; if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true; $isDbnPage = ( str_contains($name, 'website') || str_contains($title, 'dobetternorge.no') || preg_match('/^(homepage|landing|about |contact )/i', $title) || str_contains($title, 'resource directory') || preg_match('/^flashcards?\s*[-–|]/i', $title) || preg_match('/\|\s*do better norge\s*$/i', $title) || preg_match('/[-–]\s*do better norge\s*$/i', $title) ); if ($isDbnPage) { return !($activeSlices['dbn_resources'] ?? false); } return false; } private function hydrateSourceUrls(array &$pool): void { $docIds = []; foreach ($pool as $chunk) { if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue; $docId = (int)($chunk['document_id'] ?? 0); if ($docId > 0) $docIds[$docId] = true; } if (empty($docIds)) return; try { $ragDb = dbnToolsRagDb(); $ids = array_keys($docIds); $ph = implode(',', array_fill(0, count($ids), '?')); $stmt = $ragDb->prepare(" SELECT d.id, d.title, d.source_url, d.authority_type, d.publication_date, d.source_id, d.jurisdiction, d.summary, LEFT(d.content, 4000) AS content_excerpt FROM documents d WHERE d.id IN ({$ph}) "); $stmt->execute($ids); $docMeta = []; $sourceIds = []; foreach ($stmt as $row) { $dId = (int)$row['id']; $sid = isset($row['source_id']) ? (int)$row['source_id'] : null; if ($sid) $sourceIds[] = $sid; $docMeta[$dId] = [ 'source_url' => $row['source_url'] ?? null, 'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null), 'publication_date' => $row['publication_date'] ?? null, 'corpus_source_name' => 'Do Better Legal', 'source_id' => $sid, 'summary' => $row['summary'] ?? null, 'content_excerpt' => (string)($row['content_excerpt'] ?? ''), 'title' => (string)($row['title'] ?? ''), ]; } $unsummarized = array_filter($docMeta, fn($m) => $m['summary'] === null && $m['content_excerpt'] !== ''); foreach ($unsummarized as $dId => $m) { try { $raw = $this->azure->chatText([ ['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'], ['role' => 'user', 'content' => "Summarise this Norwegian family law document.\nFocus on: legal provisions covered, authority type, and questions it helps answer.\n\nTitle: {$m['title']}\n\nContent:\n{$m['content_excerpt']}"], ], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]); $summary = trim($raw); if ($summary !== '') { $ragDb->prepare("UPDATE documents SET summary = ? WHERE id = ?")->execute([$summary, $dId]); $docMeta[$dId]['summary'] = $summary; } } catch (Throwable $e) { error_log('BVJ hydrateSourceUrls summary gen failed for doc ' . $dId . ': ' . $e->getMessage()); } } if (!empty($sourceIds)) { $uSids = array_values(array_unique($sourceIds)); $sPh = implode(',', array_fill(0, count($uSids), '?')); $sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})"); $sStmt->execute($uSids); $srcNames = []; foreach ($sStmt as $row) { $srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal')); } foreach ($docMeta as &$m) { if ($m['source_id'] && isset($srcNames[$m['source_id']])) { $m['corpus_source_name'] = $srcNames[$m['source_id']]; } } unset($m); } } catch (Throwable $e) { error_log('BVJ hydrateSourceUrls failed: ' . $e->getMessage()); return; } foreach ($pool as &$chunk) { if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue; $docId = (int)($chunk['document_id'] ?? 0); if (!$docId || !isset($docMeta[$docId])) continue; $m = $docMeta[$docId]; $sourceUrl = $m['source_url'] ?? null; $chunk['source_url'] = $sourceUrl; $chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null); $chunk['authority_label'] = $m['authority_label'] ?? $chunk['authority_label']; $chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null; $chunk['publication_date'] = $m['publication_date'] ?? null; $chunk['summary'] = $m['summary'] ?? null; } unset($chunk); } private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string { if (!$sourceUrl) return null; $sourceUrl = trim($sourceUrl); if ($sourceUrl === '') return null; if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl) && $sectionTitle && preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) { return rtrim($sourceUrl, '/') . '/§' . $m[1]; } return $sourceUrl; } private function mergeAndDedupe(array $rawPool, int $cap): array { $byKey = []; foreach ($rawPool as $chunk) { $key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4))); if (!isset($byKey[$key])) { $byKey[$key] = $chunk; continue; } $existing = $byKey[$key]; $existing['matched_sub_questions'] = array_values(array_unique(array_merge( $existing['matched_sub_questions'] ?? [], $chunk['matched_sub_questions'] ?? [] ))); if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) { $existing['similarity'] = $chunk['similarity']; } if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) { $existing['reranker_score'] = $chunk['reranker_score']; } $byKey[$key] = $existing; } $merged = array_values($byKey); usort($merged, function (array $a, array $b): int { $aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0; $bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0; return $bScore <=> $aScore; }); return array_slice($merged, 0, $cap); } private function numberSources(array $chunks): array { $out = []; foreach ($chunks as $i => $c) { $c['n'] = $i + 1; $out[] = $c; } return $out; } private function citationConfidence(array $sources): string { if (!$sources) return 'low'; $scores = array_values(array_filter(array_map( fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null, $sources ), 'is_numeric')); $best = $scores ? max($scores) : 0; if (count($sources) >= 6 && $best >= 0.5) return 'high'; if (count($sources) >= 3 && $best >= 0.35) return 'medium'; return 'low'; } private function normalizeControls(array $controls): array { return [ 'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))), 'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))), 'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))), 'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))), 'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))), ]; } private function requireFamilyPackage(int $clientId): array { $package = dbnToolsFetchPackage('family-legal'); if (!$package || empty($package['is_active'])) { dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable'); } if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) { dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing'); } return $package; } private function trace(string $label, string $detail, string $status = 'complete'): array { return ['label' => $label, 'detail' => $detail, 'status' => $status]; } private function elapsedMs(float $start): int { return (int)round((microtime(true) - $start) * 1000); } }