a3d46f9756
- Public landing page at / for unauthenticated users (EN/NO/UK/PL) - Authenticated / shows Case Workbench dashboard with manifesto strip, stats, and launched-tool grid (Transcribe, Timeline, BVJ, Advocate, Deep Research, Corpus) - Added includes/i18n.php with full 4-language translation layer - Extended layout.php to Case Workbench shell with tool rail, lang switcher - AI output language normalization extended to en/no/uk/pl in PHP agents - SSO token validation in bootstrap.php / index.php (dobetternorge.no bridge) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1284 lines
60 KiB
PHP
1284 lines
60 KiB
PHP
<?php
|
||
declare(strict_types=1);
|
||
|
||
require_once __DIR__ . '/bootstrap.php';
|
||
require_once __DIR__ . '/AzureOpenAiGateway.php';
|
||
|
||
/**
|
||
* BVJ (Barnevernet) Analyzer Agent
|
||
*
|
||
* Standalone 7-step pipeline that:
|
||
* 1. Classifies the uploaded document and extracts metadata
|
||
* 2. Extracts all named parties with roles
|
||
* 3. Builds a chronological timeline of events
|
||
* 4. Generates partisan sub-questions for corpus RAG
|
||
* 5. Retrieves from the legal corpus (hybrid dense+BM25)
|
||
* 6. Synthesises an advocacy brief + procedural red flags
|
||
* 7. Assesses citation confidence
|
||
*
|
||
* Steps 1-3 always use azure_mini regardless of the user's engine choice.
|
||
* Step 6 (synthesis) uses the user's selected engine.
|
||
*/
|
||
final class DbnBvjAnalyzerAgent
|
||
{
|
||
private const MAX_DOC_CHARS = 64000;
|
||
private const CHUNK_WORDS = 600;
|
||
private const CHUNK_OVERLAP_WORDS = 75;
|
||
private const MIN_CHUNK_WORDS = 50;
|
||
private const POOL_CAP = 30;
|
||
// Steps 1-3 always use this engine — fast and cheap for structured extraction
|
||
private const EXTRACT_ENGINE = 'azure_mini';
|
||
|
||
private DbnAzureOpenAiGateway $azure;
|
||
private array $uploadVecs = [];
|
||
private array $stepTimings = [];
|
||
|
||
public function __construct(?DbnAzureOpenAiGateway $azure = null)
|
||
{
|
||
$this->azure = $azure ?: new DbnAzureOpenAiGateway();
|
||
}
|
||
|
||
/**
|
||
* Main pipeline. At least 1 uploaded file is required.
|
||
*
|
||
* @param array $uploadedFiles [{filename, text, chars, truncated}]
|
||
* @param string $advocateRole Party the user represents
|
||
* @param string $engine Affects synthesis only: azure_mini|azure_full|gpu
|
||
* @param string $language 'en' or 'no'
|
||
* @param array $sliceSelection Corpus slice toggles
|
||
* @param array $controls sub_q_count, chunk_limit, similarity_threshold, reranker_top_k, temperature
|
||
* @param string $additionalNotes Optional user context to supplement the document
|
||
* @param callable|null $emit function(string $event, array $payload): void
|
||
*/
|
||
public function run(
|
||
array $uploadedFiles,
|
||
string $advocateRole,
|
||
string $engine,
|
||
string $language,
|
||
array $sliceSelection,
|
||
array $controls,
|
||
string $additionalNotes = '',
|
||
?callable $emit = null
|
||
): array {
|
||
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true)
|
||
? $engine : 'azure_mini';
|
||
$language = dbnToolsNormalizeUiLanguage($language);
|
||
$controls = $this->normalizeControls($controls);
|
||
|
||
if (empty($uploadedFiles)) {
|
||
dbnToolsAbort('Upload at least one BVJ document before running the analyzer.', 422, 'no_uploads');
|
||
}
|
||
|
||
$client = dbnToolsRequireClient();
|
||
$package = $this->requireFamilyPackage((int)$client['id']);
|
||
|
||
dbnToolsBootCaveau();
|
||
$aiPortalRoot = dbnToolsAiPortalRoot();
|
||
require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
|
||
|
||
$this->uploadVecs = [];
|
||
$this->stepTimings = [];
|
||
$trace = [];
|
||
|
||
$emitStep = function (string $stepId, string $label, string $detail, string $status)
|
||
use (&$trace, $emit): void {
|
||
$trace[] = $this->trace($label, $detail, $status);
|
||
if ($emit) {
|
||
$emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => $status]);
|
||
}
|
||
};
|
||
$emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void {
|
||
if ($emit) {
|
||
$emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => 'running']);
|
||
}
|
||
};
|
||
|
||
// Build combined document text (first file is primary; additional files appended)
|
||
$docText = '';
|
||
foreach ($uploadedFiles as $idx => $file) {
|
||
$text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8');
|
||
if ($text === '') continue;
|
||
$filename = (string)($file['filename'] ?? sprintf('document-%d', $idx + 1));
|
||
$docText .= ($docText !== '' ? "\n\n--- Document: {$filename} ---\n\n" : '') . $text;
|
||
}
|
||
if ($docText === '') {
|
||
dbnToolsAbort('Could not extract text from the uploaded file(s).', 422, 'empty_document');
|
||
}
|
||
$docText = mb_substr($docText, 0, self::MAX_DOC_CHARS * 2, 'UTF-8');
|
||
|
||
// ── STEP 1: Document classification ────────────────────────────────────
|
||
$emitRunning('doc_classify', 'Document classification', 'Classifying document and extracting metadata…');
|
||
$stepStart = microtime(true);
|
||
$docMeta = $this->classifyDocument($docText, $language);
|
||
$this->stepTimings['doc_classify'] = $this->elapsedMs($stepStart);
|
||
if ($emit) {
|
||
$emit('doc_meta', ['result' => $docMeta]);
|
||
}
|
||
$docTypeBadge = $docMeta['doc_type'] ?? 'BVJ Document';
|
||
$refStr = $docMeta['reference_number'] ? ' · ref ' . $docMeta['reference_number'] : '';
|
||
$authStr = $docMeta['issuing_authority'] ? $docMeta['issuing_authority'] : '';
|
||
$emitStep('doc_classify', 'Document classification',
|
||
trim("{$docTypeBadge} · {$authStr}{$refStr}"), 'complete');
|
||
|
||
// ── STEP 2: Party extraction ────────────────────────────────────────────
|
||
$emitRunning('party_extract', 'Party extraction', 'Identifying all named parties and their roles…');
|
||
$stepStart = microtime(true);
|
||
$parties = $this->extractParties($docText, $language);
|
||
$this->stepTimings['party_extract'] = $this->elapsedMs($stepStart);
|
||
if ($emit) {
|
||
$emit('parties', ['parties' => $parties]);
|
||
}
|
||
$emitStep('party_extract', 'Party extraction',
|
||
sprintf('%d %s identified.', count($parties), count($parties) === 1 ? 'party' : 'parties'),
|
||
'complete');
|
||
|
||
// ── STEP 3: Timeline extraction ─────────────────────────────────────────
|
||
$emitRunning('timeline_extract', 'Timeline extraction', 'Building chronological event timeline…');
|
||
$stepStart = microtime(true);
|
||
$timelineEvents = $this->extractTimeline($docText, $language);
|
||
$this->stepTimings['timeline_extract'] = $this->elapsedMs($stepStart);
|
||
if ($emit) {
|
||
$emit('timeline', ['events' => $timelineEvents]);
|
||
}
|
||
$highCount = count(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
|
||
$emitStep('timeline_extract', 'Timeline extraction',
|
||
sprintf('%d events extracted (%d high-significance).', count($timelineEvents), $highCount),
|
||
'complete');
|
||
|
||
// ── STEP 4: Sub-question generation ────────────────────────────────────
|
||
$emitRunning('sub_question_gen', 'Sub-question generation',
|
||
sprintf('Generating %d research angles for %s…', $controls['sub_q_count'], $advocateRole ?: 'selected role'));
|
||
$stepStart = microtime(true);
|
||
$subQuestions = $this->generateSubQuestions(
|
||
$docMeta, $parties, $timelineEvents,
|
||
$advocateRole, $controls['sub_q_count'], $language
|
||
);
|
||
$this->stepTimings['sub_question_gen'] = $this->elapsedMs($stepStart);
|
||
$emitStep('sub_question_gen', 'Sub-question generation',
|
||
sprintf('%d sub-questions generated for %s.', count($subQuestions), $advocateRole ?: 'selected role'),
|
||
'complete');
|
||
|
||
// ── STEP 5: Slice resolution + upload indexing + corpus retrieval ───────
|
||
$emitRunning('slice_resolution', 'Slice resolution', 'Resolving corpus slice toggles…');
|
||
$stepStart = microtime(true);
|
||
$sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
|
||
if (!array_filter($sliceSelectionNormalized)) {
|
||
dbnToolsAbort('Enable at least one corpus slice before running the analyzer.', 422, 'no_slices');
|
||
}
|
||
$ragDb = dbnToolsRagDb();
|
||
try {
|
||
$sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
|
||
$sliceDetail = sprintf('%d slice(s) active → %d candidate documents.',
|
||
count(array_filter($sliceSelectionNormalized)), count($sharedDocIds));
|
||
$sliceStatus = 'complete';
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ slice resolve failed: ' . $e->getMessage());
|
||
$sharedDocIds = [];
|
||
$sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.';
|
||
$sliceStatus = 'warning';
|
||
}
|
||
$this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
|
||
$emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus);
|
||
|
||
// Upload indexing
|
||
$emitRunning('upload_indexing', 'Upload indexing',
|
||
sprintf('Chunking + embedding %d file(s)…', count($uploadedFiles)));
|
||
$stepStart = microtime(true);
|
||
$uploadChunks = [];
|
||
foreach ($uploadedFiles as $idx => $file) {
|
||
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
|
||
$text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8');
|
||
$uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
|
||
}
|
||
$uploadStatus = 'complete';
|
||
$uploadDetail = sprintf('%d file(s) → %d in-memory chunks indexed.', count($uploadedFiles), count($uploadChunks));
|
||
if ($uploadChunks) {
|
||
try {
|
||
$texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
|
||
$allVecs = [];
|
||
$batchSz = 5;
|
||
for ($b = 0; $b < count($texts); $b += $batchSz) {
|
||
$batch = array_slice($texts, $b, $batchSz);
|
||
if ($emit) {
|
||
$emit('progress', ['detail' => sprintf(
|
||
'Embedding chunks %d–%d of %d…',
|
||
$b + 1, $b + count($batch), count($texts)
|
||
)]);
|
||
}
|
||
$allVecs = array_merge($allVecs, dbnToolsLiteLLMEmbedBatch($batch));
|
||
}
|
||
if (count($allVecs) === count($uploadChunks)) {
|
||
foreach ($uploadChunks as $i => $chunk) {
|
||
$this->uploadVecs[] = ['meta' => $chunk, 'vec' => $allVecs[$i]];
|
||
}
|
||
} else {
|
||
$uploadStatus = 'warning';
|
||
$uploadDetail = 'Upload embedding count mismatch; uploaded chunks will not participate in retrieval.';
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ upload embed failed: ' . $e->getMessage());
|
||
$uploadStatus = 'warning';
|
||
$uploadDetail = 'Upload embedding timed out; corpus-only retrieval will run.';
|
||
$this->uploadVecs = [];
|
||
}
|
||
}
|
||
$this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
|
||
$emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus);
|
||
|
||
// Corpus retrieval (per sub-question)
|
||
$retrievalQueries = $subQuestions ?: [[
|
||
'id' => 'q1',
|
||
'question' => sprintf('%s case involving %s', $docMeta['doc_type'] ?? 'BVJ document', $advocateRole),
|
||
'rationale' => 'Fallback query (sub-question generation returned empty).',
|
||
]];
|
||
$emitRunning('retrieval', 'Corpus retrieval',
|
||
sprintf('Hybrid vector + keyword across %d sub-question(s)…', count($retrievalQueries)));
|
||
$stepStart = microtime(true);
|
||
|
||
try {
|
||
$rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
|
||
} catch (Throwable $e) {
|
||
dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
|
||
}
|
||
|
||
$rawPool = [];
|
||
$retrievalWarnings = 0;
|
||
$rawCorpusCount = 0;
|
||
$rawUploadCount = 0;
|
||
$filteredOutCount = 0;
|
||
|
||
foreach ($retrievalQueries as $idx => $sq) {
|
||
if ($emit) {
|
||
$emit('subq', [
|
||
'index' => $idx + 1,
|
||
'total' => count($retrievalQueries),
|
||
'id' => $sq['id'],
|
||
'question' => $sq['question'],
|
||
]);
|
||
}
|
||
try {
|
||
$corpusChunks = $rag->searchAll(
|
||
$sq['question'],
|
||
$controls['chunk_limit'],
|
||
null,
|
||
[
|
||
'search_private' => false,
|
||
'search_shared' => true,
|
||
'package_ids' => [(int)$package['id']],
|
||
'shared_doc_ids' => $sharedDocIds,
|
||
'chunk_limit' => $controls['chunk_limit'],
|
||
'search_method' => 'hybrid',
|
||
'reranker_enabled' => true,
|
||
'include_beta_website' => false,
|
||
'include_primary_website' => false,
|
||
]
|
||
);
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ sub-Q retrieval failed: ' . $e->getMessage());
|
||
$corpusChunks = [];
|
||
$retrievalWarnings++;
|
||
}
|
||
$rawCorpusCount += count($corpusChunks);
|
||
foreach ($corpusChunks as $chunk) {
|
||
if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) {
|
||
$filteredOutCount++;
|
||
continue;
|
||
}
|
||
$rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
|
||
}
|
||
if (!empty($this->uploadVecs)) {
|
||
$uploadHits = $this->retrieveFromUploads(
|
||
$sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']
|
||
);
|
||
$rawUploadCount += count($uploadHits);
|
||
foreach ($uploadHits as $hit) {
|
||
$hit['matched_sub_questions'] = [$sq['id']];
|
||
$rawPool[] = $hit;
|
||
}
|
||
}
|
||
}
|
||
|
||
$merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
|
||
$this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
|
||
$retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
|
||
$retrievalDetail = sprintf(
|
||
'%d sub-Q(s) × hybrid → %d corpus (%d filtered) + %d upload → %d unique after dedupe.',
|
||
count($retrievalQueries), $rawCorpusCount, $filteredOutCount, $rawUploadCount, count($merged)
|
||
);
|
||
$emitStep('retrieval', 'Corpus retrieval', $retrievalDetail, $retrievalStatus);
|
||
|
||
$synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
|
||
$this->hydrateSourceUrls($synthesisPool);
|
||
$numberedSources = $this->numberSources($synthesisPool);
|
||
|
||
// Generate upload summaries for sources from uploaded files
|
||
if (!empty($uploadedFiles) && !empty($numberedSources)) {
|
||
$uploadSummaries = [];
|
||
foreach ($uploadedFiles as $idx => $file) {
|
||
$text = mb_substr((string)($file['text'] ?? ''), 0, 4000, 'UTF-8');
|
||
$filename = (string)($file['filename'] ?? "file-{$idx}");
|
||
if ($text === '') continue;
|
||
try {
|
||
$raw = $this->azure->chatText([
|
||
['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
|
||
['role' => 'user', 'content' => "Summarise this BVJ document for a legal researcher.\n\nFilename: {$filename}\n\nContent:\n{$text}"],
|
||
], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
|
||
$uploadSummaries[$idx] = trim($raw);
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ upload summary gen failed for file ' . $idx . ': ' . $e->getMessage());
|
||
$uploadSummaries[$idx] = null;
|
||
}
|
||
}
|
||
foreach ($numberedSources as &$src) {
|
||
if (($src['source_origin'] ?? '') !== 'upload') continue;
|
||
if (preg_match('/^upload:(\d+):/', (string)($src['chunk_id'] ?? ''), $m)) {
|
||
$src['summary'] = $uploadSummaries[(int)$m[1]] ?? null;
|
||
}
|
||
}
|
||
unset($src);
|
||
}
|
||
|
||
$retrievalCounts = [
|
||
'raw_corpus' => $rawCorpusCount,
|
||
'filtered' => $filteredOutCount,
|
||
'raw_upload' => $rawUploadCount,
|
||
'after_dedupe' => count($merged),
|
||
'after_topk' => count($numberedSources),
|
||
];
|
||
|
||
// ── STEP 6: Synthesis ───────────────────────────────────────────────────
|
||
$engineLabel = match ($engine) {
|
||
'azure_full' => 'Azure gpt-4o',
|
||
'gpu' => 'GPU qwen2.5:14b',
|
||
default => 'Azure gpt-4o-mini',
|
||
};
|
||
$emitRunning('synthesis', 'Synthesis',
|
||
sprintf('Synthesising advocacy brief with %s…', $engineLabel));
|
||
$stepStart = microtime(true);
|
||
$synthesis = $this->synthesiseBvj(
|
||
$docText, $docMeta, $parties, $timelineEvents,
|
||
$subQuestions, $numberedSources,
|
||
$advocateRole, $engine, $language, $controls['temperature'], $additionalNotes,
|
||
$emit
|
||
);
|
||
$this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
|
||
$emitStep('synthesis', 'Synthesis',
|
||
sprintf('%s synthesised advocacy brief using %d source(s) + document.',
|
||
$synthesis['deploy_label'], count($numberedSources)),
|
||
'complete');
|
||
|
||
// ── STEP 7: Confidence ──────────────────────────────────────────────────
|
||
$confidence = $this->citationConfidence($numberedSources);
|
||
$emitStep('confidence', 'Citation confidence',
|
||
sprintf('%s confidence based on %d source(s).', ucfirst($confidence), count($numberedSources)),
|
||
$confidence === 'low' ? 'warning' : 'complete');
|
||
|
||
// Build sub-question output with top_sources
|
||
$subQOut = [];
|
||
foreach ($retrievalQueries as $sq) {
|
||
$matchedChunks = array_values(array_filter(
|
||
$numberedSources,
|
||
fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
|
||
));
|
||
$topSources = array_slice($matchedChunks, 0, 3);
|
||
$subQOut[] = [
|
||
'id' => $sq['id'],
|
||
'question' => $sq['question'],
|
||
'rationale' => $sq['rationale'] ?? '',
|
||
'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
|
||
'top_sources' => array_map(fn(array $s) => [
|
||
'n' => $s['n'] ?? null,
|
||
'title' => $s['title'] ?? '',
|
||
'section' => $s['section'] ?? null,
|
||
'deep_link' => $s['deep_link'] ?? $s['source_url'] ?? null,
|
||
'source_url' => $s['source_url'] ?? null,
|
||
'source_origin' => $s['source_origin'] ?? 'corpus',
|
||
'authority_label' => $s['authority_label'] ?? null,
|
||
'excerpt' => $s['excerpt'] ?? '',
|
||
], $topSources),
|
||
];
|
||
}
|
||
|
||
$synJson = $synthesis['json'];
|
||
return [
|
||
'tool' => 'bvj_analyzer',
|
||
'language' => $language,
|
||
'advocate_role' => $advocateRole,
|
||
'doc_meta' => $docMeta,
|
||
'parties' => $parties,
|
||
'timeline' => ['events' => $timelineEvents],
|
||
'advocacy_brief' => (string)($synJson['advocacy_brief'] ?? ''),
|
||
'procedural_red_flags' => is_array($synJson['procedural_red_flags'] ?? null)
|
||
? $synJson['procedural_red_flags'] : [],
|
||
'client_strengths' => is_array($synJson['client_strengths'] ?? null)
|
||
? $synJson['client_strengths'] : [],
|
||
'opposing_weaknesses' => is_array($synJson['opposing_weaknesses'] ?? null)
|
||
? $synJson['opposing_weaknesses'] : [],
|
||
'sub_questions' => $subQOut,
|
||
'sources' => $numberedSources,
|
||
'what_we_found' => (string)($synJson['what_we_found'] ?? ''),
|
||
'what_remains_uncertain' => $synJson['what_remains_uncertain'] ?? [],
|
||
'next_practical_step' => (string)($synJson['next_practical_step'] ?? ''),
|
||
'trace' => $trace,
|
||
'trace_metadata' => [
|
||
'chunk_count' => count($merged),
|
||
'source_count' => count($numberedSources),
|
||
'sub_question_count' => count($retrievalQueries),
|
||
'upload_chunk_count' => count($this->uploadVecs),
|
||
'deployment' => $synthesis['deploy_label'],
|
||
'engine_used' => $engine,
|
||
'citation_confidence' => $confidence,
|
||
'elapsed_ms_per_step' => $this->stepTimings,
|
||
'retrieval_counts' => $retrievalCounts,
|
||
'slices_active' => array_keys(array_filter($sliceSelectionNormalized)),
|
||
],
|
||
'disclaimer' => dbnToolsDisclaimer($language),
|
||
];
|
||
}
|
||
|
||
// ── Step 1: Document classification ──────────────────────────────────────
|
||
|
||
private function classifyDocument(string $docText, string $language): array
|
||
{
|
||
$locale = dbnToolsLanguageName($language);
|
||
$excerpt = mb_substr($docText, 0, 6000, 'UTF-8');
|
||
|
||
$prompt = <<<PROMPT
|
||
You are analysing a Norwegian child welfare (Barnevernet) document.
|
||
Extract the following metadata from the document text below.
|
||
|
||
Return JSON only in {$locale}:
|
||
{
|
||
"doc_type": "The document type as a short phrase, e.g. Bekymringsmelding, Vedtak, Omsorgsovertakelse, Fylkesnemnda-kjennelse, Rapport, or the detected type",
|
||
"doc_date": "Primary document date in ISO 8601 format (YYYY-MM-DD) if identifiable, otherwise null",
|
||
"issuing_authority": "Name of the issuing authority or institution, e.g. Trondheim kommune barneverntjeneste",
|
||
"reference_number": "Case or document reference number if present, otherwise null",
|
||
"child_info": "Brief description of the child(ren) involved, e.g. name and birth date if visible — anonymise if clearly redacted"
|
||
}
|
||
|
||
Rules:
|
||
- If a field cannot be determined, use null.
|
||
- doc_type should be the Norwegian term if recognisable (e.g. Bekymringsmelding), otherwise English.
|
||
- Do not invent information not present in the text.
|
||
|
||
Document text (first 6000 chars):
|
||
{$excerpt}
|
||
PROMPT;
|
||
|
||
$default = [
|
||
'doc_type' => 'BVJ Document',
|
||
'doc_date' => null,
|
||
'issuing_authority' => null,
|
||
'reference_number' => null,
|
||
'child_info' => null,
|
||
];
|
||
|
||
try {
|
||
$raw = $this->azure->chatText([
|
||
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
|
||
['role' => 'user', 'content' => $prompt],
|
||
], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 400, 'timeout' => 30]);
|
||
$json = $this->azure->decodeJsonObject($raw);
|
||
if (is_array($json)) {
|
||
return array_merge($default, array_filter($json, fn($v) => $v !== null && $v !== ''));
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ classifyDocument failed: ' . $e->getMessage());
|
||
}
|
||
return $default;
|
||
}
|
||
|
||
// ── Step 2: Party extraction ──────────────────────────────────────────────
|
||
|
||
private function extractParties(string $docText, string $language): array
|
||
{
|
||
$locale = dbnToolsLanguageName($language);
|
||
$excerpt = mb_substr($docText, 0, 12000, 'UTF-8');
|
||
|
||
$prompt = <<<PROMPT
|
||
You are analysing a Norwegian child welfare (Barnevernet) document.
|
||
Identify ALL named parties — every person or institution referred to by name or title.
|
||
|
||
Respond in {$locale}. Return a JSON object with a single key "parties" containing an array of objects.
|
||
Each object must have these four fields:
|
||
- "name": full name or institution name (string)
|
||
- "role": their role in the case, e.g. Biological mother, Child, Barnevernarbeider, Saksbehandler, Melder, Politi, Lege, Advokat, Foster carer, Rusklinikk
|
||
- "organization": employer or institution if mentioned, otherwise null
|
||
- "relationship_to_child": relationship to the child in the document, e.g. Mother, Father, Caseworker, Melder, or null
|
||
|
||
Rules:
|
||
- Include every named person and named institution — even peripheral ones.
|
||
- Include Barnevernvakta (bvv) as an institution even if no individual caseworkers are named.
|
||
- Do not invent parties not present in the text.
|
||
- Maximum 20 parties.
|
||
|
||
Document text:
|
||
{$excerpt}
|
||
PROMPT;
|
||
|
||
try {
|
||
$raw = $this->azure->chatText([
|
||
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
|
||
['role' => 'user', 'content' => $prompt],
|
||
], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 1500, 'timeout' => 40]);
|
||
$json = $this->azure->decodeJsonObject($raw);
|
||
if (is_array($json) && is_array($json['parties'] ?? null)) {
|
||
return array_slice($json['parties'], 0, 20);
|
||
}
|
||
// Fallback: model returned an array at root level instead of {parties:[...]}
|
||
if (is_array($json) && isset($json[0]['name'])) {
|
||
return array_slice($json, 0, 20);
|
||
}
|
||
error_log('BVJ extractParties unexpected structure: ' . substr($raw, 0, 300));
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ extractParties failed: ' . $e->getMessage());
|
||
}
|
||
return [];
|
||
}
|
||
|
||
// ── Step 3: Timeline extraction ───────────────────────────────────────────
|
||
|
||
private function extractTimeline(string $docText, string $language): array
|
||
{
|
||
$locale = dbnToolsLanguageName($language);
|
||
$excerpt = mb_substr($docText, 0, 12000, 'UTF-8');
|
||
|
||
$prompt = <<<PROMPT
|
||
Build a chronological timeline from this Norwegian child welfare (Barnevernet) document in {$locale}.
|
||
|
||
Extract ALL dates, times, and temporal references — including phone calls, home visits, meetings, decisions, and assessments.
|
||
|
||
IMPORTANT — Norwegian date and time formats to recognise:
|
||
- DD.MM.YY (e.g. 18.07.20 = 2020-07-18)
|
||
- DD.MM.YYYY (e.g. 18.07.2020)
|
||
- D.M.YY (e.g. 6.1.20 = 2020-01-06)
|
||
- DD.MM. (day and month without year — infer year from surrounding context)
|
||
- Times: kl. HH:MM, klokken HH:MM, kl HH.MM
|
||
- Diary/log format: lines beginning with a date or time are always events.
|
||
- Two-digit years: interpret as 20YY (20 → 2020, 21 → 2021).
|
||
|
||
For each event provide:
|
||
- "date": ISO 8601 date (YYYY-MM-DD) if determinable, otherwise best-effort description
|
||
- "time_of_day": HH:MM if present, otherwise null
|
||
- "actor": person, institution, or party involved
|
||
- "action": concise description (≤ 80 chars) of what happened
|
||
- "significance": high (acute measure, removal, police involvement, formal decision) | medium (home visit, phone call, meeting) | low (minor update, note)
|
||
|
||
Sort chronologically. Maximum 30 events.
|
||
|
||
Document text:
|
||
{$excerpt}
|
||
|
||
Return JSON only:
|
||
{
|
||
"events": [{"date":"...","time_of_day":null,"actor":"...","action":"...","significance":"high|medium|low"}]
|
||
}
|
||
PROMPT;
|
||
|
||
try {
|
||
$raw = $this->azure->chatText([
|
||
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
|
||
['role' => 'user', 'content' => $prompt],
|
||
], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 3000, 'timeout' => 45]);
|
||
$json = $this->azure->decodeJsonObject($raw);
|
||
if (is_array($json) && is_array($json['events'] ?? null)) {
|
||
return array_slice($json['events'], 0, 30);
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ extractTimeline failed: ' . $e->getMessage());
|
||
}
|
||
return [];
|
||
}
|
||
|
||
// ── Step 4: Sub-question generation ──────────────────────────────────────
|
||
|
||
private function generateSubQuestions(
|
||
array $docMeta,
|
||
array $parties,
|
||
array $timelineEvents,
|
||
string $advocateRole,
|
||
int $count,
|
||
string $language
|
||
): array {
|
||
$locale = dbnToolsLanguageName($language);
|
||
$docType = $docMeta['doc_type'] ?? 'BVJ document';
|
||
$roleStr = $advocateRole !== '' ? $advocateRole : 'the affected party';
|
||
|
||
// Summarise the top events to give the model context
|
||
$eventSummary = '';
|
||
$highEvents = array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high');
|
||
$topEvents = array_slice(array_merge(array_values($highEvents),
|
||
array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'))), 0, 8);
|
||
foreach ($topEvents as $ev) {
|
||
$eventSummary .= sprintf("- %s: %s (%s)\n", $ev['date'] ?? '?', $ev['action'] ?? '', $ev['actor'] ?? '');
|
||
}
|
||
|
||
// Summarise parties
|
||
$partyList = '';
|
||
foreach (array_slice($parties, 0, 8) as $p) {
|
||
$partyList .= sprintf("- %s (%s)\n", $p['name'] ?? '', $p['role'] ?? '');
|
||
}
|
||
|
||
$prompt = <<<PROMPT
|
||
You are a Norwegian family-law research assistant building a case for: {$roleStr}.
|
||
|
||
A {$docType} has been uploaded. Key events:
|
||
{$eventSummary}
|
||
Key parties:
|
||
{$partyList}
|
||
|
||
Generate exactly {$count} targeted sub-questions to research the legal corpus for arguments that SUPPORT {$roleStr}'s position. Each question should explore a different angle:
|
||
1. Statutory rights and obligations (Barnevernloven, Barneloven)
|
||
2. ECHR Article 8 and 9 precedents vs Norway
|
||
3. Procedural requirements BVV must follow (notice, documentation, proportionality)
|
||
4. Bufdir guidance on case handling standards
|
||
5. Norwegian court decisions on similar fact patterns
|
||
|
||
Return JSON only in {$locale}:
|
||
{
|
||
"sub_questions": [
|
||
{"id":"q1","question":"...","rationale":"how this angle strengthens {$roleStr}'s position (≤ 120 chars)"}
|
||
]
|
||
}
|
||
|
||
Rules:
|
||
- Exactly {$count} sub-questions, no more no fewer.
|
||
- Every question must be answerable from Norwegian family-law, child-welfare, or ECHR sources.
|
||
- Each question must cover a DIFFERENT legal angle.
|
||
- Questions must be self-contained without needing the raw document.
|
||
- Respond in {$locale}.
|
||
PROMPT;
|
||
|
||
try {
|
||
$raw = $this->azure->chatText([
|
||
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
|
||
['role' => 'user', 'content' => $prompt],
|
||
], ['json' => true, 'temperature' => 0.15, 'max_tokens' => 1000, 'timeout' => 40]);
|
||
$json = $this->azure->decodeJsonObject($raw);
|
||
if (is_array($json) && is_array($json['sub_questions'] ?? null) && count($json['sub_questions']) >= 1) {
|
||
$sqs = [];
|
||
foreach (array_slice($json['sub_questions'], 0, $count) as $sq) {
|
||
if (!empty($sq['id']) && !empty($sq['question'])) {
|
||
$sqs[] = [
|
||
'id' => (string)$sq['id'],
|
||
'question' => (string)$sq['question'],
|
||
'rationale' => (string)($sq['rationale'] ?? ''),
|
||
];
|
||
}
|
||
}
|
||
if ($sqs) return $sqs;
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ generateSubQuestions failed: ' . $e->getMessage());
|
||
}
|
||
|
||
// Fallback: generic sub-questions
|
||
$role = $advocateRole ?: 'affected party';
|
||
return [
|
||
['id' => 'q1', 'question' => "What procedural rights does {$role} have in Barnevernet proceedings under Barnevernloven?", 'rationale' => 'Procedural rights'],
|
||
['id' => 'q2', 'question' => "What does ECHR Article 8 require when child welfare authorities intervene in family life?", 'rationale' => 'ECHR Article 8'],
|
||
['id' => 'q3', 'question' => "What Bufdir guidance applies to the proportionality of Barnevernet interventions?", 'rationale' => 'Proportionality'],
|
||
['id' => 'q4', 'question' => "What are the documentation and notice obligations of BVV before taking acute measures?", 'rationale' => 'Documentation obligations'],
|
||
];
|
||
}
|
||
|
||
// ── Step 6: Synthesis ─────────────────────────────────────────────────────
|
||
|
||
private function synthesiseBvj(
|
||
string $docText,
|
||
array $docMeta,
|
||
array $parties,
|
||
array $timelineEvents,
|
||
array $subQuestions,
|
||
array $numberedSources,
|
||
string $advocateRole,
|
||
string $engine,
|
||
string $language,
|
||
float $temperature,
|
||
string $additionalNotes,
|
||
?callable $emit = null
|
||
): array {
|
||
$locale = dbnToolsLanguageName($language);
|
||
$roleStr = $advocateRole !== '' ? $advocateRole : 'the affected party';
|
||
$docType = $docMeta['doc_type'] ?? 'BVJ Document';
|
||
$docDate = $docMeta['doc_date'] ?? 'unknown date';
|
||
$authority = $docMeta['issuing_authority'] ?? 'unknown authority';
|
||
$refNo = $docMeta['reference_number'] ? ' (ref ' . $docMeta['reference_number'] . ')' : '';
|
||
$childInfo = $docMeta['child_info'] ?? 'not specified';
|
||
$sourceCount = count($numberedSources);
|
||
|
||
if (empty($numberedSources)) {
|
||
$emptyBrief = match (dbnToolsNormalizeUiLanguage($language)) {
|
||
'no' => 'Ingen kildetreff ble funnet i korpuset for de valgte skivene og spørsmålene.',
|
||
'uk' => 'Для вибраних розділів і підпитань не знайдено джерел у корпусі.',
|
||
'pl' => 'Nie znaleziono źródeł w korpusie dla wybranych sekcji i pytań pomocniczych.',
|
||
default => 'No corpus sources were retrieved for the selected slices and sub-questions.',
|
||
};
|
||
return [
|
||
'json' => [
|
||
'advocacy_brief' => $emptyBrief,
|
||
'procedural_red_flags' => [],
|
||
'client_strengths' => [],
|
||
'opposing_weaknesses' => [],
|
||
'what_we_found' => 'No retrieved sources passed the similarity threshold.',
|
||
'what_remains_uncertain' => ['No corpus evidence retrieved — widen slice selection or try different sub-questions.'],
|
||
'next_practical_step' => 'Enable more corpus slices (Norwegian Courts, Bufdir Guidance) and re-run.',
|
||
],
|
||
'deploy_label' => match($engine) {
|
||
'gpu' => 'GPU (cuttlefish)',
|
||
'azure_full' => 'gpt-4o',
|
||
default => $this->azure->chatDeployment(),
|
||
},
|
||
];
|
||
}
|
||
|
||
// Build parties summary (top 8)
|
||
$partiesSummary = '';
|
||
foreach (array_slice($parties, 0, 8) as $i => $p) {
|
||
$org = $p['organization'] ? ' (' . $p['organization'] . ')' : '';
|
||
$rel = $p['relationship_to_child'] ? ' — rel: ' . $p['relationship_to_child'] : '';
|
||
$partiesSummary .= sprintf("%d. %s — %s%s%s\n", $i + 1, $p['name'] ?? '', $p['role'] ?? '', $org, $rel);
|
||
}
|
||
|
||
// Build timeline summary (top 15 most significant events)
|
||
$highEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
|
||
$otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'));
|
||
$topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 15);
|
||
$timelineSummary = '';
|
||
foreach ($topEvents as $ev) {
|
||
$time = $ev['time_of_day'] ? ' kl.' . $ev['time_of_day'] : '';
|
||
$timelineSummary .= sprintf("- %s%s [%s] %s: %s\n",
|
||
$ev['date'] ?? '?', $time,
|
||
strtoupper($ev['significance'] ?? 'low'),
|
||
$ev['actor'] ?? '', $ev['action'] ?? '');
|
||
}
|
||
|
||
// Build sources text
|
||
$sourcesContext = [];
|
||
foreach ($numberedSources as $s) {
|
||
$sourcesContext[] = sprintf(
|
||
"[%d] (%s) %s%s\n Corpus: %s\n Authority: %s | Jurisdiction: %s\n Excerpt: %s",
|
||
$s['n'],
|
||
$s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
|
||
$s['title'],
|
||
!empty($s['section']) ? ' — ' . $s['section'] : '',
|
||
$s['package_or_corpus'],
|
||
$s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
|
||
$s['jurisdiction'] ?? 'n/a',
|
||
$s['excerpt']
|
||
);
|
||
}
|
||
$sourcesText = implode("\n\n", $sourcesContext);
|
||
|
||
// Build sub-question text
|
||
$subQText = '';
|
||
if ($subQuestions) {
|
||
$subQText = "\nSub-questions researched:\n";
|
||
foreach ($subQuestions as $sq) {
|
||
$subQText .= sprintf("- %s: %s\n", $sq['id'], $sq['question']);
|
||
}
|
||
}
|
||
|
||
$notesSection = $additionalNotes !== ''
|
||
? "\n== ADDITIONAL CONTEXT FROM ADVOCATE ==\n{$additionalNotes}\n"
|
||
: '';
|
||
|
||
$docExcerpt = mb_substr($docText, 0, 3000, 'UTF-8');
|
||
|
||
$prompt = <<<PROMPT
|
||
You are Do Better Norge Legal Tools producing a structured Barnevernet case analysis brief.
|
||
You are representing: {$roleStr}
|
||
|
||
Ground every claim in the numbered corpus sources below using [n] markers, OR in the uploaded document using [DOC].
|
||
Do NOT invent statutes, paragraph numbers, case names, ECHR applications, dates, or parties.
|
||
Return valid JSON only. No markdown fences.
|
||
|
||
== DOCUMENT METADATA ==
|
||
Type: {$docType}{$refNo}
|
||
Date: {$docDate}
|
||
Issuing authority: {$authority}
|
||
Child: {$childInfo}
|
||
|
||
== KEY PARTIES ==
|
||
{$partiesSummary}
|
||
|
||
== TIMELINE (from document) ==
|
||
{$timelineSummary}
|
||
|
||
== CORPUS SOURCES ({$sourceCount} numbered) ==
|
||
{$sourcesText}
|
||
{$notesSection}
|
||
{$subQText}
|
||
|
||
== DOCUMENT EXCERPT (first 3000 chars — use [DOC] to cite) ==
|
||
{$docExcerpt}
|
||
|
||
Return JSON in {$locale}:
|
||
{
|
||
"advocacy_brief": "Partisan legal brief in Markdown. Structure:\n## Case Overview\n(What happened according to [DOC] — doc type, authority, key events)\n\n## {$roleStr}'s Core Legal Position\n(Strongest statutory and ECHR arguments — cite [n] and [DOC])\n\n## Procedural Compliance Issues\n(Where BVV may have failed their own procedural obligations — cite [DOC][n])\n\n## Client Strengths\n(Factual and legal advantages for {$roleStr} — cite [n][DOC])\n\n## Counter-Arguments and Responses\n(Likely opposing arguments and how to rebut — cite [n])\n\n## Recommended Next Steps\n(Concrete legal actions)\n\nEnd with a one-line disclaimer. Length: 500-1000 words.",
|
||
|
||
"procedural_red_flags": [
|
||
{
|
||
"description": "Concise description of the potential procedural violation",
|
||
"legal_basis": "Statute or ECHR article potentially violated, e.g. Barnevernloven §6-1, ECHR Art.8",
|
||
"severity": "high",
|
||
"source_refs": ["[n]", "[DOC]"],
|
||
"what_to_check": "Specific document text or action requiring legal verification"
|
||
}
|
||
],
|
||
|
||
"client_strengths": ["3-6 items anchored with [n] or [DOC]"],
|
||
"opposing_weaknesses": ["2-5 vulnerabilities in BVV or opposing party position — omit if unsupported by sources"],
|
||
"what_we_found": "2-sentence plain-language summary of the most critical finding",
|
||
"what_remains_uncertain": ["3-5 specific gaps — missing information, unclear authority, conflicting sources"],
|
||
"next_practical_step": "The single most important concrete legal action for {$roleStr}"
|
||
}
|
||
|
||
Rules:
|
||
- Every factual claim in advocacy_brief must end with [n] or [DOC].
|
||
- procedural_red_flags must be grounded in documented BVV actions — no speculation.
|
||
- severity: high = likely violation of a codified right; medium = procedural irregularity; low = best-practice gap.
|
||
- If no corpus source supports a claimed weakness, omit it from opposing_weaknesses.
|
||
- Cite statute sections and ECHR articles as they appear in the corpus excerpts.
|
||
- Respond in {$locale}.
|
||
PROMPT;
|
||
|
||
$sysPrompt = 'You return valid JSON only. No markdown fences.';
|
||
|
||
$messages = [
|
||
['role' => 'system', 'content' => $sysPrompt],
|
||
['role' => 'user', 'content' => $prompt],
|
||
];
|
||
$opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3000, 'timeout' => 200];
|
||
|
||
$deployLabel = match ($engine) {
|
||
'gpu' => 'GPU (cuttlefish)',
|
||
'azure_full' => 'gpt-4o',
|
||
default => $this->azure->chatDeployment(),
|
||
};
|
||
|
||
$raw = '';
|
||
try {
|
||
if ($engine === 'gpu') {
|
||
$response = dbnToolsCallGpuLlm($messages, $opts);
|
||
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
|
||
} elseif ($engine === 'azure_full') {
|
||
$raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
|
||
} else {
|
||
$raw = $this->azure->chatText($messages, $opts);
|
||
}
|
||
} catch (Throwable $e) {
|
||
dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
|
||
}
|
||
|
||
$json = $this->azure->decodeJsonObject($raw);
|
||
if (!is_array($json) || empty($json['advocacy_brief'])) {
|
||
$json = [
|
||
'advocacy_brief' => $raw,
|
||
'procedural_red_flags' => [],
|
||
'client_strengths' => [],
|
||
'opposing_weaknesses' => [],
|
||
'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.',
|
||
'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
|
||
'next_practical_step' => 'Review the brief manually before relying on it.',
|
||
];
|
||
}
|
||
|
||
return ['json' => $json, 'deploy_label' => $deployLabel];
|
||
}
|
||
|
||
// ── GPU streaming helper (keeps browser connection alive during slow models) ──
|
||
|
||
/**
|
||
* Call the LiteLLM endpoint with streaming enabled and accumulate the full text.
|
||
* Every 15 seconds, calls $onProgress() so PHP can flush a keepalive event to the browser.
|
||
*/
|
||
private function callGpuLlmStream(array $messages, array $options, ?callable $onProgress): string
|
||
{
|
||
$url = 'http://10.0.1.10:4000/v1/chat/completions';
|
||
$apiKey = (string)(dbnToolsEnv('LITELLM_MASTER_KEY') ?: 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d');
|
||
$timeout = (int)($options['timeout'] ?? 660);
|
||
|
||
$payload = [
|
||
'model' => (string)($options['model'] ?? 'qwen2.5:14b'),
|
||
'messages' => $messages,
|
||
'temperature' => $options['temperature'] ?? 0.1,
|
||
'max_tokens' => $options['max_tokens'] ?? 2800,
|
||
'stream' => true,
|
||
];
|
||
if (!empty($options['stop']) && is_array($options['stop'])) {
|
||
$payload['stop'] = $options['stop'];
|
||
}
|
||
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||
$headers = [
|
||
'Content-Type: application/json',
|
||
'Authorization: Bearer ' . $apiKey,
|
||
];
|
||
|
||
$accumulated = '';
|
||
$lastKeepalive = microtime(true);
|
||
$curlErr = '';
|
||
|
||
$ch = curl_init($url);
|
||
curl_setopt_array($ch, [
|
||
CURLOPT_POST => true,
|
||
CURLOPT_POSTFIELDS => $body,
|
||
CURLOPT_HTTPHEADER => $headers,
|
||
CURLOPT_TIMEOUT => $timeout,
|
||
CURLOPT_RETURNTRANSFER => false,
|
||
CURLOPT_WRITEFUNCTION => static function ($ch, $data) use (&$accumulated, &$lastKeepalive, $onProgress): int {
|
||
foreach (explode("\n", $data) as $line) {
|
||
$trimmed = ltrim($line);
|
||
if (!str_starts_with($trimmed, 'data: ')) continue;
|
||
$json = substr($trimmed, 6);
|
||
if (trim($json) === '[DONE]') continue;
|
||
$chunk = json_decode($json, true);
|
||
$delta = $chunk['choices'][0]['delta']['content'] ?? '';
|
||
if ($delta !== '') $accumulated .= $delta;
|
||
}
|
||
if ($onProgress !== null && microtime(true) - $lastKeepalive >= 15.0) {
|
||
$lastKeepalive = microtime(true);
|
||
$onProgress();
|
||
@flush();
|
||
}
|
||
return strlen($data);
|
||
},
|
||
]);
|
||
|
||
curl_exec($ch);
|
||
$curlErr = curl_error($ch);
|
||
curl_close($ch);
|
||
|
||
if ($curlErr !== '') {
|
||
throw new RuntimeException('GPU stream request failed: ' . $curlErr);
|
||
}
|
||
return trim($accumulated);
|
||
}
|
||
|
||
// ── Shared helpers (copied from DbnDeepResearchAgent) ────────────────────
|
||
|
||
private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
|
||
{
|
||
$text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
|
||
if ($text === '') return [];
|
||
$words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||
if (!$words) return [];
|
||
|
||
$chunks = [];
|
||
$i = 0;
|
||
$chunkIdx = 0;
|
||
$total = count($words);
|
||
while ($i < $total) {
|
||
$slice = array_slice($words, $i, self::CHUNK_WORDS);
|
||
if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
|
||
$chunks[] = [
|
||
'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
|
||
'file_index' => $fileIdx,
|
||
'chunk_index' => $chunkIdx,
|
||
'filename' => $filename,
|
||
'text' => implode(' ', $slice),
|
||
];
|
||
$chunkIdx++;
|
||
}
|
||
$advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
|
||
if ($advance < 1) $advance = 1;
|
||
$i += $advance;
|
||
if (count($slice) < self::CHUNK_WORDS) break;
|
||
}
|
||
return $chunks;
|
||
}
|
||
|
||
private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
|
||
{
|
||
if (empty($this->uploadVecs)) return [];
|
||
try {
|
||
$qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? [];
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ sub-Q embed failed: ' . $e->getMessage());
|
||
return [];
|
||
}
|
||
if (empty($qVec)) return [];
|
||
|
||
$scored = [];
|
||
foreach ($this->uploadVecs as $entry) {
|
||
$sim = $this->cosineSim($qVec, $entry['vec']);
|
||
if ($sim < $threshold) continue;
|
||
$scored[] = [
|
||
'chunk_id' => $entry['meta']['chunk_id'],
|
||
'title' => 'uploaded: ' . $entry['meta']['filename'],
|
||
'section' => null,
|
||
'package_or_corpus' => 'Your upload',
|
||
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620),
|
||
'chunk_text' => $entry['meta']['text'],
|
||
'similarity' => round($sim, 4),
|
||
'reranker_score' => null,
|
||
'document_id' => null,
|
||
'source_origin' => 'upload',
|
||
'authority_type' => null,
|
||
'jurisdiction' => null,
|
||
];
|
||
}
|
||
usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
|
||
$keep = (int)ceil($limitPerSubQ / 2);
|
||
return array_slice($scored, 0, max(1, $keep));
|
||
}
|
||
|
||
private function cosineSim(array $a, array $b): float
|
||
{
|
||
$len = min(count($a), count($b));
|
||
if ($len === 0) return 0.0;
|
||
$dot = $na = $nb = 0.0;
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$x = (float)$a[$i]; $y = (float)$b[$i];
|
||
$dot += $x * $y; $na += $x * $x; $nb += $y * $y;
|
||
}
|
||
if ($na === 0.0 || $nb === 0.0) return 0.0;
|
||
return $dot / (sqrt($na) * sqrt($nb));
|
||
}
|
||
|
||
private function normalizeCorpusChunk(array $chunk, string $subQId): array
|
||
{
|
||
return [
|
||
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
|
||
'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
|
||
'section' => $chunk['section_title'] ?? null,
|
||
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
|
||
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
|
||
'chunk_text' => (string)($chunk['content'] ?? ''),
|
||
'similarity' => isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null,
|
||
'reranker_score' => isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null,
|
||
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
|
||
'source_origin' => 'corpus',
|
||
'authority_type' => $chunk['authority_type'] ?? null,
|
||
'jurisdiction' => $chunk['jurisdiction'] ?? null,
|
||
'publication_year' => $chunk['publication_year'] ?? null,
|
||
'source_url' => null,
|
||
'deep_link' => null,
|
||
'authority_label' => null,
|
||
'corpus_source_name' => null,
|
||
'publication_date' => null,
|
||
'matched_sub_questions' => [$subQId],
|
||
];
|
||
}
|
||
|
||
private function shouldExcludeChunk(array $chunk, array $activeSlices): bool
|
||
{
|
||
$title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
|
||
$url = strtolower((string)($chunk['source_url'] ?? ''));
|
||
$name = strtolower((string)($chunk['source_name'] ?? ''));
|
||
|
||
if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true;
|
||
if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true;
|
||
|
||
$isDbnPage = (
|
||
str_contains($name, 'website')
|
||
|| str_contains($title, 'dobetternorge.no')
|
||
|| preg_match('/^(homepage|landing|about |contact )/i', $title)
|
||
|| str_contains($title, 'resource directory')
|
||
|| preg_match('/^flashcards?\s*[-–|]/i', $title)
|
||
|| preg_match('/\|\s*do better norge\s*$/i', $title)
|
||
|| preg_match('/[-–]\s*do better norge\s*$/i', $title)
|
||
);
|
||
if ($isDbnPage) {
|
||
return !($activeSlices['dbn_resources'] ?? false);
|
||
}
|
||
return false;
|
||
}
|
||
|
||
private function hydrateSourceUrls(array &$pool): void
|
||
{
|
||
$docIds = [];
|
||
foreach ($pool as $chunk) {
|
||
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
|
||
$docId = (int)($chunk['document_id'] ?? 0);
|
||
if ($docId > 0) $docIds[$docId] = true;
|
||
}
|
||
if (empty($docIds)) return;
|
||
|
||
try {
|
||
$ragDb = dbnToolsRagDb();
|
||
$ids = array_keys($docIds);
|
||
$ph = implode(',', array_fill(0, count($ids), '?'));
|
||
|
||
$stmt = $ragDb->prepare("
|
||
SELECT d.id, d.title, d.source_url, d.authority_type,
|
||
d.publication_date, d.source_id, d.jurisdiction,
|
||
d.summary, LEFT(d.content, 4000) AS content_excerpt
|
||
FROM documents d
|
||
WHERE d.id IN ({$ph})
|
||
");
|
||
$stmt->execute($ids);
|
||
|
||
$docMeta = [];
|
||
$sourceIds = [];
|
||
foreach ($stmt as $row) {
|
||
$dId = (int)$row['id'];
|
||
$sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
|
||
if ($sid) $sourceIds[] = $sid;
|
||
$docMeta[$dId] = [
|
||
'source_url' => $row['source_url'] ?? null,
|
||
'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null),
|
||
'publication_date' => $row['publication_date'] ?? null,
|
||
'corpus_source_name' => 'Do Better Legal',
|
||
'source_id' => $sid,
|
||
'summary' => $row['summary'] ?? null,
|
||
'content_excerpt' => (string)($row['content_excerpt'] ?? ''),
|
||
'title' => (string)($row['title'] ?? ''),
|
||
];
|
||
}
|
||
|
||
$unsummarized = array_filter($docMeta, fn($m) => $m['summary'] === null && $m['content_excerpt'] !== '');
|
||
foreach ($unsummarized as $dId => $m) {
|
||
try {
|
||
$raw = $this->azure->chatText([
|
||
['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
|
||
['role' => 'user', 'content' => "Summarise this Norwegian family law document.\nFocus on: legal provisions covered, authority type, and questions it helps answer.\n\nTitle: {$m['title']}\n\nContent:\n{$m['content_excerpt']}"],
|
||
], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
|
||
$summary = trim($raw);
|
||
if ($summary !== '') {
|
||
$ragDb->prepare("UPDATE documents SET summary = ? WHERE id = ?")->execute([$summary, $dId]);
|
||
$docMeta[$dId]['summary'] = $summary;
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ hydrateSourceUrls summary gen failed for doc ' . $dId . ': ' . $e->getMessage());
|
||
}
|
||
}
|
||
|
||
if (!empty($sourceIds)) {
|
||
$uSids = array_values(array_unique($sourceIds));
|
||
$sPh = implode(',', array_fill(0, count($uSids), '?'));
|
||
$sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
|
||
$sStmt->execute($uSids);
|
||
$srcNames = [];
|
||
foreach ($sStmt as $row) {
|
||
$srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
|
||
}
|
||
foreach ($docMeta as &$m) {
|
||
if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
|
||
$m['corpus_source_name'] = $srcNames[$m['source_id']];
|
||
}
|
||
}
|
||
unset($m);
|
||
}
|
||
} catch (Throwable $e) {
|
||
error_log('BVJ hydrateSourceUrls failed: ' . $e->getMessage());
|
||
return;
|
||
}
|
||
|
||
foreach ($pool as &$chunk) {
|
||
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
|
||
$docId = (int)($chunk['document_id'] ?? 0);
|
||
if (!$docId || !isset($docMeta[$docId])) continue;
|
||
$m = $docMeta[$docId];
|
||
$sourceUrl = $m['source_url'] ?? null;
|
||
$chunk['source_url'] = $sourceUrl;
|
||
$chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
|
||
$chunk['authority_label'] = $m['authority_label'] ?? $chunk['authority_label'];
|
||
$chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
|
||
$chunk['publication_date'] = $m['publication_date'] ?? null;
|
||
$chunk['summary'] = $m['summary'] ?? null;
|
||
}
|
||
unset($chunk);
|
||
}
|
||
|
||
private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string
|
||
{
|
||
if (!$sourceUrl) return null;
|
||
$sourceUrl = trim($sourceUrl);
|
||
if ($sourceUrl === '') return null;
|
||
if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl)
|
||
&& $sectionTitle
|
||
&& preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) {
|
||
return rtrim($sourceUrl, '/') . '/§' . $m[1];
|
||
}
|
||
return $sourceUrl;
|
||
}
|
||
|
||
private function mergeAndDedupe(array $rawPool, int $cap): array
|
||
{
|
||
$byKey = [];
|
||
foreach ($rawPool as $chunk) {
|
||
$key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
|
||
if (!isset($byKey[$key])) {
|
||
$byKey[$key] = $chunk;
|
||
continue;
|
||
}
|
||
$existing = $byKey[$key];
|
||
$existing['matched_sub_questions'] = array_values(array_unique(array_merge(
|
||
$existing['matched_sub_questions'] ?? [],
|
||
$chunk['matched_sub_questions'] ?? []
|
||
)));
|
||
if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
|
||
$existing['similarity'] = $chunk['similarity'];
|
||
}
|
||
if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
|
||
$existing['reranker_score'] = $chunk['reranker_score'];
|
||
}
|
||
$byKey[$key] = $existing;
|
||
}
|
||
$merged = array_values($byKey);
|
||
usort($merged, function (array $a, array $b): int {
|
||
$aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
|
||
$bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
|
||
return $bScore <=> $aScore;
|
||
});
|
||
return array_slice($merged, 0, $cap);
|
||
}
|
||
|
||
private function numberSources(array $chunks): array
|
||
{
|
||
$out = [];
|
||
foreach ($chunks as $i => $c) {
|
||
$c['n'] = $i + 1;
|
||
$out[] = $c;
|
||
}
|
||
return $out;
|
||
}
|
||
|
||
private function citationConfidence(array $sources): string
|
||
{
|
||
if (!$sources) return 'low';
|
||
$scores = array_values(array_filter(array_map(
|
||
fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
|
||
$sources
|
||
), 'is_numeric'));
|
||
$best = $scores ? max($scores) : 0;
|
||
if (count($sources) >= 6 && $best >= 0.5) return 'high';
|
||
if (count($sources) >= 3 && $best >= 0.35) return 'medium';
|
||
return 'low';
|
||
}
|
||
|
||
private function normalizeControls(array $controls): array
|
||
{
|
||
return [
|
||
'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
|
||
'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
|
||
'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
|
||
'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
|
||
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
|
||
];
|
||
}
|
||
|
||
private function requireFamilyPackage(int $clientId): array
|
||
{
|
||
$package = dbnToolsFetchPackage('family-legal');
|
||
if (!$package || empty($package['is_active'])) {
|
||
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
|
||
}
|
||
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
|
||
dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
|
||
}
|
||
return $package;
|
||
}
|
||
|
||
private function trace(string $label, string $detail, string $status = 'complete'): array
|
||
{
|
||
return ['label' => $label, 'detail' => $detail, 'status' => $status];
|
||
}
|
||
|
||
private function elapsedMs(float $start): int
|
||
{
|
||
return (int)round((microtime(true) - $start) * 1000);
|
||
}
|
||
}
|