Files
dobetternorge-tools/includes/BvjAnalyzerAgent.php
T
daveadmin 0e167bf464 Integrate dbn-legal-agent-v2: upgrade all v1 refs + add Korrespond legal-check
- Replace dbn-legal-agent with dbn-legal-agent-v2 in bootstrap.php
  (dbnToolsRunLegalCheck), DeepResearchAgent.php (interpretSeed,
  expandQueries, synthesis fallback, deploy label), BvjAnalyzerAgent.php
  (check_model label) — 8 locations total
- Add dbn-legal-agent-v2 legal threshold check to KorrespondAgent:
  called after selfCheck() in both generate() and refine(); result
  surfaced as legal_check[] in the API response
- Render legal_check card in korrespond.js using existing bvj-red-flag
  styles; shows only when non-empty
- Add .korr-legal-check CSS block in tools.css

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 23:59:07 +02:00

1372 lines
65 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
declare(strict_types=1);
require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';
/**
* BVJ (Barnevernet) Analyzer Agent
*
* Standalone 7-step pipeline that:
* 1. Classifies the uploaded document and extracts metadata
* 2. Extracts all named parties with roles
* 3. Builds a chronological timeline of events
* 4. Generates partisan sub-questions for corpus RAG
* 5. Retrieves from the legal corpus (hybrid dense+BM25)
* 6. Synthesises an advocacy brief + procedural red flags
* 7. Assesses citation confidence
*
* Steps 1-3 always use azure_mini regardless of the user's engine choice.
* Step 6 (synthesis) uses the user's selected engine.
*/
final class DbnBvjAnalyzerAgent
{
private const MAX_DOC_CHARS = 64000;
private const CHUNK_WORDS = 600;
private const CHUNK_OVERLAP_WORDS = 75;
private const MIN_CHUNK_WORDS = 50;
private const POOL_CAP = 30;
// Steps 1-3 always use this engine — fast and cheap for structured extraction
private const EXTRACT_ENGINE = 'azure_mini';
private DbnAzureOpenAiGateway $azure;
private array $uploadVecs = [];
private array $stepTimings = [];
public function __construct(?DbnAzureOpenAiGateway $azure = null)
{
$this->azure = $azure ?: new DbnAzureOpenAiGateway();
}
/**
* Main pipeline. At least 1 uploaded file is required.
*
* @param array $uploadedFiles [{filename, text, chars, truncated}]
* @param string $advocateRole Party the user represents
* @param string $engine Affects synthesis only: azure_mini|azure_full|gpu
* @param string $language 'en' or 'no'
* @param array $sliceSelection Corpus slice toggles
* @param array $controls sub_q_count, chunk_limit, similarity_threshold, reranker_top_k, temperature
* @param string $additionalNotes Optional user context to supplement the document
* @param callable|null $emit function(string $event, array $payload): void
*/
public function run(
array $uploadedFiles,
string $advocateRole,
string $engine,
string $language,
array $sliceSelection,
array $controls,
string $additionalNotes = '',
?callable $emit = null
): array {
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true)
? $engine : 'azure_mini';
$language = dbnToolsNormalizeUiLanguage($language);
$controls = $this->normalizeControls($controls);
if (empty($uploadedFiles)) {
dbnToolsAbort('Upload at least one BVJ document before running the analyzer.', 422, 'no_uploads');
}
$client = dbnToolsRequireClient();
$package = $this->requireFamilyPackage((int)$client['id']);
dbnToolsBootCaveau();
$aiPortalRoot = dbnToolsAiPortalRoot();
require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
$this->uploadVecs = [];
$this->stepTimings = [];
$trace = [];
$emitStep = function (string $stepId, string $label, string $detail, string $status)
use (&$trace, $emit): void {
$trace[] = $this->trace($label, $detail, $status);
if ($emit) {
$emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => $status]);
}
};
$emitRunning = function (string $stepId, string $label, string $detail = 'Running…') use ($emit): void {
if ($emit) {
$emit('step', ['step' => $stepId, 'label' => $label, 'detail' => $detail, 'status' => 'running']);
}
};
// Build combined document text (first file is primary; additional files appended)
$docText = '';
foreach ($uploadedFiles as $idx => $file) {
$text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8');
if ($text === '') continue;
$filename = (string)($file['filename'] ?? sprintf('document-%d', $idx + 1));
$docText .= ($docText !== '' ? "\n\n--- Document: {$filename} ---\n\n" : '') . $text;
}
if ($docText === '') {
dbnToolsAbort('Could not extract text from the uploaded file(s).', 422, 'empty_document');
}
$docText = mb_substr($docText, 0, self::MAX_DOC_CHARS * 2, 'UTF-8');
// ── STEP 1: Document classification ────────────────────────────────────
$emitRunning('doc_classify', 'Document classification', 'Classifying document and extracting metadata…');
$stepStart = microtime(true);
$docMeta = $this->classifyDocument($docText, $language);
$this->stepTimings['doc_classify'] = $this->elapsedMs($stepStart);
if ($emit) {
$emit('doc_meta', ['result' => $docMeta]);
}
$docTypeBadge = $docMeta['doc_type'] ?? 'BVJ Document';
$refStr = $docMeta['reference_number'] ? ' · ref ' . $docMeta['reference_number'] : '';
$authStr = $docMeta['issuing_authority'] ? $docMeta['issuing_authority'] : '';
$emitStep('doc_classify', 'Document classification',
trim("{$docTypeBadge} · {$authStr}{$refStr}"), 'complete');
// ── STEP 2: Party extraction ────────────────────────────────────────────
$emitRunning('party_extract', 'Party extraction', 'Identifying all named parties and their roles…');
$stepStart = microtime(true);
$parties = $this->extractParties($docText, $language);
$this->stepTimings['party_extract'] = $this->elapsedMs($stepStart);
if ($emit) {
$emit('parties', ['parties' => $parties]);
}
$emitStep('party_extract', 'Party extraction',
sprintf('%d %s identified.', count($parties), count($parties) === 1 ? 'party' : 'parties'),
'complete');
// ── STEP 3: Timeline extraction ─────────────────────────────────────────
$emitRunning('timeline_extract', 'Timeline extraction', 'Building chronological event timeline…');
$stepStart = microtime(true);
$timelineEvents = $this->extractTimeline($docText, $language);
$this->stepTimings['timeline_extract'] = $this->elapsedMs($stepStart);
if ($emit) {
$emit('timeline', ['events' => $timelineEvents]);
}
$highCount = count(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
$emitStep('timeline_extract', 'Timeline extraction',
sprintf('%d events extracted (%d high-significance).', count($timelineEvents), $highCount),
'complete');
// ── STEP 4: Sub-question generation ────────────────────────────────────
$emitRunning('sub_question_gen', 'Sub-question generation',
sprintf('Generating %d research angles for %s…', $controls['sub_q_count'], $advocateRole ?: 'selected role'));
$stepStart = microtime(true);
$subQuestions = $this->generateSubQuestions(
$docMeta, $parties, $timelineEvents,
$advocateRole, $controls['sub_q_count'], $language
);
$this->stepTimings['sub_question_gen'] = $this->elapsedMs($stepStart);
$emitStep('sub_question_gen', 'Sub-question generation',
sprintf('%d sub-questions generated for %s.', count($subQuestions), $advocateRole ?: 'selected role'),
'complete');
// ── STEP 5: Slice resolution + upload indexing + corpus retrieval ───────
$emitRunning('slice_resolution', 'Slice resolution', 'Resolving corpus slice toggles…');
$stepStart = microtime(true);
$sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
if (!array_filter($sliceSelectionNormalized)) {
dbnToolsAbort('Enable at least one corpus slice before running the analyzer.', 422, 'no_slices');
}
$ragDb = dbnToolsRagDb();
try {
$sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
$sliceDetail = sprintf('%d slice(s) active → %d candidate documents.',
count(array_filter($sliceSelectionNormalized)), count($sharedDocIds));
$sliceStatus = 'complete';
} catch (Throwable $e) {
error_log('BVJ slice resolve failed: ' . $e->getMessage());
$sharedDocIds = [];
$sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.';
$sliceStatus = 'warning';
}
$this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
$emitStep('slice_resolution', 'Slice resolution', $sliceDetail, $sliceStatus);
// Upload indexing
$emitRunning('upload_indexing', 'Upload indexing',
sprintf('Chunking + embedding %d file(s)…', count($uploadedFiles)));
$stepStart = microtime(true);
$uploadChunks = [];
foreach ($uploadedFiles as $idx => $file) {
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
$text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_DOC_CHARS, 'UTF-8');
$uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
}
$uploadStatus = 'complete';
$uploadDetail = sprintf('%d file(s) → %d in-memory chunks indexed.', count($uploadedFiles), count($uploadChunks));
if ($uploadChunks) {
try {
$texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
$allVecs = [];
$batchSz = 5;
for ($b = 0; $b < count($texts); $b += $batchSz) {
$batch = array_slice($texts, $b, $batchSz);
if ($emit) {
$emit('progress', ['detail' => sprintf(
'Embedding chunks %d%d of %d…',
$b + 1, $b + count($batch), count($texts)
)]);
}
$allVecs = array_merge($allVecs, dbnToolsLiteLLMEmbedBatch($batch));
}
if (count($allVecs) === count($uploadChunks)) {
foreach ($uploadChunks as $i => $chunk) {
$this->uploadVecs[] = ['meta' => $chunk, 'vec' => $allVecs[$i]];
}
} else {
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding count mismatch; uploaded chunks will not participate in retrieval.';
}
} catch (Throwable $e) {
error_log('BVJ upload embed failed: ' . $e->getMessage());
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding timed out; corpus-only retrieval will run.';
$this->uploadVecs = [];
}
}
$this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
$emitStep('upload_indexing', 'Upload indexing', $uploadDetail, $uploadStatus);
// Corpus retrieval (per sub-question)
$retrievalQueries = $subQuestions ?: [[
'id' => 'q1',
'question' => sprintf('%s case involving %s', $docMeta['doc_type'] ?? 'BVJ document', $advocateRole),
'rationale' => 'Fallback query (sub-question generation returned empty).',
]];
$emitRunning('retrieval', 'Corpus retrieval',
sprintf('Hybrid vector + keyword across %d sub-question(s)…', count($retrievalQueries)));
$stepStart = microtime(true);
try {
$rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
} catch (Throwable $e) {
dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
}
$rawPool = [];
$retrievalWarnings = 0;
$rawCorpusCount = 0;
$rawUploadCount = 0;
$filteredOutCount = 0;
foreach ($retrievalQueries as $idx => $sq) {
if ($emit) {
$emit('subq', [
'index' => $idx + 1,
'total' => count($retrievalQueries),
'id' => $sq['id'],
'question' => $sq['question'],
]);
}
try {
$corpusChunks = $rag->searchAll(
$sq['question'],
$controls['chunk_limit'],
null,
[
'search_private' => false,
'search_shared' => true,
'package_ids' => [(int)$package['id']],
'shared_doc_ids' => $sharedDocIds,
'chunk_limit' => $controls['chunk_limit'],
'search_method' => 'hybrid',
'reranker_enabled' => true,
'include_beta_website' => false,
'include_primary_website' => false,
]
);
} catch (Throwable $e) {
error_log('BVJ sub-Q retrieval failed: ' . $e->getMessage());
$corpusChunks = [];
$retrievalWarnings++;
}
$rawCorpusCount += count($corpusChunks);
foreach ($corpusChunks as $chunk) {
if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) {
$filteredOutCount++;
continue;
}
$rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
}
if (!empty($this->uploadVecs)) {
$uploadHits = $this->retrieveFromUploads(
$sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']
);
$rawUploadCount += count($uploadHits);
foreach ($uploadHits as $hit) {
$hit['matched_sub_questions'] = [$sq['id']];
$rawPool[] = $hit;
}
}
}
$merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
$this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
$retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
$retrievalDetail = sprintf(
'%d sub-Q(s) × hybrid → %d corpus (%d filtered) + %d upload → %d unique after dedupe.',
count($retrievalQueries), $rawCorpusCount, $filteredOutCount, $rawUploadCount, count($merged)
);
$emitStep('retrieval', 'Corpus retrieval', $retrievalDetail, $retrievalStatus);
$synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
$this->hydrateSourceUrls($synthesisPool);
$numberedSources = $this->numberSources($synthesisPool);
// Generate upload summaries for sources from uploaded files
if (!empty($uploadedFiles) && !empty($numberedSources)) {
$uploadSummaries = [];
foreach ($uploadedFiles as $idx => $file) {
$text = mb_substr((string)($file['text'] ?? ''), 0, 4000, 'UTF-8');
$filename = (string)($file['filename'] ?? "file-{$idx}");
if ($text === '') continue;
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
['role' => 'user', 'content' => "Summarise this BVJ document for a legal researcher.\n\nFilename: {$filename}\n\nContent:\n{$text}"],
], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
$uploadSummaries[$idx] = trim($raw);
} catch (Throwable $e) {
error_log('BVJ upload summary gen failed for file ' . $idx . ': ' . $e->getMessage());
$uploadSummaries[$idx] = null;
}
}
foreach ($numberedSources as &$src) {
if (($src['source_origin'] ?? '') !== 'upload') continue;
if (preg_match('/^upload:(\d+):/', (string)($src['chunk_id'] ?? ''), $m)) {
$src['summary'] = $uploadSummaries[(int)$m[1]] ?? null;
}
}
unset($src);
}
$retrievalCounts = [
'raw_corpus' => $rawCorpusCount,
'filtered' => $filteredOutCount,
'raw_upload' => $rawUploadCount,
'after_dedupe' => count($merged),
'after_topk' => count($numberedSources),
];
// ── STEP 6: Synthesis ───────────────────────────────────────────────────
$engineLabel = match ($engine) {
'azure_full' => 'Azure gpt-4o',
'gpu' => 'GPU qwen2.5:14b',
default => 'Azure gpt-4o-mini',
};
$emitRunning('synthesis', 'Synthesis',
sprintf('Synthesising advocacy brief with %s…', $engineLabel));
$stepStart = microtime(true);
$synthesis = $this->synthesiseBvj(
$docText, $docMeta, $parties, $timelineEvents,
$subQuestions, $numberedSources,
$advocateRole, $engine, $language, $controls['temperature'], $additionalNotes,
$emit
);
$this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
$emitStep('synthesis', 'Synthesis',
sprintf('%s synthesised advocacy brief using %d source(s) + document.',
$synthesis['deploy_label'], count($numberedSources)),
'complete');
// ── STEP 7: Confidence ──────────────────────────────────────────────────
$confidence = $this->citationConfidence($numberedSources);
$emitStep('confidence', 'Citation confidence',
sprintf('%s confidence based on %d source(s).', ucfirst($confidence), count($numberedSources)),
$confidence === 'low' ? 'warning' : 'complete');
// Build sub-question output with top_sources
$subQOut = [];
foreach ($retrievalQueries as $sq) {
$matchedChunks = array_values(array_filter(
$numberedSources,
fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
));
$topSources = array_slice($matchedChunks, 0, 3);
$subQOut[] = [
'id' => $sq['id'],
'question' => $sq['question'],
'rationale' => $sq['rationale'] ?? '',
'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
'top_sources' => array_map(fn(array $s) => [
'n' => $s['n'] ?? null,
'title' => $s['title'] ?? '',
'section' => $s['section'] ?? null,
'deep_link' => $s['deep_link'] ?? $s['source_url'] ?? null,
'source_url' => $s['source_url'] ?? null,
'source_origin' => $s['source_origin'] ?? 'corpus',
'authority_label' => $s['authority_label'] ?? null,
'excerpt' => $s['excerpt'] ?? '',
], $topSources),
];
}
$synJson = $synthesis['json'];
return [
'tool' => 'bvj_analyzer',
'language' => $language,
'advocate_role' => $advocateRole,
'doc_meta' => $docMeta,
'parties' => $parties,
'timeline' => ['events' => $timelineEvents],
'advocacy_brief' => (string)($synJson['advocacy_brief'] ?? ''),
'procedural_red_flags' => is_array($synJson['procedural_red_flags'] ?? null)
? $synJson['procedural_red_flags'] : [],
'client_strengths' => is_array($synJson['client_strengths'] ?? null)
? $synJson['client_strengths'] : [],
'opposing_weaknesses' => is_array($synJson['opposing_weaknesses'] ?? null)
? $synJson['opposing_weaknesses'] : [],
'sub_questions' => $subQOut,
'sources' => $numberedSources,
'what_we_found' => (string)($synJson['what_we_found'] ?? ''),
'what_remains_uncertain' => $synJson['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($synJson['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($merged),
'source_count' => count($numberedSources),
'sub_question_count' => count($retrievalQueries),
'upload_chunk_count' => count($this->uploadVecs),
'deployment' => $synthesis['deploy_label'],
'engine_used' => $engine,
'citation_confidence' => $confidence,
'elapsed_ms_per_step' => $this->stepTimings,
'retrieval_counts' => $retrievalCounts,
'slices_active' => array_keys(array_filter($sliceSelectionNormalized)),
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
// ── Step 1: Document classification ──────────────────────────────────────
private function classifyDocument(string $docText, string $language): array
{
$locale = dbnToolsLanguageName($language);
$excerpt = mb_substr($docText, 0, 6000, 'UTF-8');
$prompt = <<<PROMPT
You are analysing a Norwegian child welfare (Barnevernet) document.
Extract the following metadata from the document text below.
Return JSON only in {$locale}:
{
"doc_type": "The document type as a short phrase, e.g. Bekymringsmelding, Vedtak, Omsorgsovertakelse, Fylkesnemnda-kjennelse, Rapport, or the detected type",
"doc_date": "Primary document date in ISO 8601 format (YYYY-MM-DD) if identifiable, otherwise null",
"issuing_authority": "Name of the issuing authority or institution, e.g. Trondheim kommune barneverntjeneste",
"reference_number": "Case or document reference number if present, otherwise null",
"child_info": "Brief description of the child(ren) involved, e.g. name and birth date if visible — anonymise if clearly redacted"
}
Rules:
- If a field cannot be determined, use null.
- doc_type should be the Norwegian term if recognisable (e.g. Bekymringsmelding), otherwise English.
- Do not invent information not present in the text.
Document text (first 6000 chars):
{$excerpt}
PROMPT;
$default = [
'doc_type' => 'BVJ Document',
'doc_date' => null,
'issuing_authority' => null,
'reference_number' => null,
'child_info' => null,
];
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 400, 'timeout' => 30]);
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json)) {
return array_merge($default, array_filter($json, fn($v) => $v !== null && $v !== ''));
}
} catch (Throwable $e) {
error_log('BVJ classifyDocument failed: ' . $e->getMessage());
}
return $default;
}
// ── Step 2: Party extraction ──────────────────────────────────────────────
private function extractParties(string $docText, string $language): array
{
$locale = dbnToolsLanguageName($language);
$excerpt = mb_substr($docText, 0, 20000, 'UTF-8');
$prompt = <<<PROMPT
You are analysing a Norwegian child welfare (Barnevernet) document.
Identify ALL named parties — every person or institution referred to by name or title.
Respond in {$locale}. Return a JSON object with a single key "parties" containing an array of objects.
Each object must have these four fields:
- "name": full name or institution name (string)
- "role": their role in the case, e.g. Biological mother, Biological father, Child, Barnevernarbeider, Saksbehandler, Leder, Melder, Politi, Lege, Psykolog, Advokat, Talsperson for barnet, Tilsynsfører, Sakkyndig, Foster carer (fosterforelder), Rusklinikk, Statsforvalter
- "organization": employer or institution if mentioned, otherwise null
- "relationship_to_child": relationship to the child in the document, e.g. Mother, Father, Sibling, Caseworker, Melder, Supervisor, or null
Rules:
- Include every named person and named institution — even peripheral ones.
- Include Barnevernvakta (bvv) as an institution even if no individual caseworkers are named.
- If a name appears to be redacted or anonymised (e.g. "mor", "far", "barnet", initials like "A.B."), include them with role inferred from context.
- Do not invent parties not present in the text.
- Maximum 25 parties.
Document text:
{$excerpt}
PROMPT;
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 2000, 'timeout' => 45]);
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && is_array($json['parties'] ?? null)) {
return array_slice($json['parties'], 0, 25);
}
// Fallback: model returned an array at root level instead of {parties:[...]}
if (is_array($json) && isset($json[0]['name'])) {
return array_slice($json, 0, 25);
}
error_log('BVJ extractParties unexpected structure: ' . substr($raw, 0, 300));
} catch (Throwable $e) {
error_log('BVJ extractParties failed: ' . $e->getMessage());
}
return [];
}
// ── Step 3: Timeline extraction ───────────────────────────────────────────
private function extractTimeline(string $docText, string $language): array
{
$locale = dbnToolsLanguageName($language);
$excerpt = mb_substr($docText, 0, 20000, 'UTF-8');
$prompt = <<<PROMPT
Build a chronological timeline from this Norwegian child welfare (Barnevernet) document in {$locale}.
Extract ALL dates, times, and temporal references — including phone calls, home visits, meetings, decisions, and assessments.
IMPORTANT — Norwegian date and time formats to recognise:
- DD.MM.YY (e.g. 18.07.20 = 2020-07-18)
- DD.MM.YYYY (e.g. 18.07.2020)
- D.M.YY (e.g. 6.1.20 = 2020-01-06)
- DD.MM. (day and month without year — infer year from surrounding context)
- Times: kl. HH:MM, klokken HH:MM, kl HH.MM
- Diary/log format: lines beginning with a date or time are always events.
- Two-digit years: interpret as 20YY (20 → 2020, 21 → 2021).
Barnevernet-specific events that are ALWAYS high significance:
- Akuttvedtak (emergency placement) under §4-6 or §4-25
- Omsorgsovertakelse (care order) under §4-12
- Police involvement or assistance (politibistand)
- Formal decision (vedtak) or court order (kjennelse)
- Deadline breaches: bekymringsmelding not processed within 7 days; investigation not opened within 6 weeks
- Forhandlingsmøte (negotiation hearing) or Fylkesnemnda hearing
- Supervised contact visits (samvær) being reduced or denied
- Placement in foster care or institution (fosterhjem, institusjon)
For each event provide:
- "date": ISO 8601 date (YYYY-MM-DD) if determinable, otherwise best-effort description
- "time_of_day": HH:MM if present, otherwise null
- "actor": person, institution, or party involved
- "action": concise description (≤ 80 chars) of what happened
- "significance": high (acute measure, removal, police involvement, formal decision, statutory deadline breach) | medium (home visit, phone call, meeting, assessment) | low (minor update, note)
Sort chronologically. Maximum 40 events.
Document text:
{$excerpt}
Return JSON only:
{
"events": [{"date":"...","time_of_day":null,"actor":"...","action":"...","significance":"high|medium|low"}]
}
PROMPT;
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 4000, 'timeout' => 55]);
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && is_array($json['events'] ?? null)) {
return array_slice($json['events'], 0, 40);
}
} catch (Throwable $e) {
error_log('BVJ extractTimeline failed: ' . $e->getMessage());
}
return [];
}
// ── Step 4: Sub-question generation ──────────────────────────────────────
private function generateSubQuestions(
array $docMeta,
array $parties,
array $timelineEvents,
string $advocateRole,
int $count,
string $language
): array {
$locale = dbnToolsLanguageName($language);
$docType = $docMeta['doc_type'] ?? 'BVJ document';
$docDate = $docMeta['doc_date'] ?? 'unknown date';
$authority = $docMeta['issuing_authority'] ?? 'the municipality';
$roleStr = $advocateRole !== '' ? $advocateRole : 'the affected party';
// Summarise high-significance events first, then others
$highEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
$otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'));
$topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 12);
$eventSummary = '';
foreach ($topEvents as $ev) {
$sig = ($ev['significance'] ?? 'low') === 'high' ? '[HIGH] ' : '';
$eventSummary .= sprintf("- %s %s%s (%s)\n",
$ev['date'] ?? '?', $sig, $ev['action'] ?? '', $ev['actor'] ?? '');
}
// Summarise parties
$partyList = '';
foreach (array_slice($parties, 0, 10) as $p) {
$org = !empty($p['organization']) ? ' at ' . $p['organization'] : '';
$partyList .= sprintf("- %s (%s%s)\n", $p['name'] ?? '?', $p['role'] ?? '?', $org);
}
$angleGuidance = match (true) {
$count >= 5 => <<<ANGLES
Cover these five distinct legal angles (one per question):
1. Statutory rights and obligations under Barnevernloven (e.g. §4-2, §4-6, §4-12) specific to the measures taken
2. ECHR Article 8 proportionality and procedural safeguards cite the specific measures and dates from this case
3. Procedural obligations BVV must fulfil (advance notice, documentation, hearing rights) anchor to documented events
4. Bufdir/Statsforvalter guidance on investigation standards and thresholds for intervention
5. Norwegian appellate court decisions on comparable measures and family circumstances
ANGLES,
$count === 4 => <<<ANGLES
Cover these four distinct legal angles (one per question):
1. Statutory rights under Barnevernloven anchored to the specific measures and dates in this case
2. ECHR Article 8 proportionality of the specific intervention and any procedural violations
3. BVV's procedural obligations — documentation, notice, and hearing rights — as evidenced by the timeline
4. Bufdir guidance and Norwegian court decisions on comparable fact patterns
ANGLES,
default => <<<ANGLES
Cover three distinct legal angles (one per question):
1. Statutory rights under Barnevernloven for the specific type of measure documented
2. ECHR Article 8 proportionality and procedural safeguards
3. BVV's procedural obligations and whether the documented timeline shows any breach
ANGLES,
};
$prompt = <<<PROMPT
You are a Norwegian family-law research assistant building a case for: {$roleStr}.
Case facts extracted from the uploaded document:
- Document type: {$docType}
- Date: {$docDate}
- Issuing authority: {$authority}
- Key events (chronological):
{$eventSummary}
- Key parties:
{$partyList}
Generate exactly {$count} sub-questions to search the Norwegian legal corpus for arguments that SUPPORT {$roleStr}'s position.
{$angleGuidance}
CRITICAL: Every question MUST embed specific facts from this case — use the actual authority name, document date, type of measure, and parties where relevant. Generic questions ("What are parental rights?") are useless for retrieval. Specific questions ("What notice requirements must {$authority} meet before issuing an emergency placement under Barnevernloven §4-6?") are highly effective.
Return JSON only in {$locale}:
{
"sub_questions": [
{"id":"q1","question":"...","rationale":"why this angle strengthens {$roleStr}'s position (≤ 120 chars)"}
]
}
Rules:
- Exactly {$count} sub-questions.
- Each question targets a DIFFERENT legal angle.
- Include specific case details (authority, date, measure type) in each question.
- Questions must be self-contained and answerable from Norwegian family-law, child-welfare, or ECHR sources.
- Respond in {$locale}.
PROMPT;
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
], ['json' => true, 'temperature' => 0.15, 'max_tokens' => 1000, 'timeout' => 40]);
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && is_array($json['sub_questions'] ?? null) && count($json['sub_questions']) >= 1) {
$sqs = [];
foreach (array_slice($json['sub_questions'], 0, $count) as $sq) {
if (!empty($sq['id']) && !empty($sq['question'])) {
$sqs[] = [
'id' => (string)$sq['id'],
'question' => (string)$sq['question'],
'rationale' => (string)($sq['rationale'] ?? ''),
];
}
}
if ($sqs) return $sqs;
}
} catch (Throwable $e) {
error_log('BVJ generateSubQuestions failed: ' . $e->getMessage());
}
// Fallback: generic sub-questions
$role = $advocateRole ?: 'affected party';
return [
['id' => 'q1', 'question' => "What procedural rights does {$role} have in Barnevernet proceedings under Barnevernloven?", 'rationale' => 'Procedural rights'],
['id' => 'q2', 'question' => "What does ECHR Article 8 require when child welfare authorities intervene in family life?", 'rationale' => 'ECHR Article 8'],
['id' => 'q3', 'question' => "What Bufdir guidance applies to the proportionality of Barnevernet interventions?", 'rationale' => 'Proportionality'],
['id' => 'q4', 'question' => "What are the documentation and notice obligations of BVV before taking acute measures?", 'rationale' => 'Documentation obligations'],
];
}
// ── Step 6: Synthesis ─────────────────────────────────────────────────────
private function synthesiseBvj(
string $docText,
array $docMeta,
array $parties,
array $timelineEvents,
array $subQuestions,
array $numberedSources,
string $advocateRole,
string $engine,
string $language,
float $temperature,
string $additionalNotes,
?callable $emit = null
): array {
$locale = dbnToolsLanguageName($language);
$roleStr = $advocateRole !== '' ? $advocateRole : 'the affected party';
$docType = $docMeta['doc_type'] ?? 'BVJ Document';
$docDate = $docMeta['doc_date'] ?? 'unknown date';
$authority = $docMeta['issuing_authority'] ?? 'unknown authority';
$refNo = $docMeta['reference_number'] ? ' (ref ' . $docMeta['reference_number'] . ')' : '';
$childInfo = $docMeta['child_info'] ?? 'not specified';
$sourceCount = count($numberedSources);
if (empty($numberedSources)) {
$emptyBrief = match (dbnToolsNormalizeUiLanguage($language)) {
'no' => 'Ingen kildetreff ble funnet i korpuset for de valgte skivene og spørsmålene.',
'uk' => 'Для вибраних розділів і підпитань не знайдено джерел у корпусі.',
'pl' => 'Nie znaleziono źródeł w korpusie dla wybranych sekcji i pytań pomocniczych.',
default => 'No corpus sources were retrieved for the selected slices and sub-questions.',
};
return [
'json' => [
'advocacy_brief' => $emptyBrief,
'procedural_red_flags' => [],
'client_strengths' => [],
'opposing_weaknesses' => [],
'what_we_found' => 'No retrieved sources passed the similarity threshold.',
'what_remains_uncertain' => ['No corpus evidence retrieved — widen slice selection or try different sub-questions.'],
'next_practical_step' => 'Enable more corpus slices (Norwegian Courts, Bufdir Guidance) and re-run.',
],
'deploy_label' => match($engine) {
'gpu' => 'GPU (cuttlefish)',
'azure_full' => 'gpt-4o',
default => $this->azure->chatDeployment(),
},
];
}
// Build parties summary (top 8)
$partiesSummary = '';
foreach (array_slice($parties, 0, 12) as $i => $p) {
$org = $p['organization'] ? ' (' . $p['organization'] . ')' : '';
$rel = $p['relationship_to_child'] ? ' — rel: ' . $p['relationship_to_child'] : '';
$partiesSummary .= sprintf("%d. %s — %s%s%s\n", $i + 1, $p['name'] ?? '', $p['role'] ?? '', $org, $rel);
}
// Build timeline summary (top 20 most significant events)
$highEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'));
$otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'));
$topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 20);
$timelineSummary = '';
foreach ($topEvents as $ev) {
$time = $ev['time_of_day'] ? ' kl.' . $ev['time_of_day'] : '';
$timelineSummary .= sprintf("- %s%s [%s] %s: %s\n",
$ev['date'] ?? '?', $time,
strtoupper($ev['significance'] ?? 'low'),
$ev['actor'] ?? '', $ev['action'] ?? '');
}
// Build sources text
$sourcesContext = [];
foreach ($numberedSources as $s) {
$sourcesContext[] = sprintf(
"[%d] (%s) %s%s\n Corpus: %s\n Authority: %s | Jurisdiction: %s\n Excerpt: %s",
$s['n'],
$s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
$s['title'],
!empty($s['section']) ? ' — ' . $s['section'] : '',
$s['package_or_corpus'],
$s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
$s['jurisdiction'] ?? 'n/a',
$s['excerpt']
);
}
$sourcesText = implode("\n\n", $sourcesContext);
// Build sub-question text
$subQText = '';
if ($subQuestions) {
$subQText = "\nSub-questions researched:\n";
foreach ($subQuestions as $sq) {
$subQText .= sprintf("- %s: %s\n", $sq['id'], $sq['question']);
}
}
$notesSection = $additionalNotes !== ''
? "\n== ADDITIONAL CONTEXT FROM ADVOCATE ==\n{$additionalNotes}\n"
: '';
$docExcerpt = mb_substr($docText, 0, 8000, 'UTF-8');
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools. Produce a structured Barnevernet case analysis for: {$roleStr}.
HALLUCINATION RULES — READ FIRST:
- You may ONLY cite statute sections (§), ECHR article numbers, ECHR application numbers, case names, and Bufdir/Statsforvalter circular references that appear verbatim in the numbered corpus sources below.
- Do NOT cite statute sections, case names, or ECHR applications from your training memory — they may be misremembered or no longer in force.
- If no source supports a claim, omit the claim rather than invent support.
- Every factual legal claim in advocacy_brief MUST end with at least one [n] or [DOC] citation. Unsupported claims are a liability for the client.
Return valid JSON only. No markdown fences.
== DOCUMENT METADATA ==
Type: {$docType}{$refNo}
Date: {$docDate}
Issuing authority: {$authority}
Child: {$childInfo}
== KEY PARTIES ==
{$partiesSummary}
== TIMELINE (from document) ==
{$timelineSummary}
== CORPUS SOURCES ({$sourceCount} numbered — cite as [n]) ==
{$sourcesText}
{$notesSection}
{$subQText}
== DOCUMENT EXCERPT (first 8000 chars — cite as [DOC]) ==
{$docExcerpt}
== ADVOCACY BRIEF FORMAT ==
Write the advocacy_brief as a Markdown document with these sections:
## Case Overview
Summarise what happened: document type, issuing authority, key events from the timeline. Every factual statement must cite [DOC].
## {$roleStr}'s Core Legal Position
The strongest statutory and ECHR arguments in favour of {$roleStr}. Cite [n] for each legal point. Only cite statutes and cases that appear in the corpus sources above.
## Procedural Compliance Issues
Where BVV/the authority may have failed their own procedural obligations. Ground each point in a specific documented action from [DOC] and the applicable statute or guidance from [n].
## Client Strengths
3-6 factual and legal advantages for {$roleStr}, each anchored with [n] or [DOC].
## Counter-Arguments and Responses
The most likely opposing arguments and how to rebut them. Cite [n] for rebuttal sources.
## Recommended Next Steps
2-4 concrete legal actions {$roleStr} should take now.
End with one line: "*This brief is AI-assisted and for discussion purposes only — verify all legal references with a qualified Norwegian family-law lawyer.*"
Target length: 600-1000 words.
== JSON OUTPUT ==
{
"advocacy_brief": "<the Markdown brief following the format above>",
"procedural_red_flags": [
{
"description": "Concise description of the potential procedural violation",
"legal_basis": "Statute or ECHR article from a corpus source — e.g. Barnevernloven §4-2 [3]",
"severity": "high|medium|low",
"source_refs": ["[n]", "[DOC]"],
"what_to_check": "Exact document text or action to verify with a lawyer"
}
],
"client_strengths": ["3-6 items, each ending with [n] or [DOC]"],
"opposing_weaknesses": ["2-5 documented vulnerabilities in BVV or opposing position — OMIT if not supported by at least one [n]"],
"what_we_found": "2-sentence plain-language summary of the single most critical finding",
"what_remains_uncertain": ["3-5 specific information gaps or legal questions that need clarification"],
"next_practical_step": "The single most important concrete legal action for {$roleStr} to take within the next 7 days"
}
Rules:
- severity: high = likely violation of a codified statutory right or ECHR guarantee; medium = procedural irregularity; low = best-practice gap only.
- procedural_red_flags must be grounded in documented BVV actions visible in [DOC] or the timeline.
- If fewer than 2 corpus sources support opposing_weaknesses, return an empty array.
- Respond in {$locale}.
PROMPT;
$sysPrompt = 'You return valid JSON only. No markdown fences. Every legal citation must come from the provided corpus sources, not from training memory.';
$messages = [
['role' => 'system', 'content' => $sysPrompt],
['role' => 'user', 'content' => $prompt],
];
$opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 4500, 'timeout' => 240];
$deployLabel = match ($engine) {
'gpu' => 'GPU (cuttlefish)',
'azure_full' => 'gpt-4o',
default => $this->azure->chatDeployment(),
};
$raw = '';
try {
if ($engine === 'gpu') {
$response = dbnToolsCallGpuLlm($messages, $opts);
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
} elseif ($engine === 'azure_full') {
$raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
} else {
$raw = $this->azure->chatText($messages, $opts);
}
} catch (Throwable $e) {
dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
}
$json = $this->azure->decodeJsonObject($raw);
if (!is_array($json) || empty($json['advocacy_brief'])) {
$json = [
'advocacy_brief' => $raw,
'procedural_red_flags' => [],
'client_strengths' => [],
'opposing_weaknesses' => [],
'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.',
'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
'next_practical_step' => 'Review the brief manually before relying on it.',
];
}
// Step 6b: dbn-legal-agent targeted legal Q&A check (azure engines only; silent on failure)
// Asks one focused question about the document's statutory basis to surface domain knowledge
// that Azure reliably misses (klar nødvendighet threshold, Strand Lobben, fvl §17/§41).
if (in_array($engine, ['azure_mini', 'azure_full'], true)) {
$checkFindings = dbnToolsRunLegalCheck(
(string)($json['advocacy_brief'] ?? ''),
$docType
);
if (!empty($checkFindings)) {
if (!is_array($json['procedural_red_flags'] ?? null)) {
$json['procedural_red_flags'] = [];
}
foreach ($checkFindings as $cf) {
$json['procedural_red_flags'][] = $cf;
}
$json['check_model'] = 'dbn-legal-agent-v2';
}
}
return ['json' => $json, 'deploy_label' => $deployLabel];
}
// ── GPU streaming helper (keeps browser connection alive during slow models) ──
/**
* Call the LiteLLM endpoint with streaming enabled and accumulate the full text.
* Every 15 seconds, calls $onProgress() so PHP can flush a keepalive event to the browser.
*/
private function callGpuLlmStream(array $messages, array $options, ?callable $onProgress): string
{
$url = 'http://10.0.1.10:4000/v1/chat/completions';
$apiKey = (string)(dbnToolsEnv('LITELLM_MASTER_KEY') ?: 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d');
$timeout = (int)($options['timeout'] ?? 660);
$payload = [
'model' => (string)($options['model'] ?? 'qwen2.5:14b'),
'messages' => $messages,
'temperature' => $options['temperature'] ?? 0.1,
'max_tokens' => $options['max_tokens'] ?? 2800,
'stream' => true,
];
if (!empty($options['stop']) && is_array($options['stop'])) {
$payload['stop'] = $options['stop'];
}
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
$headers = [
'Content-Type: application/json',
'Authorization: Bearer ' . $apiKey,
];
$accumulated = '';
$lastKeepalive = microtime(true);
$curlErr = '';
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $body,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_RETURNTRANSFER => false,
CURLOPT_WRITEFUNCTION => static function ($ch, $data) use (&$accumulated, &$lastKeepalive, $onProgress): int {
foreach (explode("\n", $data) as $line) {
$trimmed = ltrim($line);
if (!str_starts_with($trimmed, 'data: ')) continue;
$json = substr($trimmed, 6);
if (trim($json) === '[DONE]') continue;
$chunk = json_decode($json, true);
$delta = $chunk['choices'][0]['delta']['content'] ?? '';
if ($delta !== '') $accumulated .= $delta;
}
if ($onProgress !== null && microtime(true) - $lastKeepalive >= 15.0) {
$lastKeepalive = microtime(true);
$onProgress();
@flush();
}
return strlen($data);
},
]);
curl_exec($ch);
$curlErr = curl_error($ch);
curl_close($ch);
if ($curlErr !== '') {
throw new RuntimeException('GPU stream request failed: ' . $curlErr);
}
return trim($accumulated);
}
// ── Shared helpers (copied from DbnDeepResearchAgent) ────────────────────
private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
{
$text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
if ($text === '') return [];
$words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
if (!$words) return [];
$chunks = [];
$i = 0;
$chunkIdx = 0;
$total = count($words);
while ($i < $total) {
$slice = array_slice($words, $i, self::CHUNK_WORDS);
if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
$chunks[] = [
'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
'file_index' => $fileIdx,
'chunk_index' => $chunkIdx,
'filename' => $filename,
'text' => implode(' ', $slice),
];
$chunkIdx++;
}
$advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
if ($advance < 1) $advance = 1;
$i += $advance;
if (count($slice) < self::CHUNK_WORDS) break;
}
return $chunks;
}
private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
{
if (empty($this->uploadVecs)) return [];
try {
$qVec = dbnToolsLiteLLMEmbedBatch([$question])[0] ?? [];
} catch (Throwable $e) {
error_log('BVJ sub-Q embed failed: ' . $e->getMessage());
return [];
}
if (empty($qVec)) return [];
$scored = [];
foreach ($this->uploadVecs as $entry) {
$sim = $this->cosineSim($qVec, $entry['vec']);
if ($sim < $threshold) continue;
$scored[] = [
'chunk_id' => $entry['meta']['chunk_id'],
'title' => 'uploaded: ' . $entry['meta']['filename'],
'section' => null,
'package_or_corpus' => 'Your upload',
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620),
'chunk_text' => $entry['meta']['text'],
'similarity' => round($sim, 4),
'reranker_score' => null,
'document_id' => null,
'source_origin' => 'upload',
'authority_type' => null,
'jurisdiction' => null,
];
}
usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
$keep = (int)ceil($limitPerSubQ / 2);
return array_slice($scored, 0, max(1, $keep));
}
private function cosineSim(array $a, array $b): float
{
$len = min(count($a), count($b));
if ($len === 0) return 0.0;
$dot = $na = $nb = 0.0;
for ($i = 0; $i < $len; $i++) {
$x = (float)$a[$i]; $y = (float)$b[$i];
$dot += $x * $y; $na += $x * $x; $nb += $y * $y;
}
if ($na === 0.0 || $nb === 0.0) return 0.0;
return $dot / (sqrt($na) * sqrt($nb));
}
private function normalizeCorpusChunk(array $chunk, string $subQId): array
{
return [
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
'section' => $chunk['section_title'] ?? null,
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
'chunk_text' => (string)($chunk['content'] ?? ''),
'similarity' => isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null,
'reranker_score' => isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null,
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
'source_origin' => 'corpus',
'authority_type' => $chunk['authority_type'] ?? null,
'jurisdiction' => $chunk['jurisdiction'] ?? null,
'publication_year' => $chunk['publication_year'] ?? null,
'source_url' => null,
'deep_link' => null,
'authority_label' => null,
'corpus_source_name' => null,
'publication_date' => null,
'matched_sub_questions' => [$subQId],
];
}
private function shouldExcludeChunk(array $chunk, array $activeSlices): bool
{
$title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
$url = strtolower((string)($chunk['source_url'] ?? ''));
$name = strtolower((string)($chunk['source_name'] ?? ''));
if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true;
if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true;
$isDbnPage = (
str_contains($name, 'website')
|| str_contains($title, 'dobetternorge.no')
|| preg_match('/^(homepage|landing|about |contact )/i', $title)
|| str_contains($title, 'resource directory')
|| preg_match('/^flashcards?\s*[-|]/i', $title)
|| preg_match('/\|\s*do better norge\s*$/i', $title)
|| preg_match('/[-]\s*do better norge\s*$/i', $title)
);
if ($isDbnPage) {
return !($activeSlices['dbn_resources'] ?? false);
}
return false;
}
private function hydrateSourceUrls(array &$pool): void
{
$docIds = [];
foreach ($pool as $chunk) {
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
$docId = (int)($chunk['document_id'] ?? 0);
if ($docId > 0) $docIds[$docId] = true;
}
if (empty($docIds)) return;
try {
$ragDb = dbnToolsRagDb();
$ids = array_keys($docIds);
$ph = implode(',', array_fill(0, count($ids), '?'));
$stmt = $ragDb->prepare("
SELECT d.id, d.title, d.source_url, d.authority_type,
d.publication_date, d.source_id, d.jurisdiction,
d.summary, LEFT(d.content, 4000) AS content_excerpt
FROM documents d
WHERE d.id IN ({$ph})
");
$stmt->execute($ids);
$docMeta = [];
$sourceIds = [];
foreach ($stmt as $row) {
$dId = (int)$row['id'];
$sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
if ($sid) $sourceIds[] = $sid;
$docMeta[$dId] = [
'source_url' => $row['source_url'] ?? null,
'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null),
'publication_date' => $row['publication_date'] ?? null,
'corpus_source_name' => 'Do Better Legal',
'source_id' => $sid,
'summary' => $row['summary'] ?? null,
'content_excerpt' => (string)($row['content_excerpt'] ?? ''),
'title' => (string)($row['title'] ?? ''),
];
}
$unsummarized = array_filter($docMeta, fn($m) => $m['summary'] === null && $m['content_excerpt'] !== '');
foreach ($unsummarized as $dId => $m) {
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
['role' => 'user', 'content' => "Summarise this Norwegian family law document.\nFocus on: legal provisions covered, authority type, and questions it helps answer.\n\nTitle: {$m['title']}\n\nContent:\n{$m['content_excerpt']}"],
], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
$summary = trim($raw);
if ($summary !== '') {
$ragDb->prepare("UPDATE documents SET summary = ? WHERE id = ?")->execute([$summary, $dId]);
$docMeta[$dId]['summary'] = $summary;
}
} catch (Throwable $e) {
error_log('BVJ hydrateSourceUrls summary gen failed for doc ' . $dId . ': ' . $e->getMessage());
}
}
if (!empty($sourceIds)) {
$uSids = array_values(array_unique($sourceIds));
$sPh = implode(',', array_fill(0, count($uSids), '?'));
$sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
$sStmt->execute($uSids);
$srcNames = [];
foreach ($sStmt as $row) {
$srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
}
foreach ($docMeta as &$m) {
if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
$m['corpus_source_name'] = $srcNames[$m['source_id']];
}
}
unset($m);
}
} catch (Throwable $e) {
error_log('BVJ hydrateSourceUrls failed: ' . $e->getMessage());
return;
}
foreach ($pool as &$chunk) {
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
$docId = (int)($chunk['document_id'] ?? 0);
if (!$docId || !isset($docMeta[$docId])) continue;
$m = $docMeta[$docId];
$sourceUrl = $m['source_url'] ?? null;
$chunk['source_url'] = $sourceUrl;
$chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
$chunk['authority_label'] = $m['authority_label'] ?? $chunk['authority_label'];
$chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
$chunk['publication_date'] = $m['publication_date'] ?? null;
$chunk['summary'] = $m['summary'] ?? null;
}
unset($chunk);
}
private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string
{
if (!$sourceUrl) return null;
$sourceUrl = trim($sourceUrl);
if ($sourceUrl === '') return null;
if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl)
&& $sectionTitle
&& preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) {
return rtrim($sourceUrl, '/') . '/§' . $m[1];
}
return $sourceUrl;
}
private function mergeAndDedupe(array $rawPool, int $cap): array
{
$byKey = [];
foreach ($rawPool as $chunk) {
$key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
if (!isset($byKey[$key])) {
$byKey[$key] = $chunk;
continue;
}
$existing = $byKey[$key];
$existing['matched_sub_questions'] = array_values(array_unique(array_merge(
$existing['matched_sub_questions'] ?? [],
$chunk['matched_sub_questions'] ?? []
)));
if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
$existing['similarity'] = $chunk['similarity'];
}
if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
$existing['reranker_score'] = $chunk['reranker_score'];
}
$byKey[$key] = $existing;
}
$merged = array_values($byKey);
usort($merged, function (array $a, array $b): int {
$aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
$bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
return $bScore <=> $aScore;
});
return array_slice($merged, 0, $cap);
}
private function numberSources(array $chunks): array
{
$out = [];
foreach ($chunks as $i => $c) {
$c['n'] = $i + 1;
$out[] = $c;
}
return $out;
}
private function citationConfidence(array $sources): string
{
if (!$sources) return 'low';
$scores = array_values(array_filter(array_map(
fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
$sources
), 'is_numeric'));
$best = $scores ? max($scores) : 0;
if (count($sources) >= 6 && $best >= 0.5) return 'high';
if (count($sources) >= 3 && $best >= 0.35) return 'medium';
return 'low';
}
private function normalizeControls(array $controls): array
{
return [
'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
];
}
private function requireFamilyPackage(int $clientId): array
{
$package = dbnToolsFetchPackage('family-legal');
if (!$package || empty($package['is_active'])) {
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
}
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
}
return $package;
}
private function trace(string $label, string $detail, string $status = 'complete'): array
{
return ['label' => $label, 'detail' => $detail, 'status' => $status];
}
private function elapsedMs(float $start): int
{
return (int)round((microtime(true) - $start) * 1000);
}
}