From 7bccd8c010a8417e3a89fca8d73e6d108a157ca0 Mon Sep 17 00:00:00 2001 From: davegilligan Date: Fri, 15 May 2026 16:01:05 +0200 Subject: [PATCH] Expand corpus slices to 8: split ECHR/Hague, add Norwegian Courts, Bufdir, DBN Resources - Replace combined echr_hague slice with echr (Art.8+9, HUDOC, NIM) and hague (INCADAT, cross-border abduction) as separate toggles; echr defaults ON, hague defaults OFF - Add norwegian_courts slice: Domstol (src 5,26) + Rettspraksis.no (src 33, 482 docs) - Add bufdir_guidance slice: Barneombudet (19), Bufdir (20), Statsforvalteren (31) - Add dbn_resources slice: DBN website pages (flashcards, resource directory), defaults OFF - Replace isWebsiteChunk() with slice-aware shouldExcludeChunk(): always strips EU AI Act chunks (EUR-Lex source 7 leaks through when Qdrant runs unconstrained) and DBN website pages unless dbn_resources slice is explicitly ON - Update SLICE_DEFS in advocate.js and deep-research.js to match all 8 slices - Backward compat: echr_hague key in incoming requests fans out to echr+hague Co-Authored-By: Claude Sonnet 4.6 --- advocate.php | 36 ++++++++++++++++++++++++---- assets/js/advocate.js | 12 ++++++---- assets/js/deep-research.js | 12 ++++++---- deep-research.php | 36 ++++++++++++++++++++++++---- includes/DeepResearchAgent.php | 44 ++++++++++++++++++++++++---------- 5 files changed, 112 insertions(+), 28 deletions(-) diff --git a/advocate.php b/advocate.php index 2d8cded..d97bace 100644 --- a/advocate.php +++ b/advocate.php @@ -44,7 +44,7 @@ require_once __DIR__ . '/includes/layout.php';

Corpus slices

-

Select which slices the agent searches when building your case. All three legal slices are on by default.

+

Three core legal slices are on by default. Enable ECHR Article 9, Hague Convention, Norwegian Courts, Bufdir guidance, or DBN Resources for more targeted research.

- + + + +
diff --git a/assets/js/advocate.js b/assets/js/advocate.js index e0877c2..b22f66e 100644 --- a/assets/js/advocate.js +++ b/assets/js/advocate.js @@ -8,10 +8,14 @@ let lastResult = null; const SLICE_DEFS = [ - { id: 'family_core', label: 'Family Law Core' }, - { id: 'child_welfare', label: 'Child Welfare' }, - { id: 'echr_hague', label: 'ECHR and Hague' }, - { id: 'broader_legal', label: 'Broader Legal Support' }, + { id: 'family_core', label: 'Family Law Core' }, + { id: 'child_welfare', label: 'Child Welfare' }, + { id: 'echr', label: 'ECHR' }, + { id: 'hague', label: 'Hague Convention' }, + { id: 'norwegian_courts', label: 'Norwegian Courts' }, + { id: 'bufdir_guidance', label: 'Bufdir Guidance' }, + { id: 'broader_legal', label: 'Broader Legal Support' }, + { id: 'dbn_resources', label: 'DBN Resources' }, ]; const STEP_LABELS = [ diff --git a/assets/js/deep-research.js b/assets/js/deep-research.js index 16abde6..7d8ff1b 100644 --- a/assets/js/deep-research.js +++ b/assets/js/deep-research.js @@ -8,10 +8,14 @@ let lastResult = null; const SLICE_DEFS = [ - { id: 'family_core', label: 'Family Law Core' }, - { id: 'child_welfare', label: 'Child Welfare' }, - { id: 'echr_hague', label: 'ECHR and Hague' }, - { id: 'broader_legal', label: 'Broader Legal Support' }, + { id: 'family_core', label: 'Family Law Core' }, + { id: 'child_welfare', label: 'Child Welfare' }, + { id: 'echr', label: 'ECHR' }, + { id: 'hague', label: 'Hague Convention' }, + { id: 'norwegian_courts', label: 'Norwegian Courts' }, + { id: 'bufdir_guidance', label: 'Bufdir Guidance' }, + { id: 'broader_legal', label: 'Broader Legal Support' }, + { id: 'dbn_resources', label: 'DBN Resources' }, ]; const STEP_LABELS = [ diff --git a/deep-research.php b/deep-research.php index 25df596..78ffc0f 100644 --- a/deep-research.php +++ b/deep-research.php @@ -24,7 +24,7 @@ require_once __DIR__ . '/includes/layout.php';

Corpus slices

-

Select which slices of the Do Better Norge legal corpus the agent searches. Toggle Broader Legal on when the question reaches beyond family law.

+

Three core legal slices are on by default. Enable Hague Convention, Norwegian Courts, Bufdir guidance, or DBN Resources for more targeted research.

- + + + +
diff --git a/includes/DeepResearchAgent.php b/includes/DeepResearchAgent.php index f4d1f61..d880ff0 100644 --- a/includes/DeepResearchAgent.php +++ b/includes/DeepResearchAgent.php @@ -232,7 +232,7 @@ final class DbnDeepResearchAgent } $rawCorpusCount += count($corpusChunks); foreach ($corpusChunks as $chunk) { - if ($this->isWebsiteChunk($chunk)) { + if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) { $filteredOutCount++; continue; } @@ -666,20 +666,40 @@ PROMPT; } /** - * Defensive post-filter: drop any chunk that smells like a marketing-website hit - * (dobetternorge.no marketing pages have source_group 'website-primary'/'website-beta' - * but the chunk payload only carries `source_name` — use a name+title regex check). + * Post-retrieval filter: drop chunks that don't belong in a family-law research pass. + * + * EU AI Act (source_id=7, EUR-Lex) leaks through when the Qdrant search runs + * unconstrained (e.g. empty shared_doc_ids on slice-resolution failure). It is + * never relevant to Norwegian family law and is always excluded. + * + * DBN website pages (Resource Directory, Flashcards, etc.) are indexed with + * NULL source_id and score artificially high on broad queries. They are excluded + * unless the dbn_resources slice is explicitly ON. */ - private function isWebsiteChunk(array $chunk): bool + private function shouldExcludeChunk(array $chunk, array $activeSlices): bool { - $name = strtolower((string)($chunk['source_name'] ?? '')); + $name = strtolower((string)($chunk['source_name'] ?? '')); $title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? '')); - if ($name === '') return false; - // Trusted shared-corpus packages do not contain the word 'website'. Marketing - // sources are explicitly labelled with source_group=website-primary/beta upstream. - if (str_contains($name, 'website')) return true; - if (str_contains($title, 'dobetternorge.no')) return true; - if (preg_match('/^(homepage|landing|about |contact )/i', $title)) return true; + $url = strtolower((string)($chunk['source_url'] ?? '')); + + // EU AI Act — never relevant to family law research + if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true; + if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true; + + // DBN website pages — allow through only when dbn_resources slice is ON + $isDbnPage = ( + str_contains($name, 'website') + || str_contains($title, 'dobetternorge.no') + || preg_match('/^(homepage|landing|about |contact )/i', $title) + || str_contains($title, 'resource directory') + || preg_match('/^flashcards?\s*[-–|]/i', $title) + || preg_match('/\|\s*do better norge\s*$/i', $title) + || preg_match('/[-–]\s*do better norge\s*$/i', $title) + ); + if ($isDbnPage) { + return !($activeSlices['dbn_resources'] ?? false); + } + return false; }