diff --git a/advocate.php b/advocate.php index 2d8cded..d97bace 100644 --- a/advocate.php +++ b/advocate.php @@ -44,7 +44,7 @@ require_once __DIR__ . '/includes/layout.php';

Corpus slices

-

Select which slices the agent searches when building your case. All three legal slices are on by default.

+

Three core legal slices are on by default. Enable ECHR Article 9, Hague Convention, Norwegian Courts, Bufdir guidance, or DBN Resources for more targeted research.

- + + + +
diff --git a/assets/js/advocate.js b/assets/js/advocate.js index e0877c2..b22f66e 100644 --- a/assets/js/advocate.js +++ b/assets/js/advocate.js @@ -8,10 +8,14 @@ let lastResult = null; const SLICE_DEFS = [ - { id: 'family_core', label: 'Family Law Core' }, - { id: 'child_welfare', label: 'Child Welfare' }, - { id: 'echr_hague', label: 'ECHR and Hague' }, - { id: 'broader_legal', label: 'Broader Legal Support' }, + { id: 'family_core', label: 'Family Law Core' }, + { id: 'child_welfare', label: 'Child Welfare' }, + { id: 'echr', label: 'ECHR' }, + { id: 'hague', label: 'Hague Convention' }, + { id: 'norwegian_courts', label: 'Norwegian Courts' }, + { id: 'bufdir_guidance', label: 'Bufdir Guidance' }, + { id: 'broader_legal', label: 'Broader Legal Support' }, + { id: 'dbn_resources', label: 'DBN Resources' }, ]; const STEP_LABELS = [ diff --git a/assets/js/deep-research.js b/assets/js/deep-research.js index 16abde6..7d8ff1b 100644 --- a/assets/js/deep-research.js +++ b/assets/js/deep-research.js @@ -8,10 +8,14 @@ let lastResult = null; const SLICE_DEFS = [ - { id: 'family_core', label: 'Family Law Core' }, - { id: 'child_welfare', label: 'Child Welfare' }, - { id: 'echr_hague', label: 'ECHR and Hague' }, - { id: 'broader_legal', label: 'Broader Legal Support' }, + { id: 'family_core', label: 'Family Law Core' }, + { id: 'child_welfare', label: 'Child Welfare' }, + { id: 'echr', label: 'ECHR' }, + { id: 'hague', label: 'Hague Convention' }, + { id: 'norwegian_courts', label: 'Norwegian Courts' }, + { id: 'bufdir_guidance', label: 'Bufdir Guidance' }, + { id: 'broader_legal', label: 'Broader Legal Support' }, + { id: 'dbn_resources', label: 'DBN Resources' }, ]; const STEP_LABELS = [ diff --git a/deep-research.php b/deep-research.php index 25df596..78ffc0f 100644 --- a/deep-research.php +++ b/deep-research.php @@ -24,7 +24,7 @@ require_once __DIR__ . '/includes/layout.php';

Corpus slices

-

Select which slices of the Do Better Norge legal corpus the agent searches. Toggle Broader Legal on when the question reaches beyond family law.

+

Three core legal slices are on by default. Enable Hague Convention, Norwegian Courts, Bufdir guidance, or DBN Resources for more targeted research.

- + + + +
diff --git a/includes/DeepResearchAgent.php b/includes/DeepResearchAgent.php index f4d1f61..d880ff0 100644 --- a/includes/DeepResearchAgent.php +++ b/includes/DeepResearchAgent.php @@ -232,7 +232,7 @@ final class DbnDeepResearchAgent } $rawCorpusCount += count($corpusChunks); foreach ($corpusChunks as $chunk) { - if ($this->isWebsiteChunk($chunk)) { + if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) { $filteredOutCount++; continue; } @@ -666,20 +666,40 @@ PROMPT; } /** - * Defensive post-filter: drop any chunk that smells like a marketing-website hit - * (dobetternorge.no marketing pages have source_group 'website-primary'/'website-beta' - * but the chunk payload only carries `source_name` — use a name+title regex check). + * Post-retrieval filter: drop chunks that don't belong in a family-law research pass. + * + * EU AI Act (source_id=7, EUR-Lex) leaks through when the Qdrant search runs + * unconstrained (e.g. empty shared_doc_ids on slice-resolution failure). It is + * never relevant to Norwegian family law and is always excluded. + * + * DBN website pages (Resource Directory, Flashcards, etc.) are indexed with + * NULL source_id and score artificially high on broad queries. They are excluded + * unless the dbn_resources slice is explicitly ON. */ - private function isWebsiteChunk(array $chunk): bool + private function shouldExcludeChunk(array $chunk, array $activeSlices): bool { - $name = strtolower((string)($chunk['source_name'] ?? '')); + $name = strtolower((string)($chunk['source_name'] ?? '')); $title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? '')); - if ($name === '') return false; - // Trusted shared-corpus packages do not contain the word 'website'. Marketing - // sources are explicitly labelled with source_group=website-primary/beta upstream. - if (str_contains($name, 'website')) return true; - if (str_contains($title, 'dobetternorge.no')) return true; - if (preg_match('/^(homepage|landing|about |contact )/i', $title)) return true; + $url = strtolower((string)($chunk['source_url'] ?? '')); + + // EU AI Act — never relevant to family law research + if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true; + if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true; + + // DBN website pages — allow through only when dbn_resources slice is ON + $isDbnPage = ( + str_contains($name, 'website') + || str_contains($title, 'dobetternorge.no') + || preg_match('/^(homepage|landing|about |contact )/i', $title) + || str_contains($title, 'resource directory') + || preg_match('/^flashcards?\s*[-–|]/i', $title) + || preg_match('/\|\s*do better norge\s*$/i', $title) + || preg_match('/[-–]\s*do better norge\s*$/i', $title) + ); + if ($isDbnPage) { + return !($activeSlices['dbn_resources'] ?? false); + } + return false; }