Expand corpus slices to 8: split ECHR/Hague, add Norwegian Courts, Bufdir, DBN Resources

- Replace combined echr_hague slice with echr (Art.8+9, HUDOC, NIM) and hague (INCADAT,
  cross-border abduction) as separate toggles; echr defaults ON, hague defaults OFF
- Add norwegian_courts slice: Domstol (src 5,26) + Rettspraksis.no (src 33, 482 docs)
- Add bufdir_guidance slice: Barneombudet (19), Bufdir (20), Statsforvalteren (31)
- Add dbn_resources slice: DBN website pages (flashcards, resource directory), defaults OFF
- Replace isWebsiteChunk() with slice-aware shouldExcludeChunk(): always strips EU AI Act
  chunks (EUR-Lex source 7 leaks through when Qdrant runs unconstrained) and DBN website
  pages unless dbn_resources slice is explicitly ON
- Update SLICE_DEFS in advocate.js and deep-research.js to match all 8 slices
- Backward compat: echr_hague key in incoming requests fans out to echr+hague

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-15 16:01:05 +02:00
parent 464b8572d3
commit 7bccd8c010
5 changed files with 112 additions and 28 deletions
+32 -12
View File
@@ -232,7 +232,7 @@ final class DbnDeepResearchAgent
}
$rawCorpusCount += count($corpusChunks);
foreach ($corpusChunks as $chunk) {
if ($this->isWebsiteChunk($chunk)) {
if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) {
$filteredOutCount++;
continue;
}
@@ -666,20 +666,40 @@ PROMPT;
}
/**
* Defensive post-filter: drop any chunk that smells like a marketing-website hit
* (dobetternorge.no marketing pages have source_group 'website-primary'/'website-beta'
* but the chunk payload only carries `source_name` — use a name+title regex check).
* Post-retrieval filter: drop chunks that don't belong in a family-law research pass.
*
* EU AI Act (source_id=7, EUR-Lex) leaks through when the Qdrant search runs
* unconstrained (e.g. empty shared_doc_ids on slice-resolution failure). It is
* never relevant to Norwegian family law and is always excluded.
*
* DBN website pages (Resource Directory, Flashcards, etc.) are indexed with
* NULL source_id and score artificially high on broad queries. They are excluded
* unless the dbn_resources slice is explicitly ON.
*/
private function isWebsiteChunk(array $chunk): bool
private function shouldExcludeChunk(array $chunk, array $activeSlices): bool
{
$name = strtolower((string)($chunk['source_name'] ?? ''));
$name = strtolower((string)($chunk['source_name'] ?? ''));
$title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
if ($name === '') return false;
// Trusted shared-corpus packages do not contain the word 'website'. Marketing
// sources are explicitly labelled with source_group=website-primary/beta upstream.
if (str_contains($name, 'website')) return true;
if (str_contains($title, 'dobetternorge.no')) return true;
if (preg_match('/^(homepage|landing|about |contact )/i', $title)) return true;
$url = strtolower((string)($chunk['source_url'] ?? ''));
// EU AI Act — never relevant to family law research
if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true;
if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true;
// DBN website pages — allow through only when dbn_resources slice is ON
$isDbnPage = (
str_contains($name, 'website')
|| str_contains($title, 'dobetternorge.no')
|| preg_match('/^(homepage|landing|about |contact )/i', $title)
|| str_contains($title, 'resource directory')
|| preg_match('/^flashcards?\s*[-|]/i', $title)
|| preg_match('/\|\s*do better norge\s*$/i', $title)
|| preg_match('/[-]\s*do better norge\s*$/i', $title)
);
if ($isDbnPage) {
return !($activeSlices['dbn_resources'] ?? false);
}
return false;
}