Expand corpus slices to 8: split ECHR/Hague, add Norwegian Courts, Bufdir, DBN Resources
- Replace combined echr_hague slice with echr (Art.8+9, HUDOC, NIM) and hague (INCADAT, cross-border abduction) as separate toggles; echr defaults ON, hague defaults OFF - Add norwegian_courts slice: Domstol (src 5,26) + Rettspraksis.no (src 33, 482 docs) - Add bufdir_guidance slice: Barneombudet (19), Bufdir (20), Statsforvalteren (31) - Add dbn_resources slice: DBN website pages (flashcards, resource directory), defaults OFF - Replace isWebsiteChunk() with slice-aware shouldExcludeChunk(): always strips EU AI Act chunks (EUR-Lex source 7 leaks through when Qdrant runs unconstrained) and DBN website pages unless dbn_resources slice is explicitly ON - Update SLICE_DEFS in advocate.js and deep-research.js to match all 8 slices - Backward compat: echr_hague key in incoming requests fans out to echr+hague Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -232,7 +232,7 @@ final class DbnDeepResearchAgent
|
||||
}
|
||||
$rawCorpusCount += count($corpusChunks);
|
||||
foreach ($corpusChunks as $chunk) {
|
||||
if ($this->isWebsiteChunk($chunk)) {
|
||||
if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) {
|
||||
$filteredOutCount++;
|
||||
continue;
|
||||
}
|
||||
@@ -666,20 +666,40 @@ PROMPT;
|
||||
}
|
||||
|
||||
/**
|
||||
* Defensive post-filter: drop any chunk that smells like a marketing-website hit
|
||||
* (dobetternorge.no marketing pages have source_group 'website-primary'/'website-beta'
|
||||
* but the chunk payload only carries `source_name` — use a name+title regex check).
|
||||
* Post-retrieval filter: drop chunks that don't belong in a family-law research pass.
|
||||
*
|
||||
* EU AI Act (source_id=7, EUR-Lex) leaks through when the Qdrant search runs
|
||||
* unconstrained (e.g. empty shared_doc_ids on slice-resolution failure). It is
|
||||
* never relevant to Norwegian family law and is always excluded.
|
||||
*
|
||||
* DBN website pages (Resource Directory, Flashcards, etc.) are indexed with
|
||||
* NULL source_id and score artificially high on broad queries. They are excluded
|
||||
* unless the dbn_resources slice is explicitly ON.
|
||||
*/
|
||||
private function isWebsiteChunk(array $chunk): bool
|
||||
private function shouldExcludeChunk(array $chunk, array $activeSlices): bool
|
||||
{
|
||||
$name = strtolower((string)($chunk['source_name'] ?? ''));
|
||||
$name = strtolower((string)($chunk['source_name'] ?? ''));
|
||||
$title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
|
||||
if ($name === '') return false;
|
||||
// Trusted shared-corpus packages do not contain the word 'website'. Marketing
|
||||
// sources are explicitly labelled with source_group=website-primary/beta upstream.
|
||||
if (str_contains($name, 'website')) return true;
|
||||
if (str_contains($title, 'dobetternorge.no')) return true;
|
||||
if (preg_match('/^(homepage|landing|about |contact )/i', $title)) return true;
|
||||
$url = strtolower((string)($chunk['source_url'] ?? ''));
|
||||
|
||||
// EU AI Act — never relevant to family law research
|
||||
if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true;
|
||||
if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true;
|
||||
|
||||
// DBN website pages — allow through only when dbn_resources slice is ON
|
||||
$isDbnPage = (
|
||||
str_contains($name, 'website')
|
||||
|| str_contains($title, 'dobetternorge.no')
|
||||
|| preg_match('/^(homepage|landing|about |contact )/i', $title)
|
||||
|| str_contains($title, 'resource directory')
|
||||
|| preg_match('/^flashcards?\s*[-–|]/i', $title)
|
||||
|| preg_match('/\|\s*do better norge\s*$/i', $title)
|
||||
|| preg_match('/[-–]\s*do better norge\s*$/i', $title)
|
||||
);
|
||||
if ($isDbnPage) {
|
||||
return !($activeSlices['dbn_resources'] ?? false);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user