Expand corpus slices to 8: split ECHR/Hague, add Norwegian Courts, Bufdir, DBN Resources

- Replace combined echr_hague slice with echr (Art.8+9, HUDOC, NIM) and hague (INCADAT, cross-border abduction) as separate toggles; echr defaults ON, hague defaults OFF - Add norwegian_courts slice: Domstol (src 5,26) + Rettspraksis.no (src 33, 482 docs) - Add bufdir_guidance slice: Barneombudet (19), Bufdir (20), Statsforvalteren (31) - Add dbn_resources slice: DBN website pages (flashcards, resource directory), defaults OFF - Replace isWebsiteChunk() with slice-aware shouldExcludeChunk(): always strips EU AI Act chunks (EUR-Lex source 7 leaks through when Qdrant runs unconstrained) and DBN website pages unless dbn_resources slice is explicitly ON - Update SLICE_DEFS in advocate.js and deep-research.js to match all 8 slices - Backward compat: echr_hague key in incoming requests fans out to echr+hague Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 16:01:05 +02:00
parent 464b8572d3
commit 7bccd8c010
5 changed files with 112 additions and 28 deletions
@@ -232,7 +232,7 @@ final class DbnDeepResearchAgent
            }
            $rawCorpusCount += count($corpusChunks);
            foreach ($corpusChunks as $chunk) {
-                if ($this->isWebsiteChunk($chunk)) {
+                if ($this->shouldExcludeChunk($chunk, $sliceSelectionNormalized)) {
                    $filteredOutCount++;
                    continue;
                }
@@ -666,20 +666,40 @@ PROMPT;
    }

    /**
-     * Defensive post-filter: drop any chunk that smells like a marketing-website hit
-     * (dobetternorge.no marketing pages have source_group 'website-primary'/'website-beta'
-     * but the chunk payload only carries `source_name` — use a name+title regex check).
+     * Post-retrieval filter: drop chunks that don't belong in a family-law research pass.
+     *
+     * EU AI Act (source_id=7, EUR-Lex) leaks through when the Qdrant search runs
+     * unconstrained (e.g. empty shared_doc_ids on slice-resolution failure). It is
+     * never relevant to Norwegian family law and is always excluded.
+     *
+     * DBN website pages (Resource Directory, Flashcards, etc.) are indexed with
+     * NULL source_id and score artificially high on broad queries. They are excluded
+     * unless the dbn_resources slice is explicitly ON.
     */
-    private function isWebsiteChunk(array $chunk): bool
+    private function shouldExcludeChunk(array $chunk, array $activeSlices): bool
    {
-        $name = strtolower((string)($chunk['source_name'] ?? ''));
+        $name  = strtolower((string)($chunk['source_name'] ?? ''));
        $title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
-        if ($name === '') return false;
-        // Trusted shared-corpus packages do not contain the word 'website'. Marketing
-        // sources are explicitly labelled with source_group=website-primary/beta upstream.
-        if (str_contains($name, 'website')) return true;
-        if (str_contains($title, 'dobetternorge.no')) return true;
-        if (preg_match('/^(homepage|landing|about |contact )/i', $title)) return true;
+        $url   = strtolower((string)($chunk['source_url'] ?? ''));
+
+        // EU AI Act — never relevant to family law research
+        if (preg_match('/eu\s+ai\s+act|2024[\/.]1689|regulation.*\bai\b.*act/i', $title)) return true;
+        if (str_contains($url, 'eur-lex') && preg_match('/2024.1689|ai.act/i', $url)) return true;
+
+        // DBN website pages — allow through only when dbn_resources slice is ON
+        $isDbnPage = (
+            str_contains($name, 'website')
+            || str_contains($title, 'dobetternorge.no')
+            || preg_match('/^(homepage|landing|about |contact )/i', $title)
+            || str_contains($title, 'resource directory')
+            || preg_match('/^flashcards?\s*[-–|]/i', $title)
+            || preg_match('/\|\s*do better norge\s*$/i', $title)
+            || preg_match('/[-–]\s*do better norge\s*$/i', $title)
+        );
+        if ($isDbnPage) {
+            return !($activeSlices['dbn_resources'] ?? false);
+        }
+
        return false;
    }