From 85a6bc8134aa89201e3bb41e343cfbf93195388f Mon Sep 17 00:00:00 2001 From: davegilligan Date: Fri, 15 May 2026 12:10:46 +0200 Subject: [PATCH] Exclude dobetternorge.no docs from all corpus search modes BM25: adds NOT LIKE filter to SQL WHERE in both FULLTEXT and LIKE paths. Hybrid + Vector: post-filter hits array by source_url after results return. Co-Authored-By: Claude Sonnet 4.6 --- api/corpus-search.php | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/api/corpus-search.php b/api/corpus-search.php index dbd94c2..5d0a2ea 100644 --- a/api/corpus-search.php +++ b/api/corpus-search.php @@ -14,6 +14,8 @@ $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); $limit = max(1, min(20, (int)($input['limit'] ?? 8))); $category = isset($input['category']) && $input['category'] !== '' ? trim((string)$input['category']) : null; +const EXCLUDED_DOMAIN = 'dobetternorge.no'; + if (mb_strlen($query, 'UTF-8') < 3) { dbnToolsError('Query must be at least 3 characters.', 422, 'query_too_short'); } @@ -33,6 +35,7 @@ try { 'source_url' => $h['source_url'] ?? null, 'language' => null, ], $result['hits'] ?? []); + $hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN))); dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'hybrid', 'query' => $query]); } @@ -41,6 +44,7 @@ try { // ── BM25: FULLTEXT with LIKE fallback ─────────────────────────────────── if ($mode === 'bm25') { $catClause = $category !== null ? ' AND d.category = ?' : ''; + $excludeLike = '%' . EXCLUDED_DOMAIN . '%'; // Try FULLTEXT index first try { @@ -52,10 +56,11 @@ try { JOIN documents d ON c.document_id = d.id WHERE d.corpus_id = ? AND d.status = 'ready' AND MATCH(c.content) AGAINST (? IN BOOLEAN MODE) > 0 + AND d.source_url NOT LIKE ? $catClause ORDER BY score DESC LIMIT ?"; - $params = [$query, 1, $query]; + $params = [$query, 1, $query, $excludeLike]; if ($category !== null) $params[] = $category; $params[] = $limit; $stmt = $ragDb->prepare($sql); @@ -72,10 +77,11 @@ try { JOIN documents d ON c.document_id = d.id WHERE d.corpus_id = ? AND d.status = 'ready' AND (c.content LIKE ? OR d.title LIKE ?) + AND d.source_url NOT LIKE ? $catClause ORDER BY (d.title LIKE ?) DESC LIMIT ?"; - $params = [1, $like, $like]; + $params = [1, $like, $like, $excludeLike]; if ($category !== null) $params[] = $category; $params[] = $like; $params[] = $limit; @@ -152,6 +158,7 @@ try { ]; } + $hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN))); dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'vector', 'query' => $query]); }