From 38255669a9560bb65fe458e95ffc84f1f861932e Mon Sep 17 00:00:00 2001 From: davegilligan Date: Fri, 15 May 2026 11:55:54 +0200 Subject: [PATCH] Add corpus explorer: search bar (Hybrid/BM25/Vector), category drill-down, source row expand - api/corpus-search.php: new endpoint with three search modes (hybrid RAG, BM25 keyword, Qdrant vector) - api/corpus-documents.php: paginated document browser by category or source name - corpus.php: search bar with mode+language pills, Browse docs button on each category card with drill-down panel, expand toggle on each source row showing doc count and scraper class - tools.css: all new corpus interactive styles appended Co-Authored-By: Claude Sonnet 4.6 --- api/corpus-documents.php | 92 +++++++++ api/corpus-search.php | 163 ++++++++++++++++ assets/css/tools.css | 344 +++++++++++++++++++++++++++++++++ corpus.php | 405 +++++++++++++++++++++++++++++++++++---- 4 files changed, 962 insertions(+), 42 deletions(-) create mode 100644 api/corpus-documents.php create mode 100644 api/corpus-search.php diff --git a/api/corpus-documents.php b/api/corpus-documents.php new file mode 100644 index 0000000..8e2a858 --- /dev/null +++ b/api/corpus-documents.php @@ -0,0 +1,92 @@ +prepare( + "SELECT url FROM corpus_sources WHERE corpus_id = 1 AND name = ? LIMIT 1" + ); + $srcStmt->execute([$sourceName]); + $srcRow = $srcStmt->fetch(PDO::FETCH_ASSOC); + if ($srcRow && !empty($srcRow['url'])) { + $parsed = parse_url($srcRow['url']); + $host = $parsed['host'] ?? ''; + if ($host !== '') { + $where[] = "d.source_url LIKE ?"; + $params[] = '%' . $host . '%'; + } + } + } + + $whereStr = implode(' AND ', $where); + + // Total count + $countParams = $params; + $countStmt = $ragDb->prepare("SELECT COUNT(*) FROM documents d WHERE $whereStr"); + $countStmt->execute($countParams); + $total = (int)$countStmt->fetchColumn(); + + // Paginated rows + $dataParams = $params; + $dataParams[] = $limit; + $dataParams[] = $offset; + $dataStmt = $ragDb->prepare( + "SELECT d.id, d.title, d.category, d.source_url, d.language, d.updated_at, + COUNT(c.id) AS chunk_count + FROM documents d + LEFT JOIN chunks c ON c.document_id = d.id + WHERE $whereStr + GROUP BY d.id + ORDER BY d.updated_at DESC + LIMIT ? OFFSET ?" + ); + $dataStmt->execute($dataParams); + $documents = $dataStmt->fetchAll(PDO::FETCH_ASSOC); + + // Normalise chunk_count to int + foreach ($documents as &$doc) { + $doc['chunk_count'] = (int)$doc['chunk_count']; + } + unset($doc); + + dbnToolsRespond([ + 'ok' => true, + 'documents' => $documents, + 'total' => $total, + 'offset' => $offset, + 'limit' => $limit, + 'filter' => [ + 'category' => $category, + 'source_name' => $sourceName, + ], + ]); +} catch (Throwable $e) { + dbnToolsError('Could not load documents: ' . $e->getMessage(), 500, 'documents_error'); +} diff --git a/api/corpus-search.php b/api/corpus-search.php new file mode 100644 index 0000000..dbd94c2 --- /dev/null +++ b/api/corpus-search.php @@ -0,0 +1,163 @@ +search($query, $language, $limit, 'disabled', null); + $hits = array_map(fn($h) => [ + 'title' => $h['title'] ?? '', + 'category' => $h['category'] ?? '', + 'section' => $h['section'] ?? null, + 'excerpt' => $h['excerpt'] ?? ($h['chunk_text'] ?? ''), + 'score' => $h['score'] ?? null, + 'document_id' => $h['document_id'] ?? null, + 'chunk_id' => $h['chunk_id'] ?? null, + 'source_url' => $h['source_url'] ?? null, + 'language' => null, + ], $result['hits'] ?? []); + dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'hybrid', 'query' => $query]); + } + + $ragDb = dbnToolsRagDb(); + + // ── BM25: FULLTEXT with LIKE fallback ─────────────────────────────────── + if ($mode === 'bm25') { + $catClause = $category !== null ? ' AND d.category = ?' : ''; + + // Try FULLTEXT index first + try { + $sql = "SELECT d.id AS document_id, d.title, d.category, + d.source_url, c.id AS chunk_id, c.content AS excerpt, + c.section_title AS section, d.language, + MATCH(c.content) AGAINST (? IN BOOLEAN MODE) AS score + FROM chunks c + JOIN documents d ON c.document_id = d.id + WHERE d.corpus_id = ? AND d.status = 'ready' + AND MATCH(c.content) AGAINST (? IN BOOLEAN MODE) > 0 + $catClause + ORDER BY score DESC + LIMIT ?"; + $params = [$query, 1, $query]; + if ($category !== null) $params[] = $category; + $params[] = $limit; + $stmt = $ragDb->prepare($sql); + $stmt->execute($params); + $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); + } catch (Throwable $e) { + // FULLTEXT index absent — use LIKE + $like = '%' . str_replace(['%', '_'], ['\\%', '\\_'], $query) . '%'; + $sql = "SELECT d.id AS document_id, d.title, d.category, + d.source_url, c.id AS chunk_id, c.content AS excerpt, + c.section_title AS section, d.language, + 0.25 AS score + FROM chunks c + JOIN documents d ON c.document_id = d.id + WHERE d.corpus_id = ? AND d.status = 'ready' + AND (c.content LIKE ? OR d.title LIKE ?) + $catClause + ORDER BY (d.title LIKE ?) DESC + LIMIT ?"; + $params = [1, $like, $like]; + if ($category !== null) $params[] = $category; + $params[] = $like; + $params[] = $limit; + $stmt = $ragDb->prepare($sql); + $stmt->execute($params); + $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); + } + + $hits = array_map(fn($r) => [ + 'title' => $r['title'] ?? '', + 'category' => $r['category'] ?? '', + 'section' => $r['section'] ?? null, + 'excerpt' => mb_substr((string)($r['excerpt'] ?? ''), 0, 600, 'UTF-8'), + 'score' => isset($r['score']) ? round((float)$r['score'], 4) : null, + 'document_id' => (int)$r['document_id'], + 'chunk_id' => isset($r['chunk_id']) ? (int)$r['chunk_id'] : null, + 'source_url' => $r['source_url'] ?? null, + 'language' => $r['language'] ?? null, + ], $rows); + dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'bm25', 'query' => $query]); + } + + // ── VECTOR: embed → Qdrant ───────────────────────────────────────────── + if ($mode === 'vector') { + $embeddings = dbnToolsLiteLLMEmbedBatch([$query]); + if (empty($embeddings) || !is_array($embeddings[0])) { + dbnToolsError('Embedding failed — vector search unavailable.', 502, 'embed_error'); + } + + $filter = ['must' => [['key' => 'corpus_id', 'match' => ['value' => 1]]]]; + if ($category !== null) { + $filter['must'][] = ['key' => 'category', 'match' => ['value' => $category]]; + } + + $qdrantPayload = json_encode([ + 'vector' => $embeddings[0], + 'limit' => $limit, + 'with_payload' => true, + 'filter' => $filter, + ]); + + $ch = curl_init('http://10.0.2.10:6333/collections/bnl_chunks/points/search'); + curl_setopt_array($ch, [ + CURLOPT_RETURNTRANSFER => true, + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => $qdrantPayload, + CURLOPT_HTTPHEADER => ['Content-Type: application/json'], + CURLOPT_TIMEOUT => 15, + ]); + $resp = curl_exec($ch); + $curlErr = curl_error($ch); + curl_close($ch); + + if ($resp === false) { + dbnToolsError('Qdrant unreachable: ' . $curlErr, 502, 'qdrant_error'); + } + + $qdrantResult = json_decode($resp, true); + $points = $qdrantResult['result'] ?? []; + + $hits = []; + foreach ($points as $pt) { + $p = $pt['payload'] ?? []; + $hits[] = [ + 'title' => $p['title'] ?? $p['document_title'] ?? '', + 'category' => $p['category'] ?? '', + 'section' => $p['section_title'] ?? null, + 'excerpt' => mb_substr((string)($p['content'] ?? ''), 0, 600, 'UTF-8'), + 'score' => round((float)($pt['score'] ?? 0), 4), + 'document_id' => isset($p['document_id']) ? (int)$p['document_id'] : null, + 'chunk_id' => $pt['id'] ?? null, + 'source_url' => $p['source_url'] ?? null, + 'language' => $p['language'] ?? null, + ]; + } + + dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'vector', 'query' => $query]); + } + + dbnToolsError('Unknown search mode.', 422, 'invalid_mode'); +} catch (DbnToolsHttpException $e) { + throw $e; +} catch (Throwable $e) { + dbnToolsError('Corpus search failed: ' . $e->getMessage(), 500, 'search_error'); +} diff --git a/assets/css/tools.css b/assets/css/tools.css index 4d57a75..b9cd6bb 100644 --- a/assets/css/tools.css +++ b/assets/css/tools.css @@ -2708,3 +2708,347 @@ a.dr-source-title-link:hover { color: var(--teal-dark); text-decoration: underline; } + +/* ── Corpus Explorer: Search bar ─────────────────────────────────────── */ +.corpus-search-box { + padding: 0 0 24px; + border-bottom: 1px solid var(--line); + margin-bottom: 32px; +} + +.corpus-search-row { + display: flex; + gap: 8px; + align-items: center; +} + +.corpus-search-input { + flex: 1; + height: 40px; + padding: 0 12px; + border: 1px solid var(--line); + border-radius: 6px; + font-size: 0.9rem; + background: var(--panel); + color: var(--ink); + outline: none; + transition: border-color 0.15s; +} +.corpus-search-input:focus { border-color: var(--teal); } + +.corpus-search-controls { + display: flex; + align-items: center; + justify-content: space-between; + margin-top: 10px; + gap: 12px; + flex-wrap: wrap; +} + +.search-modes, +.lang-pills { + display: flex; + gap: 4px; +} + +.mode-pill { + padding: 3px 12px; + border: 1px solid var(--line); + border-radius: 999px; + background: transparent; + color: var(--muted); + font-size: 0.8rem; + cursor: pointer; + transition: background 0.12s, color 0.12s, border-color 0.12s; +} +.mode-pill:hover { border-color: var(--teal); color: var(--teal); } +.mode-pill.is-active { + background: var(--teal); + border-color: var(--teal); + color: #fff; +} + +/* ── Search results ───────────────────────────────────────────────────── */ +.corpus-search-results { + margin: 0 0 32px; +} + +.search-results-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 12px; +} +.search-results-count { + font-size: 0.8rem; + color: var(--muted); +} + +.search-loading, +.search-empty, +.search-error, +.search-hint { + font-size: 0.85rem; + color: var(--muted); + padding: 12px 0; +} +.search-error { color: var(--coral); } + +.passage-card { + background: var(--panel); + border: 1px solid var(--line); + border-left: 3px solid var(--teal); + border-radius: 6px; + padding: 14px 16px; + margin-bottom: 10px; +} + +.passage-card__meta { + display: flex; + align-items: center; + gap: 8px; + margin-bottom: 6px; + flex-wrap: wrap; +} + +.passage-section { + font-size: 0.78rem; + color: var(--muted); + font-style: italic; +} + +.passage-score { + font-size: 0.75rem; + background: var(--soft-teal); + color: var(--teal-dark); + padding: 1px 7px; + border-radius: 999px; + margin-left: auto; +} + +.passage-card__title { + display: block; + font-size: 0.88rem; + font-weight: 600; + color: var(--ink); + text-decoration: none; + margin-bottom: 6px; +} +.passage-card__title:hover { color: var(--teal); text-decoration: underline; } + +.passage-card__excerpt { + font-size: 0.82rem; + color: var(--muted); + line-height: 1.55; + margin: 0; + display: -webkit-box; + -webkit-line-clamp: 4; + -webkit-box-orient: vertical; + overflow: hidden; +} + +.passage-card mark { + background: var(--soft-teal); + color: var(--teal-dark); + border-radius: 2px; + padding: 0 1px; +} + +/* ── Category card browse button ──────────────────────────────────────── */ +.cat-browse-btn { + display: inline-block; + margin-top: 10px; + padding: 4px 12px; + background: var(--soft-teal); + border: 1px solid var(--teal); + border-radius: 5px; + color: var(--teal); + font-size: 0.8rem; + cursor: pointer; + transition: background 0.12s, color 0.12s; +} +.cat-browse-btn:hover { + background: var(--teal); + color: #fff; +} + +/* ── Drill-down panel ─────────────────────────────────────────────────── */ +.corpus-drill-panel { + margin-top: 20px; + background: var(--panel); + border: 1px solid var(--line); + border-top: 3px solid var(--teal); + border-radius: 0 0 8px 8px; + padding: 24px; +} + +.drill-header { + display: flex; + justify-content: space-between; + align-items: flex-start; + margin-bottom: 20px; +} +.drill-header h3 { + margin: 4px 0 0; + font-size: 1.05rem; +} + +.drill-close-btn { + background: transparent; + border: 1px solid var(--line); + border-radius: 50%; + width: 28px; + height: 28px; + cursor: pointer; + color: var(--muted); + font-size: 0.9rem; + display: flex; + align-items: center; + justify-content: center; + flex-shrink: 0; +} +.drill-close-btn:hover { border-color: var(--teal); color: var(--teal); } + +.drill-loading, +.drill-empty, +.drill-error { + font-size: 0.85rem; + color: var(--muted); + padding: 8px 0; +} +.drill-error { color: var(--coral); } + +.doc-list { display: flex; flex-direction: column; gap: 8px; } + +.doc-list__item { + display: flex; + align-items: flex-start; + justify-content: space-between; + gap: 12px; + padding: 10px 12px; + background: var(--bg); + border: 1px solid var(--line); + border-radius: 5px; +} +.doc-list__item:hover { border-color: var(--teal); } + +.doc-list__info { flex: 1; min-width: 0; } + +.doc-list__title { + display: block; + font-size: 0.88rem; + font-weight: 500; + color: var(--ink); + text-decoration: none; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} +.doc-list__title:hover { color: var(--teal); text-decoration: underline; } + +.doc-list__meta { + display: flex; + align-items: center; + gap: 8px; + margin-top: 4px; + flex-wrap: wrap; +} + +.doc-list__date { + font-size: 0.78rem; + color: var(--muted); +} + +.doc-list__chunks { + flex-shrink: 0; + font-size: 0.75rem; + background: var(--soft-teal); + color: var(--teal-dark); + padding: 2px 8px; + border-radius: 999px; + white-space: nowrap; +} + +.doc-list__more-wrap { + text-align: center; + margin-top: 16px; +} +.doc-list__more { + padding: 7px 20px; + border: 1px solid var(--teal); + border-radius: 5px; + background: transparent; + color: var(--teal); + font-size: 0.85rem; + cursor: pointer; + transition: background 0.12s, color 0.12s; +} +.doc-list__more:hover { background: var(--teal); color: #fff; } + +/* ── Sources table expand column ──────────────────────────────────────── */ +.source-expand-cell { width: 32px; padding: 0 4px !important; text-align: center; } + +.source-expand-btn { + background: transparent; + border: 1px solid var(--line); + border-radius: 4px; + width: 22px; + height: 22px; + font-size: 0.65rem; + cursor: pointer; + color: var(--muted); + display: inline-flex; + align-items: center; + justify-content: center; + transition: border-color 0.12s, color 0.12s; +} +.source-expand-btn:hover { border-color: var(--teal); color: var(--teal); } + +.source-expand-row > td { + padding: 0 !important; + background: var(--soft-teal); + border-top: none; +} + +.source-expand-inner { + padding: 16px 20px; +} + +.source-expand-loading, +.source-expand-error { + font-size: 0.82rem; + color: var(--muted); +} + +.source-expand-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 16px; +} + +.source-expand-dl { + display: grid; + grid-template-columns: auto 1fr; + gap: 4px 12px; + font-size: 0.82rem; + margin: 0; +} +.source-expand-dl dt { color: var(--muted); white-space: nowrap; } +.source-expand-dl dd { margin: 0; } + +.source-expand-url { + font-size: 0.78rem; + word-break: break-all; + margin: 0 0 12px; +} +.source-expand-url a { color: var(--teal); } + +.source-browse-btn { + font-size: 0.82rem !important; + padding: 5px 14px !important; +} + +@media (max-width: 760px) { + .source-expand-grid { grid-template-columns: 1fr; } + .corpus-search-controls { flex-direction: column; align-items: flex-start; } +} diff --git a/corpus.php b/corpus.php index e1cfa68..e029097 100644 --- a/corpus.php +++ b/corpus.php @@ -37,6 +37,7 @@ $reasoningPanelOverride = ob_get_clean(); require_once __DIR__ . '/includes/layout.php'; ?> +
@@ -56,6 +57,28 @@ require_once __DIR__ . '/includes/layout.php';
+ + + +

Coverage

@@ -68,6 +91,7 @@ require_once __DIR__ . '/includes/layout.php';

Family Law

Barneloven, child custody (foreldreansvar), samvær, mediation (mekling), separation and divorce proceedings.

+
@@ -76,6 +100,7 @@ require_once __DIR__ . '/includes/layout.php';

Child Welfare

Barnevernloven, omsorgsovertakelse, emergency care orders, foster placement, CPS (barnevernet) case law.

+
@@ -84,6 +109,7 @@ require_once __DIR__ . '/includes/layout.php';

Labour Law

Arbeidsmiljøloven, collective agreements (tariffavtaler), Arbeidsretten rulings, dismissal, sick leave obligations.

+
@@ -92,6 +118,7 @@ require_once __DIR__ . '/includes/layout.php';

Social Welfare

NAV guidance on sykepenger, dagpenger, AAP, uføretrygd, alderspensjon, yrkesskade and social assistance.

+
@@ -100,6 +127,7 @@ require_once __DIR__ . '/includes/layout.php';

Tax Law

Skatteetaten's Skatte-ABC, binding advance rulings (BFU), Skatteklagenemnda decisions, income and capital tax.

+
@@ -108,6 +136,7 @@ require_once __DIR__ . '/includes/layout.php';

Administrative Law

Sivilombudet reports, Forvaltningsloven, procedural rights, official complaints, Stortinget oversight.

+
@@ -116,6 +145,7 @@ require_once __DIR__ . '/includes/layout.php';

Consumer & Housing

HTU (rental disputes), Finansklagenemnda, Forbrukertilsynet, Forbrukerrådet, Pakkereisenemnda decisions.

+
@@ -124,6 +154,7 @@ require_once __DIR__ . '/includes/layout.php';

Immigration & International

UNE (Utlendingsnemnda) decisions, ECHR Art. 8 family rights, EMD case law, Hague Convention (cross-border child abduction).

+
@@ -132,6 +163,22 @@ require_once __DIR__ . '/includes/layout.php';

Government Documents

NOUer, Stortingsmeldinger, government white papers and regulatory guidance from Regjeringen.no.

+ +
+ + + + @@ -144,6 +191,7 @@ require_once __DIR__ . '/includes/layout.php'; + @@ -153,7 +201,7 @@ require_once __DIR__ . '/includes/layout.php'; - +
Source Type Category
Loading sources…
Loading sources…
@@ -280,6 +328,13 @@ require_once __DIR__ . '/includes/layout.php';