Add corpus explorer: search bar (Hybrid/BM25/Vector), category drill-down, source row expand

- api/corpus-search.php: new endpoint with three search modes (hybrid RAG, BM25 keyword, Qdrant vector)
- api/corpus-documents.php: paginated document browser by category or source name
- corpus.php: search bar with mode+language pills, Browse docs button on each category card with drill-down panel, expand toggle on each source row showing doc count and scraper class
- tools.css: all new corpus interactive styles appended

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-15 11:55:54 +02:00
parent 785de04f05
commit 38255669a9
4 changed files with 962 additions and 42 deletions
+92
View File
@@ -0,0 +1,92 @@
<?php
declare(strict_types=1);
require_once __DIR__ . '/../includes/bootstrap.php';
dbnToolsRequireMethod('GET');
dbnToolsRequireAuth();
try {
$ragDb = dbnToolsRagDb();
$bnlDb = dbnToolsDb();
$category = isset($_GET['category']) && $_GET['category'] !== '' ? trim((string)$_GET['category']) : null;
$sourceName = isset($_GET['source_name']) && $_GET['source_name'] !== '' ? trim((string)$_GET['source_name']) : null;
$offset = max(0, (int)($_GET['offset'] ?? 0));
$limit = max(1, min(50, (int)($_GET['limit'] ?? 20)));
// Build WHERE clause
$where = ["d.corpus_id = 1", "d.status = 'ready'"];
$params = [];
if ($category !== null) {
$where[] = 'd.category = ?';
$params[] = $category;
}
if ($sourceName !== null) {
// Filter by source via a JOIN to corpus_sources on category match
// or by matching the scraper's URL pattern in source_url
// We join bnl_admin.corpus_sources — but that's a different DB.
// Simplest: filter documents whose source_url LIKE the source's url.
// Fetch the source URL from bnl_admin first.
$srcStmt = $bnlDb->prepare(
"SELECT url FROM corpus_sources WHERE corpus_id = 1 AND name = ? LIMIT 1"
);
$srcStmt->execute([$sourceName]);
$srcRow = $srcStmt->fetch(PDO::FETCH_ASSOC);
if ($srcRow && !empty($srcRow['url'])) {
$parsed = parse_url($srcRow['url']);
$host = $parsed['host'] ?? '';
if ($host !== '') {
$where[] = "d.source_url LIKE ?";
$params[] = '%' . $host . '%';
}
}
}
$whereStr = implode(' AND ', $where);
// Total count
$countParams = $params;
$countStmt = $ragDb->prepare("SELECT COUNT(*) FROM documents d WHERE $whereStr");
$countStmt->execute($countParams);
$total = (int)$countStmt->fetchColumn();
// Paginated rows
$dataParams = $params;
$dataParams[] = $limit;
$dataParams[] = $offset;
$dataStmt = $ragDb->prepare(
"SELECT d.id, d.title, d.category, d.source_url, d.language, d.updated_at,
COUNT(c.id) AS chunk_count
FROM documents d
LEFT JOIN chunks c ON c.document_id = d.id
WHERE $whereStr
GROUP BY d.id
ORDER BY d.updated_at DESC
LIMIT ? OFFSET ?"
);
$dataStmt->execute($dataParams);
$documents = $dataStmt->fetchAll(PDO::FETCH_ASSOC);
// Normalise chunk_count to int
foreach ($documents as &$doc) {
$doc['chunk_count'] = (int)$doc['chunk_count'];
}
unset($doc);
dbnToolsRespond([
'ok' => true,
'documents' => $documents,
'total' => $total,
'offset' => $offset,
'limit' => $limit,
'filter' => [
'category' => $category,
'source_name' => $sourceName,
],
]);
} catch (Throwable $e) {
dbnToolsError('Could not load documents: ' . $e->getMessage(), 500, 'documents_error');
}
+163
View File
@@ -0,0 +1,163 @@
<?php
declare(strict_types=1);
require_once __DIR__ . '/../includes/LegalTools.php';
dbnToolsRequireMethod('POST');
dbnToolsRequireAuth();
$input = dbnToolsJsonInput(4000);
$query = trim(dbnToolsString($input, 'query', 1000));
$rawMode = $input['mode'] ?? 'hybrid';
$mode = in_array($rawMode, ['hybrid', 'bm25', 'vector'], true) ? $rawMode : 'hybrid';
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
$limit = max(1, min(20, (int)($input['limit'] ?? 8)));
$category = isset($input['category']) && $input['category'] !== '' ? trim((string)$input['category']) : null;
if (mb_strlen($query, 'UTF-8') < 3) {
dbnToolsError('Query must be at least 3 characters.', 422, 'query_too_short');
}
try {
// ── HYBRID: delegate to the existing RAG pipeline ──────────────────────
if ($mode === 'hybrid') {
$result = (new DbnLegalToolsService())->search($query, $language, $limit, 'disabled', null);
$hits = array_map(fn($h) => [
'title' => $h['title'] ?? '',
'category' => $h['category'] ?? '',
'section' => $h['section'] ?? null,
'excerpt' => $h['excerpt'] ?? ($h['chunk_text'] ?? ''),
'score' => $h['score'] ?? null,
'document_id' => $h['document_id'] ?? null,
'chunk_id' => $h['chunk_id'] ?? null,
'source_url' => $h['source_url'] ?? null,
'language' => null,
], $result['hits'] ?? []);
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'hybrid', 'query' => $query]);
}
$ragDb = dbnToolsRagDb();
// ── BM25: FULLTEXT with LIKE fallback ───────────────────────────────────
if ($mode === 'bm25') {
$catClause = $category !== null ? ' AND d.category = ?' : '';
// Try FULLTEXT index first
try {
$sql = "SELECT d.id AS document_id, d.title, d.category,
d.source_url, c.id AS chunk_id, c.content AS excerpt,
c.section_title AS section, d.language,
MATCH(c.content) AGAINST (? IN BOOLEAN MODE) AS score
FROM chunks c
JOIN documents d ON c.document_id = d.id
WHERE d.corpus_id = ? AND d.status = 'ready'
AND MATCH(c.content) AGAINST (? IN BOOLEAN MODE) > 0
$catClause
ORDER BY score DESC
LIMIT ?";
$params = [$query, 1, $query];
if ($category !== null) $params[] = $category;
$params[] = $limit;
$stmt = $ragDb->prepare($sql);
$stmt->execute($params);
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
} catch (Throwable $e) {
// FULLTEXT index absent — use LIKE
$like = '%' . str_replace(['%', '_'], ['\\%', '\\_'], $query) . '%';
$sql = "SELECT d.id AS document_id, d.title, d.category,
d.source_url, c.id AS chunk_id, c.content AS excerpt,
c.section_title AS section, d.language,
0.25 AS score
FROM chunks c
JOIN documents d ON c.document_id = d.id
WHERE d.corpus_id = ? AND d.status = 'ready'
AND (c.content LIKE ? OR d.title LIKE ?)
$catClause
ORDER BY (d.title LIKE ?) DESC
LIMIT ?";
$params = [1, $like, $like];
if ($category !== null) $params[] = $category;
$params[] = $like;
$params[] = $limit;
$stmt = $ragDb->prepare($sql);
$stmt->execute($params);
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
}
$hits = array_map(fn($r) => [
'title' => $r['title'] ?? '',
'category' => $r['category'] ?? '',
'section' => $r['section'] ?? null,
'excerpt' => mb_substr((string)($r['excerpt'] ?? ''), 0, 600, 'UTF-8'),
'score' => isset($r['score']) ? round((float)$r['score'], 4) : null,
'document_id' => (int)$r['document_id'],
'chunk_id' => isset($r['chunk_id']) ? (int)$r['chunk_id'] : null,
'source_url' => $r['source_url'] ?? null,
'language' => $r['language'] ?? null,
], $rows);
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'bm25', 'query' => $query]);
}
// ── VECTOR: embed → Qdrant ─────────────────────────────────────────────
if ($mode === 'vector') {
$embeddings = dbnToolsLiteLLMEmbedBatch([$query]);
if (empty($embeddings) || !is_array($embeddings[0])) {
dbnToolsError('Embedding failed — vector search unavailable.', 502, 'embed_error');
}
$filter = ['must' => [['key' => 'corpus_id', 'match' => ['value' => 1]]]];
if ($category !== null) {
$filter['must'][] = ['key' => 'category', 'match' => ['value' => $category]];
}
$qdrantPayload = json_encode([
'vector' => $embeddings[0],
'limit' => $limit,
'with_payload' => true,
'filter' => $filter,
]);
$ch = curl_init('http://10.0.2.10:6333/collections/bnl_chunks/points/search');
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $qdrantPayload,
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
CURLOPT_TIMEOUT => 15,
]);
$resp = curl_exec($ch);
$curlErr = curl_error($ch);
curl_close($ch);
if ($resp === false) {
dbnToolsError('Qdrant unreachable: ' . $curlErr, 502, 'qdrant_error');
}
$qdrantResult = json_decode($resp, true);
$points = $qdrantResult['result'] ?? [];
$hits = [];
foreach ($points as $pt) {
$p = $pt['payload'] ?? [];
$hits[] = [
'title' => $p['title'] ?? $p['document_title'] ?? '',
'category' => $p['category'] ?? '',
'section' => $p['section_title'] ?? null,
'excerpt' => mb_substr((string)($p['content'] ?? ''), 0, 600, 'UTF-8'),
'score' => round((float)($pt['score'] ?? 0), 4),
'document_id' => isset($p['document_id']) ? (int)$p['document_id'] : null,
'chunk_id' => $pt['id'] ?? null,
'source_url' => $p['source_url'] ?? null,
'language' => $p['language'] ?? null,
];
}
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'vector', 'query' => $query]);
}
dbnToolsError('Unknown search mode.', 422, 'invalid_mode');
} catch (DbnToolsHttpException $e) {
throw $e;
} catch (Throwable $e) {
dbnToolsError('Corpus search failed: ' . $e->getMessage(), 500, 'search_error');
}