d2f9831472
Adds /corpus.php — a data transparency page showing what powers the legal tools: 9 coverage categories with live doc counts, a full sources table pulled from the corpus DB, the AI stack (LLMs, Whisper, Qdrant, Azure AI Search, embeddings, chunking), and a pipeline flow diagram. Stats are live via a new /api/corpus-stats.php endpoint (queries dobetter_rag + bnl_admin). The reasoning sidebar is repurposed as a Corpus health panel on this page. Also ships the in-progress timeline background events toggle: API and UI wired together via include_background param. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
69 lines
2.1 KiB
PHP
69 lines
2.1 KiB
PHP
<?php
|
|
declare(strict_types=1);
|
|
|
|
require_once __DIR__ . '/../includes/bootstrap.php';
|
|
|
|
dbnToolsRequireMethod('GET');
|
|
dbnToolsRequireAuth();
|
|
|
|
try {
|
|
$bnlDb = dbnToolsDb();
|
|
$ragDb = dbnToolsRagDb();
|
|
|
|
// Total documents in the do-better-legal corpus (corpus_id=1)
|
|
$stmt = $ragDb->prepare('SELECT COUNT(*) FROM documents WHERE corpus_id = 1');
|
|
$stmt->execute();
|
|
$totalDocs = (int)$stmt->fetchColumn();
|
|
|
|
// Total chunks for corpus_id=1 documents
|
|
$stmt = $ragDb->prepare(
|
|
'SELECT COUNT(*) FROM chunks c
|
|
JOIN documents d ON c.document_id = d.id
|
|
WHERE d.corpus_id = 1'
|
|
);
|
|
$stmt->execute();
|
|
$totalChunks = (int)$stmt->fetchColumn();
|
|
|
|
// Doc counts by category
|
|
$stmt = $ragDb->prepare(
|
|
'SELECT category, COUNT(*) AS doc_count
|
|
FROM documents
|
|
WHERE corpus_id = 1 AND category IS NOT NULL AND category != \'\'
|
|
GROUP BY category
|
|
ORDER BY doc_count DESC'
|
|
);
|
|
$stmt->execute();
|
|
$byCategory = $stmt->fetchAll(PDO::FETCH_ASSOC);
|
|
|
|
// Last updated timestamp
|
|
$stmt = $ragDb->prepare('SELECT MAX(updated_at) FROM documents WHERE corpus_id = 1');
|
|
$stmt->execute();
|
|
$lastUpdated = $stmt->fetchColumn() ?: null;
|
|
|
|
// Active sources from bnl_admin
|
|
$stmt = $bnlDb->prepare(
|
|
'SELECT name, url, category, authority_type, language, schedule, is_active, scraper_class
|
|
FROM corpus_sources
|
|
WHERE corpus_id = 1
|
|
ORDER BY category, name'
|
|
);
|
|
$stmt->execute();
|
|
$sources = $stmt->fetchAll(PDO::FETCH_ASSOC);
|
|
|
|
$activeSources = count(array_filter($sources, fn($s) => !empty($s['is_active'])));
|
|
|
|
dbnToolsRespond([
|
|
'ok' => true,
|
|
'stats' => [
|
|
'total_chunks' => $totalChunks,
|
|
'total_docs' => $totalDocs,
|
|
'active_sources' => $activeSources,
|
|
'last_updated' => $lastUpdated,
|
|
'by_category' => $byCategory,
|
|
],
|
|
'sources' => $sources,
|
|
]);
|
|
} catch (Throwable $e) {
|
|
dbnToolsError('Could not load corpus statistics: ' . $e->getMessage(), 500, 'corpus_stats_error');
|
|
}
|