d2f9831472
Adds /corpus.php — a data transparency page showing what powers the legal tools: 9 coverage categories with live doc counts, a full sources table pulled from the corpus DB, the AI stack (LLMs, Whisper, Qdrant, Azure AI Search, embeddings, chunking), and a pipeline flow diagram. Stats are live via a new /api/corpus-stats.php endpoint (queries dobetter_rag + bnl_admin). The reasoning sidebar is repurposed as a Corpus health panel on this page. Also ships the in-progress timeline background events toggle: API and UI wired together via include_background param. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
416 lines
19 KiB
PHP
416 lines
19 KiB
PHP
<?php
|
||
declare(strict_types=1);
|
||
$toolName = 'corpus';
|
||
$toolTitle = 'Legal Knowledge Base';
|
||
$toolKind = 'Corpus Intelligence';
|
||
$toolBadge = '~220 K passages';
|
||
|
||
ob_start();
|
||
?>
|
||
<div class="reasoning-head">
|
||
<p class="eyebrow">Corpus health</p>
|
||
<h2 id="reasoningTitle">Vector index</h2>
|
||
</div>
|
||
<dl class="corpus-health-dl">
|
||
<dt>Collection</dt>
|
||
<dd><code>bnl_chunks</code></dd>
|
||
<dt>Dimensions</dt>
|
||
<dd>768 (nomic-embed-text)</dd>
|
||
<dt>Similarity</dt>
|
||
<dd>Cosine</dd>
|
||
<dt>RAG strategy</dt>
|
||
<dd>Hybrid vector + keyword<br>Reciprocal rank fusion</dd>
|
||
<dt>Private boost</dt>
|
||
<dd>1.5×</dd>
|
||
<dt>Temporal mode</dt>
|
||
<dd>legal_conservative</dd>
|
||
<dt>Chunk target</dt>
|
||
<dd>600 words · 75 overlap</dd>
|
||
<dt>Vector DB</dt>
|
||
<dd>Qdrant on Colin Docker<br><code>10.0.2.10:6333</code></dd>
|
||
<dt>Hybrid search</dt>
|
||
<dd>Azure AI Search<br><code>bnl-legal-search</code><br>West Europe · Basic SKU</dd>
|
||
</dl>
|
||
<?php
|
||
$reasoningPanelOverride = ob_get_clean();
|
||
|
||
require_once __DIR__ . '/includes/layout.php';
|
||
?>
|
||
|
||
<div class="corpus-stats-bar" id="corpusStatsBar">
|
||
<div class="corpus-stat" id="statChunks">
|
||
<span class="corpus-stat__value is-loading">—</span>
|
||
<span class="corpus-stat__label">Indexed passages</span>
|
||
</div>
|
||
<div class="corpus-stat" id="statDocs">
|
||
<span class="corpus-stat__value is-loading">—</span>
|
||
<span class="corpus-stat__label">Source documents</span>
|
||
</div>
|
||
<div class="corpus-stat" id="statSources">
|
||
<span class="corpus-stat__value is-loading">—</span>
|
||
<span class="corpus-stat__label">Active scrapers</span>
|
||
</div>
|
||
<div class="corpus-stat" id="statUpdated">
|
||
<span class="corpus-stat__value is-loading">—</span>
|
||
<span class="corpus-stat__label">Last ingested</span>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- COVERAGE -->
|
||
<div class="corpus-section">
|
||
<p class="eyebrow">Coverage</p>
|
||
<h3 class="corpus-section__title">Legal categories</h3>
|
||
<div class="corpus-categories" id="corpusCategories">
|
||
<div class="category-card" data-category="family-law">
|
||
<div class="category-card__top">
|
||
<span class="category-card__icon" aria-hidden="true">⚖</span>
|
||
<span class="category-card__count is-loading" id="cat-family-law">—</span>
|
||
</div>
|
||
<h4>Family Law</h4>
|
||
<p>Barneloven, child custody (foreldreansvar), samvær, mediation (mekling), separation and divorce proceedings.</p>
|
||
</div>
|
||
<div class="category-card" data-category="child-welfare">
|
||
<div class="category-card__top">
|
||
<span class="category-card__icon" aria-hidden="true">🧒</span>
|
||
<span class="category-card__count is-loading" id="cat-child-welfare">—</span>
|
||
</div>
|
||
<h4>Child Welfare</h4>
|
||
<p>Barnevernloven, omsorgsovertakelse, emergency care orders, foster placement, CPS (barnevernet) case law.</p>
|
||
</div>
|
||
<div class="category-card" data-category="labour-law">
|
||
<div class="category-card__top">
|
||
<span class="category-card__icon" aria-hidden="true">🏭</span>
|
||
<span class="category-card__count is-loading" id="cat-labour-law">—</span>
|
||
</div>
|
||
<h4>Labour Law</h4>
|
||
<p>Arbeidsmiljøloven, collective agreements (tariffavtaler), Arbeidsretten rulings, dismissal, sick leave obligations.</p>
|
||
</div>
|
||
<div class="category-card" data-category="social-welfare">
|
||
<div class="category-card__top">
|
||
<span class="category-card__icon" aria-hidden="true">🛡</span>
|
||
<span class="category-card__count is-loading" id="cat-social-welfare">—</span>
|
||
</div>
|
||
<h4>Social Welfare</h4>
|
||
<p>NAV guidance on sykepenger, dagpenger, AAP, uføretrygd, alderspensjon, yrkesskade and social assistance.</p>
|
||
</div>
|
||
<div class="category-card" data-category="tax-law">
|
||
<div class="category-card__top">
|
||
<span class="category-card__icon" aria-hidden="true">📊</span>
|
||
<span class="category-card__count is-loading" id="cat-tax-law">—</span>
|
||
</div>
|
||
<h4>Tax Law</h4>
|
||
<p>Skatteetaten's Skatte-ABC, binding advance rulings (BFU), Skatteklagenemnda decisions, income and capital tax.</p>
|
||
</div>
|
||
<div class="category-card" data-category="administrative-law">
|
||
<div class="category-card__top">
|
||
<span class="category-card__icon" aria-hidden="true">🏛</span>
|
||
<span class="category-card__count is-loading" id="cat-administrative-law">—</span>
|
||
</div>
|
||
<h4>Administrative Law</h4>
|
||
<p>Sivilombudet reports, Forvaltningsloven, procedural rights, official complaints, Stortinget oversight.</p>
|
||
</div>
|
||
<div class="category-card" data-category="consumer-law">
|
||
<div class="category-card__top">
|
||
<span class="category-card__icon" aria-hidden="true">🏠</span>
|
||
<span class="category-card__count is-loading" id="cat-consumer-law">—</span>
|
||
</div>
|
||
<h4>Consumer & Housing</h4>
|
||
<p>HTU (rental disputes), Finansklagenemnda, Forbrukertilsynet, Forbrukerrådet, Pakkereisenemnda decisions.</p>
|
||
</div>
|
||
<div class="category-card" data-category="immigration-law">
|
||
<div class="category-card__top">
|
||
<span class="category-card__icon" aria-hidden="true">🌐</span>
|
||
<span class="category-card__count is-loading" id="cat-immigration-law">—</span>
|
||
</div>
|
||
<h4>Immigration & International</h4>
|
||
<p>UNE (Utlendingsnemnda) decisions, ECHR Art. 8 family rights, EMD case law, Hague Convention (cross-border child abduction).</p>
|
||
</div>
|
||
<div class="category-card" data-category="government-documents">
|
||
<div class="category-card__top">
|
||
<span class="category-card__icon" aria-hidden="true">📄</span>
|
||
<span class="category-card__count is-loading" id="cat-government-documents">—</span>
|
||
</div>
|
||
<h4>Government Documents</h4>
|
||
<p>NOUer, Stortingsmeldinger, government white papers and regulatory guidance from Regjeringen.no.</p>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- SOURCES TABLE -->
|
||
<div class="corpus-section">
|
||
<p class="eyebrow">Data sources</p>
|
||
<h3 class="corpus-section__title">Active scrapers</h3>
|
||
<div class="corpus-table-wrap">
|
||
<table class="sources-table" id="sourcesTable">
|
||
<thead>
|
||
<tr>
|
||
<th>Source</th>
|
||
<th>Type</th>
|
||
<th>Category</th>
|
||
<th>Lang</th>
|
||
<th>Schedule</th>
|
||
<th>Status</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody id="sourcesTableBody">
|
||
<tr class="sources-skeleton"><td colspan="6">Loading sources…</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- AI STACK -->
|
||
<div class="corpus-section">
|
||
<p class="eyebrow">Software</p>
|
||
<h3 class="corpus-section__title">AI stack</h3>
|
||
<div class="stack-grid">
|
||
|
||
<div class="stack-card">
|
||
<h3>Reasoning LLMs</h3>
|
||
<ul class="stack-list">
|
||
<li><span class="stack-badge stack-badge--azure">Azure</span> <strong>gpt-4o-mini</strong> <span class="stack-star">★ default</span> — fast, cost-efficient</li>
|
||
<li><span class="stack-badge stack-badge--azure">Azure</span> <strong>gpt-4o</strong> — highest quality</li>
|
||
<li><span class="stack-badge stack-badge--gpu">GPU</span> <strong>qwen2.5:14b</strong> — local, private</li>
|
||
<li><span class="stack-badge stack-badge--gpu">GPU</span> <strong>qwen3:14b</strong> — reasoning mode</li>
|
||
<li><span class="stack-badge stack-badge--gpu">GPU</span> <strong>dbn-legal-agent</strong> — Norwegian law fine-tune (QLoRA on qwen2.5:7b, NorwAI-24B distillation)</li>
|
||
</ul>
|
||
<p class="stack-note">All routed via LiteLLM on Colin · <code>10.0.1.10:4000</code></p>
|
||
</div>
|
||
|
||
<div class="stack-card">
|
||
<h3>Transcription</h3>
|
||
<ul class="stack-list">
|
||
<li><span class="stack-badge stack-badge--gpu">GPU</span> <strong>Whisper large-v3</strong> <span class="stack-star">★ primary</span><br>Cuttlefish · RTX 3060 12 GB VRAM</li>
|
||
<li><span class="stack-badge stack-badge--api">API</span> OpenAI Whisper API</li>
|
||
<li><span class="stack-badge stack-badge--azure">Azure</span> AI Speech <code>nb-NO</code> (Norway East)</li>
|
||
</ul>
|
||
<p class="stack-note">Speaker diarization · VAD silence filter · beam size 5 · vocabulary presets (barnerett, mediation)</p>
|
||
</div>
|
||
|
||
<div class="stack-card">
|
||
<h3>Embeddings</h3>
|
||
<ul class="stack-list">
|
||
<li><strong>nomic-embed-text</strong> — 768-dim dense vectors</li>
|
||
<li>Ollama on Chloe <code>10.0.1.11:11434</code></li>
|
||
<li>Cosine similarity in Qdrant</li>
|
||
</ul>
|
||
<p class="stack-note">All documents chunked and embedded before indexing; chunks stored in both Qdrant (vector) and MariaDB (keyword fallback)</p>
|
||
</div>
|
||
|
||
<div class="stack-card">
|
||
<h3>Vector & Hybrid Search</h3>
|
||
<ul class="stack-list">
|
||
<li><strong>Qdrant</strong> <code>bnl_chunks</code> · ~220 K vectors<br>Colin Docker · <code>10.0.2.10:6333</code></li>
|
||
<li><strong>Azure AI Search</strong> <code>bnl-legal-search</code><br>Basic SKU · West Europe · hybrid keyword + semantic</li>
|
||
<li>Reciprocal rank fusion (vector + keyword)</li>
|
||
<li>Private corpus boosted 1.5×</li>
|
||
</ul>
|
||
</div>
|
||
|
||
<div class="stack-card">
|
||
<h3>Chunking pipeline</h3>
|
||
<ul class="stack-list">
|
||
<li>Heading-aware semantic splitting</li>
|
||
<li>600-word target · 75-word overlap</li>
|
||
<li>50-word minimum chunk</li>
|
||
<li>SHA-256 deduplication</li>
|
||
<li>PDF, DOCX, HTML text extraction</li>
|
||
<li>Temporal metadata (valid_from / valid_until)</li>
|
||
</ul>
|
||
<p class="stack-note">Legal temporal reranking: <code>legal_conservative</code> — surfaces current versions first</p>
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
<!-- DATA PIPELINE -->
|
||
<div class="corpus-section">
|
||
<p class="eyebrow">How it works</p>
|
||
<h3 class="corpus-section__title">Ingestion pipeline</h3>
|
||
<div class="pipeline-flow" role="list" aria-label="Data pipeline steps">
|
||
<div class="pipeline-step" role="listitem">
|
||
<span class="pipeline-step__icon">🌐</span>
|
||
<span>Source</span>
|
||
<small>gov websites, APIs, PDFs</small>
|
||
</div>
|
||
<div class="pipeline-arrow" aria-hidden="true"></div>
|
||
<div class="pipeline-step" role="listitem">
|
||
<span class="pipeline-step__icon">🕷</span>
|
||
<span>Scraper</span>
|
||
<small>HTTP / API / PDF</small>
|
||
</div>
|
||
<div class="pipeline-arrow" aria-hidden="true"></div>
|
||
<div class="pipeline-step" role="listitem">
|
||
<span class="pipeline-step__icon">📝</span>
|
||
<span>Text extract</span>
|
||
<small>PDF, DOCX, HTML</small>
|
||
</div>
|
||
<div class="pipeline-arrow" aria-hidden="true"></div>
|
||
<div class="pipeline-step" role="listitem">
|
||
<span class="pipeline-step__icon">✂</span>
|
||
<span>TextChunker</span>
|
||
<small>600w · 75w overlap</small>
|
||
</div>
|
||
<div class="pipeline-arrow" aria-hidden="true"></div>
|
||
<div class="pipeline-step" role="listitem">
|
||
<span class="pipeline-step__icon">🔢</span>
|
||
<span>Embed</span>
|
||
<small>nomic · 768-dim</small>
|
||
</div>
|
||
<div class="pipeline-arrow" aria-hidden="true"></div>
|
||
<div class="pipeline-step" role="listitem">
|
||
<span class="pipeline-step__icon">⚡</span>
|
||
<span>Qdrant</span>
|
||
<small>cosine upsert</small>
|
||
</div>
|
||
<div class="pipeline-arrow" aria-hidden="true"></div>
|
||
<div class="pipeline-step" role="listitem">
|
||
<span class="pipeline-step__icon">🤖</span>
|
||
<span>LiteLLM</span>
|
||
<small>RAG + LLM</small>
|
||
</div>
|
||
<div class="pipeline-arrow" aria-hidden="true"></div>
|
||
<div class="pipeline-step pipeline-step--end" role="listitem">
|
||
<span class="pipeline-step__icon">🔍</span>
|
||
<span>Your tool</span>
|
||
<small>Ask, Search, Research…</small>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<script>
|
||
(function () {
|
||
function fmt(n) {
|
||
if (n === null || n === undefined) return '—';
|
||
return Number(n).toLocaleString('en');
|
||
}
|
||
|
||
function fmtDate(s) {
|
||
if (!s) return '—';
|
||
try {
|
||
const d = new Date(s);
|
||
return d.toLocaleDateString('en-GB', { day: 'numeric', month: 'short', year: 'numeric' });
|
||
} catch (e) { return s; }
|
||
}
|
||
|
||
const authorityLabels = {
|
||
case_law: { label: 'Case law', cls: 'badge--teal' },
|
||
guidance: { label: 'Guidance', cls: 'badge--amber' },
|
||
report: { label: 'Report', cls: 'badge--muted' },
|
||
ombudsman: { label: 'Ombudsman', cls: 'badge--muted' },
|
||
tribunal: { label: 'Tribunal', cls: 'badge--coral' },
|
||
regulatory: { label: 'Regulatory', cls: 'badge--coral' },
|
||
law: { label: 'Statute', cls: 'badge--teal' },
|
||
treaty: { label: 'Treaty', cls: 'badge--muted' },
|
||
};
|
||
|
||
const scheduleLabels = {
|
||
daily: 'Daily',
|
||
weekly: 'Weekly',
|
||
monthly: 'Monthly',
|
||
manual: 'Manual',
|
||
};
|
||
|
||
// Category slug → element id map (for live counts)
|
||
const catIds = {
|
||
'family-law': 'cat-family-law',
|
||
'family_law': 'cat-family-law',
|
||
'child-welfare': 'cat-child-welfare',
|
||
'child_welfare': 'cat-child-welfare',
|
||
'labour-law': 'cat-labour-law',
|
||
'labour_law': 'cat-labour-law',
|
||
'social-welfare': 'cat-social-welfare',
|
||
'social_welfare': 'cat-social-welfare',
|
||
'tax-law': 'cat-tax-law',
|
||
'tax_law': 'cat-tax-law',
|
||
'administrative-law': 'cat-administrative-law',
|
||
'administrative_law': 'cat-administrative-law',
|
||
'consumer-law': 'cat-consumer-law',
|
||
'consumer_law': 'cat-consumer-law',
|
||
'tenancy-law': 'cat-consumer-law',
|
||
'financial-law': 'cat-consumer-law',
|
||
'immigration-law': 'cat-immigration-law',
|
||
'immigration_law': 'cat-immigration-law',
|
||
'government-documents':'cat-government-documents',
|
||
'government_documents':'cat-government-documents',
|
||
'case-law': 'cat-administrative-law',
|
||
'victim-compensation': 'cat-administrative-law',
|
||
'procurement-law': 'cat-administrative-law',
|
||
};
|
||
|
||
function setLoaded(el) {
|
||
el.classList.remove('is-loading');
|
||
}
|
||
|
||
fetch('/api/corpus-stats.php', { credentials: 'same-origin' })
|
||
.then(r => r.json())
|
||
.then(data => {
|
||
if (!data.ok) return;
|
||
const s = data.stats;
|
||
|
||
const elChunks = document.querySelector('#statChunks .corpus-stat__value');
|
||
const elDocs = document.querySelector('#statDocs .corpus-stat__value');
|
||
const elSrc = document.querySelector('#statSources .corpus-stat__value');
|
||
const elUpd = document.querySelector('#statUpdated .corpus-stat__value');
|
||
|
||
if (elChunks) { elChunks.textContent = fmt(s.total_chunks); setLoaded(elChunks); }
|
||
if (elDocs) { elDocs.textContent = fmt(s.total_docs); setLoaded(elDocs); }
|
||
if (elSrc) { elSrc.textContent = fmt(s.active_sources); setLoaded(elSrc); }
|
||
if (elUpd) { elUpd.textContent = fmtDate(s.last_updated); setLoaded(elUpd); }
|
||
|
||
// Category counts
|
||
(s.by_category || []).forEach(row => {
|
||
const elId = catIds[row.category];
|
||
if (!elId) return;
|
||
const el = document.getElementById(elId);
|
||
if (!el) return;
|
||
const cur = parseInt(el.textContent, 10) || 0;
|
||
el.textContent = fmt(cur + parseInt(row.doc_count, 10));
|
||
setLoaded(el);
|
||
});
|
||
// Zero out remaining loading badges
|
||
document.querySelectorAll('.category-card__count.is-loading').forEach(el => {
|
||
el.textContent = '0';
|
||
setLoaded(el);
|
||
});
|
||
|
||
// Sources table
|
||
const tbody = document.getElementById('sourcesTableBody');
|
||
if (!tbody) return;
|
||
tbody.innerHTML = '';
|
||
(data.sources || []).forEach(src => {
|
||
const auth = authorityLabels[src.authority_type] || { label: src.authority_type || '—', cls: 'badge--muted' };
|
||
const sched = scheduleLabels[src.schedule] || (src.schedule || 'Manual');
|
||
const langFlag = src.language === 'no' ? '🇳🇴' : src.language === 'en' ? '🇬🇧' : (src.language || '—');
|
||
const statusHtml = src.is_active
|
||
? '<span class="status-active">● Active</span>'
|
||
: '<span class="status-inactive">○ Inactive</span>';
|
||
const nameHtml = src.url
|
||
? `<a href="${escHtml(src.url)}" target="_blank" rel="noopener">${escHtml(src.name)}</a>`
|
||
: escHtml(src.name);
|
||
const tr = document.createElement('tr');
|
||
tr.innerHTML = `
|
||
<td class="source-name">${nameHtml}</td>
|
||
<td><span class="source-badge ${escHtml(auth.cls)}">${escHtml(auth.label)}</span></td>
|
||
<td><span class="source-cat">${escHtml(src.category || '—')}</span></td>
|
||
<td>${langFlag}</td>
|
||
<td>${escHtml(sched)}</td>
|
||
<td>${statusHtml}</td>`;
|
||
tbody.appendChild(tr);
|
||
});
|
||
})
|
||
.catch(() => {
|
||
document.querySelectorAll('.corpus-stat__value').forEach(el => {
|
||
el.textContent = '—';
|
||
el.classList.remove('is-loading');
|
||
});
|
||
});
|
||
|
||
function escHtml(s) {
|
||
return String(s ?? '').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');
|
||
}
|
||
})();
|
||
</script>
|
||
|
||
<?php require_once __DIR__ . '/includes/layout_footer.php'; ?>
|