662fbf7d6d
Generalize the family-locked legal tools into caveauAI persona profiles (client 57 chat profiles, resolved in-process via the chat_profiles bridge). Each tool accepts an optional `profile` slug that scopes the corpus package(s), search method, system prompt and synthesis model; omitting it falls back to the family-legal package so existing behaviour is unchanged. - dbnToolsResolvePersona / dbnToolsListPersonas / dbnToolsBootChatProfiles in bootstrap.php; new api/personas.php + dbn.list_personas MCP tool. - LegalTools search/ask/corpusContextForSummarize and the BvjAnalyzer / LegalAnalysis / translate paths take the persona's packages + prompt + model. - Persona <select> on ask/search/summarize (populated from api/personas.php). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
308 lines
14 KiB
PHP
308 lines
14 KiB
PHP
<?php
|
|
declare(strict_types=1);
|
|
|
|
require_once __DIR__ . '/../includes/LegalTools.php';
|
|
|
|
dbnToolsRequireMethod('POST');
|
|
dbnToolsRequireAuth();
|
|
|
|
$input = dbnToolsJsonInput(4000);
|
|
$query = trim(dbnToolsString($input, 'query', 1000));
|
|
$rawMode = $input['mode'] ?? 'hybrid';
|
|
$mode = in_array($rawMode, ['hybrid', 'bm25', 'vector', 'azure'], true) ? $rawMode : 'hybrid';
|
|
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
|
|
$limit = max(1, min(20, (int)($input['limit'] ?? 8)));
|
|
$category = isset($input['category']) && $input['category'] !== '' ? trim((string)$input['category']) : null;
|
|
$persona = (isset($input['profile']) && is_string($input['profile']) && trim($input['profile']) !== '')
|
|
? trim($input['profile'])
|
|
: null;
|
|
|
|
const EXCLUDED_DOMAIN = 'dobetternorge.no';
|
|
|
|
if (mb_strlen($query, 'UTF-8') < 3) {
|
|
dbnToolsError('Query must be at least 3 characters.', 422, 'query_too_short');
|
|
}
|
|
|
|
try {
|
|
// ── HYBRID: delegate to the existing RAG pipeline ──────────────────────
|
|
if ($mode === 'hybrid') {
|
|
$result = (new DbnLegalToolsService())->search($query, $language, $limit, 'disabled', null, 'both', $persona);
|
|
$hits = array_map(fn($h) => [
|
|
'title' => $h['title'] ?? '',
|
|
'category' => $h['category'] ?? '',
|
|
'section' => $h['section'] ?? null,
|
|
'excerpt' => $h['excerpt'] ?? ($h['chunk_text'] ?? ''),
|
|
'full_text' => $h['full_text'] ?? $h['chunk_text'] ?? $h['excerpt'] ?? '',
|
|
'score' => $h['score'] ?? null,
|
|
'document_id' => $h['document_id'] ?? null,
|
|
'chunk_id' => $h['chunk_id'] ?? null,
|
|
'source_url' => $h['source_url'] ?? null,
|
|
'language' => null,
|
|
], $result['hits'] ?? []);
|
|
$hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN)));
|
|
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'hybrid', 'query' => $query]);
|
|
}
|
|
|
|
$ragDb = dbnToolsRagDb();
|
|
|
|
// ── BM25: FULLTEXT with LIKE fallback ───────────────────────────────────
|
|
if ($mode === 'bm25') {
|
|
$catClause = $category !== null ? ' AND d.category = ?' : '';
|
|
$excludeLike = '%' . EXCLUDED_DOMAIN . '%';
|
|
|
|
// Exact-identifier routing: the FULLTEXT tokenizer drops "§ 4-12" / "Art. 8"
|
|
// to stopword fragments, so a citation query never matches. Route those to a
|
|
// verbatim LIKE lookup and pin the hits ahead of the fuzzy BM25 results.
|
|
$exactHits = [];
|
|
$exactChunkIds = [];
|
|
$citationAtoms = DbnLegalToolsService::citationAtoms($query);
|
|
if (!empty($citationAtoms)) {
|
|
$atomClauses = [];
|
|
$atomParams = [1];
|
|
foreach ($citationAtoms as $atom) {
|
|
$like = '%' . str_replace(['%', '_'], ['\\%', '\\_'], $atom) . '%';
|
|
$atomClauses[] = '(c.content LIKE ? OR c.section_title LIKE ?)';
|
|
$atomParams[] = $like;
|
|
$atomParams[] = $like;
|
|
}
|
|
$atomParams[] = $excludeLike;
|
|
$exactSql = "SELECT d.id AS document_id, d.title, d.category,
|
|
d.source_url, c.id AS chunk_id, c.content AS excerpt,
|
|
c.section_title AS section, d.language, 1.0 AS score
|
|
FROM chunks c
|
|
JOIN documents d ON c.document_id = d.id
|
|
WHERE d.corpus_id = ? AND d.status = 'ready'
|
|
AND (" . implode(' OR ', $atomClauses) . ")
|
|
AND d.source_url NOT LIKE ?
|
|
$catClause
|
|
LIMIT $limit";
|
|
$exactParams = $atomParams;
|
|
if ($category !== null) $exactParams[] = $category;
|
|
try {
|
|
$stmt = $ragDb->prepare($exactSql);
|
|
$stmt->execute($exactParams);
|
|
foreach ($stmt->fetchAll(PDO::FETCH_ASSOC) as $r) {
|
|
$exactHits[] = $r;
|
|
if (isset($r['chunk_id'])) $exactChunkIds[(int)$r['chunk_id']] = true;
|
|
}
|
|
} catch (Throwable $e) {
|
|
// Non-fatal — fall through to fuzzy BM25.
|
|
}
|
|
}
|
|
|
|
// Try FULLTEXT index first
|
|
try {
|
|
$sql = "SELECT d.id AS document_id, d.title, d.category,
|
|
d.source_url, c.id AS chunk_id, c.content AS excerpt,
|
|
c.section_title AS section, d.language,
|
|
MATCH(c.content) AGAINST (? IN BOOLEAN MODE) AS score
|
|
FROM chunks c
|
|
JOIN documents d ON c.document_id = d.id
|
|
WHERE d.corpus_id = ? AND d.status = 'ready'
|
|
AND MATCH(c.content) AGAINST (? IN BOOLEAN MODE) > 0
|
|
AND d.source_url NOT LIKE ?
|
|
$catClause
|
|
ORDER BY score DESC
|
|
LIMIT $limit";
|
|
$params = [$query, 1, $query, $excludeLike];
|
|
if ($category !== null) $params[] = $category;
|
|
$stmt = $ragDb->prepare($sql);
|
|
$stmt->execute($params);
|
|
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
|
|
} catch (Throwable $e) {
|
|
// FULLTEXT index absent — use LIKE
|
|
$like = '%' . str_replace(['%', '_'], ['\\%', '\\_'], $query) . '%';
|
|
$sql = "SELECT d.id AS document_id, d.title, d.category,
|
|
d.source_url, c.id AS chunk_id, c.content AS excerpt,
|
|
c.section_title AS section, d.language,
|
|
0.25 AS score
|
|
FROM chunks c
|
|
JOIN documents d ON c.document_id = d.id
|
|
WHERE d.corpus_id = ? AND d.status = 'ready'
|
|
AND (c.content LIKE ? OR d.title LIKE ?)
|
|
AND d.source_url NOT LIKE ?
|
|
$catClause
|
|
ORDER BY (d.title LIKE ?) DESC
|
|
LIMIT $limit";
|
|
$params = [1, $like, $like, $excludeLike];
|
|
if ($category !== null) $params[] = $category;
|
|
$params[] = $like;
|
|
$stmt = $ragDb->prepare($sql);
|
|
$stmt->execute($params);
|
|
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
|
|
}
|
|
|
|
// Exact identifier hits lead; drop fuzzy rows that duplicate them.
|
|
if (!empty($exactChunkIds)) {
|
|
$rows = array_values(array_filter($rows, fn($r) => empty($exactChunkIds[(int)($r['chunk_id'] ?? 0)])));
|
|
}
|
|
$rows = array_slice(array_merge($exactHits, $rows), 0, $limit);
|
|
|
|
$hits = array_map(fn($r) => [
|
|
'title' => $r['title'] ?? '',
|
|
'category' => $r['category'] ?? '',
|
|
'section' => $r['section'] ?? null,
|
|
'excerpt' => mb_substr((string)($r['excerpt'] ?? ''), 0, 600, 'UTF-8'),
|
|
'full_text' => (string)($r['excerpt'] ?? ''),
|
|
'score' => isset($r['score']) ? round((float)$r['score'], 4) : null,
|
|
'document_id' => (int)$r['document_id'],
|
|
'chunk_id' => isset($r['chunk_id']) ? (int)$r['chunk_id'] : null,
|
|
'source_url' => $r['source_url'] ?? null,
|
|
'language' => $r['language'] ?? null,
|
|
'exact_match' => !empty($exactChunkIds[(int)($r['chunk_id'] ?? 0)]),
|
|
], $rows);
|
|
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'bm25', 'query' => $query]);
|
|
}
|
|
|
|
// ── VECTOR: embed → Qdrant ─────────────────────────────────────────────
|
|
if ($mode === 'vector') {
|
|
$embeddings = dbnToolsLiteLLMEmbedBatch([$query]);
|
|
if (empty($embeddings) || !is_array($embeddings[0])) {
|
|
dbnToolsError('Embedding failed — vector search unavailable.', 502, 'embed_error');
|
|
}
|
|
|
|
$filter = ['must' => [['key' => 'corpus_id', 'match' => ['value' => 1]]]];
|
|
if ($category !== null) {
|
|
$filter['must'][] = ['key' => 'category', 'match' => ['value' => $category]];
|
|
}
|
|
|
|
$qdrantPayload = json_encode([
|
|
'vector' => $embeddings[0],
|
|
'limit' => $limit,
|
|
'with_payload' => true,
|
|
'filter' => $filter,
|
|
]);
|
|
|
|
$ch = curl_init('http://10.0.2.10:6333/collections/bnl_chunks/points/search');
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_POST => true,
|
|
CURLOPT_POSTFIELDS => $qdrantPayload,
|
|
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
|
|
CURLOPT_TIMEOUT => 15,
|
|
]);
|
|
$resp = curl_exec($ch);
|
|
$curlErr = curl_error($ch);
|
|
curl_close($ch);
|
|
|
|
if ($resp === false) {
|
|
dbnToolsError('Qdrant unreachable: ' . $curlErr, 502, 'qdrant_error');
|
|
}
|
|
|
|
$qdrantResult = json_decode($resp, true);
|
|
$points = $qdrantResult['result'] ?? [];
|
|
|
|
$hits = [];
|
|
foreach ($points as $pt) {
|
|
$p = $pt['payload'] ?? [];
|
|
$hits[] = [
|
|
'title' => $p['title'] ?? $p['document_title'] ?? '',
|
|
'category' => $p['category'] ?? '',
|
|
'section' => $p['section_title'] ?? null,
|
|
'excerpt' => mb_substr((string)($p['content'] ?? ''), 0, 600, 'UTF-8'),
|
|
'full_text' => (string)($p['content'] ?? ''),
|
|
'score' => round((float)($pt['score'] ?? 0), 4),
|
|
'document_id' => isset($p['document_id']) ? (int)$p['document_id'] : null,
|
|
'chunk_id' => $pt['id'] ?? null,
|
|
'source_url' => $p['source_url'] ?? null,
|
|
'language' => $p['language'] ?? null,
|
|
];
|
|
}
|
|
|
|
$hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN)));
|
|
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'vector', 'query' => $query]);
|
|
}
|
|
|
|
// ── AZURE AI SEARCH: semantic + vector via Azure AI Search ────────────────
|
|
if ($mode === 'azure') {
|
|
$searchEndpoint = rtrim((string)dbnToolsEnv('DBN_AZURE_SEARCH_ENDPOINT', ''), '/');
|
|
$searchKey = (string)dbnToolsEnv('DBN_AZURE_SEARCH_KEY', '');
|
|
$searchIndex = (string)dbnToolsEnv('DBN_AZURE_SEARCH_INDEX', 'bnl-legal-v2');
|
|
|
|
if (!$searchEndpoint || !$searchKey) {
|
|
dbnToolsError('Azure AI Search is not configured on this server.', 503, 'azure_search_not_configured');
|
|
}
|
|
|
|
// Try to embed the query for hybrid (semantic + vector) search
|
|
$vector = null;
|
|
$embeddings = dbnToolsLiteLLMEmbedBatch([$query]);
|
|
if (!empty($embeddings) && is_array($embeddings[0])) {
|
|
$vector = $embeddings[0];
|
|
}
|
|
|
|
// Expanded keep-list: original 11 + government-policy, health-law,
|
|
// social-services, labour-law, immigration (unblocked after contamination cleanup)
|
|
$keepCats = [
|
|
'child-welfare', 'echr-case-law', 'child-abduction', 'legislation',
|
|
'anti-discrimination', 'legal', 'children-rights', 'family-law',
|
|
'civil-litigation', 'patient-rights', 'parliamentary',
|
|
'government-policy', 'health-law', 'social-services', 'labour-law', 'immigration',
|
|
];
|
|
$catFilter = implode(' or ', array_map(fn($c) => "category eq '$c'", $keepCats));
|
|
if ($category !== null) {
|
|
$catFilter = "category eq '$category'";
|
|
}
|
|
|
|
$payload = [
|
|
'search' => $query,
|
|
'top' => $limit,
|
|
'select' => 'id,chunk_id,content,title,section_title,category,source_url',
|
|
'queryType' => 'semantic',
|
|
'semanticConfiguration' => 'bnl-semantic',
|
|
'filter' => $catFilter,
|
|
];
|
|
if ($vector) {
|
|
$payload['vectorQueries'] = [[
|
|
'kind' => 'vector',
|
|
'vector' => $vector,
|
|
'fields' => 'content_vector',
|
|
'k' => $limit,
|
|
]];
|
|
}
|
|
|
|
$url = "$searchEndpoint/indexes/" . rawurlencode($searchIndex) . '/docs/search?api-version=2024-05-01-preview';
|
|
$ch = curl_init($url);
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_POST => true,
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_TIMEOUT => 15,
|
|
CURLOPT_HTTPHEADER => ['Content-Type: application/json', "api-key: $searchKey"],
|
|
CURLOPT_POSTFIELDS => json_encode($payload, JSON_UNESCAPED_SLASHES),
|
|
]);
|
|
$resp = curl_exec($ch);
|
|
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
|
|
$curlErr = curl_error($ch);
|
|
curl_close($ch);
|
|
|
|
if ($curlErr) dbnToolsError("Azure Search unreachable: $curlErr", 502, 'azure_search_error');
|
|
if ($code !== 200) {
|
|
$errBody = json_decode((string)$resp, true);
|
|
$errMsg = $errBody['error']['message'] ?? "HTTP $code";
|
|
dbnToolsError("Azure AI Search error: $errMsg", 502, 'azure_search_error');
|
|
}
|
|
|
|
$data = json_decode((string)$resp, true);
|
|
$hits = array_map(fn($d) => [
|
|
'title' => trim(implode(' — ', array_filter([$d['title'] ?? '', $d['section_title'] ?? '']))),
|
|
'category' => $d['category'] ?? '',
|
|
'section' => $d['section_title'] ?? null,
|
|
'excerpt' => mb_substr((string)($d['content'] ?? ''), 0, 600, 'UTF-8'),
|
|
'full_text' => (string)($d['content'] ?? ''),
|
|
'score' => round((float)($d['@search.rerankerScore'] ?? $d['@search.score'] ?? 0), 4),
|
|
'document_id' => null,
|
|
'chunk_id' => $d['chunk_id'] ?? $d['id'] ?? null,
|
|
'source_url' => $d['source_url'] ?? null,
|
|
'language' => null,
|
|
], $data['value'] ?? []);
|
|
|
|
$hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN)));
|
|
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'azure', 'query' => $query]);
|
|
}
|
|
|
|
dbnToolsError('Unknown search mode.', 422, 'invalid_mode');
|
|
} catch (DbnToolsHttpException $e) {
|
|
throw $e;
|
|
} catch (Throwable $e) {
|
|
dbnToolsError('Corpus search failed: ' . $e->getMessage(), 500, 'search_error');
|
|
}
|