Wire Azure AI Search into dobetternorge-tools

health.php: Add azure_search check — calls /$count endpoint and
  reports doc count in the index. Reads DBN_AZURE_SEARCH_{ENDPOINT,KEY,INDEX}.

corpus-search.php: Add azure mode — semantic + vector hybrid search
  via Azure AI Search bnl-legal-v2. Embeds query with LiteLLM
  nomic-embed-text; expands keepCats to include government-policy,
  health-law, social-services, labour-law, immigration (previously
  blocked by contamination workaround, now safe to include).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-15 13:32:15 +02:00
parent d5e61d656a
commit 464b8572d3
2 changed files with 109 additions and 1 deletions
+86 -1
View File
@@ -9,7 +9,7 @@ dbnToolsRequireAuth();
$input = dbnToolsJsonInput(4000); $input = dbnToolsJsonInput(4000);
$query = trim(dbnToolsString($input, 'query', 1000)); $query = trim(dbnToolsString($input, 'query', 1000));
$rawMode = $input['mode'] ?? 'hybrid'; $rawMode = $input['mode'] ?? 'hybrid';
$mode = in_array($rawMode, ['hybrid', 'bm25', 'vector'], true) ? $rawMode : 'hybrid'; $mode = in_array($rawMode, ['hybrid', 'bm25', 'vector', 'azure'], true) ? $rawMode : 'hybrid';
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
$limit = max(1, min(20, (int)($input['limit'] ?? 8))); $limit = max(1, min(20, (int)($input['limit'] ?? 8)));
$category = isset($input['category']) && $input['category'] !== '' ? trim((string)$input['category']) : null; $category = isset($input['category']) && $input['category'] !== '' ? trim((string)$input['category']) : null;
@@ -160,6 +160,91 @@ try {
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'vector', 'query' => $query]); dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'vector', 'query' => $query]);
} }
// ── AZURE AI SEARCH: semantic + vector via Azure AI Search ────────────────
if ($mode === 'azure') {
$searchEndpoint = rtrim((string)dbnToolsEnv('DBN_AZURE_SEARCH_ENDPOINT', ''), '/');
$searchKey = (string)dbnToolsEnv('DBN_AZURE_SEARCH_KEY', '');
$searchIndex = (string)dbnToolsEnv('DBN_AZURE_SEARCH_INDEX', 'bnl-legal-v2');
if (!$searchEndpoint || !$searchKey) {
dbnToolsError('Azure AI Search is not configured on this server.', 503, 'azure_search_not_configured');
}
// Try to embed the query for hybrid (semantic + vector) search
$vector = null;
$embeddings = dbnToolsLiteLLMEmbedBatch([$query]);
if (!empty($embeddings) && is_array($embeddings[0])) {
$vector = $embeddings[0];
}
// Expanded keep-list: original 11 + government-policy, health-law,
// social-services, labour-law, immigration (unblocked after contamination cleanup)
$keepCats = [
'child-welfare', 'echr-case-law', 'child-abduction', 'legislation',
'anti-discrimination', 'legal', 'children-rights', 'family-law',
'civil-litigation', 'patient-rights', 'parliamentary',
'government-policy', 'health-law', 'social-services', 'labour-law', 'immigration',
];
$catFilter = implode(' or ', array_map(fn($c) => "category eq '$c'", $keepCats));
if ($category !== null) {
$catFilter = "category eq '$category'";
}
$payload = [
'search' => $query,
'top' => $limit,
'select' => 'id,chunk_id,content,title,section_title,category,source_url',
'queryType' => 'semantic',
'semanticConfiguration' => 'bnl-semantic',
'filter' => $catFilter,
];
if ($vector) {
$payload['vectorQueries'] = [[
'kind' => 'vector',
'vector' => $vector,
'fields' => 'content_vector',
'k' => $limit,
]];
}
$url = "$searchEndpoint/indexes/" . rawurlencode($searchIndex) . '/docs/search?api-version=2024-05-01-preview';
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_POST => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 15,
CURLOPT_HTTPHEADER => ['Content-Type: application/json', "api-key: $searchKey"],
CURLOPT_POSTFIELDS => json_encode($payload, JSON_UNESCAPED_SLASHES),
]);
$resp = curl_exec($ch);
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$curlErr = curl_error($ch);
curl_close($ch);
if ($curlErr) dbnToolsError("Azure Search unreachable: $curlErr", 502, 'azure_search_error');
if ($code !== 200) {
$errBody = json_decode((string)$resp, true);
$errMsg = $errBody['error']['message'] ?? "HTTP $code";
dbnToolsError("Azure AI Search error: $errMsg", 502, 'azure_search_error');
}
$data = json_decode((string)$resp, true);
$hits = array_map(fn($d) => [
'title' => trim(implode(' — ', array_filter([$d['title'] ?? '', $d['section_title'] ?? '']))),
'category' => $d['category'] ?? '',
'section' => $d['section_title'] ?? null,
'excerpt' => mb_substr((string)($d['content'] ?? ''), 0, 600, 'UTF-8'),
'score' => round((float)($d['@search.rerankerScore'] ?? $d['@search.score'] ?? 0), 4),
'document_id' => null,
'chunk_id' => $d['chunk_id'] ?? $d['id'] ?? null,
'source_url' => $d['source_url'] ?? null,
'language' => null,
], $data['value'] ?? []);
$hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN)));
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'azure', 'query' => $query]);
}
dbnToolsError('Unknown search mode.', 422, 'invalid_mode'); dbnToolsError('Unknown search mode.', 422, 'invalid_mode');
} catch (DbnToolsHttpException $e) { } catch (DbnToolsHttpException $e) {
throw $e; throw $e;
+23
View File
@@ -66,6 +66,29 @@ try {
$checks['family_legal_subscription'] = ['ok' => false, 'detail' => 'Not checked']; $checks['family_legal_subscription'] = ['ok' => false, 'detail' => 'Not checked'];
} }
$searchEndpoint = rtrim((string)dbnToolsEnv('DBN_AZURE_SEARCH_ENDPOINT', ''), '/');
$searchKey = (string)dbnToolsEnv('DBN_AZURE_SEARCH_KEY', '');
$searchIndex = (string)dbnToolsEnv('DBN_AZURE_SEARCH_INDEX', '');
if ($searchEndpoint && $searchKey && $searchIndex) {
$countUrl = "$searchEndpoint/indexes/$searchIndex/docs/\$count?api-version=2024-05-01-preview";
$ch = curl_init($countUrl);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 8,
CURLOPT_HTTPHEADER => ["api-key: $searchKey"],
]);
$resp = curl_exec($ch);
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
curl_close($ch);
$count = is_numeric(trim((string)$resp)) ? (int)trim($resp) : null;
$checks['azure_search'] = [
'ok' => $code === 200 && $count !== null,
'detail' => $code === 200 ? "$count docs in $searchIndex" : "HTTP $code",
];
} else {
$checks['azure_search'] = ['ok' => false, 'detail' => 'Azure Search env vars not configured'];
}
$logPath = dbnToolsMetadataLogPath(); $logPath = dbnToolsMetadataLogPath();
$dir = dirname($logPath); $dir = dirname($logPath);
$checks['metadata_log'] = [ $checks['metadata_log'] = [