Wire Azure AI Search into dobetternorge-tools
health.php: Add azure_search check — calls /$count endpoint and
reports doc count in the index. Reads DBN_AZURE_SEARCH_{ENDPOINT,KEY,INDEX}.
corpus-search.php: Add azure mode — semantic + vector hybrid search
via Azure AI Search bnl-legal-v2. Embeds query with LiteLLM
nomic-embed-text; expands keepCats to include government-policy,
health-law, social-services, labour-law, immigration (previously
blocked by contamination workaround, now safe to include).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+86
-1
@@ -9,7 +9,7 @@ dbnToolsRequireAuth();
|
||||
$input = dbnToolsJsonInput(4000);
|
||||
$query = trim(dbnToolsString($input, 'query', 1000));
|
||||
$rawMode = $input['mode'] ?? 'hybrid';
|
||||
$mode = in_array($rawMode, ['hybrid', 'bm25', 'vector'], true) ? $rawMode : 'hybrid';
|
||||
$mode = in_array($rawMode, ['hybrid', 'bm25', 'vector', 'azure'], true) ? $rawMode : 'hybrid';
|
||||
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
|
||||
$limit = max(1, min(20, (int)($input['limit'] ?? 8)));
|
||||
$category = isset($input['category']) && $input['category'] !== '' ? trim((string)$input['category']) : null;
|
||||
@@ -160,6 +160,91 @@ try {
|
||||
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'vector', 'query' => $query]);
|
||||
}
|
||||
|
||||
// ── AZURE AI SEARCH: semantic + vector via Azure AI Search ────────────────
|
||||
if ($mode === 'azure') {
|
||||
$searchEndpoint = rtrim((string)dbnToolsEnv('DBN_AZURE_SEARCH_ENDPOINT', ''), '/');
|
||||
$searchKey = (string)dbnToolsEnv('DBN_AZURE_SEARCH_KEY', '');
|
||||
$searchIndex = (string)dbnToolsEnv('DBN_AZURE_SEARCH_INDEX', 'bnl-legal-v2');
|
||||
|
||||
if (!$searchEndpoint || !$searchKey) {
|
||||
dbnToolsError('Azure AI Search is not configured on this server.', 503, 'azure_search_not_configured');
|
||||
}
|
||||
|
||||
// Try to embed the query for hybrid (semantic + vector) search
|
||||
$vector = null;
|
||||
$embeddings = dbnToolsLiteLLMEmbedBatch([$query]);
|
||||
if (!empty($embeddings) && is_array($embeddings[0])) {
|
||||
$vector = $embeddings[0];
|
||||
}
|
||||
|
||||
// Expanded keep-list: original 11 + government-policy, health-law,
|
||||
// social-services, labour-law, immigration (unblocked after contamination cleanup)
|
||||
$keepCats = [
|
||||
'child-welfare', 'echr-case-law', 'child-abduction', 'legislation',
|
||||
'anti-discrimination', 'legal', 'children-rights', 'family-law',
|
||||
'civil-litigation', 'patient-rights', 'parliamentary',
|
||||
'government-policy', 'health-law', 'social-services', 'labour-law', 'immigration',
|
||||
];
|
||||
$catFilter = implode(' or ', array_map(fn($c) => "category eq '$c'", $keepCats));
|
||||
if ($category !== null) {
|
||||
$catFilter = "category eq '$category'";
|
||||
}
|
||||
|
||||
$payload = [
|
||||
'search' => $query,
|
||||
'top' => $limit,
|
||||
'select' => 'id,chunk_id,content,title,section_title,category,source_url',
|
||||
'queryType' => 'semantic',
|
||||
'semanticConfiguration' => 'bnl-semantic',
|
||||
'filter' => $catFilter,
|
||||
];
|
||||
if ($vector) {
|
||||
$payload['vectorQueries'] = [[
|
||||
'kind' => 'vector',
|
||||
'vector' => $vector,
|
||||
'fields' => 'content_vector',
|
||||
'k' => $limit,
|
||||
]];
|
||||
}
|
||||
|
||||
$url = "$searchEndpoint/indexes/" . rawurlencode($searchIndex) . '/docs/search?api-version=2024-05-01-preview';
|
||||
$ch = curl_init($url);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_TIMEOUT => 15,
|
||||
CURLOPT_HTTPHEADER => ['Content-Type: application/json', "api-key: $searchKey"],
|
||||
CURLOPT_POSTFIELDS => json_encode($payload, JSON_UNESCAPED_SLASHES),
|
||||
]);
|
||||
$resp = curl_exec($ch);
|
||||
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
|
||||
$curlErr = curl_error($ch);
|
||||
curl_close($ch);
|
||||
|
||||
if ($curlErr) dbnToolsError("Azure Search unreachable: $curlErr", 502, 'azure_search_error');
|
||||
if ($code !== 200) {
|
||||
$errBody = json_decode((string)$resp, true);
|
||||
$errMsg = $errBody['error']['message'] ?? "HTTP $code";
|
||||
dbnToolsError("Azure AI Search error: $errMsg", 502, 'azure_search_error');
|
||||
}
|
||||
|
||||
$data = json_decode((string)$resp, true);
|
||||
$hits = array_map(fn($d) => [
|
||||
'title' => trim(implode(' — ', array_filter([$d['title'] ?? '', $d['section_title'] ?? '']))),
|
||||
'category' => $d['category'] ?? '',
|
||||
'section' => $d['section_title'] ?? null,
|
||||
'excerpt' => mb_substr((string)($d['content'] ?? ''), 0, 600, 'UTF-8'),
|
||||
'score' => round((float)($d['@search.rerankerScore'] ?? $d['@search.score'] ?? 0), 4),
|
||||
'document_id' => null,
|
||||
'chunk_id' => $d['chunk_id'] ?? $d['id'] ?? null,
|
||||
'source_url' => $d['source_url'] ?? null,
|
||||
'language' => null,
|
||||
], $data['value'] ?? []);
|
||||
|
||||
$hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN)));
|
||||
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'azure', 'query' => $query]);
|
||||
}
|
||||
|
||||
dbnToolsError('Unknown search mode.', 422, 'invalid_mode');
|
||||
} catch (DbnToolsHttpException $e) {
|
||||
throw $e;
|
||||
|
||||
@@ -66,6 +66,29 @@ try {
|
||||
$checks['family_legal_subscription'] = ['ok' => false, 'detail' => 'Not checked'];
|
||||
}
|
||||
|
||||
$searchEndpoint = rtrim((string)dbnToolsEnv('DBN_AZURE_SEARCH_ENDPOINT', ''), '/');
|
||||
$searchKey = (string)dbnToolsEnv('DBN_AZURE_SEARCH_KEY', '');
|
||||
$searchIndex = (string)dbnToolsEnv('DBN_AZURE_SEARCH_INDEX', '');
|
||||
if ($searchEndpoint && $searchKey && $searchIndex) {
|
||||
$countUrl = "$searchEndpoint/indexes/$searchIndex/docs/\$count?api-version=2024-05-01-preview";
|
||||
$ch = curl_init($countUrl);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_TIMEOUT => 8,
|
||||
CURLOPT_HTTPHEADER => ["api-key: $searchKey"],
|
||||
]);
|
||||
$resp = curl_exec($ch);
|
||||
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
|
||||
curl_close($ch);
|
||||
$count = is_numeric(trim((string)$resp)) ? (int)trim($resp) : null;
|
||||
$checks['azure_search'] = [
|
||||
'ok' => $code === 200 && $count !== null,
|
||||
'detail' => $code === 200 ? "$count docs in $searchIndex" : "HTTP $code",
|
||||
];
|
||||
} else {
|
||||
$checks['azure_search'] = ['ok' => false, 'detail' => 'Azure Search env vars not configured'];
|
||||
}
|
||||
|
||||
$logPath = dbnToolsMetadataLogPath();
|
||||
$dir = dirname($logPath);
|
||||
$checks['metadata_log'] = [
|
||||
|
||||
Reference in New Issue
Block a user