From 464b8572d31ff41baaaf8841637b5e7e2311dc6e Mon Sep 17 00:00:00 2001 From: davegilligan Date: Fri, 15 May 2026 13:32:15 +0200 Subject: [PATCH] Wire Azure AI Search into dobetternorge-tools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit health.php: Add azure_search check — calls /$count endpoint and reports doc count in the index. Reads DBN_AZURE_SEARCH_{ENDPOINT,KEY,INDEX}. corpus-search.php: Add azure mode — semantic + vector hybrid search via Azure AI Search bnl-legal-v2. Embeds query with LiteLLM nomic-embed-text; expands keepCats to include government-policy, health-law, social-services, labour-law, immigration (previously blocked by contamination workaround, now safe to include). Co-Authored-By: Claude Sonnet 4.6 --- api/corpus-search.php | 87 ++++++++++++++++++++++++++++++++++++++++++- api/health.php | 23 ++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/api/corpus-search.php b/api/corpus-search.php index 6155708..b2ecee1 100644 --- a/api/corpus-search.php +++ b/api/corpus-search.php @@ -9,7 +9,7 @@ dbnToolsRequireAuth(); $input = dbnToolsJsonInput(4000); $query = trim(dbnToolsString($input, 'query', 1000)); $rawMode = $input['mode'] ?? 'hybrid'; -$mode = in_array($rawMode, ['hybrid', 'bm25', 'vector'], true) ? $rawMode : 'hybrid'; +$mode = in_array($rawMode, ['hybrid', 'bm25', 'vector', 'azure'], true) ? $rawMode : 'hybrid'; $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); $limit = max(1, min(20, (int)($input['limit'] ?? 8))); $category = isset($input['category']) && $input['category'] !== '' ? trim((string)$input['category']) : null; @@ -160,6 +160,91 @@ try { dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'vector', 'query' => $query]); } + // ── AZURE AI SEARCH: semantic + vector via Azure AI Search ──────────────── + if ($mode === 'azure') { + $searchEndpoint = rtrim((string)dbnToolsEnv('DBN_AZURE_SEARCH_ENDPOINT', ''), '/'); + $searchKey = (string)dbnToolsEnv('DBN_AZURE_SEARCH_KEY', ''); + $searchIndex = (string)dbnToolsEnv('DBN_AZURE_SEARCH_INDEX', 'bnl-legal-v2'); + + if (!$searchEndpoint || !$searchKey) { + dbnToolsError('Azure AI Search is not configured on this server.', 503, 'azure_search_not_configured'); + } + + // Try to embed the query for hybrid (semantic + vector) search + $vector = null; + $embeddings = dbnToolsLiteLLMEmbedBatch([$query]); + if (!empty($embeddings) && is_array($embeddings[0])) { + $vector = $embeddings[0]; + } + + // Expanded keep-list: original 11 + government-policy, health-law, + // social-services, labour-law, immigration (unblocked after contamination cleanup) + $keepCats = [ + 'child-welfare', 'echr-case-law', 'child-abduction', 'legislation', + 'anti-discrimination', 'legal', 'children-rights', 'family-law', + 'civil-litigation', 'patient-rights', 'parliamentary', + 'government-policy', 'health-law', 'social-services', 'labour-law', 'immigration', + ]; + $catFilter = implode(' or ', array_map(fn($c) => "category eq '$c'", $keepCats)); + if ($category !== null) { + $catFilter = "category eq '$category'"; + } + + $payload = [ + 'search' => $query, + 'top' => $limit, + 'select' => 'id,chunk_id,content,title,section_title,category,source_url', + 'queryType' => 'semantic', + 'semanticConfiguration' => 'bnl-semantic', + 'filter' => $catFilter, + ]; + if ($vector) { + $payload['vectorQueries'] = [[ + 'kind' => 'vector', + 'vector' => $vector, + 'fields' => 'content_vector', + 'k' => $limit, + ]]; + } + + $url = "$searchEndpoint/indexes/" . rawurlencode($searchIndex) . '/docs/search?api-version=2024-05-01-preview'; + $ch = curl_init($url); + curl_setopt_array($ch, [ + CURLOPT_POST => true, + CURLOPT_RETURNTRANSFER => true, + CURLOPT_TIMEOUT => 15, + CURLOPT_HTTPHEADER => ['Content-Type: application/json', "api-key: $searchKey"], + CURLOPT_POSTFIELDS => json_encode($payload, JSON_UNESCAPED_SLASHES), + ]); + $resp = curl_exec($ch); + $code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE); + $curlErr = curl_error($ch); + curl_close($ch); + + if ($curlErr) dbnToolsError("Azure Search unreachable: $curlErr", 502, 'azure_search_error'); + if ($code !== 200) { + $errBody = json_decode((string)$resp, true); + $errMsg = $errBody['error']['message'] ?? "HTTP $code"; + dbnToolsError("Azure AI Search error: $errMsg", 502, 'azure_search_error'); + } + + $data = json_decode((string)$resp, true); + $hits = array_map(fn($d) => [ + 'title' => trim(implode(' — ', array_filter([$d['title'] ?? '', $d['section_title'] ?? '']))), + 'category' => $d['category'] ?? '', + 'section' => $d['section_title'] ?? null, + 'excerpt' => mb_substr((string)($d['content'] ?? ''), 0, 600, 'UTF-8'), + 'score' => round((float)($d['@search.rerankerScore'] ?? $d['@search.score'] ?? 0), 4), + 'document_id' => null, + 'chunk_id' => $d['chunk_id'] ?? $d['id'] ?? null, + 'source_url' => $d['source_url'] ?? null, + 'language' => null, + ], $data['value'] ?? []); + + $hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN))); + dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'azure', 'query' => $query]); + } + dbnToolsError('Unknown search mode.', 422, 'invalid_mode'); } catch (DbnToolsHttpException $e) { throw $e; diff --git a/api/health.php b/api/health.php index 94b9692..1e2aa6b 100644 --- a/api/health.php +++ b/api/health.php @@ -66,6 +66,29 @@ try { $checks['family_legal_subscription'] = ['ok' => false, 'detail' => 'Not checked']; } +$searchEndpoint = rtrim((string)dbnToolsEnv('DBN_AZURE_SEARCH_ENDPOINT', ''), '/'); +$searchKey = (string)dbnToolsEnv('DBN_AZURE_SEARCH_KEY', ''); +$searchIndex = (string)dbnToolsEnv('DBN_AZURE_SEARCH_INDEX', ''); +if ($searchEndpoint && $searchKey && $searchIndex) { + $countUrl = "$searchEndpoint/indexes/$searchIndex/docs/\$count?api-version=2024-05-01-preview"; + $ch = curl_init($countUrl); + curl_setopt_array($ch, [ + CURLOPT_RETURNTRANSFER => true, + CURLOPT_TIMEOUT => 8, + CURLOPT_HTTPHEADER => ["api-key: $searchKey"], + ]); + $resp = curl_exec($ch); + $code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE); + curl_close($ch); + $count = is_numeric(trim((string)$resp)) ? (int)trim($resp) : null; + $checks['azure_search'] = [ + 'ok' => $code === 200 && $count !== null, + 'detail' => $code === 200 ? "$count docs in $searchIndex" : "HTTP $code", + ]; +} else { + $checks['azure_search'] = ['ok' => false, 'detail' => 'Azure Search env vars not configured']; +} + $logPath = dbnToolsMetadataLogPath(); $dir = dirname($logPath); $checks['metadata_log'] = [