endpoint = rtrim($endpoint ?? ($cfg['endpoint'] ?? ''), '/'); $this->adminKey = $adminKey ?? ($cfg['admin_key'] ?? ''); if ($this->endpoint === '' || $this->adminKey === '') { throw new RuntimeException('AzureSearchAdmin: endpoint or admin key not configured.'); } } private static function loadConfig(): array { $path = '/etc/bnl/azure.php'; if (is_readable($path)) { $cfg = require $path; return [ 'endpoint' => (string)($cfg['SEARCH_ENDPOINT'] ?? 'https://bnl-legal-search.search.windows.net'), 'admin_key' => (string)($cfg['SEARCH_ADMIN_KEY'] ?? ''), ]; } return [ 'endpoint' => (string)(getenv('AZURE_SEARCH_ENDPOINT') ?: 'https://bnl-legal-search.search.windows.net'), 'admin_key' => (string)(getenv('AZURE_SEARCH_ADMIN_KEY') ?: ''), ]; } public static function indexName(int $userId): string { return 'case-' . $userId; } /** Create the per-user index if it does not exist. Idempotent. */ public function ensureUserIndex(int $userId): string { $name = self::indexName($userId); if ($this->indexExists($name)) { return $name; } $body = [ 'name' => $name, 'fields' => [ ['name' => 'id', 'type' => 'Edm.String', 'key' => true, 'filterable' => true], ['name' => 'doc_id', 'type' => 'Edm.Int32', 'filterable' => true, 'facetable' => true], ['name' => 'user_id', 'type' => 'Edm.Int32', 'filterable' => true], ['name' => 'filename', 'type' => 'Edm.String', 'filterable' => true, 'sortable' => true, 'searchable' => true, 'analyzer' => 'standard.lucene'], ['name' => 'page', 'type' => 'Edm.Int32', 'filterable' => true, 'sortable' => true], ['name' => 'chunk_text', 'type' => 'Edm.String', 'searchable' => true, 'analyzer' => 'nb.microsoft'], ['name' => 'doc_type', 'type' => 'Edm.String', 'filterable' => true, 'facetable' => true], ['name' => 'detected_date', 'type' => 'Edm.DateTimeOffset', 'filterable' => true, 'sortable' => true], [ 'name' => 'vector', 'type' => 'Collection(Edm.Single)', 'searchable' => true, 'dimensions' => 1536, 'vectorSearchProfile' => 'caseVectorProfile', ], ], 'vectorSearch' => [ 'algorithms' => [[ 'name' => 'caseHnsw', 'kind' => 'hnsw', 'hnswParameters' => ['m' => 4, 'efConstruction' => 400, 'efSearch' => 500, 'metric' => 'cosine'], ]], 'profiles' => [['name' => 'caseVectorProfile', 'algorithm' => 'caseHnsw']], ], 'semantic' => [ 'configurations' => [[ 'name' => 'caseSemantic', 'prioritizedFields' => [ 'contentFields' => [['fieldName' => 'chunk_text']], 'titleField' => ['fieldName' => 'filename'], ], ]], ], ]; $this->request('PUT', '/indexes/' . rawurlencode($name) . '?api-version=' . self::API_VERSION, $body); return $name; } public function indexExists(string $name): bool { $code = $this->request('GET', '/indexes/' . rawurlencode($name) . '?api-version=' . self::API_VERSION, null, true); return $code === 200; } /** Upsert a batch of documents (chunks) into the user's index. */ public function upsertChunks(int $userId, array $chunks): void { if (empty($chunks)) return; $name = self::indexName($userId); $body = [ 'value' => array_map(fn($c) => array_merge(['@search.action' => 'mergeOrUpload'], $c), $chunks), ]; $this->request('POST', '/indexes/' . rawurlencode($name) . '/docs/index?api-version=' . self::API_VERSION, $body); } /** Delete all chunks for a given doc_id (used on document deletion). */ public function deleteDoc(int $userId, int $docId): void { $name = self::indexName($userId); // First search to get all chunk ids for this doc $resp = $this->request('POST', '/indexes/' . rawurlencode($name) . '/docs/search?api-version=' . self::API_VERSION, [ 'search' => '*', 'filter' => 'doc_id eq ' . $docId, 'select' => 'id', 'top' => 1000, ]); $ids = array_map(fn($v) => $v['id'] ?? null, $resp['value'] ?? []); $ids = array_filter($ids); if (empty($ids)) return; $body = [ 'value' => array_map(fn($id) => ['@search.action' => 'delete', 'id' => $id], array_values($ids)), ]; $this->request('POST', '/indexes/' . rawurlencode($name) . '/docs/index?api-version=' . self::API_VERSION, $body); } /** Delete the entire index (account deletion / GDPR). */ public function deleteIndex(int $userId): void { $name = self::indexName($userId); $this->request('DELETE', '/indexes/' . rawurlencode($name) . '?api-version=' . self::API_VERSION, null, true); } /** * Hybrid search: BM25 (Norwegian analyzer) + vector + semantic ranker. * Returns ['value' => [{id, doc_id, filename, page, chunk_text, @search.score, @search.rerankerScore}, ...]] */ public function hybridSearch(int $userId, string $query, array $queryVector, int $k = 5): array { $name = self::indexName($userId); $body = [ 'search' => $query, 'queryType' => 'semantic', 'semanticConfiguration' => 'caseSemantic', 'searchFields' => 'chunk_text,filename', 'select' => 'id,doc_id,filename,page,chunk_text,doc_type,detected_date', 'top' => $k, 'vectorQueries' => [[ 'kind' => 'vector', 'vector' => $queryVector, 'k' => $k, 'fields' => 'vector', ]], ]; return $this->request('POST', '/indexes/' . rawurlencode($name) . '/docs/search?api-version=' . self::API_VERSION, $body); } /** Low-level HTTP. If $returnStatusOnly, returns http code instead of decoded body. */ private function request(string $method, string $path, ?array $body = null, bool $returnStatusOnly = false) { $url = $this->endpoint . $path; $headers = [ 'api-key: ' . $this->adminKey, 'Content-Type: application/json', ]; $ch = curl_init(); curl_setopt_array($ch, [ CURLOPT_URL => $url, CURLOPT_CUSTOMREQUEST => strtoupper($method), CURLOPT_RETURNTRANSFER => true, CURLOPT_HTTPHEADER => $headers, CURLOPT_TIMEOUT => 30, ]); if ($body !== null) { curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($body, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES)); } $raw = curl_exec($ch); $status = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE); $errno = curl_errno($ch); curl_close($ch); if ($returnStatusOnly) { return $status; } if ($errno !== 0) { throw new RuntimeException('AzureSearch curl error: ' . curl_strerror($errno)); } if ($status >= 400) { throw new RuntimeException("AzureSearch HTTP {$status}: " . substr((string)$raw, 0, 300)); } $decoded = json_decode((string)$raw, true); return is_array($decoded) ? $decoded : []; } }