search($query, $language, $limit, 'disabled', null); $hits = array_map(fn($h) => [ 'title' => $h['title'] ?? '', 'category' => $h['category'] ?? '', 'section' => $h['section'] ?? null, 'excerpt' => $h['excerpt'] ?? ($h['chunk_text'] ?? ''), 'full_text' => $h['full_text'] ?? $h['chunk_text'] ?? $h['excerpt'] ?? '', 'score' => $h['score'] ?? null, 'document_id' => $h['document_id'] ?? null, 'chunk_id' => $h['chunk_id'] ?? null, 'source_url' => $h['source_url'] ?? null, 'language' => null, ], $result['hits'] ?? []); $hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN))); dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'hybrid', 'query' => $query]); } $ragDb = dbnToolsRagDb(); // ── BM25: FULLTEXT with LIKE fallback ─────────────────────────────────── if ($mode === 'bm25') { $catClause = $category !== null ? ' AND d.category = ?' : ''; $excludeLike = '%' . EXCLUDED_DOMAIN . '%'; // Exact-identifier routing: the FULLTEXT tokenizer drops "§ 4-12" / "Art. 8" // to stopword fragments, so a citation query never matches. Route those to a // verbatim LIKE lookup and pin the hits ahead of the fuzzy BM25 results. $exactHits = []; $exactChunkIds = []; $citationAtoms = DbnLegalToolsService::citationAtoms($query); if (!empty($citationAtoms)) { $atomClauses = []; $atomParams = [1]; foreach ($citationAtoms as $atom) { $like = '%' . str_replace(['%', '_'], ['\\%', '\\_'], $atom) . '%'; $atomClauses[] = '(c.content LIKE ? OR c.section_title LIKE ?)'; $atomParams[] = $like; $atomParams[] = $like; } $atomParams[] = $excludeLike; $exactSql = "SELECT d.id AS document_id, d.title, d.category, d.source_url, c.id AS chunk_id, c.content AS excerpt, c.section_title AS section, d.language, 1.0 AS score FROM chunks c JOIN documents d ON c.document_id = d.id WHERE d.corpus_id = ? AND d.status = 'ready' AND (" . implode(' OR ', $atomClauses) . ") AND d.source_url NOT LIKE ? $catClause LIMIT $limit"; $exactParams = $atomParams; if ($category !== null) $exactParams[] = $category; try { $stmt = $ragDb->prepare($exactSql); $stmt->execute($exactParams); foreach ($stmt->fetchAll(PDO::FETCH_ASSOC) as $r) { $exactHits[] = $r; if (isset($r['chunk_id'])) $exactChunkIds[(int)$r['chunk_id']] = true; } } catch (Throwable $e) { // Non-fatal — fall through to fuzzy BM25. } } // Try FULLTEXT index first try { $sql = "SELECT d.id AS document_id, d.title, d.category, d.source_url, c.id AS chunk_id, c.content AS excerpt, c.section_title AS section, d.language, MATCH(c.content) AGAINST (? IN BOOLEAN MODE) AS score FROM chunks c JOIN documents d ON c.document_id = d.id WHERE d.corpus_id = ? AND d.status = 'ready' AND MATCH(c.content) AGAINST (? IN BOOLEAN MODE) > 0 AND d.source_url NOT LIKE ? $catClause ORDER BY score DESC LIMIT $limit"; $params = [$query, 1, $query, $excludeLike]; if ($category !== null) $params[] = $category; $stmt = $ragDb->prepare($sql); $stmt->execute($params); $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); } catch (Throwable $e) { // FULLTEXT index absent — use LIKE $like = '%' . str_replace(['%', '_'], ['\\%', '\\_'], $query) . '%'; $sql = "SELECT d.id AS document_id, d.title, d.category, d.source_url, c.id AS chunk_id, c.content AS excerpt, c.section_title AS section, d.language, 0.25 AS score FROM chunks c JOIN documents d ON c.document_id = d.id WHERE d.corpus_id = ? AND d.status = 'ready' AND (c.content LIKE ? OR d.title LIKE ?) AND d.source_url NOT LIKE ? $catClause ORDER BY (d.title LIKE ?) DESC LIMIT $limit"; $params = [1, $like, $like, $excludeLike]; if ($category !== null) $params[] = $category; $params[] = $like; $stmt = $ragDb->prepare($sql); $stmt->execute($params); $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); } // Exact identifier hits lead; drop fuzzy rows that duplicate them. if (!empty($exactChunkIds)) { $rows = array_values(array_filter($rows, fn($r) => empty($exactChunkIds[(int)($r['chunk_id'] ?? 0)]))); } $rows = array_slice(array_merge($exactHits, $rows), 0, $limit); $hits = array_map(fn($r) => [ 'title' => $r['title'] ?? '', 'category' => $r['category'] ?? '', 'section' => $r['section'] ?? null, 'excerpt' => mb_substr((string)($r['excerpt'] ?? ''), 0, 600, 'UTF-8'), 'full_text' => (string)($r['excerpt'] ?? ''), 'score' => isset($r['score']) ? round((float)$r['score'], 4) : null, 'document_id' => (int)$r['document_id'], 'chunk_id' => isset($r['chunk_id']) ? (int)$r['chunk_id'] : null, 'source_url' => $r['source_url'] ?? null, 'language' => $r['language'] ?? null, 'exact_match' => !empty($exactChunkIds[(int)($r['chunk_id'] ?? 0)]), ], $rows); dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'bm25', 'query' => $query]); } // ── VECTOR: embed → Qdrant ───────────────────────────────────────────── if ($mode === 'vector') { $embeddings = dbnToolsLiteLLMEmbedBatch([$query]); if (empty($embeddings) || !is_array($embeddings[0])) { dbnToolsError('Embedding failed — vector search unavailable.', 502, 'embed_error'); } $filter = ['must' => [['key' => 'corpus_id', 'match' => ['value' => 1]]]]; if ($category !== null) { $filter['must'][] = ['key' => 'category', 'match' => ['value' => $category]]; } $qdrantPayload = json_encode([ 'vector' => $embeddings[0], 'limit' => $limit, 'with_payload' => true, 'filter' => $filter, ]); $ch = curl_init('http://10.0.2.10:6333/collections/bnl_chunks/points/search'); curl_setopt_array($ch, [ CURLOPT_RETURNTRANSFER => true, CURLOPT_POST => true, CURLOPT_POSTFIELDS => $qdrantPayload, CURLOPT_HTTPHEADER => ['Content-Type: application/json'], CURLOPT_TIMEOUT => 15, ]); $resp = curl_exec($ch); $curlErr = curl_error($ch); curl_close($ch); if ($resp === false) { dbnToolsError('Qdrant unreachable: ' . $curlErr, 502, 'qdrant_error'); } $qdrantResult = json_decode($resp, true); $points = $qdrantResult['result'] ?? []; $hits = []; foreach ($points as $pt) { $p = $pt['payload'] ?? []; $hits[] = [ 'title' => $p['title'] ?? $p['document_title'] ?? '', 'category' => $p['category'] ?? '', 'section' => $p['section_title'] ?? null, 'excerpt' => mb_substr((string)($p['content'] ?? ''), 0, 600, 'UTF-8'), 'full_text' => (string)($p['content'] ?? ''), 'score' => round((float)($pt['score'] ?? 0), 4), 'document_id' => isset($p['document_id']) ? (int)$p['document_id'] : null, 'chunk_id' => $pt['id'] ?? null, 'source_url' => $p['source_url'] ?? null, 'language' => $p['language'] ?? null, ]; } $hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN))); dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'vector', 'query' => $query]); } // ── AZURE AI SEARCH: semantic + vector via Azure AI Search ──────────────── if ($mode === 'azure') { $searchEndpoint = rtrim((string)dbnToolsEnv('DBN_AZURE_SEARCH_ENDPOINT', ''), '/'); $searchKey = (string)dbnToolsEnv('DBN_AZURE_SEARCH_KEY', ''); $searchIndex = (string)dbnToolsEnv('DBN_AZURE_SEARCH_INDEX', 'bnl-legal-v2'); if (!$searchEndpoint || !$searchKey) { dbnToolsError('Azure AI Search is not configured on this server.', 503, 'azure_search_not_configured'); } // Try to embed the query for hybrid (semantic + vector) search $vector = null; $embeddings = dbnToolsLiteLLMEmbedBatch([$query]); if (!empty($embeddings) && is_array($embeddings[0])) { $vector = $embeddings[0]; } // Expanded keep-list: original 11 + government-policy, health-law, // social-services, labour-law, immigration (unblocked after contamination cleanup) $keepCats = [ 'child-welfare', 'echr-case-law', 'child-abduction', 'legislation', 'anti-discrimination', 'legal', 'children-rights', 'family-law', 'civil-litigation', 'patient-rights', 'parliamentary', 'government-policy', 'health-law', 'social-services', 'labour-law', 'immigration', ]; $catFilter = implode(' or ', array_map(fn($c) => "category eq '$c'", $keepCats)); if ($category !== null) { $catFilter = "category eq '$category'"; } $payload = [ 'search' => $query, 'top' => $limit, 'select' => 'id,chunk_id,content,title,section_title,category,source_url', 'queryType' => 'semantic', 'semanticConfiguration' => 'bnl-semantic', 'filter' => $catFilter, ]; if ($vector) { $payload['vectorQueries'] = [[ 'kind' => 'vector', 'vector' => $vector, 'fields' => 'content_vector', 'k' => $limit, ]]; } $url = "$searchEndpoint/indexes/" . rawurlencode($searchIndex) . '/docs/search?api-version=2024-05-01-preview'; $ch = curl_init($url); curl_setopt_array($ch, [ CURLOPT_POST => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 15, CURLOPT_HTTPHEADER => ['Content-Type: application/json', "api-key: $searchKey"], CURLOPT_POSTFIELDS => json_encode($payload, JSON_UNESCAPED_SLASHES), ]); $resp = curl_exec($ch); $code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE); $curlErr = curl_error($ch); curl_close($ch); if ($curlErr) dbnToolsError("Azure Search unreachable: $curlErr", 502, 'azure_search_error'); if ($code !== 200) { $errBody = json_decode((string)$resp, true); $errMsg = $errBody['error']['message'] ?? "HTTP $code"; dbnToolsError("Azure AI Search error: $errMsg", 502, 'azure_search_error'); } $data = json_decode((string)$resp, true); $hits = array_map(fn($d) => [ 'title' => trim(implode(' — ', array_filter([$d['title'] ?? '', $d['section_title'] ?? '']))), 'category' => $d['category'] ?? '', 'section' => $d['section_title'] ?? null, 'excerpt' => mb_substr((string)($d['content'] ?? ''), 0, 600, 'UTF-8'), 'full_text' => (string)($d['content'] ?? ''), 'score' => round((float)($d['@search.rerankerScore'] ?? $d['@search.score'] ?? 0), 4), 'document_id' => null, 'chunk_id' => $d['chunk_id'] ?? $d['id'] ?? null, 'source_url' => $d['source_url'] ?? null, 'language' => null, ], $data['value'] ?? []); $hits = array_values(array_filter($hits, fn($h) => !str_contains($h['source_url'] ?? '', EXCLUDED_DOMAIN))); dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'azure', 'query' => $query]); } dbnToolsError('Unknown search mode.', 422, 'invalid_mode'); } catch (DbnToolsHttpException $e) { throw $e; } catch (Throwable $e) { dbnToolsError('Corpus search failed: ' . $e->getMessage(), 500, 'search_error'); }