From 5a0ef89dcaff5803023e69fbcc2ef43a2034d65f Mon Sep 17 00:00:00 2001 From: davegilligan Date: Mon, 1 Jun 2026 16:45:41 +0200 Subject: [PATCH] feat(mcp): expose corpus_search, korrespond_refine, extract_text tools Restores the 3 tools (manifest + invoke arms + invokeExtract helper), the citation-atom RAG lever in LegalTools/corpus-search, and the catalog icons. These were live on prod via rsync but uncommitted, so a git-pull deploy reverted the manifest from 22 to 19 tools. Co-Authored-By: Claude Opus 4.7 --- api/corpus-search.php | 47 +++++++++++++++++++++ includes/DbnMcpRuntime.php | 83 +++++++++++++++++++++++++++++++++++++- includes/LegalTools.php | 55 ++++++++++++++++++++++++- mcp.php | 3 ++ 4 files changed, 186 insertions(+), 2 deletions(-) diff --git a/api/corpus-search.php b/api/corpus-search.php index 683f5c0..df7ada0 100644 --- a/api/corpus-search.php +++ b/api/corpus-search.php @@ -47,6 +47,46 @@ try { $catClause = $category !== null ? ' AND d.category = ?' : ''; $excludeLike = '%' . EXCLUDED_DOMAIN . '%'; + // Exact-identifier routing: the FULLTEXT tokenizer drops "§ 4-12" / "Art. 8" + // to stopword fragments, so a citation query never matches. Route those to a + // verbatim LIKE lookup and pin the hits ahead of the fuzzy BM25 results. + $exactHits = []; + $exactChunkIds = []; + $citationAtoms = DbnLegalToolsService::citationAtoms($query); + if (!empty($citationAtoms)) { + $atomClauses = []; + $atomParams = [1]; + foreach ($citationAtoms as $atom) { + $like = '%' . str_replace(['%', '_'], ['\\%', '\\_'], $atom) . '%'; + $atomClauses[] = '(c.content LIKE ? OR c.section_title LIKE ?)'; + $atomParams[] = $like; + $atomParams[] = $like; + } + $atomParams[] = $excludeLike; + $exactSql = "SELECT d.id AS document_id, d.title, d.category, + d.source_url, c.id AS chunk_id, c.content AS excerpt, + c.section_title AS section, d.language, 1.0 AS score + FROM chunks c + JOIN documents d ON c.document_id = d.id + WHERE d.corpus_id = ? AND d.status = 'ready' + AND (" . implode(' OR ', $atomClauses) . ") + AND d.source_url NOT LIKE ? + $catClause + LIMIT $limit"; + $exactParams = $atomParams; + if ($category !== null) $exactParams[] = $category; + try { + $stmt = $ragDb->prepare($exactSql); + $stmt->execute($exactParams); + foreach ($stmt->fetchAll(PDO::FETCH_ASSOC) as $r) { + $exactHits[] = $r; + if (isset($r['chunk_id'])) $exactChunkIds[(int)$r['chunk_id']] = true; + } + } catch (Throwable $e) { + // Non-fatal — fall through to fuzzy BM25. + } + } + // Try FULLTEXT index first try { $sql = "SELECT d.id AS document_id, d.title, d.category, @@ -89,6 +129,12 @@ try { $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); } + // Exact identifier hits lead; drop fuzzy rows that duplicate them. + if (!empty($exactChunkIds)) { + $rows = array_values(array_filter($rows, fn($r) => empty($exactChunkIds[(int)($r['chunk_id'] ?? 0)]))); + } + $rows = array_slice(array_merge($exactHits, $rows), 0, $limit); + $hits = array_map(fn($r) => [ 'title' => $r['title'] ?? '', 'category' => $r['category'] ?? '', @@ -100,6 +146,7 @@ try { 'chunk_id' => isset($r['chunk_id']) ? (int)$r['chunk_id'] : null, 'source_url' => $r['source_url'] ?? null, 'language' => $r['language'] ?? null, + 'exact_match' => !empty($exactChunkIds[(int)($r['chunk_id'] ?? 0)]), ], $rows); dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'bm25', 'query' => $query]); } diff --git a/includes/DbnMcpRuntime.php b/includes/DbnMcpRuntime.php index d60ab70..9028f6e 100644 --- a/includes/DbnMcpRuntime.php +++ b/includes/DbnMcpRuntime.php @@ -20,6 +20,13 @@ final class DbnMcpRuntime 'limit' => ['type' => 'integer', 'minimum' => 1, 'maximum' => 10], 'corpus_scope' => ['type' => 'string', 'enum' => ['shared', 'private', 'both']], ], ['query']), + self::tool('dbn.corpus_search', 'Advanced corpus search', 'Search the DBN legal corpus with a chosen retrieval mode (hybrid, bm25, vector, azure) and optional category filter.', [ + 'query' => ['type' => 'string', 'minLength' => 3], + 'language' => $lang, + 'mode' => ['type' => 'string', 'enum' => ['hybrid', 'bm25', 'vector', 'azure']], + 'limit' => ['type' => 'integer', 'minimum' => 1, 'maximum' => 20], + 'category' => ['type' => 'string'], + ], ['query']), self::tool('dbn.ask', 'Ask a legal question', 'Answer a legal preparation question with source-grounded DBN context.', [ 'question' => ['type' => 'string', 'minLength' => 5], 'language' => $lang, @@ -64,6 +71,15 @@ final class DbnMcpRuntime 'use_case_context' => $useCase, 'force_draft' => ['type' => 'boolean'], ]), + self::tool('dbn.korrespond_refine', 'Refine authority correspondence', 'Refine an existing Norwegian draft letter to an authority into a stronger, source-grounded version.', [ + 'original_draft' => ['type' => 'string', 'minLength' => 10], + 'language' => $lang, + 'jurisdiction' => ['type' => 'string', 'enum' => ['norwegian', 'echr', 'both']], + 'recipient_body' => ['type' => 'string'], + 'output_type' => ['type' => 'string', 'enum' => ['email', 'formal', 'filing', 'call_prep']], + 'tone' => ['type' => 'string', 'enum' => ['cooperative', 'neutral', 'firm', 'adversarial', 'warm']], + 'goal' => ['type' => 'string'], + ], ['original_draft']), self::tool('dbn.barnevernet_analyze', 'Analyze Barnevernet document', 'Analyze child-welfare documents for red flags and legal issues.', [ 'document_text' => $text, 'filename' => ['type' => 'string'], @@ -98,6 +114,11 @@ final class DbnMcpRuntime 'language' => ['type' => 'string'], 'diarize' => ['type' => 'boolean'], ]), + self::tool('dbn.extract_text', 'Extract document text', 'Extract plain text from a document (PDF, DOCX, TXT, etc.) supplied as base64 or a URL.', [ + 'file_base64' => ['type' => 'string'], + 'file_url' => ['type' => 'string'], + 'filename' => ['type' => 'string'], + ]), self::tool('dbn.corpus_stats', 'Corpus statistics', 'Return document/chunk counts and active legal sources.', []), self::tool('dbn.list_documents', 'List corpus documents', 'List DBN legal corpus documents with filters.', [ 'category' => ['type' => 'string'], @@ -142,6 +163,13 @@ final class DbnMcpRuntime 'limit' => (int)($args['limit'] ?? $args['top_k'] ?? 8), 'corpus_scope' => self::corpusScope($args['corpus_scope'] ?? 'both'), ]), + 'dbn.corpus_search' => self::callJson('api/corpus-search.php', [ + 'query' => (string)($args['query'] ?? ''), + 'language' => self::language($args['language'] ?? 'en'), + 'mode' => in_array($args['mode'] ?? 'hybrid', ['hybrid', 'bm25', 'vector', 'azure'], true) ? (string)$args['mode'] : 'hybrid', + 'limit' => (int)($args['limit'] ?? 8), + 'category' => (string)($args['category'] ?? ''), + ]), 'dbn.ask' => self::callJson('api/ask.php', [ 'question' => (string)($args['question'] ?? ''), 'language' => self::language($args['language'] ?? 'en'), @@ -188,6 +216,18 @@ final class DbnMcpRuntime 'use_my_case' => !empty($args['use_case_context']), 'force_draft' => ($args['force_draft'] ?? true) !== false, ]), + 'dbn.korrespond_refine' => self::callJson('api/korrespond-refine.php', [ + 'original_draft_no' => (string)($args['original_draft'] ?? ''), + 'language' => self::language($args['language'] ?? 'en'), + 'jurisdiction' => in_array($args['jurisdiction'] ?? 'norwegian', ['norwegian', 'echr', 'both'], true) ? (string)$args['jurisdiction'] : 'norwegian', + 'intake' => [ + 'recipient_body' => (string)($args['recipient_body'] ?? 'other'), + 'output_type' => (string)($args['output_type'] ?? 'email'), + 'tone' => (string)($args['tone'] ?? 'neutral'), + 'goal' => (string)($args['goal'] ?? ''), + ], + 'classify' => [], + ]), 'dbn.barnevernet_analyze' => self::callMultipart('api/barnevernet.php', [ 'language' => self::language($args['language'] ?? 'en'), 'advocate_role' => (string)($args['advocate_role'] ?? ''), @@ -213,6 +253,7 @@ final class DbnMcpRuntime 'file_b' => self::tempTextFile((string)($args['document_b_text'] ?? ''), (string)($args['filename_b'] ?? 'document-b.txt')), ]), 'dbn.transcribe_audio' => self::invokeTranscribe($args), + 'dbn.extract_text' => self::invokeExtract($args), 'dbn.corpus_stats' => self::callGet('api/corpus-stats.php', []), 'dbn.list_documents' => self::callGet('api/corpus-documents.php', [ 'category' => (string)($args['category'] ?? ''), @@ -468,6 +509,43 @@ final class DbnMcpRuntime } } + private static function invokeExtract(array $args): array + { + $filename = preg_replace('/[^A-Za-z0-9._-]/', '_', (string)($args['filename'] ?? 'document.pdf')) ?: 'document.pdf'; + $path = tempnam(sys_get_temp_dir(), 'dbn-extract-'); + if ($path === false) { + throw new DbnToolsHttpException('Could not create temporary file.', 500, 'temp_failed'); + } + + try { + if (!empty($args['file_base64'])) { + $data = base64_decode((string)$args['file_base64'], true); + if ($data === false || strlen($data) < 8) { + throw new DbnToolsHttpException('file_base64 is invalid.', 422, 'bad_file_base64'); + } + if (strlen($data) > 25 * 1024 * 1024) { + throw new DbnToolsHttpException('file_base64 is too large for MCP upload. Use file_url.', 413, 'file_too_large'); + } + file_put_contents($path, $data); + } elseif (!empty($args['file_url'])) { + self::downloadToFile((string)$args['file_url'], $path, 100 * 1024 * 1024); + $filename = basename(parse_url((string)$args['file_url'], PHP_URL_PATH) ?: $filename) ?: $filename; + } else { + throw new DbnToolsHttpException('Provide file_base64 or file_url.', 422, 'missing_file'); + } + + $body = [ + 'tool' => 'extract', + 'file' => new CURLFile($path, mime_content_type($path) ?: 'application/octet-stream', $filename), + ]; + return self::curl('api/extract.php', 'POST', $body, ['Accept: application/json']); + } finally { + if (is_file($path)) { + @unlink($path); + } + } + } + private static function downloadToFile(string $url, string $path, int $maxBytes): void { if (!preg_match('#^https?://#i', $url)) { @@ -577,7 +655,7 @@ final class DbnMcpRuntime private static function summaryText(array $payload): string { - foreach (['summary_text', 'answer', 'what_we_found', 'overall_assessment', 'translated_text', 'redacted_text', 'transcript'] as $key) { + foreach (['summary_text', 'answer', 'what_we_found', 'overall_assessment', 'translated_text', 'redacted_text', 'transcript', 'draft_no', 'draft_user', 'text'] as $key) { if (!empty($payload[$key]) && is_string($payload[$key])) { return $payload[$key]; } @@ -585,6 +663,9 @@ final class DbnMcpRuntime if (!empty($payload['document']['title'])) { return 'Document: ' . (string)$payload['document']['title']; } + if (isset($payload['hits']) && is_array($payload['hits'])) { + return 'Found ' . count($payload['hits']) . ' source excerpt(s) from the legal corpus.'; + } if (!empty($payload['stats'])) { return 'Corpus statistics: ' . json_encode($payload['stats'], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); } diff --git a/includes/LegalTools.php b/includes/LegalTools.php index b31d502..825f88a 100644 --- a/includes/LegalTools.php +++ b/includes/LegalTools.php @@ -1380,6 +1380,12 @@ PROMPT; private function searchTerms(string $query): array { + // Citation atoms first: "§ 4-12", "Art. 8(2)", "Rt. 2020 s. 1234" tokenize + // to fragments shorter than the 3-char floor and get dropped, so a citation + // query loses its only meaningful term (EDI Vol.1 #2, §2.1). Extract them + // verbatim and route them ahead of the word tokens. + $citations = $this->extractCitationAtoms($query); + $parts = preg_split('/[^\p{L}\p{N}]+/u', mb_strtolower($query, 'UTF-8')) ?: []; $stop = ['the', 'and', 'for', 'with', 'that', 'this', 'hva', 'har', 'kan', 'jeg', 'som', 'det', 'med', 'til', 'og']; $terms = []; @@ -1389,7 +1395,54 @@ PROMPT; } $terms[] = $part; } - return array_slice(array_values(array_unique($terms)), 0, 6); + + // Citation atoms are authoritative — prepend, keep verbatim, dedupe. + $terms = array_merge($citations, $terms); + return array_slice(array_values(array_unique($terms)), 0, 8); + } + + /** + * Extract exact legal-identifier substrings that must survive tokenization. + * Each is kept as a whole LIKE term. For § sections we also emit spaced / + * unspaced variants so "§4-12" matches stored "§ 4-12" and vice versa. + * + * @return string[] + */ + private function extractCitationAtoms(string $query): array + { + return self::citationAtoms($query); + } + + /** + * Static, reusable citation extractor (also used by api/corpus-search.php to + * route identifier queries around the FULLTEXT tokenizer). + * + * @return string[] + */ + public static function citationAtoms(string $query): array + { + $patterns = [ + '/§\s*\d+(?:-\d+)?[a-z]?/u', // § 4-12, § 1a + '/\bArt(?:ikkel|icle|\.)?\s*\d+(?:\(\d+\))?/iu', // Art. 8, Article 3, Art. 8(2) + '/\b3\d{4}[A-Z]\d{4}\b/', // EU CELEX: 32016R0679 + '/\bRt[\.\s]*\d{4}[\.\s]*s[\.\s]*\d+/u', // Rt. 2020 s. 1234 + '/\bHR-\d{4}-\d+(?:-[A-Z])?/u', // HR-2020-1789-A + ]; + + $out = []; + foreach ($patterns as $rx) { + if (!preg_match_all($rx, $query, $m)) continue; + foreach ($m[0] as $hit) { + $hit = trim((string)$hit); + if ($hit === '') continue; + $out[$hit] = true; + if (mb_strpos($hit, '§') !== false) { + $out[preg_replace('/§\s*/u', '§ ', $hit)] = true; // force single space + $out[preg_replace('/§\s*/u', '§', $hit)] = true; // no space + } + } + } + return array_keys($out); } private function requirePasteText(string $text, ?int $maxChars = null): string diff --git a/mcp.php b/mcp.php index 5d1723d..f96db96 100644 --- a/mcp.php +++ b/mcp.php @@ -25,6 +25,7 @@ $toolCatalog = DbnMcpRuntime::tools(); $toolIcons = [ 'dbn.search_legal' => '🔍', + 'dbn.corpus_search' => '🧭', 'dbn.ask' => '💬', 'dbn.summarize' => '📋', 'dbn.timeline' => '📅', @@ -32,11 +33,13 @@ $toolIcons = [ 'dbn.translate' => '🌍', 'dbn.legal_analysis' => '⚖️', 'dbn.korrespond' => '✉️', + 'dbn.korrespond_refine' => '✨', 'dbn.barnevernet_analyze' => '📄', 'dbn.advocate_brief' => '🏛️', 'dbn.deep_research' => '🔬', 'dbn.discrepancy_find' => '🔄', 'dbn.transcribe_audio' => '🎤', + 'dbn.extract_text' => '📑', 'dbn.corpus_stats' => '📊', 'dbn.list_documents' => '📚', 'dbn.get_document' => '📖',