feat(mcp): expose corpus_search, korrespond_refine, extract_text tools

Restores the 3 tools (manifest + invoke arms + invokeExtract helper),
the citation-atom RAG lever in LegalTools/corpus-search, and the catalog
icons. These were live on prod via rsync but uncommitted, so a git-pull
deploy reverted the manifest from 22 to 19 tools.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-06-01 16:45:41 +02:00
parent 2d2502a037
commit 5a0ef89dca
4 changed files with 186 additions and 2 deletions
+47
View File
@@ -47,6 +47,46 @@ try {
$catClause = $category !== null ? ' AND d.category = ?' : ''; $catClause = $category !== null ? ' AND d.category = ?' : '';
$excludeLike = '%' . EXCLUDED_DOMAIN . '%'; $excludeLike = '%' . EXCLUDED_DOMAIN . '%';
// Exact-identifier routing: the FULLTEXT tokenizer drops "§ 4-12" / "Art. 8"
// to stopword fragments, so a citation query never matches. Route those to a
// verbatim LIKE lookup and pin the hits ahead of the fuzzy BM25 results.
$exactHits = [];
$exactChunkIds = [];
$citationAtoms = DbnLegalToolsService::citationAtoms($query);
if (!empty($citationAtoms)) {
$atomClauses = [];
$atomParams = [1];
foreach ($citationAtoms as $atom) {
$like = '%' . str_replace(['%', '_'], ['\\%', '\\_'], $atom) . '%';
$atomClauses[] = '(c.content LIKE ? OR c.section_title LIKE ?)';
$atomParams[] = $like;
$atomParams[] = $like;
}
$atomParams[] = $excludeLike;
$exactSql = "SELECT d.id AS document_id, d.title, d.category,
d.source_url, c.id AS chunk_id, c.content AS excerpt,
c.section_title AS section, d.language, 1.0 AS score
FROM chunks c
JOIN documents d ON c.document_id = d.id
WHERE d.corpus_id = ? AND d.status = 'ready'
AND (" . implode(' OR ', $atomClauses) . ")
AND d.source_url NOT LIKE ?
$catClause
LIMIT $limit";
$exactParams = $atomParams;
if ($category !== null) $exactParams[] = $category;
try {
$stmt = $ragDb->prepare($exactSql);
$stmt->execute($exactParams);
foreach ($stmt->fetchAll(PDO::FETCH_ASSOC) as $r) {
$exactHits[] = $r;
if (isset($r['chunk_id'])) $exactChunkIds[(int)$r['chunk_id']] = true;
}
} catch (Throwable $e) {
// Non-fatal — fall through to fuzzy BM25.
}
}
// Try FULLTEXT index first // Try FULLTEXT index first
try { try {
$sql = "SELECT d.id AS document_id, d.title, d.category, $sql = "SELECT d.id AS document_id, d.title, d.category,
@@ -89,6 +129,12 @@ try {
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC); $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
} }
// Exact identifier hits lead; drop fuzzy rows that duplicate them.
if (!empty($exactChunkIds)) {
$rows = array_values(array_filter($rows, fn($r) => empty($exactChunkIds[(int)($r['chunk_id'] ?? 0)])));
}
$rows = array_slice(array_merge($exactHits, $rows), 0, $limit);
$hits = array_map(fn($r) => [ $hits = array_map(fn($r) => [
'title' => $r['title'] ?? '', 'title' => $r['title'] ?? '',
'category' => $r['category'] ?? '', 'category' => $r['category'] ?? '',
@@ -100,6 +146,7 @@ try {
'chunk_id' => isset($r['chunk_id']) ? (int)$r['chunk_id'] : null, 'chunk_id' => isset($r['chunk_id']) ? (int)$r['chunk_id'] : null,
'source_url' => $r['source_url'] ?? null, 'source_url' => $r['source_url'] ?? null,
'language' => $r['language'] ?? null, 'language' => $r['language'] ?? null,
'exact_match' => !empty($exactChunkIds[(int)($r['chunk_id'] ?? 0)]),
], $rows); ], $rows);
dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'bm25', 'query' => $query]); dbnToolsRespond(['ok' => true, 'hits' => $hits, 'mode' => 'bm25', 'query' => $query]);
} }
+82 -1
View File
@@ -20,6 +20,13 @@ final class DbnMcpRuntime
'limit' => ['type' => 'integer', 'minimum' => 1, 'maximum' => 10], 'limit' => ['type' => 'integer', 'minimum' => 1, 'maximum' => 10],
'corpus_scope' => ['type' => 'string', 'enum' => ['shared', 'private', 'both']], 'corpus_scope' => ['type' => 'string', 'enum' => ['shared', 'private', 'both']],
], ['query']), ], ['query']),
self::tool('dbn.corpus_search', 'Advanced corpus search', 'Search the DBN legal corpus with a chosen retrieval mode (hybrid, bm25, vector, azure) and optional category filter.', [
'query' => ['type' => 'string', 'minLength' => 3],
'language' => $lang,
'mode' => ['type' => 'string', 'enum' => ['hybrid', 'bm25', 'vector', 'azure']],
'limit' => ['type' => 'integer', 'minimum' => 1, 'maximum' => 20],
'category' => ['type' => 'string'],
], ['query']),
self::tool('dbn.ask', 'Ask a legal question', 'Answer a legal preparation question with source-grounded DBN context.', [ self::tool('dbn.ask', 'Ask a legal question', 'Answer a legal preparation question with source-grounded DBN context.', [
'question' => ['type' => 'string', 'minLength' => 5], 'question' => ['type' => 'string', 'minLength' => 5],
'language' => $lang, 'language' => $lang,
@@ -64,6 +71,15 @@ final class DbnMcpRuntime
'use_case_context' => $useCase, 'use_case_context' => $useCase,
'force_draft' => ['type' => 'boolean'], 'force_draft' => ['type' => 'boolean'],
]), ]),
self::tool('dbn.korrespond_refine', 'Refine authority correspondence', 'Refine an existing Norwegian draft letter to an authority into a stronger, source-grounded version.', [
'original_draft' => ['type' => 'string', 'minLength' => 10],
'language' => $lang,
'jurisdiction' => ['type' => 'string', 'enum' => ['norwegian', 'echr', 'both']],
'recipient_body' => ['type' => 'string'],
'output_type' => ['type' => 'string', 'enum' => ['email', 'formal', 'filing', 'call_prep']],
'tone' => ['type' => 'string', 'enum' => ['cooperative', 'neutral', 'firm', 'adversarial', 'warm']],
'goal' => ['type' => 'string'],
], ['original_draft']),
self::tool('dbn.barnevernet_analyze', 'Analyze Barnevernet document', 'Analyze child-welfare documents for red flags and legal issues.', [ self::tool('dbn.barnevernet_analyze', 'Analyze Barnevernet document', 'Analyze child-welfare documents for red flags and legal issues.', [
'document_text' => $text, 'document_text' => $text,
'filename' => ['type' => 'string'], 'filename' => ['type' => 'string'],
@@ -98,6 +114,11 @@ final class DbnMcpRuntime
'language' => ['type' => 'string'], 'language' => ['type' => 'string'],
'diarize' => ['type' => 'boolean'], 'diarize' => ['type' => 'boolean'],
]), ]),
self::tool('dbn.extract_text', 'Extract document text', 'Extract plain text from a document (PDF, DOCX, TXT, etc.) supplied as base64 or a URL.', [
'file_base64' => ['type' => 'string'],
'file_url' => ['type' => 'string'],
'filename' => ['type' => 'string'],
]),
self::tool('dbn.corpus_stats', 'Corpus statistics', 'Return document/chunk counts and active legal sources.', []), self::tool('dbn.corpus_stats', 'Corpus statistics', 'Return document/chunk counts and active legal sources.', []),
self::tool('dbn.list_documents', 'List corpus documents', 'List DBN legal corpus documents with filters.', [ self::tool('dbn.list_documents', 'List corpus documents', 'List DBN legal corpus documents with filters.', [
'category' => ['type' => 'string'], 'category' => ['type' => 'string'],
@@ -142,6 +163,13 @@ final class DbnMcpRuntime
'limit' => (int)($args['limit'] ?? $args['top_k'] ?? 8), 'limit' => (int)($args['limit'] ?? $args['top_k'] ?? 8),
'corpus_scope' => self::corpusScope($args['corpus_scope'] ?? 'both'), 'corpus_scope' => self::corpusScope($args['corpus_scope'] ?? 'both'),
]), ]),
'dbn.corpus_search' => self::callJson('api/corpus-search.php', [
'query' => (string)($args['query'] ?? ''),
'language' => self::language($args['language'] ?? 'en'),
'mode' => in_array($args['mode'] ?? 'hybrid', ['hybrid', 'bm25', 'vector', 'azure'], true) ? (string)$args['mode'] : 'hybrid',
'limit' => (int)($args['limit'] ?? 8),
'category' => (string)($args['category'] ?? ''),
]),
'dbn.ask' => self::callJson('api/ask.php', [ 'dbn.ask' => self::callJson('api/ask.php', [
'question' => (string)($args['question'] ?? ''), 'question' => (string)($args['question'] ?? ''),
'language' => self::language($args['language'] ?? 'en'), 'language' => self::language($args['language'] ?? 'en'),
@@ -188,6 +216,18 @@ final class DbnMcpRuntime
'use_my_case' => !empty($args['use_case_context']), 'use_my_case' => !empty($args['use_case_context']),
'force_draft' => ($args['force_draft'] ?? true) !== false, 'force_draft' => ($args['force_draft'] ?? true) !== false,
]), ]),
'dbn.korrespond_refine' => self::callJson('api/korrespond-refine.php', [
'original_draft_no' => (string)($args['original_draft'] ?? ''),
'language' => self::language($args['language'] ?? 'en'),
'jurisdiction' => in_array($args['jurisdiction'] ?? 'norwegian', ['norwegian', 'echr', 'both'], true) ? (string)$args['jurisdiction'] : 'norwegian',
'intake' => [
'recipient_body' => (string)($args['recipient_body'] ?? 'other'),
'output_type' => (string)($args['output_type'] ?? 'email'),
'tone' => (string)($args['tone'] ?? 'neutral'),
'goal' => (string)($args['goal'] ?? ''),
],
'classify' => [],
]),
'dbn.barnevernet_analyze' => self::callMultipart('api/barnevernet.php', [ 'dbn.barnevernet_analyze' => self::callMultipart('api/barnevernet.php', [
'language' => self::language($args['language'] ?? 'en'), 'language' => self::language($args['language'] ?? 'en'),
'advocate_role' => (string)($args['advocate_role'] ?? ''), 'advocate_role' => (string)($args['advocate_role'] ?? ''),
@@ -213,6 +253,7 @@ final class DbnMcpRuntime
'file_b' => self::tempTextFile((string)($args['document_b_text'] ?? ''), (string)($args['filename_b'] ?? 'document-b.txt')), 'file_b' => self::tempTextFile((string)($args['document_b_text'] ?? ''), (string)($args['filename_b'] ?? 'document-b.txt')),
]), ]),
'dbn.transcribe_audio' => self::invokeTranscribe($args), 'dbn.transcribe_audio' => self::invokeTranscribe($args),
'dbn.extract_text' => self::invokeExtract($args),
'dbn.corpus_stats' => self::callGet('api/corpus-stats.php', []), 'dbn.corpus_stats' => self::callGet('api/corpus-stats.php', []),
'dbn.list_documents' => self::callGet('api/corpus-documents.php', [ 'dbn.list_documents' => self::callGet('api/corpus-documents.php', [
'category' => (string)($args['category'] ?? ''), 'category' => (string)($args['category'] ?? ''),
@@ -468,6 +509,43 @@ final class DbnMcpRuntime
} }
} }
private static function invokeExtract(array $args): array
{
$filename = preg_replace('/[^A-Za-z0-9._-]/', '_', (string)($args['filename'] ?? 'document.pdf')) ?: 'document.pdf';
$path = tempnam(sys_get_temp_dir(), 'dbn-extract-');
if ($path === false) {
throw new DbnToolsHttpException('Could not create temporary file.', 500, 'temp_failed');
}
try {
if (!empty($args['file_base64'])) {
$data = base64_decode((string)$args['file_base64'], true);
if ($data === false || strlen($data) < 8) {
throw new DbnToolsHttpException('file_base64 is invalid.', 422, 'bad_file_base64');
}
if (strlen($data) > 25 * 1024 * 1024) {
throw new DbnToolsHttpException('file_base64 is too large for MCP upload. Use file_url.', 413, 'file_too_large');
}
file_put_contents($path, $data);
} elseif (!empty($args['file_url'])) {
self::downloadToFile((string)$args['file_url'], $path, 100 * 1024 * 1024);
$filename = basename(parse_url((string)$args['file_url'], PHP_URL_PATH) ?: $filename) ?: $filename;
} else {
throw new DbnToolsHttpException('Provide file_base64 or file_url.', 422, 'missing_file');
}
$body = [
'tool' => 'extract',
'file' => new CURLFile($path, mime_content_type($path) ?: 'application/octet-stream', $filename),
];
return self::curl('api/extract.php', 'POST', $body, ['Accept: application/json']);
} finally {
if (is_file($path)) {
@unlink($path);
}
}
}
private static function downloadToFile(string $url, string $path, int $maxBytes): void private static function downloadToFile(string $url, string $path, int $maxBytes): void
{ {
if (!preg_match('#^https?://#i', $url)) { if (!preg_match('#^https?://#i', $url)) {
@@ -577,7 +655,7 @@ final class DbnMcpRuntime
private static function summaryText(array $payload): string private static function summaryText(array $payload): string
{ {
foreach (['summary_text', 'answer', 'what_we_found', 'overall_assessment', 'translated_text', 'redacted_text', 'transcript'] as $key) { foreach (['summary_text', 'answer', 'what_we_found', 'overall_assessment', 'translated_text', 'redacted_text', 'transcript', 'draft_no', 'draft_user', 'text'] as $key) {
if (!empty($payload[$key]) && is_string($payload[$key])) { if (!empty($payload[$key]) && is_string($payload[$key])) {
return $payload[$key]; return $payload[$key];
} }
@@ -585,6 +663,9 @@ final class DbnMcpRuntime
if (!empty($payload['document']['title'])) { if (!empty($payload['document']['title'])) {
return 'Document: ' . (string)$payload['document']['title']; return 'Document: ' . (string)$payload['document']['title'];
} }
if (isset($payload['hits']) && is_array($payload['hits'])) {
return 'Found ' . count($payload['hits']) . ' source excerpt(s) from the legal corpus.';
}
if (!empty($payload['stats'])) { if (!empty($payload['stats'])) {
return 'Corpus statistics: ' . json_encode($payload['stats'], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); return 'Corpus statistics: ' . json_encode($payload['stats'], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
} }
+54 -1
View File
@@ -1380,6 +1380,12 @@ PROMPT;
private function searchTerms(string $query): array private function searchTerms(string $query): array
{ {
// Citation atoms first: "§ 4-12", "Art. 8(2)", "Rt. 2020 s. 1234" tokenize
// to fragments shorter than the 3-char floor and get dropped, so a citation
// query loses its only meaningful term (EDI Vol.1 #2, §2.1). Extract them
// verbatim and route them ahead of the word tokens.
$citations = $this->extractCitationAtoms($query);
$parts = preg_split('/[^\p{L}\p{N}]+/u', mb_strtolower($query, 'UTF-8')) ?: []; $parts = preg_split('/[^\p{L}\p{N}]+/u', mb_strtolower($query, 'UTF-8')) ?: [];
$stop = ['the', 'and', 'for', 'with', 'that', 'this', 'hva', 'har', 'kan', 'jeg', 'som', 'det', 'med', 'til', 'og']; $stop = ['the', 'and', 'for', 'with', 'that', 'this', 'hva', 'har', 'kan', 'jeg', 'som', 'det', 'med', 'til', 'og'];
$terms = []; $terms = [];
@@ -1389,7 +1395,54 @@ PROMPT;
} }
$terms[] = $part; $terms[] = $part;
} }
return array_slice(array_values(array_unique($terms)), 0, 6);
// Citation atoms are authoritative — prepend, keep verbatim, dedupe.
$terms = array_merge($citations, $terms);
return array_slice(array_values(array_unique($terms)), 0, 8);
}
/**
* Extract exact legal-identifier substrings that must survive tokenization.
* Each is kept as a whole LIKE term. For § sections we also emit spaced /
* unspaced variants so "§4-12" matches stored "§ 4-12" and vice versa.
*
* @return string[]
*/
private function extractCitationAtoms(string $query): array
{
return self::citationAtoms($query);
}
/**
* Static, reusable citation extractor (also used by api/corpus-search.php to
* route identifier queries around the FULLTEXT tokenizer).
*
* @return string[]
*/
public static function citationAtoms(string $query): array
{
$patterns = [
'/§\s*\d+(?:-\d+)?[a-z]?/u', // § 4-12, § 1a
'/\bArt(?:ikkel|icle|\.)?\s*\d+(?:\(\d+\))?/iu', // Art. 8, Article 3, Art. 8(2)
'/\b3\d{4}[A-Z]\d{4}\b/', // EU CELEX: 32016R0679
'/\bRt[\.\s]*\d{4}[\.\s]*s[\.\s]*\d+/u', // Rt. 2020 s. 1234
'/\bHR-\d{4}-\d+(?:-[A-Z])?/u', // HR-2020-1789-A
];
$out = [];
foreach ($patterns as $rx) {
if (!preg_match_all($rx, $query, $m)) continue;
foreach ($m[0] as $hit) {
$hit = trim((string)$hit);
if ($hit === '') continue;
$out[$hit] = true;
if (mb_strpos($hit, '§') !== false) {
$out[preg_replace('/§\s*/u', '§ ', $hit)] = true; // force single space
$out[preg_replace('/§\s*/u', '§', $hit)] = true; // no space
}
}
}
return array_keys($out);
} }
private function requirePasteText(string $text, ?int $maxChars = null): string private function requirePasteText(string $text, ?int $maxChars = null): string
+3
View File
@@ -25,6 +25,7 @@ $toolCatalog = DbnMcpRuntime::tools();
$toolIcons = [ $toolIcons = [
'dbn.search_legal' => '🔍', 'dbn.search_legal' => '🔍',
'dbn.corpus_search' => '🧭',
'dbn.ask' => '💬', 'dbn.ask' => '💬',
'dbn.summarize' => '📋', 'dbn.summarize' => '📋',
'dbn.timeline' => '📅', 'dbn.timeline' => '📅',
@@ -32,11 +33,13 @@ $toolIcons = [
'dbn.translate' => '🌍', 'dbn.translate' => '🌍',
'dbn.legal_analysis' => '⚖️', 'dbn.legal_analysis' => '⚖️',
'dbn.korrespond' => '✉️', 'dbn.korrespond' => '✉️',
'dbn.korrespond_refine' => '✨',
'dbn.barnevernet_analyze' => '📄', 'dbn.barnevernet_analyze' => '📄',
'dbn.advocate_brief' => '🏛️', 'dbn.advocate_brief' => '🏛️',
'dbn.deep_research' => '🔬', 'dbn.deep_research' => '🔬',
'dbn.discrepancy_find' => '🔄', 'dbn.discrepancy_find' => '🔄',
'dbn.transcribe_audio' => '🎤', 'dbn.transcribe_audio' => '🎤',
'dbn.extract_text' => '📑',
'dbn.corpus_stats' => '📊', 'dbn.corpus_stats' => '📊',
'dbn.list_documents' => '📚', 'dbn.list_documents' => '📚',
'dbn.get_document' => '📖', 'dbn.get_document' => '📖',