feat(mcp): expose corpus_search, korrespond_refine, extract_text tools
Restores the 3 tools (manifest + invoke arms + invokeExtract helper), the citation-atom RAG lever in LegalTools/corpus-search, and the catalog icons. These were live on prod via rsync but uncommitted, so a git-pull deploy reverted the manifest from 22 to 19 tools. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+54
-1
@@ -1380,6 +1380,12 @@ PROMPT;
|
||||
|
||||
private function searchTerms(string $query): array
|
||||
{
|
||||
// Citation atoms first: "§ 4-12", "Art. 8(2)", "Rt. 2020 s. 1234" tokenize
|
||||
// to fragments shorter than the 3-char floor and get dropped, so a citation
|
||||
// query loses its only meaningful term (EDI Vol.1 #2, §2.1). Extract them
|
||||
// verbatim and route them ahead of the word tokens.
|
||||
$citations = $this->extractCitationAtoms($query);
|
||||
|
||||
$parts = preg_split('/[^\p{L}\p{N}]+/u', mb_strtolower($query, 'UTF-8')) ?: [];
|
||||
$stop = ['the', 'and', 'for', 'with', 'that', 'this', 'hva', 'har', 'kan', 'jeg', 'som', 'det', 'med', 'til', 'og'];
|
||||
$terms = [];
|
||||
@@ -1389,7 +1395,54 @@ PROMPT;
|
||||
}
|
||||
$terms[] = $part;
|
||||
}
|
||||
return array_slice(array_values(array_unique($terms)), 0, 6);
|
||||
|
||||
// Citation atoms are authoritative — prepend, keep verbatim, dedupe.
|
||||
$terms = array_merge($citations, $terms);
|
||||
return array_slice(array_values(array_unique($terms)), 0, 8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract exact legal-identifier substrings that must survive tokenization.
|
||||
* Each is kept as a whole LIKE term. For § sections we also emit spaced /
|
||||
* unspaced variants so "§4-12" matches stored "§ 4-12" and vice versa.
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function extractCitationAtoms(string $query): array
|
||||
{
|
||||
return self::citationAtoms($query);
|
||||
}
|
||||
|
||||
/**
|
||||
* Static, reusable citation extractor (also used by api/corpus-search.php to
|
||||
* route identifier queries around the FULLTEXT tokenizer).
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
public static function citationAtoms(string $query): array
|
||||
{
|
||||
$patterns = [
|
||||
'/§\s*\d+(?:-\d+)?[a-z]?/u', // § 4-12, § 1a
|
||||
'/\bArt(?:ikkel|icle|\.)?\s*\d+(?:\(\d+\))?/iu', // Art. 8, Article 3, Art. 8(2)
|
||||
'/\b3\d{4}[A-Z]\d{4}\b/', // EU CELEX: 32016R0679
|
||||
'/\bRt[\.\s]*\d{4}[\.\s]*s[\.\s]*\d+/u', // Rt. 2020 s. 1234
|
||||
'/\bHR-\d{4}-\d+(?:-[A-Z])?/u', // HR-2020-1789-A
|
||||
];
|
||||
|
||||
$out = [];
|
||||
foreach ($patterns as $rx) {
|
||||
if (!preg_match_all($rx, $query, $m)) continue;
|
||||
foreach ($m[0] as $hit) {
|
||||
$hit = trim((string)$hit);
|
||||
if ($hit === '') continue;
|
||||
$out[$hit] = true;
|
||||
if (mb_strpos($hit, '§') !== false) {
|
||||
$out[preg_replace('/§\s*/u', '§ ', $hit)] = true; // force single space
|
||||
$out[preg_replace('/§\s*/u', '§', $hit)] = true; // no space
|
||||
}
|
||||
}
|
||||
}
|
||||
return array_keys($out);
|
||||
}
|
||||
|
||||
private function requirePasteText(string $text, ?int $maxChars = null): string
|
||||
|
||||
Reference in New Issue
Block a user