Add Summarize Document tool — engine selector, file upload, optional corpus enrichment
- summarize.php: full custom inline form (replaces tool_form.php wrapper) with lang switcher, azure_mini/azure_full/gpu engine selector, 8 corpus-slice toggles (all off by default), doc picker, file upload zone, and textarea - api/summarize.php: rewritten to streaming NDJSON (matches barnevernet pattern); accepts JSON payload with text, language, engine, slices[], doc_ids[] - includes/LegalTools.php: adds corpusContextForSummarize() (keyword search via ClientRagPipeline) and summarizeWithContext() (engine-aware LLM call with optional corpus prepend); returns structured JSON matching existing summarize format - assets/js/summarize.js: self-contained IIFE handling file upload via api/extract.php, slice toggles, NDJSON stream reader, result rendering, and trace panel update - includes/i18n.php: adds 'summarize' to nav in all 4 languages (EN/NO/UK/PL), inserted after 'redact' in the tool order with icon 'SZ' Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1211,6 +1211,153 @@ PROMPT;
|
||||
return dbnToolsCallGpuLlm($messages, $options);
|
||||
}
|
||||
|
||||
// ── Summarize: corpus context + engine-aware summary ─────────────────────
|
||||
|
||||
/**
|
||||
* Search the shared legal corpus and return top-N passages as a formatted
|
||||
* context string. Returns '' on failure so the caller can degrade gracefully.
|
||||
*/
|
||||
public function corpusContextForSummarize(string $query, int $limit = 8): string
|
||||
{
|
||||
try {
|
||||
$client = dbnToolsRequireClient();
|
||||
$package = $this->requireFamilyPackage((int)$client['id']);
|
||||
dbnToolsBootCaveau();
|
||||
$gatewayUrl = 'http://10.0.1.10:4000';
|
||||
try {
|
||||
$config = getConfig();
|
||||
$u = trim((string)($config['ai_gateway']['url'] ?? ''));
|
||||
if ($u !== '') $gatewayUrl = $u;
|
||||
} catch (Throwable) {}
|
||||
$rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 20);
|
||||
$chunks = $rag->searchAll($query, $limit, null, [
|
||||
'search_private' => true,
|
||||
'search_shared' => true,
|
||||
'package_ids' => [(int)$package['id']],
|
||||
'chunk_limit' => $limit,
|
||||
'search_method' => 'keyword',
|
||||
'min_private' => 0,
|
||||
'include_beta_website' => true,
|
||||
]);
|
||||
$parts = [];
|
||||
foreach ($chunks as $c) {
|
||||
$title = (string)($c['title'] ?? ($c['source'] ?? 'Legal source'));
|
||||
$content = (string)($c['content'] ?? ($c['text'] ?? ''));
|
||||
if ($content !== '') {
|
||||
$parts[] = "=== {$title} ===\n{$content}";
|
||||
}
|
||||
}
|
||||
return implode("\n\n", $parts);
|
||||
} catch (Throwable $e) {
|
||||
error_log('summarize corpus search failed: ' . $e->getMessage());
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Engine-aware structured summarization, optionally enriched with corpus context.
|
||||
*/
|
||||
public function summarizeWithContext(
|
||||
string $text,
|
||||
string $language = 'en',
|
||||
string $engine = 'azure_mini',
|
||||
string $corpusContext = ''
|
||||
): array {
|
||||
$text = $this->requirePasteText($text);
|
||||
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
|
||||
|
||||
$locale = dbnToolsLanguageName($language);
|
||||
|
||||
$enriched = $text;
|
||||
$corpusUsed = $corpusContext !== '';
|
||||
if ($corpusUsed) {
|
||||
$enriched = "[Relevant legal context from Do Better Norge corpus]\n"
|
||||
. $corpusContext
|
||||
. "\n\n---\n\nDocument to summarise:\n"
|
||||
. $text;
|
||||
}
|
||||
|
||||
$prompt = <<<PROMPT
|
||||
Summarise the following document in {$locale}. Do not invent facts not present in the text.
|
||||
Return JSON only — no extra text before or after the JSON object.
|
||||
|
||||
{$enriched}
|
||||
|
||||
Return this JSON structure:
|
||||
{
|
||||
"what_we_found": "plain-language summary (2-4 sentences)",
|
||||
"key_facts": ["fact 1", "fact 2"],
|
||||
"dates": ["date or event phrase"],
|
||||
"parties": ["party or role"],
|
||||
"legal_references_detected": ["statute, article, or case name"],
|
||||
"what_remains_uncertain": ["uncertainty or gap"],
|
||||
"next_practical_step": "one concrete next action"
|
||||
}
|
||||
PROMPT;
|
||||
|
||||
$system = $this->legalJsonSystemPrompt($language);
|
||||
$messages = [
|
||||
['role' => 'system', 'content' => $system],
|
||||
['role' => 'user', 'content' => $prompt],
|
||||
];
|
||||
$maxTok = ($engine === 'azure_full') ? 8000 : 4000;
|
||||
$chatOpts = ['json' => true, 'temperature' => 0.1, 'max_tokens' => $maxTok, 'timeout' => 120];
|
||||
|
||||
$deployLabel = $this->azure->chatDeployment();
|
||||
try {
|
||||
if ($engine === 'gpu') {
|
||||
$response = $this->callGpuLlm($messages, $chatOpts);
|
||||
$deployLabel = 'GPU (local)';
|
||||
} elseif ($engine === 'azure_full') {
|
||||
$response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOpts);
|
||||
$deployLabel = 'gpt-4o';
|
||||
} else {
|
||||
$response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOpts);
|
||||
$deployLabel = 'gpt-4o-mini';
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
dbnToolsAbort('LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
|
||||
}
|
||||
|
||||
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
|
||||
$json = $this->azure->decodeJsonObject($raw);
|
||||
if (!$json) {
|
||||
dbnToolsAbort('LLM returned unparseable JSON.', 502, 'llm_parse_error');
|
||||
}
|
||||
|
||||
$corpusNote = $corpusUsed
|
||||
? 'Summary enriched with ' . count(array_filter(explode('=== ', $corpusContext))) . ' passage(s) from the Do Better Norge legal corpus.'
|
||||
: 'No corpus search performed; summarised from document text only.';
|
||||
|
||||
$trace = [
|
||||
$this->trace('Document preparation', 'Text validated and prepared for summarisation.', 'complete'),
|
||||
$this->trace('Corpus enrichment', $corpusNote, $corpusUsed ? 'complete' : 'complete'),
|
||||
$this->trace('Summary generation', 'Structured summary generated via ' . $deployLabel . '.', 'complete'),
|
||||
$this->trace('Uncertainty', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'),
|
||||
$this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the summary against the original document.'), 'complete'),
|
||||
];
|
||||
|
||||
return [
|
||||
'tool' => 'summarize',
|
||||
'language' => $language,
|
||||
'what_we_found' => (string)($json['what_we_found'] ?? ''),
|
||||
'key_facts' => $json['key_facts'] ?? [],
|
||||
'dates' => $json['dates'] ?? [],
|
||||
'parties' => $json['parties'] ?? [],
|
||||
'legal_references_detected' => $json['legal_references_detected'] ?? [],
|
||||
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
|
||||
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
|
||||
'corpus_used' => $corpusUsed,
|
||||
'trace' => $trace,
|
||||
'trace_metadata' => [
|
||||
'chunk_count' => 1,
|
||||
'source_count' => 1,
|
||||
'deployment' => $deployLabel,
|
||||
],
|
||||
'disclaimer' => dbnToolsDisclaimer($language),
|
||||
];
|
||||
}
|
||||
|
||||
private function applyGenericTags(string $text): string
|
||||
{
|
||||
// Collapse contextual role tags (e.g. [FATHER], [JUDGE: Andersen], [CHILD_1]) → [PERSON]
|
||||
|
||||
Reference in New Issue
Block a user