Add Summarize Document tool — engine selector, file upload, optional corpus enrichment

- summarize.php: full custom inline form (replaces tool_form.php wrapper) with
  lang switcher, azure_mini/azure_full/gpu engine selector, 8 corpus-slice
  toggles (all off by default), doc picker, file upload zone, and textarea
- api/summarize.php: rewritten to streaming NDJSON (matches barnevernet pattern);
  accepts JSON payload with text, language, engine, slices[], doc_ids[]
- includes/LegalTools.php: adds corpusContextForSummarize() (keyword search via
  ClientRagPipeline) and summarizeWithContext() (engine-aware LLM call with
  optional corpus prepend); returns structured JSON matching existing summarize format
- assets/js/summarize.js: self-contained IIFE handling file upload via
  api/extract.php, slice toggles, NDJSON stream reader, result rendering,
  and trace panel update
- includes/i18n.php: adds 'summarize' to nav in all 4 languages (EN/NO/UK/PL),
  inserted after 'redact' in the tool order with icon 'SZ'

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-23 23:25:40 +02:00
parent 8587ec372f
commit e768662efe
5 changed files with 742 additions and 12 deletions
+147
View File
@@ -1211,6 +1211,153 @@ PROMPT;
return dbnToolsCallGpuLlm($messages, $options);
}
// ── Summarize: corpus context + engine-aware summary ─────────────────────
/**
* Search the shared legal corpus and return top-N passages as a formatted
* context string. Returns '' on failure so the caller can degrade gracefully.
*/
public function corpusContextForSummarize(string $query, int $limit = 8): string
{
try {
$client = dbnToolsRequireClient();
$package = $this->requireFamilyPackage((int)$client['id']);
dbnToolsBootCaveau();
$gatewayUrl = 'http://10.0.1.10:4000';
try {
$config = getConfig();
$u = trim((string)($config['ai_gateway']['url'] ?? ''));
if ($u !== '') $gatewayUrl = $u;
} catch (Throwable) {}
$rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 20);
$chunks = $rag->searchAll($query, $limit, null, [
'search_private' => true,
'search_shared' => true,
'package_ids' => [(int)$package['id']],
'chunk_limit' => $limit,
'search_method' => 'keyword',
'min_private' => 0,
'include_beta_website' => true,
]);
$parts = [];
foreach ($chunks as $c) {
$title = (string)($c['title'] ?? ($c['source'] ?? 'Legal source'));
$content = (string)($c['content'] ?? ($c['text'] ?? ''));
if ($content !== '') {
$parts[] = "=== {$title} ===\n{$content}";
}
}
return implode("\n\n", $parts);
} catch (Throwable $e) {
error_log('summarize corpus search failed: ' . $e->getMessage());
return '';
}
}
/**
* Engine-aware structured summarization, optionally enriched with corpus context.
*/
public function summarizeWithContext(
string $text,
string $language = 'en',
string $engine = 'azure_mini',
string $corpusContext = ''
): array {
$text = $this->requirePasteText($text);
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
$locale = dbnToolsLanguageName($language);
$enriched = $text;
$corpusUsed = $corpusContext !== '';
if ($corpusUsed) {
$enriched = "[Relevant legal context from Do Better Norge corpus]\n"
. $corpusContext
. "\n\n---\n\nDocument to summarise:\n"
. $text;
}
$prompt = <<<PROMPT
Summarise the following document in {$locale}. Do not invent facts not present in the text.
Return JSON only — no extra text before or after the JSON object.
{$enriched}
Return this JSON structure:
{
"what_we_found": "plain-language summary (2-4 sentences)",
"key_facts": ["fact 1", "fact 2"],
"dates": ["date or event phrase"],
"parties": ["party or role"],
"legal_references_detected": ["statute, article, or case name"],
"what_remains_uncertain": ["uncertainty or gap"],
"next_practical_step": "one concrete next action"
}
PROMPT;
$system = $this->legalJsonSystemPrompt($language);
$messages = [
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $prompt],
];
$maxTok = ($engine === 'azure_full') ? 8000 : 4000;
$chatOpts = ['json' => true, 'temperature' => 0.1, 'max_tokens' => $maxTok, 'timeout' => 120];
$deployLabel = $this->azure->chatDeployment();
try {
if ($engine === 'gpu') {
$response = $this->callGpuLlm($messages, $chatOpts);
$deployLabel = 'GPU (local)';
} elseif ($engine === 'azure_full') {
$response = $this->azure->withDeployment('gpt-4o')->chat($messages, $chatOpts);
$deployLabel = 'gpt-4o';
} else {
$response = $this->azure->withDeployment('gpt-4o-mini')->chat($messages, $chatOpts);
$deployLabel = 'gpt-4o-mini';
}
} catch (Throwable $e) {
dbnToolsAbort('LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
}
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
$json = $this->azure->decodeJsonObject($raw);
if (!$json) {
dbnToolsAbort('LLM returned unparseable JSON.', 502, 'llm_parse_error');
}
$corpusNote = $corpusUsed
? 'Summary enriched with ' . count(array_filter(explode('=== ', $corpusContext))) . ' passage(s) from the Do Better Norge legal corpus.'
: 'No corpus search performed; summarised from document text only.';
$trace = [
$this->trace('Document preparation', 'Text validated and prepared for summarisation.', 'complete'),
$this->trace('Corpus enrichment', $corpusNote, $corpusUsed ? 'complete' : 'complete'),
$this->trace('Summary generation', 'Structured summary generated via ' . $deployLabel . '.', 'complete'),
$this->trace('Uncertainty', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'),
$this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the summary against the original document.'), 'complete'),
];
return [
'tool' => 'summarize',
'language' => $language,
'what_we_found' => (string)($json['what_we_found'] ?? ''),
'key_facts' => $json['key_facts'] ?? [],
'dates' => $json['dates'] ?? [],
'parties' => $json['parties'] ?? [],
'legal_references_detected' => $json['legal_references_detected'] ?? [],
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
'corpus_used' => $corpusUsed,
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => 1,
'source_count' => 1,
'deployment' => $deployLabel,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function applyGenericTags(string $text): string
{
// Collapse contextual role tags (e.g. [FATHER], [JUDGE: Andersen], [CHILD_1]) → [PERSON]