From c84ed2ed787fe483ea636863c251d6eb48509feb Mon Sep 17 00:00:00 2001 From: davegilligan Date: Tue, 2 Jun 2026 17:36:35 +0200 Subject: [PATCH] fix(tools): parse-harden Do Better Legal ask against leaky fine-tune output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dbn-legal-agent-v3 fine-tune (Track 1 / family) emits a labelled-prose template — duplicate `answer:` prefixes, markdown-escaped underscores (`\_`), and a trailing raw JSON blob — rather than the strict JSON the Azure/gpt-4o path produces via response_format. decodeJsonObject() returned null on that invalid JSON, so ask() dumped the entire raw blob into `answer`. Fix at the parse layer (no upstream response_format change, to avoid fighting the fine-tune's training): - dbnToolsRepairJsonText(): strip fences, drop only invalid `\_`/`\*` escapes, then balanced-brace scan collecting every top-level {...} (longest first) to recover an appended JSON object. Shared by both gateways' decodeJsonObject(), so all JSON tools benefit. - dbnToolsParseLabeledFields(): parse labelled-prose into real fields when no JSON decodes, tolerating escaped key names and collapsing duplicate prefixes. - ask() null-fallback now builds clean structured fields from the parsed prose instead of dumping raw; what_remains_uncertain becomes a proper list. Co-Authored-By: Claude Opus 4.7 --- includes/AzureOpenAiGateway.php | 21 +----- includes/DbnBedrockGateway.php | 21 +----- includes/LegalTools.php | 27 ++++++-- includes/bootstrap.php | 111 ++++++++++++++++++++++++++++++++ 4 files changed, 136 insertions(+), 44 deletions(-) diff --git a/includes/AzureOpenAiGateway.php b/includes/AzureOpenAiGateway.php index 275bd13..dd15c22 100644 --- a/includes/AzureOpenAiGateway.php +++ b/includes/AzureOpenAiGateway.php @@ -148,26 +148,7 @@ final class DbnAzureOpenAiGateway public function decodeJsonObject(string $content): ?array { - $content = trim($content); - $content = (string)preg_replace('/^```(?:json)?\s*\n?/i', '', $content); - $content = (string)preg_replace('/\n?```\s*$/', '', $content); - $content = trim($content); - - $decoded = json_decode($content, true); - if (is_array($decoded)) { - return $decoded; - } - - $start = strpos($content, '{'); - $end = strrpos($content, '}'); - if ($start !== false && $end !== false && $end > $start) { - $candidate = substr($content, $start, $end - $start + 1); - $decoded = json_decode($candidate, true); - if (is_array($decoded)) { - return $decoded; - } - } - return null; + return dbnToolsRepairJsonText($content); } private function postJson(string $url, array $payload, int $timeout): array diff --git a/includes/DbnBedrockGateway.php b/includes/DbnBedrockGateway.php index 846521c..3cc4090 100644 --- a/includes/DbnBedrockGateway.php +++ b/includes/DbnBedrockGateway.php @@ -140,26 +140,7 @@ final class DbnBedrockGateway public function decodeJsonObject(string $content): ?array { - $content = trim($content); - $content = (string)preg_replace('/^```(?:json)?\s*\n?/i', '', $content); - $content = (string)preg_replace('/\n?```\s*$/', '', $content); - $content = trim($content); - - $decoded = json_decode($content, true); - if (is_array($decoded)) { - return $decoded; - } - - $start = strpos($content, '{'); - $end = strrpos($content, '}'); - if ($start !== false && $end !== false && $end > $start) { - $candidate = substr($content, $start, $end - $start + 1); - $decoded = json_decode($candidate, true); - if (is_array($decoded)) { - return $decoded; - } - } - return null; + return dbnToolsRepairJsonText($content); } // ── Bedrock-specific ────────────────────────────────────────────────────── diff --git a/includes/LegalTools.php b/includes/LegalTools.php index 8b0a3b5..9edc780 100644 --- a/includes/LegalTools.php +++ b/includes/LegalTools.php @@ -268,12 +268,31 @@ PROMPT; $json = $gateway->decodeJsonObject($raw); if (!$json) { + // Some fine-tuned models emit a labelled-prose template instead of JSON. + // Parse those labels into the real fields rather than dumping the raw blob. + $fields = dbnToolsParseLabeledFields($raw, [ + 'answer', 'what_we_found', 'evidence_trail', 'what_remains_uncertain', 'next_practical_step', + ]); + $uncertain = trim((string)($fields['what_remains_uncertain'] ?? '')); + $uncertainList = $uncertain !== '' + ? array_values(array_filter(array_map( + static fn(string $l): string => trim(ltrim($l, "-*• \t")), + preg_split('/\r?\n/', $uncertain) ?: [] + ), static fn(string $l): bool => $l !== '')) + : ['The response format could not be validated as structured JSON.']; + $cleanAnswer = trim((string)($fields['answer'] ?? '')); + if ($cleanAnswer === '') { + // No usable label — strip the trailing appended JSON blob from raw. + $cleanAnswer = trim((string)preg_replace('/\s*\{[\s\S]*$/', '', (string)preg_replace('/\\\\([_*])/', '$1', $raw))); + } $json = [ - 'answer' => $raw, - 'what_we_found' => 'Azure returned a plain-text answer based on the retrieved excerpts.', + 'answer' => $cleanAnswer !== '' ? $cleanAnswer : $raw, + 'what_we_found' => trim((string)($fields['what_we_found'] ?? '')) + ?: 'The model returned a plain-text answer based on the retrieved excerpts.', 'evidence_trail' => [], - 'what_remains_uncertain' => ['The response format could not be validated as structured JSON.'], - 'next_practical_step' => 'Review the source excerpts manually before relying on the answer.', + 'what_remains_uncertain' => $uncertainList, + 'next_practical_step' => trim((string)($fields['next_practical_step'] ?? '')) + ?: 'Review the source excerpts manually before relying on the answer.', ]; } diff --git a/includes/bootstrap.php b/includes/bootstrap.php index 35ffb8e..090da9c 100644 --- a/includes/bootstrap.php +++ b/includes/bootstrap.php @@ -1461,6 +1461,117 @@ function dbnToolsExtractCleanAnswer(string $text): string return trim($text); } +/** + * Robustly extract a JSON object from a model reply, tolerating the artifacts the + * fine-tuned models leak: ```fences```, markdown-escaped underscores/asterisks + * (`\_`, `\*` — never valid JSON escapes), and prose wrapped around a real JSON + * blob. Returns the decoded array, or null if nothing parses. Shared by both + * gateways' decodeJsonObject(), so every JSON tool benefits. + */ +function dbnToolsRepairJsonText(string $content): ?array +{ + $content = trim($content); + $content = (string)preg_replace('/^```(?:json)?\s*\n?/i', '', $content); + $content = (string)preg_replace('/\n?```\s*$/', '', $content); + // Drop only invalid markdown escapes; leave legitimate \n \" \\ \/ \t intact. + $content = (string)preg_replace('/\\\\([_*])/', '$1', $content); + $content = trim($content); + + $decoded = json_decode($content, true); + if (is_array($decoded)) { + return $decoded; + } + + // Collect every balanced top-level {...} block (ignoring braces inside JSON + // strings), then try the longest first — handles "prose then appended JSON". + $candidates = []; + $depth = 0; + $start = -1; + $inStr = false; + $escaped = false; + $len = strlen($content); + for ($i = 0; $i < $len; $i++) { + $ch = $content[$i]; + if ($inStr) { + if ($escaped) { + $escaped = false; + } elseif ($ch === '\\') { + $escaped = true; + } elseif ($ch === '"') { + $inStr = false; + } + continue; + } + if ($ch === '"') { + $inStr = true; + } elseif ($ch === '{') { + if ($depth === 0) { + $start = $i; + } + $depth++; + } elseif ($ch === '}') { + if ($depth > 0) { + $depth--; + if ($depth === 0 && $start >= 0) { + $candidates[] = substr($content, $start, $i - $start + 1); + $start = -1; + } + } + } + } + usort($candidates, static fn(string $a, string $b): int => strlen($b) <=> strlen($a)); + foreach ($candidates as $candidate) { + $decoded = json_decode($candidate, true); + if (is_array($decoded)) { + return $decoded; + } + } + return null; +} + +/** + * Parse a labelled-prose reply (`answer: ...`, `what_we_found: ...`) into an assoc + * array keyed by $keys, for fine-tunes that ignore the JSON contract. Tolerates + * markdown-escaped key names (`what\_we\_found`). Each value runs until the next + * known key label or a trailing { JSON blob (discarded). Returns only found keys. + */ +function dbnToolsParseLabeledFields(string $text, array $keys): array +{ + $text = (string)preg_replace('/\\\\([_*])/', '$1', trim($text)); + if ($text === '' || empty($keys)) { + return []; + } + // Find each "key:" label position (line start, case-insensitive). + $labels = []; + foreach ($keys as $key) { + if (preg_match('/^\s*' . preg_quote($key, '/') . '\s*:/im', $text, $m, PREG_OFFSET_CAPTURE)) { + $labelStart = $m[0][1]; + $valueStart = $labelStart + strlen($m[0][0]); + $labels[] = ['key' => $key, 'start' => $labelStart, 'value_start' => $valueStart]; + } + } + if (!$labels) { + return []; + } + usort($labels, static fn(array $a, array $b): int => $a['start'] <=> $b['start']); + + $out = []; + $count = count($labels); + for ($i = 0; $i < $count; $i++) { + $end = ($i + 1 < $count) ? $labels[$i + 1]['start'] : strlen($text); + $value = substr($text, $labels[$i]['value_start'], $end - $labels[$i]['value_start']); + // Drop a trailing appended JSON blob from the last field's value. + $brace = strpos($value, '{'); + if ($brace !== false && $i + 1 === $count) { + $value = substr($value, 0, $brace); + } + // Collapse a duplicated "key:" prefix the model sometimes repeats inside the value. + $value = (string)preg_replace('/^\s*' . preg_quote($labels[$i]['key'], '/') . '\s*:\s*/i', '', trim($value)); + $out[$labels[$i]['key']] = trim($value); + } + return $out; +} + function dbnToolsInferCheckSeverity(string $text): string { if (preg_match('/ugyldig|§\s*41|kontradiksjon|klar nødvendighet|strand lobben|biologiske bånd/i', $text)) {