fix(tools): parse-harden Do Better Legal ask against leaky fine-tune output
The dbn-legal-agent-v3 fine-tune (Track 1 / family) emits a labelled-prose
template — duplicate `answer:` prefixes, markdown-escaped underscores (`\_`),
and a trailing raw JSON blob — rather than the strict JSON the Azure/gpt-4o
path produces via response_format. decodeJsonObject() returned null on that
invalid JSON, so ask() dumped the entire raw blob into `answer`.
Fix at the parse layer (no upstream response_format change, to avoid fighting
the fine-tune's training):
- dbnToolsRepairJsonText(): strip fences, drop only invalid `\_`/`\*` escapes,
then balanced-brace scan collecting every top-level {...} (longest first) to
recover an appended JSON object. Shared by both gateways' decodeJsonObject(),
so all JSON tools benefit.
- dbnToolsParseLabeledFields(): parse labelled-prose into real fields when no
JSON decodes, tolerating escaped key names and collapsing duplicate prefixes.
- ask() null-fallback now builds clean structured fields from the parsed prose
instead of dumping raw; what_remains_uncertain becomes a proper list.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+23
-4
@@ -268,12 +268,31 @@ PROMPT;
|
||||
|
||||
$json = $gateway->decodeJsonObject($raw);
|
||||
if (!$json) {
|
||||
// Some fine-tuned models emit a labelled-prose template instead of JSON.
|
||||
// Parse those labels into the real fields rather than dumping the raw blob.
|
||||
$fields = dbnToolsParseLabeledFields($raw, [
|
||||
'answer', 'what_we_found', 'evidence_trail', 'what_remains_uncertain', 'next_practical_step',
|
||||
]);
|
||||
$uncertain = trim((string)($fields['what_remains_uncertain'] ?? ''));
|
||||
$uncertainList = $uncertain !== ''
|
||||
? array_values(array_filter(array_map(
|
||||
static fn(string $l): string => trim(ltrim($l, "-*• \t")),
|
||||
preg_split('/\r?\n/', $uncertain) ?: []
|
||||
), static fn(string $l): bool => $l !== ''))
|
||||
: ['The response format could not be validated as structured JSON.'];
|
||||
$cleanAnswer = trim((string)($fields['answer'] ?? ''));
|
||||
if ($cleanAnswer === '') {
|
||||
// No usable label — strip the trailing appended JSON blob from raw.
|
||||
$cleanAnswer = trim((string)preg_replace('/\s*\{[\s\S]*$/', '', (string)preg_replace('/\\\\([_*])/', '$1', $raw)));
|
||||
}
|
||||
$json = [
|
||||
'answer' => $raw,
|
||||
'what_we_found' => 'Azure returned a plain-text answer based on the retrieved excerpts.',
|
||||
'answer' => $cleanAnswer !== '' ? $cleanAnswer : $raw,
|
||||
'what_we_found' => trim((string)($fields['what_we_found'] ?? ''))
|
||||
?: 'The model returned a plain-text answer based on the retrieved excerpts.',
|
||||
'evidence_trail' => [],
|
||||
'what_remains_uncertain' => ['The response format could not be validated as structured JSON.'],
|
||||
'next_practical_step' => 'Review the source excerpts manually before relying on the answer.',
|
||||
'what_remains_uncertain' => $uncertainList,
|
||||
'next_practical_step' => trim((string)($fields['next_practical_step'] ?? ''))
|
||||
?: 'Review the source excerpts manually before relying on the answer.',
|
||||
];
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user