feat(tools): graceful degradation when GPU fine-tune is offline

Persona-pinned legal model (dbn-legal-agent-v3, served from the home GPU
pod) hard-failed ask/legal-analysis whenever the GPU was powered off.
Add a cached health-ping gate plus reactive try/catch fallback: if the
fine-tune is unreachable, transparently route to gpt-4o and surface a
localized notice in what_remains_uncertain that the specialized model is
temporarily offline while corpus, retrieval, and sources remain live.
Cloud models are excluded from the gate so gpt-4o personas never degrade.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-06-05 06:19:05 +02:00
parent c22f54bb7b
commit b217f18118
+172 -14
View File
@@ -245,9 +245,6 @@ final class DbnLegalToolsService
];
}
[$gateway, $personaModel] = $this->personaGateway($personaResolved, $engine);
$gateway->requireChat();
$context = $this->buildEvidenceContext($hits);
$locale = dbnToolsLanguageName($language);
$prompt = <<<PROMPT
@@ -270,8 +267,7 @@ PROMPT;
// Persona voice/domain folded into the JSON-enforcing scaffold (keeps the
// structured-output contract while applying the persona's legal framing).
$system = $this->legalJsonSystemPrompt($language, $personaResolved['system_prompt'] ?? null);
$askDeployment = $personaModel;
$raw = $gateway->withDeployment($askDeployment)->chatText([
$synth = $this->personaSynthesize($personaResolved, $engine, [
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $prompt],
], [
@@ -279,6 +275,9 @@ PROMPT;
'temperature' => 0.15,
'max_tokens' => 1300,
]);
$gateway = $synth['gateway'];
$askDeployment = $synth['model'];
$raw = $synth['raw'];
$json = $gateway->decodeJsonObject($raw);
if (!$json) {
@@ -310,6 +309,16 @@ PROMPT;
];
}
if ($synth['degraded']) {
$uncertain = $json['what_remains_uncertain'] ?? [];
if (!is_array($uncertain)) {
$uncertain = ($uncertain === '' || $uncertain === null) ? [] : [(string)$uncertain];
}
array_unshift($uncertain, $this->degradedModelNotice($language));
$json['what_remains_uncertain'] = $uncertain;
$trace[] = $this->trace('Model routing', 'Fine-tuned legal model unavailable; answered with gpt-4o fallback (corpus + retrieval unaffected).', 'warning');
}
$trace[] = $this->trace('Synthesis', 'Azure OpenAI generated an answer using only the retrieved source excerpts.', 'complete');
$trace[] = $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete');
$trace[] = $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the evidence trail.'), 'complete');
@@ -330,6 +339,9 @@ PROMPT;
'source_count' => count($hits),
'deployment' => $askDeployment,
'citation_confidence' => $search['trace_metadata']['citation_confidence'] ?? 'medium',
'model_degraded' => $synth['degraded'],
'requested_model' => $synth['failed_model'],
'used_model' => $askDeployment,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
@@ -1222,26 +1234,172 @@ PROMPT;
return [$this->azure, ($engine === 'azure_full') ? 'gpt-4o' : 'gpt-4o-mini'];
}
/**
* True for self-hosted / fine-tuned models served from the GPU pod (e.g.
* dbn-legal-agent-v3, dobetter-norge-v4, bnl-legal). Cloud models (gpt-*, claude-*, …)
* are always up and must NOT be health-gated or treated as degradable.
*/
private function isGpuBackedModel(string $model): bool
{
$m = strtolower(trim($model));
if ($m === '') {
return false;
}
foreach (['gpt-', 'o1', 'o3', 'o4', 'claude', 'azure', 'nova', 'gemini', 'command', 'mistral-large'] as $cloudPrefix) {
if (str_starts_with($m, $cloudPrefix)) {
return false;
}
}
return true;
}
/** The always-up cloud fallback used when a GPU-backed fine-tune is offline. */
private function cloudFallbackGateway(): array
{
return [$this->azure, 'gpt-4o'];
}
/**
* Cached reachability gate for a GPU-served model. Pings the LiteLLM gateway at most
* once per TTL window (30s when up, 60s when down) so we neither probe on every request
* nor wait out a long synthesis timeout repeatedly while the GPU is powered off.
*/
private function gpuModelAvailable(DbnBedrockGateway $gw, string $model): bool
{
$cached = $this->readGpuHealth($model);
if ($cached !== null) {
return $cached;
}
$ok = $gw->withDeployment($model)->ping(8);
$this->writeGpuHealth($model, $ok);
return $ok;
}
private function gpuHealthStore(): array
{
if (function_exists('apcu_enabled') && apcu_enabled()) {
$raw = apcu_fetch('dbn_gpu_health');
return is_array($raw) ? $raw : [];
}
$file = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'dbn_gpu_health.json';
if (is_file($file)) {
$decoded = json_decode((string)@file_get_contents($file), true);
if (is_array($decoded)) {
return $decoded;
}
}
return [];
}
private function readGpuHealth(string $model): ?bool
{
$data = $this->gpuHealthStore();
if (!isset($data[$model]) || !is_array($data[$model])) {
return null;
}
$ok = (bool)($data[$model]['ok'] ?? false);
$ts = (int)($data[$model]['ts'] ?? 0);
$ttl = $ok ? 30 : 60;
if ((time() - $ts) > $ttl) {
return null;
}
return $ok;
}
private function writeGpuHealth(string $model, bool $ok): void
{
$data = $this->gpuHealthStore();
$data[$model] = ['ok' => $ok, 'ts' => time()];
if (function_exists('apcu_enabled') && apcu_enabled()) {
apcu_store('dbn_gpu_health', $data, 120);
return;
}
$file = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'dbn_gpu_health.json';
@file_put_contents($file, json_encode($data), LOCK_EX);
}
/**
* Run persona-pinned synthesis with graceful degradation. If the persona pins a
* GPU-backed fine-tune that is unreachable (pod powered off), transparently fall back
* to cloud gpt-4o so the grounded answer is still produced — never a hard failure.
*
* @return array{raw:string, gateway:(DbnAzureOpenAiGateway|DbnBedrockGateway), model:string, degraded:bool, failed_model:?string}
*/
private function personaSynthesize(array $persona, string $engine, array $messages, array $options): array
{
[$gw, $model] = $this->personaGateway($persona, $engine);
$degraded = false;
$failedModel = null;
$isPinnedGpu = ($gw instanceof DbnBedrockGateway) && $this->isGpuBackedModel($model);
// Fast path: model is known-down within the TTL window → skip straight to cloud.
if ($isPinnedGpu && !$this->gpuModelAvailable($gw, $model)) {
error_log('[dbn-persona] GPU model ' . $model . ' marked unavailable; using cloud fallback.');
$failedModel = $model;
$degraded = true;
[$gw, $model] = $this->cloudFallbackGateway();
}
try {
$raw = $gw->withDeployment($model)->chatText($messages, $options);
} catch (Throwable $e) {
if (!$isPinnedGpu || $degraded) {
// Not a GPU model, or already on the fallback — nothing left to try.
throw $e;
}
error_log('[dbn-persona] pinned GPU model ' . $model . ' failed; falling back to cloud: ' . $e->getMessage());
$this->writeGpuHealth($model, false);
$failedModel = $model;
$degraded = true;
[$gw, $model] = $this->cloudFallbackGateway();
$raw = $gw->withDeployment($model)->chatText($messages, $options);
}
return [
'raw' => $raw,
'gateway' => $gw,
'model' => $model,
'degraded' => $degraded,
'failed_model' => $failedModel,
];
}
/** Localized notice shown when the fine-tuned legal model was offline and gpt-4o answered. */
private function degradedModelNotice(string $language): string
{
return match (dbnToolsNormalizeUiLanguage($language)) {
'no' => 'Den spesialiserte, finjusterte norske juridiske modellen er midlertidig utilgjengelig, så dette svaret ble generert av den generelle modellen (gpt-4o). Det juridiske korpuset, kildene og bevisene er fullt tilgjengelige gjennomgå de siterte kildene som vanlig.',
'uk' => 'Спеціалізована доточена норвезька юридична модель тимчасово недоступна, тому цю відповідь згенерувала загальна модель (gpt-4o). Юридичний корпус, джерела та докази повністю доступні — перегляньте цитовані джерела, як зазвичай.',
'pl' => 'Wyspecjalizowany, dostrojony norweski model prawny jest tymczasowo niedostępny, więc tę odpowiedź wygenerował model ogólny (gpt-4o). Korpus prawny, źródła i dowody są w pełni dostępne — przejrzyj cytowane źródła jak zwykle.',
default => 'The specialized fine-tuned Norwegian legal model is temporarily offline, so this answer was generated by the general model (gpt-4o). The legal corpus, sources, and evidence are fully live — review the cited sources as usual.',
};
}
private function runJsonTool(string $prompt, string $language, int $maxTokens, ?array $persona = null): array
{
// With a persona, route to its pinned engine (Track-1 → tuned Qwen, Track-2 → gpt-4o)
// and fold its domain framing into the system prompt. Without one (e.g. pasted-text
// tools), keep the default Azure routing with the neutral base prompt.
$personaPrompt = $persona['system_prompt'] ?? null;
if ($persona !== null) {
[$gateway, $model] = $this->personaGateway($persona, 'azure_mini');
$gateway = $gateway->withDeployment($model);
} else {
$gateway = $this->azure;
}
$raw = $gateway->chatText([
$messages = [
['role' => 'system', 'content' => $this->legalJsonSystemPrompt($language, $personaPrompt)],
['role' => 'user', 'content' => $prompt],
], [
];
$options = [
'json' => true,
'temperature' => 0.1,
'max_tokens' => $maxTokens,
]);
];
if ($persona !== null) {
// Persona-pinned synthesis with graceful GPU→cloud fallback.
$synth = $this->personaSynthesize($persona, 'azure_mini', $messages, $options);
$gateway = $synth['gateway'];
$raw = $synth['raw'];
} else {
$gateway = $this->azure;
$raw = $gateway->chatText($messages, $options);
}
$json = $gateway->decodeJsonObject($raw);
if (!$json) {
dbnToolsAbort('The model did not return valid structured JSON.', 502, 'invalid_json');