From b217f18118c395be0c26823aeb15cba68678a42f Mon Sep 17 00:00:00 2001 From: davegilligan Date: Fri, 5 Jun 2026 06:19:05 +0200 Subject: [PATCH] feat(tools): graceful degradation when GPU fine-tune is offline Persona-pinned legal model (dbn-legal-agent-v3, served from the home GPU pod) hard-failed ask/legal-analysis whenever the GPU was powered off. Add a cached health-ping gate plus reactive try/catch fallback: if the fine-tune is unreachable, transparently route to gpt-4o and surface a localized notice in what_remains_uncertain that the specialized model is temporarily offline while corpus, retrieval, and sources remain live. Cloud models are excluded from the gate so gpt-4o personas never degrade. Co-Authored-By: Claude Opus 4.7 --- includes/LegalTools.php | 186 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 172 insertions(+), 14 deletions(-) diff --git a/includes/LegalTools.php b/includes/LegalTools.php index e6aa9c6..54c311f 100644 --- a/includes/LegalTools.php +++ b/includes/LegalTools.php @@ -245,9 +245,6 @@ final class DbnLegalToolsService ]; } - [$gateway, $personaModel] = $this->personaGateway($personaResolved, $engine); - $gateway->requireChat(); - $context = $this->buildEvidenceContext($hits); $locale = dbnToolsLanguageName($language); $prompt = <<legalJsonSystemPrompt($language, $personaResolved['system_prompt'] ?? null); - $askDeployment = $personaModel; - $raw = $gateway->withDeployment($askDeployment)->chatText([ + $synth = $this->personaSynthesize($personaResolved, $engine, [ ['role' => 'system', 'content' => $system], ['role' => 'user', 'content' => $prompt], ], [ @@ -279,6 +275,9 @@ PROMPT; 'temperature' => 0.15, 'max_tokens' => 1300, ]); + $gateway = $synth['gateway']; + $askDeployment = $synth['model']; + $raw = $synth['raw']; $json = $gateway->decodeJsonObject($raw); if (!$json) { @@ -310,6 +309,16 @@ PROMPT; ]; } + if ($synth['degraded']) { + $uncertain = $json['what_remains_uncertain'] ?? []; + if (!is_array($uncertain)) { + $uncertain = ($uncertain === '' || $uncertain === null) ? [] : [(string)$uncertain]; + } + array_unshift($uncertain, $this->degradedModelNotice($language)); + $json['what_remains_uncertain'] = $uncertain; + $trace[] = $this->trace('Model routing', 'Fine-tuned legal model unavailable; answered with gpt-4o fallback (corpus + retrieval unaffected).', 'warning'); + } + $trace[] = $this->trace('Synthesis', 'Azure OpenAI generated an answer using only the retrieved source excerpts.', 'complete'); $trace[] = $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'); $trace[] = $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the evidence trail.'), 'complete'); @@ -330,6 +339,9 @@ PROMPT; 'source_count' => count($hits), 'deployment' => $askDeployment, 'citation_confidence' => $search['trace_metadata']['citation_confidence'] ?? 'medium', + 'model_degraded' => $synth['degraded'], + 'requested_model' => $synth['failed_model'], + 'used_model' => $askDeployment, ], 'disclaimer' => dbnToolsDisclaimer($language), ]; @@ -1222,26 +1234,172 @@ PROMPT; return [$this->azure, ($engine === 'azure_full') ? 'gpt-4o' : 'gpt-4o-mini']; } + /** + * True for self-hosted / fine-tuned models served from the GPU pod (e.g. + * dbn-legal-agent-v3, dobetter-norge-v4, bnl-legal). Cloud models (gpt-*, claude-*, …) + * are always up and must NOT be health-gated or treated as degradable. + */ + private function isGpuBackedModel(string $model): bool + { + $m = strtolower(trim($model)); + if ($m === '') { + return false; + } + foreach (['gpt-', 'o1', 'o3', 'o4', 'claude', 'azure', 'nova', 'gemini', 'command', 'mistral-large'] as $cloudPrefix) { + if (str_starts_with($m, $cloudPrefix)) { + return false; + } + } + return true; + } + + /** The always-up cloud fallback used when a GPU-backed fine-tune is offline. */ + private function cloudFallbackGateway(): array + { + return [$this->azure, 'gpt-4o']; + } + + /** + * Cached reachability gate for a GPU-served model. Pings the LiteLLM gateway at most + * once per TTL window (30s when up, 60s when down) so we neither probe on every request + * nor wait out a long synthesis timeout repeatedly while the GPU is powered off. + */ + private function gpuModelAvailable(DbnBedrockGateway $gw, string $model): bool + { + $cached = $this->readGpuHealth($model); + if ($cached !== null) { + return $cached; + } + $ok = $gw->withDeployment($model)->ping(8); + $this->writeGpuHealth($model, $ok); + return $ok; + } + + private function gpuHealthStore(): array + { + if (function_exists('apcu_enabled') && apcu_enabled()) { + $raw = apcu_fetch('dbn_gpu_health'); + return is_array($raw) ? $raw : []; + } + $file = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'dbn_gpu_health.json'; + if (is_file($file)) { + $decoded = json_decode((string)@file_get_contents($file), true); + if (is_array($decoded)) { + return $decoded; + } + } + return []; + } + + private function readGpuHealth(string $model): ?bool + { + $data = $this->gpuHealthStore(); + if (!isset($data[$model]) || !is_array($data[$model])) { + return null; + } + $ok = (bool)($data[$model]['ok'] ?? false); + $ts = (int)($data[$model]['ts'] ?? 0); + $ttl = $ok ? 30 : 60; + if ((time() - $ts) > $ttl) { + return null; + } + return $ok; + } + + private function writeGpuHealth(string $model, bool $ok): void + { + $data = $this->gpuHealthStore(); + $data[$model] = ['ok' => $ok, 'ts' => time()]; + if (function_exists('apcu_enabled') && apcu_enabled()) { + apcu_store('dbn_gpu_health', $data, 120); + return; + } + $file = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'dbn_gpu_health.json'; + @file_put_contents($file, json_encode($data), LOCK_EX); + } + + /** + * Run persona-pinned synthesis with graceful degradation. If the persona pins a + * GPU-backed fine-tune that is unreachable (pod powered off), transparently fall back + * to cloud gpt-4o so the grounded answer is still produced — never a hard failure. + * + * @return array{raw:string, gateway:(DbnAzureOpenAiGateway|DbnBedrockGateway), model:string, degraded:bool, failed_model:?string} + */ + private function personaSynthesize(array $persona, string $engine, array $messages, array $options): array + { + [$gw, $model] = $this->personaGateway($persona, $engine); + $degraded = false; + $failedModel = null; + + $isPinnedGpu = ($gw instanceof DbnBedrockGateway) && $this->isGpuBackedModel($model); + + // Fast path: model is known-down within the TTL window → skip straight to cloud. + if ($isPinnedGpu && !$this->gpuModelAvailable($gw, $model)) { + error_log('[dbn-persona] GPU model ' . $model . ' marked unavailable; using cloud fallback.'); + $failedModel = $model; + $degraded = true; + [$gw, $model] = $this->cloudFallbackGateway(); + } + + try { + $raw = $gw->withDeployment($model)->chatText($messages, $options); + } catch (Throwable $e) { + if (!$isPinnedGpu || $degraded) { + // Not a GPU model, or already on the fallback — nothing left to try. + throw $e; + } + error_log('[dbn-persona] pinned GPU model ' . $model . ' failed; falling back to cloud: ' . $e->getMessage()); + $this->writeGpuHealth($model, false); + $failedModel = $model; + $degraded = true; + [$gw, $model] = $this->cloudFallbackGateway(); + $raw = $gw->withDeployment($model)->chatText($messages, $options); + } + + return [ + 'raw' => $raw, + 'gateway' => $gw, + 'model' => $model, + 'degraded' => $degraded, + 'failed_model' => $failedModel, + ]; + } + + /** Localized notice shown when the fine-tuned legal model was offline and gpt-4o answered. */ + private function degradedModelNotice(string $language): string + { + return match (dbnToolsNormalizeUiLanguage($language)) { + 'no' => 'Den spesialiserte, finjusterte norske juridiske modellen er midlertidig utilgjengelig, så dette svaret ble generert av den generelle modellen (gpt-4o). Det juridiske korpuset, kildene og bevisene er fullt tilgjengelige – gjennomgå de siterte kildene som vanlig.', + 'uk' => 'Спеціалізована доточена норвезька юридична модель тимчасово недоступна, тому цю відповідь згенерувала загальна модель (gpt-4o). Юридичний корпус, джерела та докази повністю доступні — перегляньте цитовані джерела, як зазвичай.', + 'pl' => 'Wyspecjalizowany, dostrojony norweski model prawny jest tymczasowo niedostępny, więc tę odpowiedź wygenerował model ogólny (gpt-4o). Korpus prawny, źródła i dowody są w pełni dostępne — przejrzyj cytowane źródła jak zwykle.', + default => 'The specialized fine-tuned Norwegian legal model is temporarily offline, so this answer was generated by the general model (gpt-4o). The legal corpus, sources, and evidence are fully live — review the cited sources as usual.', + }; + } + private function runJsonTool(string $prompt, string $language, int $maxTokens, ?array $persona = null): array { // With a persona, route to its pinned engine (Track-1 → tuned Qwen, Track-2 → gpt-4o) // and fold its domain framing into the system prompt. Without one (e.g. pasted-text // tools), keep the default Azure routing with the neutral base prompt. $personaPrompt = $persona['system_prompt'] ?? null; - if ($persona !== null) { - [$gateway, $model] = $this->personaGateway($persona, 'azure_mini'); - $gateway = $gateway->withDeployment($model); - } else { - $gateway = $this->azure; - } - $raw = $gateway->chatText([ + $messages = [ ['role' => 'system', 'content' => $this->legalJsonSystemPrompt($language, $personaPrompt)], ['role' => 'user', 'content' => $prompt], - ], [ + ]; + $options = [ 'json' => true, 'temperature' => 0.1, 'max_tokens' => $maxTokens, - ]); + ]; + if ($persona !== null) { + // Persona-pinned synthesis with graceful GPU→cloud fallback. + $synth = $this->personaSynthesize($persona, 'azure_mini', $messages, $options); + $gateway = $synth['gateway']; + $raw = $synth['raw']; + } else { + $gateway = $this->azure; + $raw = $gateway->chatText($messages, $options); + } $json = $gateway->decodeJsonObject($raw); if (!$json) { dbnToolsAbort('The model did not return valid structured JSON.', 502, 'invalid_json');