feat: auto-select STT engine (Azure → Google Cloud → Whisper) and show provider in results

Removes user-facing engine/model/key/beam controls. The server now picks
the best available engine automatically:
1. Microsoft Azure Speech — short clips (≤1MB, no diarization, audio/*)
2. Google Cloud Speech v2 — long audio, diarization, all languages
3. OpenAI Whisper GPU — local fallback

Results display which provider was used (e.g. "Transcribed with Google
Cloud Speech") via transcript-engine-badge and traceMeta.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-16 13:22:24 +02:00
parent c6a9cc9199
commit 08d1e3cee3
14 changed files with 2937 additions and 416 deletions
+1 -48
View File
@@ -3,11 +3,9 @@ declare(strict_types=1);
$toolName = 'transcribe';
$toolTitle = 'Transcribe audio';
$toolKind = 'Audio Transcription';
$toolBadge = 'Whisper / GPU';
$toolBadge = 'Azure · Google · Whisper';
require_once __DIR__ . '/includes/layout.php';
$azureConfigured = !empty(dbnToolsEnv('DBN_AZURE_SPEECH_KEY'));
?>
<script>window.DBN_AZURE_SPEECH_CONFIGURED = <?= $azureConfigured ? 'true' : 'false' ?>;</script>
<form id="toolForm" class="tool-form">
<div class="lang-switcher" id="uiLangSwitcher" role="group" aria-label="UI language">
@@ -17,33 +15,6 @@ $azureConfigured = !empty(dbnToolsEnv('DBN_AZURE_SPEECH_KEY'));
<button type="button" class="lang-btn" data-lang="pl">&#127477;&#127473; PL</button>
</div>
<div class="control-row" id="engineControl">
<span class="control-label" data-i18n="engine">Engine</span>
<label><input type="radio" name="engine" value="gpu" checked id="engineGpu"> <span data-i18n="engineGpuLabel">GPU (cuttlefish RTX 3060)</span></label>
<label><input type="radio" name="engine" value="openai" id="engineOpenai"> <span data-i18n="engineOpenaiLabel">OpenAI Whisper API</span></label>
<label><input type="radio" name="engine" value="azure" id="engineAzure"> <span data-i18n="engineAzureLabel">Azure AI Speech (nb-NO)</span></label>
</div>
<div class="control-row is-hidden" id="openaiKeyControl">
<span class="control-label" data-i18n="apiKey">API Key</span>
<input type="password" id="openaiKeyInput" name="openai_key" placeholder="sk-…" class="byok-input" autocomplete="off">
<small class="control-hint inline-hint" data-i18n="apiKeyHint">Used for this request only, never stored. Max 25&thinsp;MB.</small>
</div>
<div class="control-row is-hidden" id="azureKeyControl">
<span class="control-label" data-i18n="apiKey">API Key</span>
<input type="password" id="azureKeyInput" name="azure_key" placeholder="Azure Speech key" class="byok-input" autocomplete="off">
<span class="control-label" style="margin-left:1.25rem" data-i18n="region">Region</span>
<input type="text" id="azureRegionInput" name="azure_region" placeholder="norwayeast" class="byok-input byok-input--short" value="norwayeast">
</div>
<div class="control-row" id="modelControl">
<span class="control-label" data-i18n="model">Model</span>
<label><input type="radio" name="model" value="small"> <span data-i18n="modelFastest">Fastest</span> <small class="control-hint">(small)</small></label>
<label><input type="radio" name="model" value="medium"> <span data-i18n="modelBalanced">Balanced</span> <small class="control-hint">(medium)</small></label>
<label><input type="radio" name="model" value="large-v3" checked> <span data-i18n="modelBest">Best quality</span> &#9733; <small class="control-hint">(large-v3)</small></label>
</div>
<div class="control-row" id="transcribeLangControl">
<span class="control-label" data-i18n="transcribeLang">Audio language</span>
<label><input type="radio" name="transcribeLang" value="no" checked> Norsk (nb)</label>
@@ -93,24 +64,6 @@ $azureConfigured = !empty(dbnToolsEnv('DBN_AZURE_SPEECH_KEY'));
</div>
</div>
<details class="expert-settings" id="expertSettings">
<summary class="expert-summary" data-i18n="expertSettings">Advanced settings</summary>
<div class="expert-body">
<div class="control-row">
<span class="control-label" data-i18n="beamSize">Beam size</span>
<label><input type="radio" name="beam_size" value="1"> 1 <small class="control-hint" data-i18n="beamFastest">(fastest)</small></label>
<label><input type="radio" name="beam_size" value="3"> 3</label>
<label><input type="radio" name="beam_size" value="5" checked> 5 <small class="control-hint" data-i18n="beamBest">(best)</small></label>
</div>
<p class="upload-hint" data-i18n="beamSizeHint">Controls search breadth — higher values improve accuracy but take longer. 5 is recommended for legal recordings.</p>
<div class="control-row">
<span class="control-label" data-i18n="vadFilter">VAD filter</span>
<label><input type="checkbox" name="vad_filter" id="vadFilterCheck" value="1" checked> <span data-i18n="vadFilterLabel">Remove silence</span></label>
</div>
<p class="upload-hint" data-i18n="vadFilterHint">Voice Activity Detection — skips silent passages before transcribing. Speeds up processing and prevents the model hallucinating on silence.</p>
</div>
</details>
<!-- Hidden stubs so tools.js refs don't crash on this page -->
<div class="is-hidden" id="languageControl" aria-hidden="true">
<input type="radio" name="language" value="en" checked>