diff --git a/api/transcribe.php b/api/transcribe.php index d9f3664..202bfd1 100644 --- a/api/transcribe.php +++ b/api/transcribe.php @@ -27,6 +27,11 @@ $task = ($_POST['task'] ?? 'transcribe') === 'translate' ? 'translate' : $vadFilter = !empty($_POST['vad_filter']) && $_POST['vad_filter'] !== '0'; $initPrompt = substr(trim((string)($_POST['initial_prompt'] ?? '')), 0, 500); +$allowedPostModels = ['', 'gpt-4o-mini', 'gpt-4o']; +$postModel = in_array($_POST['post_model'] ?? '', $allowedPostModels, true) + ? (string)($_POST['post_model'] ?? '') + : ''; + // ── Validate upload ─────────────────────────────────────────────────────────── if (empty($_FILES['audio']) || $_FILES['audio']['error'] !== UPLOAD_ERR_OK) { @@ -114,6 +119,17 @@ if ($timeOffset > 0.0 && !empty($result['segments'])) { unset($seg); } +// ── Optional GPT cleanup pass ───────────────────────────────────────────────── + +$cleanedBy = null; +if ($postModel !== '' && !empty($result['text'])) { + $cleaned = dbnCleanupTranscript($result['text'], $language, $initPrompt, $postModel); + if ($cleaned !== null) { + $result['text'] = $cleaned; + $cleanedBy = $postModel; + } +} + // ── Speaker role labelling (diarize + multiple speakers only) ───────────────── $segments = $result['segments'] ?? []; @@ -126,7 +142,8 @@ if ($numDetected < 2 && $segments) { $speakerRoles = null; if ($diarize && $numDetected > 1 && $segments) { - $speakerRoles = dbnLabelSpeakerRoles($segments); + $labelDeployment = $postModel ?: 'gpt-4o-mini'; + $speakerRoles = dbnLabelSpeakerRoles($segments, $labelDeployment); } // ── Friendly engine label ───────────────────────────────────────────────────── @@ -161,6 +178,7 @@ dbnToolsRespond([ 'model' => $engineLabel, 'engine' => $engineUsed, 'latency_ms' => $latencyMs, + 'cleaned_by' => $cleanedBy, ]); @@ -313,7 +331,7 @@ function transcribeViaWhisperGpu(array $file, string $language, bool $diarize, i } -function dbnLabelSpeakerRoles(array $segments): array +function dbnLabelSpeakerRoles(array $segments, string $deployment = 'gpt-4o-mini'): array { $sample = array_slice( array_values(array_filter($segments, fn($s) => isset($s['speaker']))), @@ -322,16 +340,11 @@ function dbnLabelSpeakerRoles(array $segments): array if (!$sample) return []; $lines = array_map(fn($s) => "[{$s['speaker']}] " . trim((string)($s['text'] ?? '')), $sample); - $azure = new DbnAzureOpenAiGateway(); - $system = 'You are analyzing a legal proceeding transcript. ' - . 'Based on the first segments, identify the role of each speaker. ' - . 'Common roles in Norwegian legal proceedings: dommer (judge), advokat (lawyer), ' - . 'forelder (parent), barn (child), sakkyndig (expert witness), ' - . 'saksbehandler (caseworker), tolk (interpreter), vitne (witness), ' - . 'prosessfullmektig (counsel). ' + $azure = (new DbnAzureOpenAiGateway())->withDeployment($deployment); + $system = 'Label speakers in this Norwegian legal transcript. ' . 'Return ONLY valid JSON: {"SPEAKER_00":"dommer","SPEAKER_01":"forelder"}. ' - . 'Use Norwegian role names. Use "ukjent" if role cannot be determined. ' - . 'Only include speakers present in the input.'; + . 'Norwegian role names only — dommer, advokat, forelder, barn, sakkyndig, ' + . 'saksbehandler, tolk, vitne, prosessfullmektig. Use "ukjent" if unclear.'; try { $text = $azure->chatText([ @@ -345,3 +358,36 @@ function dbnLabelSpeakerRoles(array $segments): array return []; } } + + +function dbnCleanupTranscript(string $text, string $language, string $vocabulary, string $deployment): ?string +{ + $langName = match($language) { + 'no', 'nb', 'nn' => 'Norwegian', + 'en' => 'English', + 'pl' => 'Polish', + 'uk' => 'Ukrainian', + 'sv' => 'Swedish', + 'da' => 'Danish', + 'de' => 'German', + 'fr' => 'French', + default => 'Norwegian', + }; + $vocabHint = $vocabulary !== '' ? " Domain terms to preserve correctly: {$vocabulary}." : ''; + $system = "Fix transcription errors in this {$langName} text.{$vocabHint} " + . "Correct mishearing errors, run-on sentences, and punctuation. " + . "Preserve all meaning and the original language exactly. " + . "Return only the corrected transcript text, no commentary."; + + try { + $azure = (new DbnAzureOpenAiGateway())->withDeployment($deployment); + $result = $azure->chatText( + [['role' => 'system', 'content' => $system], + ['role' => 'user', 'content' => $text]], + ['temperature' => 0.1, 'max_tokens' => 4096] + ); + return ($result !== '' && $result !== null) ? $result : null; + } catch (Throwable) { + return null; + } +} diff --git a/assets/js/tools.js b/assets/js/tools.js index d680339..88e7db7 100644 --- a/assets/js/tools.js +++ b/assets/js/tools.js @@ -441,6 +441,20 @@ const TRANSCRIBE_I18N = { traceProcessingDetail: () => 'Processing audio. Large files may take 1–3 minutes.', traceStillLabel: (clip) => `${clip} — still processing…`, traceStillDetail: (e) => { const m = Math.floor(e / 60), s = e % 60; return m > 0 ? `${m}m ${s}s elapsed — working through the audio.` : `${e}s elapsed — processing.`; }, + advancedOptions: 'Advanced options', + task: 'Task', + taskTranscribe: 'Transcribe', + taskTranslate: 'Translate to English', + vadFilter: 'VAD filter', + vadFilterLabel: 'Remove silence / noise', + vadFilterHint: 'Improves accuracy on recordings with long pauses.', + whisperModel: 'Whisper model', + whisperModelHint: 'Used when Azure/GCP unavailable. large-v3 is the default.', + postModel: 'AI cleanup', + postModelNone: 'None', + postModelMini: 'GPT-4o Mini', + postModelFull: 'GPT-4o', + postModelHint: 'Fixes errors, punctuation, and domain terms after transcription.', }, no: { transcribeLang: 'Språk i lydfil', @@ -481,6 +495,20 @@ const TRANSCRIBE_I18N = { traceProcessingLabel: (clip) => `${clip} — transkriberer`, traceProcessingDetail: () => 'Behandler lyden. Store filer tar 1–3 minutter.', traceStillLabel: (clip) => `${clip} — behandler fortsatt…`, traceStillDetail: (e) => { const m = Math.floor(e / 60), s = e % 60; return m > 0 ? `${m} min ${s}s gått — jobber gjennom lyden.` : `${e}s gått — behandler.`; }, + advancedOptions: 'Avanserte valg', + task: 'Oppgave', + taskTranscribe: 'Transkriber', + taskTranslate: 'Oversett til engelsk', + vadFilter: 'VAD-filter', + vadFilterLabel: 'Fjern stillhet / støy', + vadFilterHint: 'Forbedrer nøyaktigheten ved opptak med lange pauser.', + whisperModel: 'Whisper-modell', + whisperModelHint: 'Brukes når Azure/GCP ikke er tilgjengelig. large-v3 er standard.', + postModel: 'AI-opprydding', + postModelNone: 'Ingen', + postModelMini: 'GPT-4o Mini', + postModelFull: 'GPT-4o', + postModelHint: 'Retter feil, tegnsetting og fagtermer etter transkripsjon.', }, uk: { transcribeLang: 'Мова аудіо', @@ -521,6 +549,20 @@ const TRANSCRIBE_I18N = { traceProcessingLabel: (clip) => `${clip} — транскрибування`, traceProcessingDetail: () => 'Обробка аудіо. Великі файли займають 1–3 хвилини.', traceStillLabel: (clip) => `${clip} — ще обробляється…`, traceStillDetail: (e) => { const m = Math.floor(e / 60), s = e % 60; return m > 0 ? `Минуло ${m} хв ${s} с — обробка.` : `Минуло ${e} с — обробка.`; }, + advancedOptions: 'Розширені параметри', + task: 'Завдання', + taskTranscribe: 'Транскрибувати', + taskTranslate: 'Перекласти на англійську', + vadFilter: 'VAD-фільтр', + vadFilterLabel: 'Видалити тишу / шум', + vadFilterHint: 'Покращує точність для записів з довгими паузами.', + whisperModel: 'Модель Whisper', + whisperModelHint: 'Використовується, якщо Azure/GCP недоступні. large-v3 за замовчуванням.', + postModel: 'AI-очищення', + postModelNone: 'Без', + postModelMini: 'GPT-4o Mini', + postModelFull: 'GPT-4o', + postModelHint: 'Виправляє помилки, пунктуацію та терміни після транскрипції.', }, pl: { transcribeLang: 'Język audio', @@ -561,6 +603,20 @@ const TRANSCRIBE_I18N = { traceProcessingLabel: (clip) => `${clip} — transkrybowanie`, traceProcessingDetail: () => 'Przetwarzanie audio. Duże pliki zajmują 1–3 minuty.', traceStillLabel: (clip) => `${clip} — nadal przetwarza…`, traceStillDetail: (e) => { const m = Math.floor(e / 60), s = e % 60; return m > 0 ? `Minęło ${m} min ${s} s — przetwarzanie audio.` : `Minęło ${e} s — przetwarzanie.`; }, + advancedOptions: 'Opcje zaawansowane', + task: 'Zadanie', + taskTranscribe: 'Transkrypcja', + taskTranslate: 'Przetłumacz na angielski', + vadFilter: 'Filtr VAD', + vadFilterLabel: 'Usuń ciszę / szum', + vadFilterHint: 'Poprawia dokładność nagrań z długimi przerwami.', + whisperModel: 'Model Whisper', + whisperModelHint: 'Używany gdy Azure/GCP niedostępne. large-v3 jest domyślny.', + postModel: 'Korekta AI', + postModelNone: 'Brak', + postModelMini: 'GPT-4o Mini', + postModelFull: 'GPT-4o', + postModelHint: 'Poprawia błędy, interpunkcję i terminy po transkrypcji.', }, }; @@ -1515,6 +1571,10 @@ async function runTranscribe() { formData.append('time_offset', String(cumulativeOffset)); if (vadFilter) formData.append('vad_filter', '1'); if (initPrompt) formData.append('initial_prompt', initPrompt); + const whisperModel = document.getElementById('whisperModelSelect')?.value; + if (whisperModel) formData.append('model', whisperModel); + const postModel = document.querySelector('input[name="post_model"]:checked')?.value; + if (postModel) formData.append('post_model', postModel); if (diarize) { formData.append('diarize', '1'); if (numSpeakers >= 2) formData.append('num_speakers', String(numSpeakers)); @@ -1650,6 +1710,7 @@ function renderTranscriptResults(data) { if (data.language) traceMeta.push({ label: `Language: ${data.language}`, detail: '', status: 'complete' }); if (data.num_speakers > 1) traceMeta.push({ label: `Speakers detected: ${data.num_speakers}`, detail: Object.entries(speakerRoles).map(([id, r]) => `${id}: ${r}`).join(', ') || '', status: 'complete' }); if (data.model) traceMeta.push({ label: data.model, detail: '', status: 'complete' }); + if (data.cleaned_by) traceMeta.push({ label: `Cleaned by ${data.cleaned_by}`, detail: '', status: 'complete' }); renderTrace(traceMeta.length ? traceMeta : [{ label: 'Transcribed', detail: '', status: 'complete' }]); } diff --git a/includes/BvjAnalyzerAgent.php b/includes/BvjAnalyzerAgent.php index d74b53d..c73643e 100644 --- a/includes/BvjAnalyzerAgent.php +++ b/includes/BvjAnalyzerAgent.php @@ -493,7 +493,7 @@ PROMPT; private function extractParties(string $docText, string $language): array { $locale = dbnToolsLanguageName($language); - $excerpt = mb_substr($docText, 0, 12000, 'UTF-8'); + $excerpt = mb_substr($docText, 0, 20000, 'UTF-8'); $prompt = <<azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], - ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 1500, 'timeout' => 40]); + ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 2000, 'timeout' => 45]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json) && is_array($json['parties'] ?? null)) { - return array_slice($json['parties'], 0, 20); + return array_slice($json['parties'], 0, 25); } // Fallback: model returned an array at root level instead of {parties:[...]} if (is_array($json) && isset($json[0]['name'])) { - return array_slice($json, 0, 20); + return array_slice($json, 0, 25); } error_log('BVJ extractParties unexpected structure: ' . substr($raw, 0, 300)); } catch (Throwable $e) { @@ -541,7 +542,7 @@ PROMPT; private function extractTimeline(string $docText, string $language): array { $locale = dbnToolsLanguageName($language); - $excerpt = mb_substr($docText, 0, 12000, 'UTF-8'); + $excerpt = mb_substr($docText, 0, 20000, 'UTF-8'); $prompt = <<azure->chatText([ ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], ['role' => 'user', 'content' => $prompt], - ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 3000, 'timeout' => 45]); + ], ['json' => true, 'temperature' => 0.05, 'max_tokens' => 4000, 'timeout' => 55]); $json = $this->azure->decodeJsonObject($raw); if (is_array($json) && is_array($json['events'] ?? null)) { - return array_slice($json['events'], 0, 30); + return array_slice($json['events'], 0, 40); } } catch (Throwable $e) { error_log('BVJ extractTimeline failed: ' . $e->getMessage()); @@ -600,52 +611,84 @@ PROMPT; int $count, string $language ): array { - $locale = dbnToolsLanguageName($language); - $docType = $docMeta['doc_type'] ?? 'BVJ document'; - $roleStr = $advocateRole !== '' ? $advocateRole : 'the affected party'; + $locale = dbnToolsLanguageName($language); + $docType = $docMeta['doc_type'] ?? 'BVJ document'; + $docDate = $docMeta['doc_date'] ?? 'unknown date'; + $authority = $docMeta['issuing_authority'] ?? 'the municipality'; + $roleStr = $advocateRole !== '' ? $advocateRole : 'the affected party'; - // Summarise the top events to give the model context + // Summarise high-significance events first, then others + $highEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high')); + $otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high')); + $topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 12); $eventSummary = ''; - $highEvents = array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high'); - $topEvents = array_slice(array_merge(array_values($highEvents), - array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high'))), 0, 8); foreach ($topEvents as $ev) { - $eventSummary .= sprintf("- %s: %s (%s)\n", $ev['date'] ?? '?', $ev['action'] ?? '', $ev['actor'] ?? ''); + $sig = ($ev['significance'] ?? 'low') === 'high' ? '[HIGH] ' : ''; + $eventSummary .= sprintf("- %s %s%s (%s)\n", + $ev['date'] ?? '?', $sig, $ev['action'] ?? '', $ev['actor'] ?? ''); } // Summarise parties $partyList = ''; - foreach (array_slice($parties, 0, 8) as $p) { - $partyList .= sprintf("- %s (%s)\n", $p['name'] ?? '', $p['role'] ?? ''); + foreach (array_slice($parties, 0, 10) as $p) { + $org = !empty($p['organization']) ? ' at ' . $p['organization'] : ''; + $partyList .= sprintf("- %s (%s%s)\n", $p['name'] ?? '?', $p['role'] ?? '?', $org); } + $angleGuidance = match (true) { + $count >= 5 => << << << $p) { + foreach (array_slice($parties, 0, 12) as $i => $p) { $org = $p['organization'] ? ' (' . $p['organization'] . ')' : ''; $rel = $p['relationship_to_child'] ? ' — rel: ' . $p['relationship_to_child'] : ''; $partiesSummary .= sprintf("%d. %s — %s%s%s\n", $i + 1, $p['name'] ?? '', $p['role'] ?? '', $org, $rel); } - // Build timeline summary (top 15 most significant events) + // Build timeline summary (top 20 most significant events) $highEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') === 'high')); $otherEvents = array_values(array_filter($timelineEvents, fn($e) => ($e['significance'] ?? '') !== 'high')); - $topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 15); + $topEvents = array_slice(array_merge($highEvents, $otherEvents), 0, 20); $timelineSummary = ''; foreach ($topEvents as $ev) { $time = $ev['time_of_day'] ? ' kl.' . $ev['time_of_day'] : ''; @@ -783,14 +826,17 @@ PROMPT; ? "\n== ADDITIONAL CONTEXT FROM ADVOCATE ==\n{$additionalNotes}\n" : ''; - $docExcerpt = mb_substr($docText, 0, 3000, 'UTF-8'); + $docExcerpt = mb_substr($docText, 0, 8000, 'UTF-8'); $prompt = <<", "procedural_red_flags": [ { "description": "Concise description of the potential procedural violation", - "legal_basis": "Statute or ECHR article potentially violated, e.g. Barnevernloven §6-1, ECHR Art.8", - "severity": "high", + "legal_basis": "Statute or ECHR article from a corpus source — e.g. Barnevernloven §4-2 [3]", + "severity": "high|medium|low", "source_refs": ["[n]", "[DOC]"], - "what_to_check": "Specific document text or action requiring legal verification" + "what_to_check": "Exact document text or action to verify with a lawyer" } ], - "client_strengths": ["3-6 items anchored with [n] or [DOC]"], - "opposing_weaknesses": ["2-5 vulnerabilities in BVV or opposing party position — omit if unsupported by sources"], - "what_we_found": "2-sentence plain-language summary of the most critical finding", - "what_remains_uncertain": ["3-5 specific gaps — missing information, unclear authority, conflicting sources"], - "next_practical_step": "The single most important concrete legal action for {$roleStr}" + "client_strengths": ["3-6 items, each ending with [n] or [DOC]"], + "opposing_weaknesses": ["2-5 documented vulnerabilities in BVV or opposing position — OMIT if not supported by at least one [n]"], + "what_we_found": "2-sentence plain-language summary of the single most critical finding", + "what_remains_uncertain": ["3-5 specific information gaps or legal questions that need clarification"], + "next_practical_step": "The single most important concrete legal action for {$roleStr} to take within the next 7 days" } Rules: -- Every factual claim in advocacy_brief must end with [n] or [DOC]. -- procedural_red_flags must be grounded in documented BVV actions — no speculation. -- severity: high = likely violation of a codified right; medium = procedural irregularity; low = best-practice gap. -- If no corpus source supports a claimed weakness, omit it from opposing_weaknesses. -- Cite statute sections and ECHR articles as they appear in the corpus excerpts. +- severity: high = likely violation of a codified statutory right or ECHR guarantee; medium = procedural irregularity; low = best-practice gap only. +- procedural_red_flags must be grounded in documented BVV actions visible in [DOC] or the timeline. +- If fewer than 2 corpus sources support opposing_weaknesses, return an empty array. - Respond in {$locale}. PROMPT; - $sysPrompt = 'You return valid JSON only. No markdown fences.'; + $sysPrompt = 'You return valid JSON only. No markdown fences. Every legal citation must come from the provided corpus sources, not from training memory.'; $messages = [ ['role' => 'system', 'content' => $sysPrompt], ['role' => 'user', 'content' => $prompt], ]; - $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3000, 'timeout' => 200]; + $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 4500, 'timeout' => 240]; $deployLabel = match ($engine) { 'gpu' => 'GPU (cuttlefish)', diff --git a/includes/DeepResearchAgent.php b/includes/DeepResearchAgent.php index e00896e..55572dc 100644 --- a/includes/DeepResearchAgent.php +++ b/includes/DeepResearchAgent.php @@ -91,7 +91,7 @@ final class DbnDeepResearchAgent // STEP 2: Query expansion $emitRunning('expansion', 'Query expansion', 'Generating sub-questions…'); $stepStart = microtime(true); - $expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language, $advocateRole); + $expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $interpretation['key_signals'], $controls['sub_q_count'], $language, $advocateRole); $this->stepTimings['expansion'] = $this->elapsedMs($stepStart); $subQuestions = $expansion['questions']; $expansionStatus = $expansion['fallback'] ? 'warning' : 'complete'; @@ -323,7 +323,8 @@ final class DbnDeepResearchAgent $controls['temperature'], $advocateRole, $priorContext, - $branchNotes + $branchNotes, + $interpretation['key_signals'] ?? [] ); $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart); $emitStep( @@ -406,7 +407,7 @@ final class DbnDeepResearchAgent 'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))), 'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))), 'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))), - 'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))), + 'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.10))), ]; } @@ -472,7 +473,7 @@ Input: In {$locale}, produce JSON with: { - "brief": "1-3 sentence description of what the user is trying to research (≤ 220 chars)", + "brief": "1-3 sentence description of what the user is trying to research (≤ 300 chars)", "key_signals": ["short keywords or terms that should drive retrieval"] } PROMPT; @@ -483,20 +484,21 @@ PROMPT; if ($language === 'no' || $advocateRole !== '') { $resp = dbnToolsCallGpuLlm([$sysMsg, $userMsg], [ 'model' => 'dbn-legal-agent', 'json' => true, - 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 40, + 'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 40, ]); $raw = (string)($resp['choices'][0]['message']['content'] ?? ''); } else { $raw = $this->azure->chatText([$sysMsg, $userMsg], - ['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]); + ['json' => true, 'temperature' => 0.1, 'max_tokens' => 500, 'timeout' => 30]); } $json = $this->azure->decodeJsonObject($raw); if (is_array($json) && !empty($json['brief'])) { - $signals = $json['key_signals'] ?? []; - $signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : ''; + $signals = is_array($json['key_signals'] ?? null) ? array_slice($json['key_signals'], 0, 8) : []; + $signalText = $signals ? implode(', ', $signals) : ''; return [ - 'brief' => (string)$json['brief'], - 'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''), + 'brief' => (string)$json['brief'], + 'key_signals' => $signals, + 'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''), ]; } } catch (Throwable $e) { @@ -504,14 +506,18 @@ PROMPT; } return [ - 'brief' => '', - 'detail' => 'Interpretation step skipped — proceeding with raw seed input.', + 'brief' => '', + 'key_signals' => [], + 'detail' => 'Interpretation step skipped — proceeding with raw seed input.', ]; } - private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language, string $advocateRole = ''): array + private function expandQueries(string $seedDescription, string $brief, array $keySignals, int $targetCount, string $language, string $advocateRole = ''): array { $locale = dbnToolsLanguageName($language); + $anchorsLine = !empty($keySignals) + ? "\nKey retrieval anchors (incorporate these terms into your sub-questions where relevant):\n" . implode(', ', $keySignals) . "\n" + : ''; if ($advocateRole !== '') { $prompt = << 'uploaded: ' . $entry['meta']['filename'], 'section' => null, 'package_or_corpus' => 'Your upload', - 'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620), + 'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 950), 'chunk_text' => $entry['meta']['text'], 'similarity' => round($sim, 4), 'reranker_score' => null, @@ -709,7 +718,7 @@ PROMPT; 'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'), 'section' => $chunk['section_title'] ?? null, 'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'), - 'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620), + 'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 950), 'chunk_text' => (string)($chunk['content'] ?? ''), 'similarity' => $similarity, 'reranker_score' => $rerankerScore, @@ -940,7 +949,8 @@ PROMPT; float $temperature, string $advocateRole = '', ?array $priorContext = null, - string $branchNotes = '' + string $branchNotes = '', + array $keySignals = [] ): array { $locale = dbnToolsLanguageName($language); @@ -1014,41 +1024,49 @@ PROMPT; ? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.' : '250-450 words, 2-3 short paragraphs. Note when evidence is thin.'; + $keySignalsLine = !empty($keySignals) + ? "\nKey retrieval signals (statutory/factual terms that drove corpus search — ground your brief in these where sources permit):\n" . implode(', ', $keySignals) . "\n" + : ''; + if ($advocateRole !== '') { $prompt = << Høyesterett decision > ECHR Grand Chamber > ECHR regular chamber > lower courts > Bufdir guidance. +- Citation self-check: before writing each [n] marker, confirm that source [n] exists in the list and its excerpt actually supports the specific claim being made. - When multiple sources support the same point, cite all of them (e.g. `[2,4]`). -- `opposing_weaknesses` must be omitted or empty when no retrieved source actually supports the identified weakness. +- `opposing_weaknesses`: OMIT this field by default. Populate it only when ≥2 retrieved sources explicitly support the identified weakness. Do not speculate or infer weaknesses from thin evidence. +- `brief_markdown` must be {$lengthGuidance} Structure it as: (1) {$advocateRole}'s core legal position, (2) Strongest supporting arguments with [n] citations, (3) Procedural rights and obligations {$advocateRole} should assert, (4) Opposing weaknesses — only if `opposing_weaknesses` is non-empty. End with a one-line caveat that this is legal preparation support, not final legal advice. +- `client_strengths`: 3-6 items, each must include at least one [n] citation. +- `what_remains_uncertain`: 3-5 honest gaps where evidence is insufficient or law is unclear. - Respond in {$locale}. - Output valid JSON only — no markdown fences around the JSON object itself. + +Return JSON: +{ + "brief_markdown": "", + "client_strengths": [""], + "opposing_weaknesses": [""], + "what_we_found": "<2-sentence summary of the most relevant retrieved authority for {$advocateRole}>", + "what_remains_uncertain": [""], + "next_practical_step": "" +} PROMPT; } else { $prompt = << 'system', 'content' => 'You return valid JSON only. No markdown fences.'], + ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences. Every legal claim must be supported by a source from the numbered list. Do not invent statute sections, case names, paragraph numbers, or dates. If no source supports a point, omit it entirely.'], ['role' => 'user', 'content' => $prompt], ]; - $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3200, 'timeout' => 180]; + $synthTemp = ($advocateRole !== '') ? min($temperature, 0.20) : $temperature; + $opts = ['json' => true, 'temperature' => $synthTemp, 'max_tokens' => 4000, 'timeout' => 180]; try { if ($engine === 'dbn_legal') { diff --git a/transcribe.php b/transcribe.php index 0480b75..c4756ae 100644 --- a/transcribe.php +++ b/transcribe.php @@ -48,6 +48,43 @@ require_once __DIR__ . '/includes/layout.php';

Helps Whisper recognise technical terms. Not included in the transcript.

+
+ Advanced options + +
+ Task + + +
+ +
+ VAD filter + + Improves accuracy on recordings with long pauses. +
+ +
+ Whisper model + + Used when Azure/GCP unavailable. large-v3 is the default. +
+ +
+ AI cleanup + + + + Fixes errors, punctuation, and domain terms after transcription. +
+
+