Files
daveadmin b84827ecea Add Transcribe docs (about/guide/tech) + refresh Redact docs
- New: transcribe-about.php, transcribe-guide.php, transcribe-tech.php
  with full en/no/uk/pl translations (3-engine cascade, diarization, vocab)
- New: translations/transcribe-about|guide|tech.php (4-lang strings)
- New: scripts/translate-pages.php (Azure gpt-4o CLI translation helper)
- Add korr-doc-links nav to transcribe.php
- Refresh redact-about|guide|tech.php — point to assets/images/redact/
- Fix all "never written to disk" wording in redact translations
- Add Min Sak/corpus save workflow to redact guide and tech privacy section
- redact.php upload hint: correct in-memory wording

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 10:28:06 +02:00

194 lines
7.1 KiB
PHP

<?php
/**
* translate-pages.php
*
* CLI utility: translates the 'en' entries in a translations/*.php file
* into Norwegian (no), Ukrainian (uk), and Polish (pl) using Azure OpenAI gpt-4o.
*
* Usage:
* php scripts/translate-pages.php translations/transcribe-about.php
*
* The file is updated in-place — existing no/uk/pl keys are replaced.
*
* Config (env vars or constants below):
* AZURE_OPENAI_KEY — Azure OpenAI API key
* AZURE_OPENAI_ENDPOINT — e.g. https://exos-openai-test.openai.azure.com
* AZURE_OPENAI_DEPLOY — deployment name, default gpt-4o
* AZURE_OPENAI_VERSION — API version, default 2024-02-01
*/
declare(strict_types=1);
// ── Config ──────────────────────────────────────────────────────────────────
$apiKey = getenv('AZURE_OPENAI_KEY') ?: '7cb23c8ce75741c798a7dfe24827c95e';
$endpoint = rtrim(getenv('AZURE_OPENAI_ENDPOINT') ?: 'https://exos-openai-test.openai.azure.com', '/');
$deploy = getenv('AZURE_OPENAI_DEPLOY') ?: 'gpt-4o';
$apiVer = getenv('AZURE_OPENAI_VERSION') ?: '2024-02-01';
$batchSize = 20; // keys per API call
$targetLangs = [
'no' => 'Norwegian Bokmål (nn is Nynorsk; this is bokmål). Legal and formal register. Preserve Norwegian legal terminology.',
'uk' => 'Ukrainian. Formal register suitable for legal documents. Use standard Ukrainian orthography.',
'pl' => 'Polish. Formal register suitable for legal documents.',
];
// ── Entry point ──────────────────────────────────────────────────────────────
if (PHP_SAPI !== 'cli') {
http_response_code(403);
exit('CLI only');
}
if ($argc < 2) {
fwrite(STDERR, "Usage: php scripts/translate-pages.php translations/<file>.php\n");
exit(1);
}
$filePath = realpath($argv[1]);
if (!$filePath || !is_file($filePath)) {
fwrite(STDERR, "File not found: {$argv[1]}\n");
exit(1);
}
echo "Loading: $filePath\n";
$allLangs = require $filePath;
if (!is_array($allLangs) || !isset($allLangs['en'])) {
fwrite(STDERR, "File must return an array with an 'en' key.\n");
exit(1);
}
$englishStrings = $allLangs['en'];
echo 'English keys: ' . count($englishStrings) . "\n";
// ── Translate each target language ──────────────────────────────────────────
foreach ($targetLangs as $langCode => $langDesc) {
echo "\nTranslating → $langCode ($langDesc)...\n";
$translated = translateAll($englishStrings, $langCode, $langDesc, $endpoint, $deploy, $apiVer, $apiKey, $batchSize);
$allLangs[$langCode] = $translated;
echo " Done. Keys translated: " . count($translated) . "\n";
}
// ── Write back ───────────────────────────────────────────────────────────────
$phpCode = "<?php\nreturn " . varExportShort($allLangs) . ";\n";
file_put_contents($filePath, $phpCode);
echo "\nWritten: $filePath\n";
// ── Functions ────────────────────────────────────────────────────────────────
function translateAll(
array $strings,
string $langCode,
string $langDesc,
string $endpoint,
string $deploy,
string $apiVer,
string $apiKey,
int $batchSize
): array {
$keys = array_keys($strings);
$batches = array_chunk($keys, $batchSize);
$result = [];
foreach ($batches as $i => $batchKeys) {
$batchNum = $i + 1;
$total = count($batches);
echo " Batch $batchNum/$total (" . count($batchKeys) . " keys)...\n";
$batchStrings = array_intersect_key($strings, array_flip($batchKeys));
$translated = callAzureOpenAI($batchStrings, $langCode, $langDesc, $endpoint, $deploy, $apiVer, $apiKey);
$result = array_merge($result, $translated);
}
return $result;
}
function callAzureOpenAI(
array $strings,
string $langCode,
string $langDesc,
string $endpoint,
string $deploy,
string $apiVer,
string $apiKey
): array {
$systemPrompt = <<<PROMPT
You are a professional legal translator specialising in Scandinavian and Eastern European languages.
Translate the following PHP string values from English to {$langDesc}.
Rules:
- Preserve all HTML tags exactly (<strong>, <em>, <code>, <br> etc.)
- Preserve all bracketed tokens exactly: [FATHER], [PERSON], [SOCIAL WORKER], [ORG] etc.
- Preserve all arrow characters (→, ←, →) and typographic symbols
- Preserve placeholder values like "Ola Nordmann", "00:01:24" unchanged
- Keep URLs, file extensions (.docx, .php), and technical terms (PDF, DOCX, GPT-4o, Whisper, pyannote) unchanged
- Translate UI labels, descriptions, and body text naturally into {$langDesc}
- Return ONLY a valid JSON object with the same keys as input — no commentary, no markdown fences
PROMPT;
$userContent = json_encode($strings, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
$payload = json_encode([
'messages' => [
['role' => 'system', 'content' => $systemPrompt],
['role' => 'user', 'content' => $userContent],
],
'temperature' => 0.2,
'response_format' => ['type' => 'json_object'],
]);
$url = "{$endpoint}/openai/deployments/{$deploy}/chat/completions?api-version={$apiVer}";
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $payload,
CURLOPT_HTTPHEADER => [
'Content-Type: application/json',
"api-key: {$apiKey}",
],
CURLOPT_TIMEOUT => 120,
]);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($response === false || $httpCode !== 200) {
fwrite(STDERR, "API error HTTP $httpCode: $response\n");
exit(1);
}
$data = json_decode($response, true);
$content = $data['choices'][0]['message']['content'] ?? '';
$translated = json_decode($content, true);
if (!is_array($translated)) {
fwrite(STDERR, "Failed to parse JSON from model response:\n$content\n");
exit(1);
}
return $translated;
}
function varExportShort(mixed $data, int $indent = 0): string
{
$pad = str_repeat(' ', $indent);
$pad1 = str_repeat(' ', $indent + 1);
if (!is_array($data)) {
return var_export($data, true);
}
$isIndexed = array_keys($data) === range(0, count($data) - 1);
$lines = [];
foreach ($data as $key => $value) {
$keyStr = $isIndexed ? '' : var_export($key, true) . ' => ';
$lines[] = $pad1 . $keyStr . varExportShort($value, $indent + 1);
}
return "array (\n" . implode(",\n", $lines) . ",\n{$pad})";
}