b84827ecea
- New: transcribe-about.php, transcribe-guide.php, transcribe-tech.php with full en/no/uk/pl translations (3-engine cascade, diarization, vocab) - New: translations/transcribe-about|guide|tech.php (4-lang strings) - New: scripts/translate-pages.php (Azure gpt-4o CLI translation helper) - Add korr-doc-links nav to transcribe.php - Refresh redact-about|guide|tech.php — point to assets/images/redact/ - Fix all "never written to disk" wording in redact translations - Add Min Sak/corpus save workflow to redact guide and tech privacy section - redact.php upload hint: correct in-memory wording Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
194 lines
7.1 KiB
PHP
194 lines
7.1 KiB
PHP
<?php
|
|
/**
|
|
* translate-pages.php
|
|
*
|
|
* CLI utility: translates the 'en' entries in a translations/*.php file
|
|
* into Norwegian (no), Ukrainian (uk), and Polish (pl) using Azure OpenAI gpt-4o.
|
|
*
|
|
* Usage:
|
|
* php scripts/translate-pages.php translations/transcribe-about.php
|
|
*
|
|
* The file is updated in-place — existing no/uk/pl keys are replaced.
|
|
*
|
|
* Config (env vars or constants below):
|
|
* AZURE_OPENAI_KEY — Azure OpenAI API key
|
|
* AZURE_OPENAI_ENDPOINT — e.g. https://exos-openai-test.openai.azure.com
|
|
* AZURE_OPENAI_DEPLOY — deployment name, default gpt-4o
|
|
* AZURE_OPENAI_VERSION — API version, default 2024-02-01
|
|
*/
|
|
|
|
declare(strict_types=1);
|
|
|
|
// ── Config ──────────────────────────────────────────────────────────────────
|
|
$apiKey = getenv('AZURE_OPENAI_KEY') ?: '7cb23c8ce75741c798a7dfe24827c95e';
|
|
$endpoint = rtrim(getenv('AZURE_OPENAI_ENDPOINT') ?: 'https://exos-openai-test.openai.azure.com', '/');
|
|
$deploy = getenv('AZURE_OPENAI_DEPLOY') ?: 'gpt-4o';
|
|
$apiVer = getenv('AZURE_OPENAI_VERSION') ?: '2024-02-01';
|
|
$batchSize = 20; // keys per API call
|
|
|
|
$targetLangs = [
|
|
'no' => 'Norwegian Bokmål (nn is Nynorsk; this is bokmål). Legal and formal register. Preserve Norwegian legal terminology.',
|
|
'uk' => 'Ukrainian. Formal register suitable for legal documents. Use standard Ukrainian orthography.',
|
|
'pl' => 'Polish. Formal register suitable for legal documents.',
|
|
];
|
|
|
|
// ── Entry point ──────────────────────────────────────────────────────────────
|
|
if (PHP_SAPI !== 'cli') {
|
|
http_response_code(403);
|
|
exit('CLI only');
|
|
}
|
|
|
|
if ($argc < 2) {
|
|
fwrite(STDERR, "Usage: php scripts/translate-pages.php translations/<file>.php\n");
|
|
exit(1);
|
|
}
|
|
|
|
$filePath = realpath($argv[1]);
|
|
if (!$filePath || !is_file($filePath)) {
|
|
fwrite(STDERR, "File not found: {$argv[1]}\n");
|
|
exit(1);
|
|
}
|
|
|
|
echo "Loading: $filePath\n";
|
|
$allLangs = require $filePath;
|
|
|
|
if (!is_array($allLangs) || !isset($allLangs['en'])) {
|
|
fwrite(STDERR, "File must return an array with an 'en' key.\n");
|
|
exit(1);
|
|
}
|
|
|
|
$englishStrings = $allLangs['en'];
|
|
echo 'English keys: ' . count($englishStrings) . "\n";
|
|
|
|
// ── Translate each target language ──────────────────────────────────────────
|
|
foreach ($targetLangs as $langCode => $langDesc) {
|
|
echo "\nTranslating → $langCode ($langDesc)...\n";
|
|
$translated = translateAll($englishStrings, $langCode, $langDesc, $endpoint, $deploy, $apiVer, $apiKey, $batchSize);
|
|
$allLangs[$langCode] = $translated;
|
|
echo " Done. Keys translated: " . count($translated) . "\n";
|
|
}
|
|
|
|
// ── Write back ───────────────────────────────────────────────────────────────
|
|
$phpCode = "<?php\nreturn " . varExportShort($allLangs) . ";\n";
|
|
file_put_contents($filePath, $phpCode);
|
|
echo "\nWritten: $filePath\n";
|
|
|
|
// ── Functions ────────────────────────────────────────────────────────────────
|
|
|
|
function translateAll(
|
|
array $strings,
|
|
string $langCode,
|
|
string $langDesc,
|
|
string $endpoint,
|
|
string $deploy,
|
|
string $apiVer,
|
|
string $apiKey,
|
|
int $batchSize
|
|
): array {
|
|
$keys = array_keys($strings);
|
|
$batches = array_chunk($keys, $batchSize);
|
|
$result = [];
|
|
|
|
foreach ($batches as $i => $batchKeys) {
|
|
$batchNum = $i + 1;
|
|
$total = count($batches);
|
|
echo " Batch $batchNum/$total (" . count($batchKeys) . " keys)...\n";
|
|
|
|
$batchStrings = array_intersect_key($strings, array_flip($batchKeys));
|
|
$translated = callAzureOpenAI($batchStrings, $langCode, $langDesc, $endpoint, $deploy, $apiVer, $apiKey);
|
|
$result = array_merge($result, $translated);
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
function callAzureOpenAI(
|
|
array $strings,
|
|
string $langCode,
|
|
string $langDesc,
|
|
string $endpoint,
|
|
string $deploy,
|
|
string $apiVer,
|
|
string $apiKey
|
|
): array {
|
|
$systemPrompt = <<<PROMPT
|
|
You are a professional legal translator specialising in Scandinavian and Eastern European languages.
|
|
|
|
Translate the following PHP string values from English to {$langDesc}.
|
|
|
|
Rules:
|
|
- Preserve all HTML tags exactly (<strong>, <em>, <code>, <br> etc.)
|
|
- Preserve all bracketed tokens exactly: [FATHER], [PERSON], [SOCIAL WORKER], [ORG] etc.
|
|
- Preserve all arrow characters (→, ←, →) and typographic symbols
|
|
- Preserve placeholder values like "Ola Nordmann", "00:01:24" unchanged
|
|
- Keep URLs, file extensions (.docx, .php), and technical terms (PDF, DOCX, GPT-4o, Whisper, pyannote) unchanged
|
|
- Translate UI labels, descriptions, and body text naturally into {$langDesc}
|
|
- Return ONLY a valid JSON object with the same keys as input — no commentary, no markdown fences
|
|
PROMPT;
|
|
|
|
$userContent = json_encode($strings, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
|
|
|
|
$payload = json_encode([
|
|
'messages' => [
|
|
['role' => 'system', 'content' => $systemPrompt],
|
|
['role' => 'user', 'content' => $userContent],
|
|
],
|
|
'temperature' => 0.2,
|
|
'response_format' => ['type' => 'json_object'],
|
|
]);
|
|
|
|
$url = "{$endpoint}/openai/deployments/{$deploy}/chat/completions?api-version={$apiVer}";
|
|
|
|
$ch = curl_init($url);
|
|
curl_setopt_array($ch, [
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_POST => true,
|
|
CURLOPT_POSTFIELDS => $payload,
|
|
CURLOPT_HTTPHEADER => [
|
|
'Content-Type: application/json',
|
|
"api-key: {$apiKey}",
|
|
],
|
|
CURLOPT_TIMEOUT => 120,
|
|
]);
|
|
|
|
$response = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if ($response === false || $httpCode !== 200) {
|
|
fwrite(STDERR, "API error HTTP $httpCode: $response\n");
|
|
exit(1);
|
|
}
|
|
|
|
$data = json_decode($response, true);
|
|
$content = $data['choices'][0]['message']['content'] ?? '';
|
|
$translated = json_decode($content, true);
|
|
|
|
if (!is_array($translated)) {
|
|
fwrite(STDERR, "Failed to parse JSON from model response:\n$content\n");
|
|
exit(1);
|
|
}
|
|
|
|
return $translated;
|
|
}
|
|
|
|
function varExportShort(mixed $data, int $indent = 0): string
|
|
{
|
|
$pad = str_repeat(' ', $indent);
|
|
$pad1 = str_repeat(' ', $indent + 1);
|
|
|
|
if (!is_array($data)) {
|
|
return var_export($data, true);
|
|
}
|
|
|
|
$isIndexed = array_keys($data) === range(0, count($data) - 1);
|
|
$lines = [];
|
|
|
|
foreach ($data as $key => $value) {
|
|
$keyStr = $isIndexed ? '' : var_export($key, true) . ' => ';
|
|
$lines[] = $pad1 . $keyStr . varExportShort($value, $indent + 1);
|
|
}
|
|
|
|
return "array (\n" . implode(",\n", $lines) . ",\n{$pad})";
|
|
}
|