Add Transcribe docs (about/guide/tech) + refresh Redact docs
- New: transcribe-about.php, transcribe-guide.php, transcribe-tech.php with full en/no/uk/pl translations (3-engine cascade, diarization, vocab) - New: translations/transcribe-about|guide|tech.php (4-lang strings) - New: scripts/translate-pages.php (Azure gpt-4o CLI translation helper) - Add korr-doc-links nav to transcribe.php - Refresh redact-about|guide|tech.php — point to assets/images/redact/ - Fix all "never written to disk" wording in redact translations - Add Min Sak/corpus save workflow to redact guide and tech privacy section - redact.php upload hint: correct in-memory wording Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,193 @@
|
||||
<?php
|
||||
/**
|
||||
* translate-pages.php
|
||||
*
|
||||
* CLI utility: translates the 'en' entries in a translations/*.php file
|
||||
* into Norwegian (no), Ukrainian (uk), and Polish (pl) using Azure OpenAI gpt-4o.
|
||||
*
|
||||
* Usage:
|
||||
* php scripts/translate-pages.php translations/transcribe-about.php
|
||||
*
|
||||
* The file is updated in-place — existing no/uk/pl keys are replaced.
|
||||
*
|
||||
* Config (env vars or constants below):
|
||||
* AZURE_OPENAI_KEY — Azure OpenAI API key
|
||||
* AZURE_OPENAI_ENDPOINT — e.g. https://exos-openai-test.openai.azure.com
|
||||
* AZURE_OPENAI_DEPLOY — deployment name, default gpt-4o
|
||||
* AZURE_OPENAI_VERSION — API version, default 2024-02-01
|
||||
*/
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
// ── Config ──────────────────────────────────────────────────────────────────
|
||||
$apiKey = getenv('AZURE_OPENAI_KEY') ?: '7cb23c8ce75741c798a7dfe24827c95e';
|
||||
$endpoint = rtrim(getenv('AZURE_OPENAI_ENDPOINT') ?: 'https://exos-openai-test.openai.azure.com', '/');
|
||||
$deploy = getenv('AZURE_OPENAI_DEPLOY') ?: 'gpt-4o';
|
||||
$apiVer = getenv('AZURE_OPENAI_VERSION') ?: '2024-02-01';
|
||||
$batchSize = 20; // keys per API call
|
||||
|
||||
$targetLangs = [
|
||||
'no' => 'Norwegian Bokmål (nn is Nynorsk; this is bokmål). Legal and formal register. Preserve Norwegian legal terminology.',
|
||||
'uk' => 'Ukrainian. Formal register suitable for legal documents. Use standard Ukrainian orthography.',
|
||||
'pl' => 'Polish. Formal register suitable for legal documents.',
|
||||
];
|
||||
|
||||
// ── Entry point ──────────────────────────────────────────────────────────────
|
||||
if (PHP_SAPI !== 'cli') {
|
||||
http_response_code(403);
|
||||
exit('CLI only');
|
||||
}
|
||||
|
||||
if ($argc < 2) {
|
||||
fwrite(STDERR, "Usage: php scripts/translate-pages.php translations/<file>.php\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$filePath = realpath($argv[1]);
|
||||
if (!$filePath || !is_file($filePath)) {
|
||||
fwrite(STDERR, "File not found: {$argv[1]}\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
echo "Loading: $filePath\n";
|
||||
$allLangs = require $filePath;
|
||||
|
||||
if (!is_array($allLangs) || !isset($allLangs['en'])) {
|
||||
fwrite(STDERR, "File must return an array with an 'en' key.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$englishStrings = $allLangs['en'];
|
||||
echo 'English keys: ' . count($englishStrings) . "\n";
|
||||
|
||||
// ── Translate each target language ──────────────────────────────────────────
|
||||
foreach ($targetLangs as $langCode => $langDesc) {
|
||||
echo "\nTranslating → $langCode ($langDesc)...\n";
|
||||
$translated = translateAll($englishStrings, $langCode, $langDesc, $endpoint, $deploy, $apiVer, $apiKey, $batchSize);
|
||||
$allLangs[$langCode] = $translated;
|
||||
echo " Done. Keys translated: " . count($translated) . "\n";
|
||||
}
|
||||
|
||||
// ── Write back ───────────────────────────────────────────────────────────────
|
||||
$phpCode = "<?php\nreturn " . varExportShort($allLangs) . ";\n";
|
||||
file_put_contents($filePath, $phpCode);
|
||||
echo "\nWritten: $filePath\n";
|
||||
|
||||
// ── Functions ────────────────────────────────────────────────────────────────
|
||||
|
||||
function translateAll(
|
||||
array $strings,
|
||||
string $langCode,
|
||||
string $langDesc,
|
||||
string $endpoint,
|
||||
string $deploy,
|
||||
string $apiVer,
|
||||
string $apiKey,
|
||||
int $batchSize
|
||||
): array {
|
||||
$keys = array_keys($strings);
|
||||
$batches = array_chunk($keys, $batchSize);
|
||||
$result = [];
|
||||
|
||||
foreach ($batches as $i => $batchKeys) {
|
||||
$batchNum = $i + 1;
|
||||
$total = count($batches);
|
||||
echo " Batch $batchNum/$total (" . count($batchKeys) . " keys)...\n";
|
||||
|
||||
$batchStrings = array_intersect_key($strings, array_flip($batchKeys));
|
||||
$translated = callAzureOpenAI($batchStrings, $langCode, $langDesc, $endpoint, $deploy, $apiVer, $apiKey);
|
||||
$result = array_merge($result, $translated);
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
function callAzureOpenAI(
|
||||
array $strings,
|
||||
string $langCode,
|
||||
string $langDesc,
|
||||
string $endpoint,
|
||||
string $deploy,
|
||||
string $apiVer,
|
||||
string $apiKey
|
||||
): array {
|
||||
$systemPrompt = <<<PROMPT
|
||||
You are a professional legal translator specialising in Scandinavian and Eastern European languages.
|
||||
|
||||
Translate the following PHP string values from English to {$langDesc}.
|
||||
|
||||
Rules:
|
||||
- Preserve all HTML tags exactly (<strong>, <em>, <code>, <br> etc.)
|
||||
- Preserve all bracketed tokens exactly: [FATHER], [PERSON], [SOCIAL WORKER], [ORG] etc.
|
||||
- Preserve all arrow characters (→, ←, →) and typographic symbols
|
||||
- Preserve placeholder values like "Ola Nordmann", "00:01:24" unchanged
|
||||
- Keep URLs, file extensions (.docx, .php), and technical terms (PDF, DOCX, GPT-4o, Whisper, pyannote) unchanged
|
||||
- Translate UI labels, descriptions, and body text naturally into {$langDesc}
|
||||
- Return ONLY a valid JSON object with the same keys as input — no commentary, no markdown fences
|
||||
PROMPT;
|
||||
|
||||
$userContent = json_encode($strings, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
|
||||
|
||||
$payload = json_encode([
|
||||
'messages' => [
|
||||
['role' => 'system', 'content' => $systemPrompt],
|
||||
['role' => 'user', 'content' => $userContent],
|
||||
],
|
||||
'temperature' => 0.2,
|
||||
'response_format' => ['type' => 'json_object'],
|
||||
]);
|
||||
|
||||
$url = "{$endpoint}/openai/deployments/{$deploy}/chat/completions?api-version={$apiVer}";
|
||||
|
||||
$ch = curl_init($url);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $payload,
|
||||
CURLOPT_HTTPHEADER => [
|
||||
'Content-Type: application/json',
|
||||
"api-key: {$apiKey}",
|
||||
],
|
||||
CURLOPT_TIMEOUT => 120,
|
||||
]);
|
||||
|
||||
$response = curl_exec($ch);
|
||||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
curl_close($ch);
|
||||
|
||||
if ($response === false || $httpCode !== 200) {
|
||||
fwrite(STDERR, "API error HTTP $httpCode: $response\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$data = json_decode($response, true);
|
||||
$content = $data['choices'][0]['message']['content'] ?? '';
|
||||
$translated = json_decode($content, true);
|
||||
|
||||
if (!is_array($translated)) {
|
||||
fwrite(STDERR, "Failed to parse JSON from model response:\n$content\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return $translated;
|
||||
}
|
||||
|
||||
function varExportShort(mixed $data, int $indent = 0): string
|
||||
{
|
||||
$pad = str_repeat(' ', $indent);
|
||||
$pad1 = str_repeat(' ', $indent + 1);
|
||||
|
||||
if (!is_array($data)) {
|
||||
return var_export($data, true);
|
||||
}
|
||||
|
||||
$isIndexed = array_keys($data) === range(0, count($data) - 1);
|
||||
$lines = [];
|
||||
|
||||
foreach ($data as $key => $value) {
|
||||
$keyStr = $isIndexed ? '' : var_export($key, true) . ' => ';
|
||||
$lines[] = $pad1 . $keyStr . varExportShort($value, $indent + 1);
|
||||
}
|
||||
|
||||
return "array (\n" . implode(",\n", $lines) . ",\n{$pad})";
|
||||
}
|
||||
Reference in New Issue
Block a user