endpoint = rtrim($endpoint ?? ($cfg['endpoint'] ?? ''), '/'); $this->key = $key ?? ($cfg['key'] ?? ''); if ($this->endpoint === '' || $this->key === '') { throw new RuntimeException('AzureDocIntelligence: endpoint or key not configured.'); } } private static function loadConfig(): array { $path = '/etc/bnl/azure.php'; if (is_readable($path)) { $cfg = require $path; return [ 'endpoint' => (string)($cfg['DOC_INTELLIGENCE_ENDPOINT'] ?? ''), 'key' => (string)($cfg['DOC_INTELLIGENCE_KEY'] ?? ''), ]; } return [ 'endpoint' => (string)(getenv('AZURE_DOC_INTELLIGENCE_ENDPOINT') ?: ''), 'key' => (string)(getenv('AZURE_DOC_INTELLIGENCE_KEY') ?: ''), ]; } /** * OCR a local PDF file using the prebuilt-read model. * Returns: ['content' => string, 'pages' => array, 'languages' => array] */ public function readPdf(string $localPath, int $pollTimeoutSeconds = 120): array { if (!is_readable($localPath)) { throw new InvalidArgumentException("Unreadable file: {$localPath}"); } $url = $this->endpoint . '/documentintelligence/documentModels/prebuilt-read:analyze?api-version=2024-11-30'; $body = file_get_contents($localPath); $ch = curl_init(); curl_setopt_array($ch, [ CURLOPT_URL => $url, CURLOPT_POST => true, CURLOPT_POSTFIELDS => $body, CURLOPT_HTTPHEADER => [ 'Content-Type: application/pdf', 'Ocp-Apim-Subscription-Key: ' . $this->key, ], CURLOPT_RETURNTRANSFER => true, CURLOPT_HEADER => true, CURLOPT_TIMEOUT => 60, ]); $response = curl_exec($ch); $headerSize = (int)curl_getinfo($ch, CURLINFO_HEADER_SIZE); $status = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE); curl_close($ch); if ($status !== 202 || !is_string($response)) { throw new RuntimeException("DocIntelligence analyze failed: HTTP {$status}"); } $headers = substr($response, 0, $headerSize); if (!preg_match('/Operation-Location:\s*(.+?)\r?\n/i', $headers, $m)) { throw new RuntimeException('DocIntelligence: missing Operation-Location header.'); } $pollUrl = trim($m[1]); $deadline = time() + $pollTimeoutSeconds; while (time() < $deadline) { usleep(1500_000); $pollCh = curl_init(); curl_setopt_array($pollCh, [ CURLOPT_URL => $pollUrl, CURLOPT_HTTPHEADER => ['Ocp-Apim-Subscription-Key: ' . $this->key], CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, ]); $pollResp = curl_exec($pollCh); $pollStatus = (int)curl_getinfo($pollCh, CURLINFO_RESPONSE_CODE); curl_close($pollCh); if ($pollStatus !== 200 || !is_string($pollResp)) { throw new RuntimeException("DocIntelligence poll failed: HTTP {$pollStatus}"); } $data = json_decode($pollResp, true); $st = (string)($data['status'] ?? ''); if ($st === 'succeeded') { $result = $data['analyzeResult'] ?? []; return [ 'content' => (string)($result['content'] ?? ''), 'pages' => $result['pages'] ?? [], 'languages' => $result['languages'] ?? [], 'page_count' => count($result['pages'] ?? []), ]; } if ($st === 'failed') { $err = $data['error']['message'] ?? 'unknown'; throw new RuntimeException("DocIntelligence analysis failed: {$err}"); } // 'running' or 'notStarted' — continue polling } throw new RuntimeException("DocIntelligence poll timeout after {$pollTimeoutSeconds}s."); } }