'The file exceeds the allowed size limit.', UPLOAD_ERR_NO_TMP_DIR => 'No temporary directory is available.', UPLOAD_ERR_CANT_WRITE => 'Unable to save the uploaded file.', default => 'File upload failed.', }; dbnToolsError($msg, 422, 'upload_error'); } $originalName = basename((string)($file['name'] ?? '')); $tmpPath = (string)($file['tmp_name'] ?? ''); $size = (int)($file['size'] ?? 0); if (!is_uploaded_file($tmpPath)) { dbnToolsError('Invalid file upload.', 400, 'invalid_upload'); } if ($size === 0) { dbnToolsError('The uploaded file is empty.', 422, 'file_empty'); } if ($size > EXTRACT_MAX_BYTES) { dbnToolsError('File exceeds the 4 MB limit.', 413, 'file_too_large'); } $ext = strtolower(pathinfo($originalName, PATHINFO_EXTENSION)); if (!in_array($ext, EXTRACT_ALLOWED_EXTS, true)) { dbnToolsError('Unsupported file type. Upload a .pdf, .docx, or .txt file.', 422, 'unsupported_type'); } $text = match ($ext) { 'txt' => extractTxt($tmpPath), 'pdf' => extractPdf($tmpPath), 'docx' => extractDocx($tmpPath), }; $text = trim($text); if ($text === '') { dbnToolsError('No text could be extracted from this file.', 422, 'no_text'); } $truncated = false; if (mb_strlen($text, 'UTF-8') > EXTRACT_TEXT_LIMIT) { $text = mb_substr($text, 0, EXTRACT_TEXT_LIMIT, 'UTF-8'); $truncated = true; } dbnToolsRespond([ 'ok' => true, 'text' => $text, 'filename' => $originalName, 'chars' => mb_strlen($text, 'UTF-8'), 'truncated' => $truncated, ]); } catch (DbnToolsHttpException $e) { dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra); } catch (Throwable $e) { error_log('DBN extract error: ' . $e->getMessage()); dbnToolsError('Text extraction failed.', 500, 'extract_error'); } function extractTxt(string $path): string { $content = file_get_contents($path); if ($content === false) { throw new DbnToolsHttpException('Unable to read the file.', 500, 'read_error'); } return mb_convert_encoding($content, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252'); } function extractPdf(string $path): string { $cmd = 'pdftotext ' . escapeshellarg($path) . ' - 2>/dev/null'; $output = shell_exec($cmd); if ($output === null || $output === false || trim($output) === '') { throw new DbnToolsHttpException( 'PDF text extraction failed. The file may be image-only or encrypted.', 422, 'pdf_extract_failed' ); } return $output; } function extractDocx(string $path): string { $zip = new ZipArchive(); $result = $zip->open($path); if ($result !== true) { throw new DbnToolsHttpException('Unable to open the .docx file.', 422, 'docx_open_failed'); } $xml = $zip->getFromName('word/document.xml'); $zip->close(); if ($xml === false) { throw new DbnToolsHttpException('No document content found in this .docx file.', 422, 'docx_no_content'); } $doc = new DOMDocument(); libxml_use_internal_errors(true); $doc->loadXML($xml); libxml_clear_errors(); $xpath = new DOMXPath($doc); $xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'); $paragraphs = []; foreach ($xpath->query('//w:p') as $para) { $runs = []; foreach ($xpath->query('.//w:t', $para) as $t) { $runs[] = $t->textContent; } $paragraphs[] = implode('', $runs); } return implode("\n", $paragraphs); }