bbe5307c03
api/extract.php — new endpoint accepting .pdf/.docx/.txt up to 4 MB; pdftotext for PDFs, ZipArchive+DOMXPath for DOCX, mb_convert_encoding for TXT; truncates to 32 000 chars to stay within redact limit. index.php — drop/browse upload zone above the textarea, visible only in Redact mode. tools.js — setupUpload(), handleFileUpload(), resetUpload(); drag-and-drop and file picker both call the extract endpoint then populate the textarea. tools.css — upload zone, drag-over, file-info, clear button styles. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
138 lines
4.3 KiB
PHP
138 lines
4.3 KiB
PHP
<?php
|
|
declare(strict_types=1);
|
|
|
|
require_once __DIR__ . '/../includes/bootstrap.php';
|
|
|
|
dbnToolsRequireMethod('POST');
|
|
dbnToolsRequireAuth();
|
|
|
|
const EXTRACT_MAX_BYTES = 4 * 1024 * 1024;
|
|
const EXTRACT_TEXT_LIMIT = 32000;
|
|
const EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];
|
|
|
|
try {
|
|
if (empty($_FILES['file']) || !is_array($_FILES['file'])) {
|
|
dbnToolsError('No file was uploaded.', 422, 'missing_file');
|
|
}
|
|
|
|
$file = $_FILES['file'];
|
|
$errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE);
|
|
|
|
if ($errCode !== UPLOAD_ERR_OK) {
|
|
$msg = match ($errCode) {
|
|
UPLOAD_ERR_INI_SIZE, UPLOAD_ERR_FORM_SIZE => 'The file exceeds the allowed size limit.',
|
|
UPLOAD_ERR_NO_TMP_DIR => 'No temporary directory is available.',
|
|
UPLOAD_ERR_CANT_WRITE => 'Unable to save the uploaded file.',
|
|
default => 'File upload failed.',
|
|
};
|
|
dbnToolsError($msg, 422, 'upload_error');
|
|
}
|
|
|
|
$originalName = basename((string)($file['name'] ?? ''));
|
|
$tmpPath = (string)($file['tmp_name'] ?? '');
|
|
$size = (int)($file['size'] ?? 0);
|
|
|
|
if (!is_uploaded_file($tmpPath)) {
|
|
dbnToolsError('Invalid file upload.', 400, 'invalid_upload');
|
|
}
|
|
if ($size === 0) {
|
|
dbnToolsError('The uploaded file is empty.', 422, 'file_empty');
|
|
}
|
|
if ($size > EXTRACT_MAX_BYTES) {
|
|
dbnToolsError('File exceeds the 4 MB limit.', 413, 'file_too_large');
|
|
}
|
|
|
|
$ext = strtolower(pathinfo($originalName, PATHINFO_EXTENSION));
|
|
if (!in_array($ext, EXTRACT_ALLOWED_EXTS, true)) {
|
|
dbnToolsError('Unsupported file type. Upload a .pdf, .docx, or .txt file.', 422, 'unsupported_type');
|
|
}
|
|
|
|
$text = match ($ext) {
|
|
'txt' => extractTxt($tmpPath),
|
|
'pdf' => extractPdf($tmpPath),
|
|
'docx' => extractDocx($tmpPath),
|
|
};
|
|
|
|
$text = trim($text);
|
|
if ($text === '') {
|
|
dbnToolsError('No text could be extracted from this file.', 422, 'no_text');
|
|
}
|
|
|
|
$truncated = false;
|
|
if (mb_strlen($text, 'UTF-8') > EXTRACT_TEXT_LIMIT) {
|
|
$text = mb_substr($text, 0, EXTRACT_TEXT_LIMIT, 'UTF-8');
|
|
$truncated = true;
|
|
}
|
|
|
|
dbnToolsRespond([
|
|
'ok' => true,
|
|
'text' => $text,
|
|
'filename' => $originalName,
|
|
'chars' => mb_strlen($text, 'UTF-8'),
|
|
'truncated' => $truncated,
|
|
]);
|
|
} catch (DbnToolsHttpException $e) {
|
|
dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra);
|
|
} catch (Throwable $e) {
|
|
error_log('DBN extract error: ' . $e->getMessage());
|
|
dbnToolsError('Text extraction failed.', 500, 'extract_error');
|
|
}
|
|
|
|
function extractTxt(string $path): string
|
|
{
|
|
$content = file_get_contents($path);
|
|
if ($content === false) {
|
|
throw new DbnToolsHttpException('Unable to read the file.', 500, 'read_error');
|
|
}
|
|
return mb_convert_encoding($content, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252');
|
|
}
|
|
|
|
function extractPdf(string $path): string
|
|
{
|
|
$cmd = 'pdftotext ' . escapeshellarg($path) . ' - 2>/dev/null';
|
|
$output = shell_exec($cmd);
|
|
if ($output === null || $output === false || trim($output) === '') {
|
|
throw new DbnToolsHttpException(
|
|
'PDF text extraction failed. The file may be image-only or encrypted.',
|
|
422,
|
|
'pdf_extract_failed'
|
|
);
|
|
}
|
|
return $output;
|
|
}
|
|
|
|
function extractDocx(string $path): string
|
|
{
|
|
$zip = new ZipArchive();
|
|
$result = $zip->open($path);
|
|
if ($result !== true) {
|
|
throw new DbnToolsHttpException('Unable to open the .docx file.', 422, 'docx_open_failed');
|
|
}
|
|
|
|
$xml = $zip->getFromName('word/document.xml');
|
|
$zip->close();
|
|
|
|
if ($xml === false) {
|
|
throw new DbnToolsHttpException('No document content found in this .docx file.', 422, 'docx_no_content');
|
|
}
|
|
|
|
$doc = new DOMDocument();
|
|
libxml_use_internal_errors(true);
|
|
$doc->loadXML($xml);
|
|
libxml_clear_errors();
|
|
|
|
$xpath = new DOMXPath($doc);
|
|
$xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');
|
|
|
|
$paragraphs = [];
|
|
foreach ($xpath->query('//w:p') as $para) {
|
|
$runs = [];
|
|
foreach ($xpath->query('.//w:t', $para) as $t) {
|
|
$runs[] = $t->textContent;
|
|
}
|
|
$paragraphs[] = implode('', $runs);
|
|
}
|
|
|
|
return implode("\n", $paragraphs);
|
|
}
|