dobetternorge-tools/api/extract.php

<?php
declare(strict_types=1);

require_once __DIR__ . '/../includes/bootstrap.php';

dbnToolsRequireMethod('POST');
dbnToolsRequireAuth();

const EXTRACT_MAX_BYTES = 4 * 1024 * 1024;
const EXTRACT_TEXT_LIMIT = 32000;
const EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];

try {
    if (empty($_FILES['file']) || !is_array($_FILES['file'])) {
        dbnToolsError('No file was uploaded.', 422, 'missing_file');
    }

    $file    = $_FILES['file'];
    $errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE);

    if ($errCode !== UPLOAD_ERR_OK) {
        $msg = match ($errCode) {
            UPLOAD_ERR_INI_SIZE, UPLOAD_ERR_FORM_SIZE => 'The file exceeds the allowed size limit.',
            UPLOAD_ERR_NO_TMP_DIR => 'No temporary directory is available.',
            UPLOAD_ERR_CANT_WRITE => 'Unable to save the uploaded file.',
            default => 'File upload failed.',
        };
        dbnToolsError($msg, 422, 'upload_error');
    }

    $originalName = basename((string)($file['name'] ?? ''));
    $tmpPath      = (string)($file['tmp_name'] ?? '');
    $size         = (int)($file['size'] ?? 0);

    if (!is_uploaded_file($tmpPath)) {
        dbnToolsError('Invalid file upload.', 400, 'invalid_upload');
    }
    if ($size === 0) {
        dbnToolsError('The uploaded file is empty.', 422, 'file_empty');
    }
    if ($size > EXTRACT_MAX_BYTES) {
        dbnToolsError('File exceeds the 4 MB limit.', 413, 'file_too_large');
    }

    $ext = strtolower(pathinfo($originalName, PATHINFO_EXTENSION));
    if (!in_array($ext, EXTRACT_ALLOWED_EXTS, true)) {
        dbnToolsError('Unsupported file type. Upload a .pdf, .docx, or .txt file.', 422, 'unsupported_type');
    }

    $text = match ($ext) {
        'txt'  => extractTxt($tmpPath),
        'pdf'  => extractPdf($tmpPath),
        'docx' => extractDocx($tmpPath),
    };

    $text = trim($text);
    if ($text === '') {
        dbnToolsError('No text could be extracted from this file.', 422, 'no_text');
    }

    $truncated = false;
    if (mb_strlen($text, 'UTF-8') > EXTRACT_TEXT_LIMIT) {
        $text      = mb_substr($text, 0, EXTRACT_TEXT_LIMIT, 'UTF-8');
        $truncated = true;
    }

    dbnToolsRespond([
        'ok'        => true,
        'text'      => $text,
        'filename'  => $originalName,
        'chars'     => mb_strlen($text, 'UTF-8'),
        'truncated' => $truncated,
    ]);
} catch (DbnToolsHttpException $e) {
    dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra);
} catch (Throwable $e) {
    error_log('DBN extract error: ' . $e->getMessage());
    dbnToolsError('Text extraction failed.', 500, 'extract_error');
}

function extractTxt(string $path): string
{
    $content = file_get_contents($path);
    if ($content === false) {
        throw new DbnToolsHttpException('Unable to read the file.', 500, 'read_error');
    }
    return mb_convert_encoding($content, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252');
}

function extractPdf(string $path): string
{
    $cmd    = 'pdftotext ' . escapeshellarg($path) . ' - 2>/dev/null';
    $output = shell_exec($cmd);
    if ($output === null || $output === false || trim($output) === '') {
        throw new DbnToolsHttpException(
            'PDF text extraction failed. The file may be image-only or encrypted.',
            422,
            'pdf_extract_failed'
        );
    }
    return $output;
}

function extractDocx(string $path): string
{
    $zip    = new ZipArchive();
    $result = $zip->open($path);
    if ($result !== true) {
        throw new DbnToolsHttpException('Unable to open the .docx file.', 422, 'docx_open_failed');
    }

    $xml = $zip->getFromName('word/document.xml');
    $zip->close();

    if ($xml === false) {
        throw new DbnToolsHttpException('No document content found in this .docx file.', 422, 'docx_no_content');
    }

    $doc = new DOMDocument();
    libxml_use_internal_errors(true);
    $doc->loadXML($xml);
    libxml_clear_errors();

    $xpath = new DOMXPath($doc);
    $xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');

    $paragraphs = [];
    foreach ($xpath->query('//w:p') as $para) {
        $runs = [];
        foreach ($xpath->query('.//w:t', $para) as $t) {
            $runs[] = $t->textContent;
        }
        $paragraphs[] = implode('', $runs);
    }

    return implode("\n", $paragraphs);
}