Add chunked timeline routing
This commit is contained in:
+277
-3
@@ -7,6 +7,7 @@ require_once __DIR__ . '/AzureOpenAiGateway.php';
|
||||
final class DbnLegalToolsService
|
||||
{
|
||||
private const MAX_PASTE_CHARS = 128000;
|
||||
private const MAX_TIMELINE_CHARS = 600000;
|
||||
|
||||
private DbnAzureOpenAiGateway $azure;
|
||||
|
||||
@@ -353,7 +354,7 @@ PROMPT;
|
||||
string $userNotes = '',
|
||||
?callable $onProgress = null
|
||||
): array {
|
||||
$text = $this->requirePasteText($text);
|
||||
$text = $this->requirePasteText($text, self::MAX_TIMELINE_CHARS);
|
||||
$engine = in_array($engine, ['nova_lite', 'azure_mini', 'azure_full'], true) ? $engine : 'azure_mini';
|
||||
$focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all';
|
||||
|
||||
@@ -382,6 +383,23 @@ PROMPT;
|
||||
? "\n\nUser-provided context notes (use these to resolve ambiguities, not as source events):\n---\n" . $userNotes . "\n---"
|
||||
: '';
|
||||
|
||||
$charCount = mb_strlen($text, 'UTF-8');
|
||||
$singlePassLimit = $this->timelineSinglePassLimit($engine);
|
||||
if ($charCount > $singlePassLimit) {
|
||||
return $this->timelineChunked(
|
||||
$text,
|
||||
$language,
|
||||
$engine,
|
||||
$focus,
|
||||
$confidenceFilter,
|
||||
$includeRelative,
|
||||
$includeBackground,
|
||||
$userNotes,
|
||||
$onProgress,
|
||||
$inputDateHintCount
|
||||
);
|
||||
}
|
||||
|
||||
$prompt = <<<PROMPT
|
||||
Build a chronological timeline from the pasted text in {$locale}.{$userNotesBlock}
|
||||
|
||||
@@ -589,6 +607,261 @@ PROMPT;
|
||||
];
|
||||
}
|
||||
|
||||
private function timelineChunked(
|
||||
string $text,
|
||||
string $language,
|
||||
string $engine,
|
||||
string $focus,
|
||||
string $confidenceFilter,
|
||||
bool $includeRelative,
|
||||
bool $includeBackground,
|
||||
string $userNotes,
|
||||
?callable $onProgress,
|
||||
int $inputDateHintCount
|
||||
): array {
|
||||
$engineLabel = match ($engine) { 'azure_full' => 'gpt-4o', 'nova_lite' => 'nova-lite', default => 'gpt-4o-mini' };
|
||||
$chunkSize = $this->timelineChunkSize($engine);
|
||||
$chunks = $this->timelineTextChunks($text, $chunkSize, 900);
|
||||
$chunkCount = count($chunks);
|
||||
$events = [];
|
||||
$chunkFailures = 0;
|
||||
$usedFallbackExtractor = false;
|
||||
|
||||
$onProgress && $onProgress('Splitting timeline into ' . $chunkCount . " chunk(s)\u{2026}");
|
||||
|
||||
foreach ($chunks as $idx => $chunk) {
|
||||
$chunkNo = $idx + 1;
|
||||
$chunkText = trim((string)$chunk['text']);
|
||||
if (mb_strlen($chunkText, 'UTF-8') < 20) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$onProgress && $onProgress("Extracting timeline chunk {$chunkNo}/{$chunkCount}\u{2026}");
|
||||
try {
|
||||
$result = $this->timeline(
|
||||
$chunkText,
|
||||
$language,
|
||||
$engine,
|
||||
$focus,
|
||||
$confidenceFilter,
|
||||
$includeRelative,
|
||||
$includeBackground,
|
||||
$userNotes,
|
||||
null
|
||||
);
|
||||
$chunkEvents = is_array($result['events'] ?? null) ? $result['events'] : [];
|
||||
if (!empty($result['trace_metadata']['used_fallback_extractor'])) {
|
||||
$usedFallbackExtractor = true;
|
||||
}
|
||||
} catch (DbnToolsHttpException $e) {
|
||||
$chunkFailures++;
|
||||
$chunkEvents = [];
|
||||
if ($this->timelineDateHintCount($chunkText) > 0) {
|
||||
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
|
||||
if ($chunkEvents) {
|
||||
$usedFallbackExtractor = true;
|
||||
}
|
||||
}
|
||||
if (!$chunkEvents && $e->status >= 500) {
|
||||
error_log('timeline chunk failed: ' . $e->errorCode . ' ' . $e->getMessage());
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
$chunkFailures++;
|
||||
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
|
||||
if ($chunkEvents) {
|
||||
$usedFallbackExtractor = true;
|
||||
}
|
||||
error_log('timeline chunk throwable: ' . $e->getMessage());
|
||||
}
|
||||
|
||||
foreach ($chunkEvents as $event) {
|
||||
if (!is_array($event)) {
|
||||
continue;
|
||||
}
|
||||
$event['chunk_index'] = $chunkNo;
|
||||
$event['source_position'] = (int)$chunk['start'];
|
||||
$events[] = $event;
|
||||
}
|
||||
}
|
||||
|
||||
$events = $this->mergeTimelineEvents($events);
|
||||
if ($confidenceFilter === 'high_medium') {
|
||||
$events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low'));
|
||||
}
|
||||
if (!$includeRelative) {
|
||||
$events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute'));
|
||||
}
|
||||
|
||||
$focusLabel = match ($focus) {
|
||||
'deadlines' => 'legal deadlines',
|
||||
'hearings' => 'court hearings',
|
||||
'cps' => 'CPS milestones',
|
||||
default => 'all events',
|
||||
};
|
||||
$isoDates = array_values(array_filter(array_map(fn($ev) => (string)($ev['date'] ?? ''), $events), fn($d) => preg_match('/^\d{4}-\d{2}-\d{2}$/', $d)));
|
||||
sort($isoDates);
|
||||
$range = $isoDates ? (' from ' . $isoDates[0] . ' to ' . $isoDates[count($isoDates) - 1]) : '';
|
||||
$actors = array_values(array_unique(array_filter(array_map(fn($ev) => (string)($ev['actor'] ?? ''), $events), fn($a) => $a !== '' && $a !== 'unknown')));
|
||||
$summary = count($events) . " event(s) extracted from {$chunkCount} chunk(s){$range}.";
|
||||
if ($actors) {
|
||||
$summary .= ' Main actors: ' . implode(', ', array_slice($actors, 0, 8)) . '.';
|
||||
}
|
||||
|
||||
$uncertain = [];
|
||||
if ($chunkFailures > 0) {
|
||||
$uncertain[] = "{$chunkFailures} chunk(s) needed fallback extraction or could not be fully parsed; review medium-confidence entries against the source.";
|
||||
}
|
||||
if ($usedFallbackExtractor) {
|
||||
$uncertain[] = 'Deterministic fallback extraction was used for at least one chunk.';
|
||||
}
|
||||
|
||||
$trace = [
|
||||
$this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Chunked timeline mode; without saving the text or output.", 'complete'),
|
||||
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text split into local chunks.', 'complete'),
|
||||
$this->trace('Evidence found', count($events) . " event(s) identified across {$chunkCount} chunk(s).", count($events) ? 'complete' : 'warning'),
|
||||
$this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text. Overlapping chunks were de-duplicated.', 'complete'),
|
||||
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($uncertain), $uncertain ? 'warning' : 'complete'),
|
||||
$this->trace('Next practical step', 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.', 'complete'),
|
||||
];
|
||||
|
||||
return [
|
||||
'tool' => 'timeline',
|
||||
'language' => $language,
|
||||
'what_we_found' => $summary,
|
||||
'events' => $events,
|
||||
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; split into local timeline chunks; not stored.']],
|
||||
'what_remains_uncertain' => $uncertain,
|
||||
'next_practical_step' => 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.',
|
||||
'trace' => $trace,
|
||||
'trace_metadata' => [
|
||||
'chunk_count' => count($events),
|
||||
'source_count' => $chunkCount,
|
||||
'deployment' => $engineLabel,
|
||||
'input_date_hint_count' => $inputDateHintCount,
|
||||
'used_fallback_extractor' => $usedFallbackExtractor,
|
||||
'chunked_timeline' => true,
|
||||
'timeline_chunk_count' => $chunkCount,
|
||||
'chunk_failures' => $chunkFailures,
|
||||
],
|
||||
'disclaimer' => dbnToolsDisclaimer($language),
|
||||
];
|
||||
}
|
||||
|
||||
private function timelineSinglePassLimit(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 25000,
|
||||
'azure_mini' => 55000,
|
||||
default => 128000,
|
||||
};
|
||||
}
|
||||
|
||||
private function timelineChunkSize(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 10000,
|
||||
'azure_mini' => 16000,
|
||||
default => 30000,
|
||||
};
|
||||
}
|
||||
|
||||
private function timelineTextChunks(string $text, int $chunkSize, int $overlap): array
|
||||
{
|
||||
$len = mb_strlen($text, 'UTF-8');
|
||||
$chunks = [];
|
||||
$start = 0;
|
||||
while ($start < $len) {
|
||||
$targetEnd = min($len, $start + $chunkSize);
|
||||
$window = mb_substr($text, $start, $targetEnd - $start, 'UTF-8');
|
||||
$end = $targetEnd;
|
||||
if ($targetEnd < $len) {
|
||||
$breakAt = mb_strrpos($window, "\n\n", 0, 'UTF-8');
|
||||
if ($breakAt === false || $breakAt < (int)($chunkSize * 0.55)) {
|
||||
$breakAt = mb_strrpos($window, "\n", 0, 'UTF-8');
|
||||
}
|
||||
if ($breakAt !== false && $breakAt > (int)($chunkSize * 0.45)) {
|
||||
$end = $start + $breakAt;
|
||||
}
|
||||
}
|
||||
|
||||
$chunkText = trim(mb_substr($text, $start, max(1, $end - $start), 'UTF-8'));
|
||||
if ($chunkText !== '') {
|
||||
$chunks[] = ['start' => $start, 'text' => $chunkText];
|
||||
}
|
||||
if ($end >= $len) {
|
||||
break;
|
||||
}
|
||||
$nextStart = max(0, $end - $overlap);
|
||||
if ($nextStart <= $start) {
|
||||
$nextStart = $end;
|
||||
}
|
||||
$start = $nextStart;
|
||||
}
|
||||
return $chunks;
|
||||
}
|
||||
|
||||
private function mergeTimelineEvents(array $events): array
|
||||
{
|
||||
$merged = [];
|
||||
foreach ($events as $event) {
|
||||
if (!is_array($event)) {
|
||||
continue;
|
||||
}
|
||||
$key = $this->timelineEventSignature($event);
|
||||
if (!isset($merged[$key])) {
|
||||
$merged[$key] = $event;
|
||||
continue;
|
||||
}
|
||||
$existing = $merged[$key];
|
||||
$candidateExcerpt = (string)($event['source_excerpt'] ?? '');
|
||||
$existingExcerpt = (string)($existing['source_excerpt'] ?? '');
|
||||
$additionalExcerpt = $candidateExcerpt;
|
||||
if ($this->timelineConfidenceRank((string)($event['confidence'] ?? 'medium')) > $this->timelineConfidenceRank((string)($existing['confidence'] ?? 'medium'))) {
|
||||
$merged[$key] = $event;
|
||||
$additionalExcerpt = $existingExcerpt;
|
||||
}
|
||||
$oldExcerpt = (string)($merged[$key]['source_excerpt'] ?? '');
|
||||
$newExcerpt = $additionalExcerpt;
|
||||
if ($newExcerpt !== '' && $oldExcerpt !== '' && $newExcerpt !== $oldExcerpt && mb_strlen($oldExcerpt, 'UTF-8') < 260) {
|
||||
$merged[$key]['source_excerpt'] = $oldExcerpt . ' / ' . $newExcerpt;
|
||||
}
|
||||
}
|
||||
|
||||
$events = array_values($merged);
|
||||
usort($events, static function (array $a, array $b): int {
|
||||
$ad = (string)($a['date'] ?? '');
|
||||
$bd = (string)($b['date'] ?? '');
|
||||
$ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99';
|
||||
$bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99';
|
||||
$cmp = strcmp($ai, $bi);
|
||||
if ($cmp !== 0) {
|
||||
return $cmp;
|
||||
}
|
||||
return strcmp((string)($a['time'] ?? ''), (string)($b['time'] ?? ''));
|
||||
});
|
||||
return $events;
|
||||
}
|
||||
|
||||
private function timelineEventSignature(array $event): string
|
||||
{
|
||||
$date = mb_strtolower(trim((string)($event['date'] ?? '')), 'UTF-8');
|
||||
$time = mb_strtolower(trim((string)($event['time'] ?? '')), 'UTF-8');
|
||||
$actor = mb_strtolower(trim((string)($event['actor'] ?? 'unknown')), 'UTF-8');
|
||||
$body = mb_strtolower(trim((string)($event['event'] ?? '')), 'UTF-8');
|
||||
$body = (string)preg_replace('/[^\p{L}\p{N}]+/u', ' ', $body);
|
||||
$body = trim((string)preg_replace('/\s+/u', ' ', $body));
|
||||
return $date . '|' . $time . '|' . $actor . '|' . mb_substr($body, 0, 96, 'UTF-8');
|
||||
}
|
||||
|
||||
private function timelineConfidenceRank(string $confidence): int
|
||||
{
|
||||
return match ($confidence) {
|
||||
'high' => 3,
|
||||
'medium' => 2,
|
||||
default => 1,
|
||||
};
|
||||
}
|
||||
|
||||
private function timelineDateHintCount(string $text): int
|
||||
{
|
||||
preg_match_all('/(?<!\d)\d{1,2}\.\d{1,2}\.(?:\d{2,4})?(?!\d)/u', $text, $numeric);
|
||||
@@ -1106,13 +1379,14 @@ PROMPT;
|
||||
return array_slice(array_values(array_unique($terms)), 0, 6);
|
||||
}
|
||||
|
||||
private function requirePasteText(string $text): string
|
||||
private function requirePasteText(string $text, ?int $maxChars = null): string
|
||||
{
|
||||
$text = trim($text);
|
||||
if (mb_strlen($text, 'UTF-8') < 20) {
|
||||
dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short');
|
||||
}
|
||||
if (mb_strlen($text, 'UTF-8') > self::MAX_PASTE_CHARS) {
|
||||
$maxChars ??= self::MAX_PASTE_CHARS;
|
||||
if (mb_strlen($text, 'UTF-8') > $maxChars) {
|
||||
dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long');
|
||||
}
|
||||
return $text;
|
||||
|
||||
+108
-8
@@ -1,6 +1,7 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/bootstrap.php';
|
||||
require_once __DIR__ . '/FreeTier.php';
|
||||
|
||||
/**
|
||||
@@ -14,6 +15,9 @@ final class ToolModels
|
||||
public const TIMELINE_QUICK_CHAR_LIMIT = 25000;
|
||||
public const TIMELINE_STANDARD_CHAR_LIMIT = 55000;
|
||||
public const TIMELINE_DEEP_CHAR_LIMIT = 128000;
|
||||
public const TIMELINE_QUICK_MAX_CHARS = 100000;
|
||||
public const TIMELINE_STANDARD_MAX_CHARS = 300000;
|
||||
public const TIMELINE_DEEP_MAX_CHARS = 600000;
|
||||
|
||||
public static function engineForUser(int $userId, string $requestedEngine): string
|
||||
{
|
||||
@@ -38,22 +42,32 @@ final class ToolModels
|
||||
$tierEngine = self::engineForUser($userId, $requestedEngine);
|
||||
$charCount = mb_strlen($text, 'UTF-8');
|
||||
|
||||
if ($charCount > self::TIMELINE_DEEP_CHAR_LIMIT) {
|
||||
if ($charCount > self::TIMELINE_DEEP_MAX_CHARS) {
|
||||
throw new DbnToolsHttpException(
|
||||
'This timeline input is too large after selected documents or My Case context were added. Split the file or use fewer selected documents.',
|
||||
413,
|
||||
'timeline_input_too_large',
|
||||
['input_char_count' => $charCount, 'max_chars' => self::TIMELINE_DEEP_CHAR_LIMIT]
|
||||
['input_char_count' => $charCount, 'max_chars' => self::TIMELINE_DEEP_MAX_CHARS]
|
||||
);
|
||||
}
|
||||
|
||||
$effectiveEngine = $tierEngine;
|
||||
if ($charCount > self::TIMELINE_STANDARD_CHAR_LIMIT) {
|
||||
$effectiveEngine = 'azure_full';
|
||||
} elseif ($charCount > self::TIMELINE_QUICK_CHAR_LIMIT && $effectiveEngine === 'nova_lite') {
|
||||
$effectiveEngine = 'azure_mini';
|
||||
if ($charCount > self::timelineEngineMaxChars($effectiveEngine)) {
|
||||
$effectiveEngine = $charCount <= self::TIMELINE_STANDARD_MAX_CHARS ? 'azure_mini' : 'azure_full';
|
||||
} elseif ($charCount > self::TIMELINE_STANDARD_CHAR_LIMIT && $effectiveEngine === 'nova_lite') {
|
||||
$effectiveEngine = $charCount <= self::TIMELINE_QUICK_MAX_CHARS ? 'nova_lite' : 'azure_mini';
|
||||
}
|
||||
|
||||
if ($charCount > self::timelineEngineMaxChars($effectiveEngine)) {
|
||||
$effectiveEngine = 'azure_full';
|
||||
}
|
||||
|
||||
$credits = self::timelineCreditsForSize($effectiveEngine, $charCount);
|
||||
$baseCredits = self::timelineAdvertisedCredits($requestedEngine);
|
||||
$requiresConfirmation = $credits > $baseCredits
|
||||
|| self::timelineEngineRank($effectiveEngine) > self::timelineEngineRank($requestedEngine);
|
||||
$chunked = $charCount > self::timelineEngineLimit($effectiveEngine);
|
||||
|
||||
return [
|
||||
'requested_engine' => $requestedEngine,
|
||||
'tier_engine' => $tierEngine,
|
||||
@@ -61,13 +75,48 @@ final class ToolModels
|
||||
'auto_upgraded_engine' => $effectiveEngine !== $tierEngine,
|
||||
'input_char_count' => $charCount,
|
||||
'engine_limit_chars' => self::timelineEngineLimit($effectiveEngine),
|
||||
'credits' => self::timelineCredits($effectiveEngine),
|
||||
'max_char_limit' => self::timelineEngineMaxChars($effectiveEngine),
|
||||
'chunked_timeline' => $chunked,
|
||||
'timeline_chunk_count' => $chunked ? (int)ceil($charCount / self::timelineChunkSize($effectiveEngine)) : 1,
|
||||
'estimated_credits' => $credits,
|
||||
'credits' => $credits,
|
||||
'base_credits' => $baseCredits,
|
||||
'requires_confirmation' => $requiresConfirmation,
|
||||
];
|
||||
}
|
||||
|
||||
public static function assertTimelineQuoteAccepted(array $route, array $input): void
|
||||
{
|
||||
if (empty($route['requires_confirmation'])) {
|
||||
return;
|
||||
}
|
||||
|
||||
$accepted = !empty($input['accepted_timeline_quote'])
|
||||
&& (int)($input['accepted_credits'] ?? 0) === (int)$route['credits']
|
||||
&& (string)($input['accepted_effective_engine'] ?? '') === (string)$route['effective_engine'];
|
||||
|
||||
if ($accepted) {
|
||||
return;
|
||||
}
|
||||
|
||||
$engineLabel = self::timelineEngineLabel((string)$route['effective_engine']);
|
||||
throw new DbnToolsHttpException(
|
||||
'This timeline is larger than the selected engine can handle at the advertised price. Confirm the quoted engine and credits before running.',
|
||||
409,
|
||||
'timeline_quote_required',
|
||||
['timeline_quote' => array_merge($route, [
|
||||
'effective_engine_label' => $engineLabel,
|
||||
'message' => 'Timeline will use ' . $engineLabel . ' for '
|
||||
. number_format((int)$route['input_char_count'])
|
||||
. ' characters across about ' . (int)$route['timeline_chunk_count']
|
||||
. ' chunk(s), costing ' . (int)$route['credits'] . ' credit(s).',
|
||||
])]
|
||||
);
|
||||
}
|
||||
|
||||
public static function timelineCredits(string $engine): int
|
||||
{
|
||||
return $engine === 'azure_full' ? 2 : 1;
|
||||
return self::timelineAdvertisedCredits($engine);
|
||||
}
|
||||
|
||||
public static function timelineEngineLimit(string $engine): int
|
||||
@@ -78,4 +127,55 @@ final class ToolModels
|
||||
default => self::TIMELINE_DEEP_CHAR_LIMIT,
|
||||
};
|
||||
}
|
||||
|
||||
public static function timelineChunkSize(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 10000,
|
||||
'azure_mini' => 16000,
|
||||
default => 30000,
|
||||
};
|
||||
}
|
||||
|
||||
public static function timelineEngineMaxChars(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => self::TIMELINE_QUICK_MAX_CHARS,
|
||||
'azure_mini' => self::TIMELINE_STANDARD_MAX_CHARS,
|
||||
default => self::TIMELINE_DEEP_MAX_CHARS,
|
||||
};
|
||||
}
|
||||
|
||||
public static function timelineCreditsForSize(string $engine, int $charCount): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => $charCount <= self::TIMELINE_QUICK_CHAR_LIMIT ? 1 : 2,
|
||||
'azure_mini' => $charCount <= self::TIMELINE_STANDARD_CHAR_LIMIT ? 1 : ($charCount <= 180000 ? 2 : 3),
|
||||
default => $charCount <= self::TIMELINE_DEEP_CHAR_LIMIT ? 2 : ($charCount <= 350000 ? 4 : 6),
|
||||
};
|
||||
}
|
||||
|
||||
public static function timelineAdvertisedCredits(string $engine): int
|
||||
{
|
||||
return $engine === 'azure_full' ? 2 : 1;
|
||||
}
|
||||
|
||||
public static function timelineEngineLabel(string $engine): string
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 'Quick',
|
||||
'azure_full' => 'Deep',
|
||||
default => 'Standard',
|
||||
};
|
||||
}
|
||||
|
||||
private static function timelineEngineRank(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 1,
|
||||
'azure_mini' => 2,
|
||||
'azure_full' => 3,
|
||||
default => 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -890,9 +890,10 @@ function dbnToolsExcerpt(string $text, int $limit = 520): string
|
||||
|
||||
const DBN_TOOLS_EXTRACT_MAX_BYTES = 8 * 1024 * 1024;
|
||||
const DBN_TOOLS_EXTRACT_TEXT_LIMIT = 128000;
|
||||
const DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT = 600000;
|
||||
const DBN_TOOLS_EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];
|
||||
|
||||
function dbnToolsExtractUploadedFile(array $file): array
|
||||
function dbnToolsExtractUploadedFile(array $file, int $textLimit = DBN_TOOLS_EXTRACT_TEXT_LIMIT): array
|
||||
{
|
||||
$errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE);
|
||||
if ($errCode !== UPLOAD_ERR_OK) {
|
||||
@@ -936,8 +937,9 @@ function dbnToolsExtractUploadedFile(array $file): array
|
||||
}
|
||||
|
||||
$truncated = false;
|
||||
if (mb_strlen($text, 'UTF-8') > DBN_TOOLS_EXTRACT_TEXT_LIMIT) {
|
||||
$text = mb_substr($text, 0, DBN_TOOLS_EXTRACT_TEXT_LIMIT, 'UTF-8');
|
||||
$textLimit = max(1000, min($textLimit, DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT));
|
||||
if (mb_strlen($text, 'UTF-8') > $textLimit) {
|
||||
$text = mb_substr($text, 0, $textLimit, 'UTF-8');
|
||||
$truncated = true;
|
||||
}
|
||||
|
||||
@@ -947,6 +949,7 @@ function dbnToolsExtractUploadedFile(array $file): array
|
||||
'filename' => $originalName,
|
||||
'chars' => mb_strlen($text, 'UTF-8'),
|
||||
'truncated' => $truncated,
|
||||
'limit' => $textLimit,
|
||||
];
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user