95685862ab
- Extract limit raised from 32K to 128K chars per file (long legal docs now fit) - Redact API body/text limits raised (400KB / 128K chars) to match - Upload zone accepts multiple files (up to 5); extracted text concatenated with doc separator and combined before redaction; shows per-file char counts - LLM redact pass now infers contextual person roles (FATHER, MOTHER, CHILD, ATTORNEY, JUDGE, etc.) instead of generic [PERSON] for all names; same individual gets consistent tag throughout the document - Tag validation widened to allow any [A-Za-z0-9_- ] pattern (not just the five hardcoded tags), supporting contextual and alias tags - Alias UI added to Redact mode: user maps real names to bracketed aliases (e.g. "David Jr" -> [Junior]); aliases injected into LLM system prompt as override instructions; max 20 aliases, 100 chars each - max_tokens raised from 2000 to 4000; timeout from 60s to 90s for larger docs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
885 lines
42 KiB
PHP
885 lines
42 KiB
PHP
<?php
|
|
declare(strict_types=1);
|
|
|
|
require_once __DIR__ . '/bootstrap.php';
|
|
require_once __DIR__ . '/AzureOpenAiGateway.php';
|
|
|
|
final class DbnLegalToolsService
|
|
{
|
|
private const MAX_PASTE_CHARS = 32000;
|
|
|
|
private DbnAzureOpenAiGateway $azure;
|
|
|
|
public function __construct(?DbnAzureOpenAiGateway $azure = null)
|
|
{
|
|
$this->azure = $azure ?: new DbnAzureOpenAiGateway();
|
|
}
|
|
|
|
public function search(
|
|
string $query,
|
|
string $language = 'en',
|
|
int $limit = 6,
|
|
string $temporalMode = 'disabled',
|
|
?string $asOfDate = null
|
|
): array {
|
|
$query = trim($query);
|
|
if (mb_strlen($query, 'UTF-8') < 3) {
|
|
dbnToolsAbort('Search query must be at least 3 characters.', 422, 'query_too_short');
|
|
}
|
|
$limit = max(1, min(10, $limit));
|
|
$temporalMode = in_array($temporalMode, ['legal_conservative', 'disabled'], true) ? $temporalMode : 'disabled';
|
|
|
|
$trace = [
|
|
$this->trace('Query interpretation', 'Searching Do Better Norge private corpus plus the subscribed family-legal package.', 'complete'),
|
|
$this->trace('Search tools used', 'ClientRagPipeline::searchAll with keyword mode, private corpus enabled, shared package filter set to family-legal.', 'running'),
|
|
];
|
|
|
|
$client = dbnToolsRequireClient();
|
|
$package = $this->requireFamilyPackage((int)$client['id']);
|
|
|
|
$chunks = [];
|
|
$retrievalNote = 'ClientRagPipeline keyword retrieval';
|
|
try {
|
|
dbnToolsBootCaveau();
|
|
$gatewayUrl = 'http://10.0.1.10:4000';
|
|
try {
|
|
$config = getConfig();
|
|
$configured = trim((string)($config['ai_gateway']['url'] ?? ''));
|
|
if ($configured !== '') {
|
|
$gatewayUrl = $configured;
|
|
}
|
|
} catch (Throwable $e) {
|
|
// Retrieval still works in keyword mode without gateway config.
|
|
}
|
|
|
|
$rag = new ClientRagPipeline((int)$client['id'], $gatewayUrl, 30);
|
|
$chunks = $rag->searchAll($query, $limit, null, [
|
|
'search_private' => true,
|
|
'search_shared' => true,
|
|
'package_ids' => [(int)$package['id']],
|
|
'chunk_limit' => $limit,
|
|
'search_method' => 'keyword',
|
|
'min_private' => 0,
|
|
'include_beta_website' => true,
|
|
]);
|
|
|
|
// Apply temporal reranking after retrieval (optional)
|
|
if ($temporalMode === 'legal_conservative' && !empty($chunks)) {
|
|
$temporalLayerPath = __DIR__ . '/../../ai-portal/platform/includes/LegalTemporalLayer.php';
|
|
if (file_exists($temporalLayerPath)) {
|
|
require_once $temporalLayerPath;
|
|
$layer = new LegalTemporalLayer(['temporal_mode' => $temporalMode]);
|
|
$chunks = $layer->rerank($chunks, $query, $asOfDate);
|
|
}
|
|
}
|
|
} catch (Throwable $e) {
|
|
$retrievalNote = 'SQL keyword fallback after ClientRagPipeline error';
|
|
$trace[] = $this->trace('Search fallback', 'Pipeline retrieval failed; using direct SQL keyword fallback without storing the query.', 'warning');
|
|
$chunks = $this->fallbackKeywordSearch((int)$client['id'], $package, $query, $limit);
|
|
}
|
|
|
|
if (!$chunks) {
|
|
$fallback = $this->fallbackKeywordSearch((int)$client['id'], $package, $query, $limit);
|
|
if ($fallback) {
|
|
$chunks = $fallback;
|
|
$retrievalNote = 'SQL keyword fallback';
|
|
}
|
|
}
|
|
|
|
$sharedDocIds = [];
|
|
foreach (array_slice($chunks, 0, $limit) as $chunk) {
|
|
if (($chunk['source_type'] ?? '') !== 'private' && isset($chunk['document_id'])) {
|
|
$sharedDocIds[(int)$chunk['document_id']] = true;
|
|
}
|
|
}
|
|
$docSummaries = $sharedDocIds ? $this->fetchDocSummaries(array_keys($sharedDocIds)) : [];
|
|
|
|
$hits = array_map(
|
|
fn(array $chunk): array => $this->sourceFromChunk(
|
|
$chunk,
|
|
($chunk['source_type'] ?? '') !== 'private'
|
|
? ($docSummaries[(int)($chunk['document_id'] ?? 0)] ?? null)
|
|
: null
|
|
),
|
|
array_slice($chunks, 0, $limit)
|
|
);
|
|
$confidence = $this->citationConfidence($hits);
|
|
|
|
$trace[1] = $this->trace('Search tools used', $retrievalNote . '; returned ' . count($hits) . ' source hit(s).', 'complete');
|
|
$trace[] = $this->trace('Evidence found', count($hits) ? 'Retrieved source excerpts for review.' : 'No matching source excerpts were found.', count($hits) ? 'complete' : 'warning');
|
|
$trace[] = $this->trace('Citation confidence', ucfirst($confidence) . ' confidence based on source count and retrieval scores.', $confidence === 'low' ? 'warning' : 'complete');
|
|
|
|
return [
|
|
'tool' => 'search',
|
|
'language' => $language,
|
|
'what_we_found' => count($hits) ? 'Found source excerpts from the legal corpus.' : 'No matching source excerpts were found.',
|
|
'hits' => $hits,
|
|
'evidence_trail' => $hits,
|
|
'what_remains_uncertain' => count($hits) ? 'Search results still need human review for legal relevance and currentness.' : 'The corpus may not contain enough evidence for this query.',
|
|
'next_practical_step' => count($hits) ? 'Open the strongest sources and confirm the cited sections before relying on them.' : 'Try a narrower query with statutory terms, party names, or dates.',
|
|
'trace' => $trace,
|
|
'trace_metadata' => [
|
|
'chunk_count' => count($chunks),
|
|
'source_count' => count($hits),
|
|
'deployment' => null,
|
|
'citation_confidence' => $confidence,
|
|
],
|
|
'disclaimer' => dbnToolsDisclaimer($language),
|
|
];
|
|
}
|
|
|
|
public function ask(string $question, string $language = 'en'): array
|
|
{
|
|
$search = $this->search($question, $language, 7);
|
|
$hits = $search['hits'];
|
|
$trace = $search['trace'];
|
|
|
|
if (!$hits) {
|
|
$trace[] = $this->trace('Synthesis', 'Skipped answer synthesis because no evidence was found.', 'warning');
|
|
return [
|
|
'tool' => 'ask',
|
|
'language' => $language,
|
|
'answer' => $language === 'no'
|
|
? 'Jeg fant ikke nok kildestøtte i familie-rettskorpuset til å svare sikkert.'
|
|
: 'I did not find enough source support in the family-law corpus to answer safely.',
|
|
'what_we_found' => $search['what_we_found'],
|
|
'evidence_trail' => [],
|
|
'what_remains_uncertain' => $search['what_remains_uncertain'],
|
|
'next_practical_step' => $search['next_practical_step'],
|
|
'trace' => $trace,
|
|
'trace_metadata' => [
|
|
'chunk_count' => 0,
|
|
'source_count' => 0,
|
|
'deployment' => null,
|
|
'citation_confidence' => 'low',
|
|
],
|
|
'disclaimer' => dbnToolsDisclaimer($language),
|
|
];
|
|
}
|
|
|
|
$this->azure->requireChat();
|
|
|
|
$context = $this->buildEvidenceContext($hits);
|
|
$locale = $language === 'no' ? 'Norwegian' : 'English';
|
|
$prompt = <<<PROMPT
|
|
Question:
|
|
{$question}
|
|
|
|
Evidence excerpts:
|
|
{$context}
|
|
|
|
Return JSON only with these keys:
|
|
{
|
|
"answer": "short direct answer in {$locale}",
|
|
"what_we_found": "plain-language summary of the supported finding",
|
|
"evidence_trail": [{"title":"source title","why_it_matters":"one sentence","citation":"visible source title or section"}],
|
|
"what_remains_uncertain": ["specific gaps or caveats"],
|
|
"next_practical_step": "one concrete next action"
|
|
}
|
|
PROMPT;
|
|
|
|
$system = $this->legalJsonSystemPrompt($language);
|
|
$raw = $this->azure->chatText([
|
|
['role' => 'system', 'content' => $system],
|
|
['role' => 'user', 'content' => $prompt],
|
|
], [
|
|
'json' => true,
|
|
'temperature' => 0.15,
|
|
'max_tokens' => 1300,
|
|
]);
|
|
|
|
$json = $this->azure->decodeJsonObject($raw);
|
|
if (!$json) {
|
|
$json = [
|
|
'answer' => $raw,
|
|
'what_we_found' => 'Azure returned a plain-text answer based on the retrieved excerpts.',
|
|
'evidence_trail' => [],
|
|
'what_remains_uncertain' => ['The response format could not be validated as structured JSON.'],
|
|
'next_practical_step' => 'Review the source excerpts manually before relying on the answer.',
|
|
];
|
|
}
|
|
|
|
$trace[] = $this->trace('Synthesis', 'Azure OpenAI generated an answer using only the retrieved source excerpts.', 'complete');
|
|
$trace[] = $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete');
|
|
$trace[] = $this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the evidence trail.'), 'complete');
|
|
|
|
return [
|
|
'tool' => 'ask',
|
|
'language' => $language,
|
|
'answer' => (string)($json['answer'] ?? ''),
|
|
'what_we_found' => (string)($json['what_we_found'] ?? ''),
|
|
'evidence_trail' => $hits,
|
|
'citation_notes' => $this->normalizeEvidenceTrail($json['evidence_trail'] ?? [], $hits),
|
|
'sources' => $hits,
|
|
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
|
|
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
|
|
'trace' => $trace,
|
|
'trace_metadata' => [
|
|
'chunk_count' => count($hits),
|
|
'source_count' => count($hits),
|
|
'deployment' => $this->azure->chatDeployment(),
|
|
'citation_confidence' => $search['trace_metadata']['citation_confidence'] ?? 'medium',
|
|
],
|
|
'disclaimer' => dbnToolsDisclaimer($language),
|
|
];
|
|
}
|
|
|
|
public function summarize(string $text, string $language = 'en'): array
|
|
{
|
|
$text = $this->requirePasteText($text);
|
|
$this->azure->requireChat();
|
|
|
|
$locale = $language === 'no' ? 'Norwegian' : 'English';
|
|
$prompt = <<<PROMPT
|
|
Summarize this pasted case-preparation text in {$locale}. Do not invent missing facts.
|
|
|
|
Pasted text:
|
|
{$text}
|
|
|
|
Return JSON only:
|
|
{
|
|
"what_we_found": "plain-language summary",
|
|
"key_facts": ["fact"],
|
|
"dates": ["date or unknown"],
|
|
"parties": ["party or role"],
|
|
"legal_references_detected": ["reference"],
|
|
"what_remains_uncertain": ["uncertainty"],
|
|
"next_practical_step": "one concrete next action"
|
|
}
|
|
PROMPT;
|
|
|
|
$json = $this->runJsonTool($prompt, $language, 1300);
|
|
$trace = [
|
|
$this->trace('Query interpretation', 'Summarize pasted text without saving the text or output.', 'complete'),
|
|
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text.', 'complete'),
|
|
$this->trace('Evidence found', 'Evidence trail is limited to the pasted text supplied in this request.', 'complete'),
|
|
$this->trace('Citation confidence', 'Medium confidence for factual extraction; no external legal source verification was performed.', 'warning'),
|
|
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'),
|
|
$this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Review the summary against the original text.'), 'complete'),
|
|
];
|
|
|
|
return [
|
|
'tool' => 'summarize',
|
|
'language' => $language,
|
|
'what_we_found' => (string)($json['what_we_found'] ?? ''),
|
|
'key_facts' => $json['key_facts'] ?? [],
|
|
'dates' => $json['dates'] ?? [],
|
|
'parties' => $json['parties'] ?? [],
|
|
'legal_references_detected' => $json['legal_references_detected'] ?? [],
|
|
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
|
|
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
|
|
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
|
|
'trace' => $trace,
|
|
'trace_metadata' => [
|
|
'chunk_count' => 1,
|
|
'source_count' => 1,
|
|
'deployment' => $this->azure->chatDeployment(),
|
|
],
|
|
'disclaimer' => dbnToolsDisclaimer($language),
|
|
];
|
|
}
|
|
|
|
public function timeline(string $text, string $language = 'en'): array
|
|
{
|
|
$text = $this->requirePasteText($text);
|
|
$this->azure->requireChat();
|
|
|
|
$locale = $language === 'no' ? 'Norwegian' : 'English';
|
|
$prompt = <<<PROMPT
|
|
Build a chronological timeline from this pasted text in {$locale}. Keep uncertain dates explicit.
|
|
|
|
Pasted text:
|
|
{$text}
|
|
|
|
Return JSON only:
|
|
{
|
|
"what_we_found": "short overview",
|
|
"events": [{"date":"YYYY-MM-DD, month/year, or unknown","actor":"actor or unknown","event":"event","source_excerpt":"short excerpt","confidence":"high|medium|low"}],
|
|
"evidence_trail": [{"title":"Pasted text","excerpt":"short relevant excerpt"}],
|
|
"what_remains_uncertain": ["uncertainty"],
|
|
"next_practical_step": "one concrete next action"
|
|
}
|
|
PROMPT;
|
|
|
|
$json = $this->runJsonTool($prompt, $language, 1600);
|
|
$events = is_array($json['events'] ?? null) ? $json['events'] : [];
|
|
$trace = [
|
|
$this->trace('Query interpretation', 'Extract dated events from pasted text without saving the text or output.', 'complete'),
|
|
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text.', 'complete'),
|
|
$this->trace('Evidence found', count($events) . ' event(s) identified.', count($events) ? 'complete' : 'warning'),
|
|
$this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text.', 'complete'),
|
|
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($json['what_remains_uncertain'] ?? []), 'complete'),
|
|
$this->trace('Next practical step', (string)($json['next_practical_step'] ?? 'Verify dates against original documents.'), 'complete'),
|
|
];
|
|
|
|
return [
|
|
'tool' => 'timeline',
|
|
'language' => $language,
|
|
'what_we_found' => (string)($json['what_we_found'] ?? ''),
|
|
'events' => $events,
|
|
'evidence_trail' => $json['evidence_trail'] ?? [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
|
|
'what_remains_uncertain' => $json['what_remains_uncertain'] ?? [],
|
|
'next_practical_step' => (string)($json['next_practical_step'] ?? ''),
|
|
'trace' => $trace,
|
|
'trace_metadata' => [
|
|
'chunk_count' => count($events),
|
|
'source_count' => 1,
|
|
'deployment' => $this->azure->chatDeployment(),
|
|
],
|
|
'disclaimer' => dbnToolsDisclaimer($language),
|
|
];
|
|
}
|
|
|
|
public function redact(string $text, string $mode = 'standard', string $region = 'nordic', string $language = 'en', array $aliases = []): array
|
|
{
|
|
$text = $this->requirePasteText($text);
|
|
$mode = $mode === 'strict' ? 'strict' : 'standard';
|
|
$region = in_array($region, ['nordic', 'european', 'echr', 'global'], true) ? $region : 'nordic';
|
|
|
|
// Pass 1 — deterministic regex
|
|
[$preRedacted, $pass1Counts] = $this->deterministicRedaction($text, $mode, $region);
|
|
$pass1Total = array_sum($pass1Counts);
|
|
$pass1Detail = $pass1Total
|
|
? implode(', ', array_map(
|
|
fn($k, $v) => "{$k}: {$v}",
|
|
array_keys(array_filter($pass1Counts, fn($v): bool => $v > 0)),
|
|
array_filter($pass1Counts, fn($v): bool => $v > 0)
|
|
))
|
|
: 'none detected';
|
|
|
|
$trace = [
|
|
$this->trace('Query interpretation', "Redact PII from pasted text. Region: {$region}. Mode: {$mode}.", 'complete'),
|
|
$this->trace('Pass 1 — Deterministic patterns', "Applied {$region} pattern pack. {$pass1Detail}.", $pass1Total > 0 ? 'complete' : 'warning'),
|
|
];
|
|
|
|
// Pass 2 — LLM semantic scan
|
|
$finalRedacted = $preRedacted;
|
|
$pass2Counts = [];
|
|
$llmDeployment = null;
|
|
|
|
$llmResult = $this->llmRedactionPass($preRedacted, $language, $aliases);
|
|
|
|
if (!empty($llmResult['skipped'])) {
|
|
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped: ' . ($llmResult['reason'] ?? 'Azure not configured') . '.', 'warning');
|
|
} elseif (!empty($llmResult['error'])) {
|
|
$trace[] = $this->trace('Pass 2 — LLM semantic scan', 'Skipped due to error: ' . dbnToolsExcerpt($llmResult['error'], 100) . '.', 'warning');
|
|
} else {
|
|
$entities = $llmResult['entities'] ?? [];
|
|
$llmDeployment = $llmResult['deployment'] ?? null;
|
|
$applied = 0;
|
|
|
|
foreach ($entities as $entity) {
|
|
if (!is_array($entity)) {
|
|
continue;
|
|
}
|
|
$original = (string)($entity['original'] ?? '');
|
|
$type = (string)($entity['type'] ?? 'other');
|
|
$tag = (string)($entity['tag'] ?? '[IDENTIFIER]');
|
|
if ($original === '' || str_starts_with($original, '[')) {
|
|
continue;
|
|
}
|
|
if (!preg_match('/^\[[A-Za-z0-9_\- ]+\]$/', $tag)) {
|
|
$tag = '[IDENTIFIER]';
|
|
}
|
|
if (str_contains($finalRedacted, $original)) {
|
|
$finalRedacted = str_replace($original, $tag, $finalRedacted);
|
|
$pass2Counts[$type] = ($pass2Counts[$type] ?? 0) + 1;
|
|
$applied++;
|
|
}
|
|
}
|
|
|
|
$pass2Detail = $applied > 0
|
|
? "{$applied} additional: " . implode(', ', array_map(fn($k, $v) => "{$k}: {$v}", array_keys($pass2Counts), $pass2Counts))
|
|
: 'no additional entities found';
|
|
|
|
$trace[] = $this->trace('Pass 2 — LLM semantic scan', "Azure reviewed pre-redacted text for names, orgs, and places. {$pass2Detail}.", 'complete');
|
|
}
|
|
|
|
$allCounts = array_merge($pass1Counts, $pass2Counts);
|
|
$categories = array_keys(array_filter($allCounts, fn($v): bool => $v > 0));
|
|
|
|
$trace[] = $this->trace('Uncertainty / missing evidence', 'Human review recommended for contextual identification and unusual formatting.', 'warning');
|
|
$trace[] = $this->trace('Next practical step', 'Review the output and rerun in strict mode if the text will be shared broadly.', 'complete');
|
|
|
|
return [
|
|
'tool' => 'redact',
|
|
'mode' => $mode,
|
|
'region' => $region,
|
|
'what_we_found' => "Applied {$region} pattern pack" . ($llmDeployment ? ' and LLM semantic scan' : '') . '.',
|
|
'redacted_text' => $finalRedacted,
|
|
'detected_entity_categories' => $categories,
|
|
'entity_counts' => $allCounts,
|
|
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; not stored.']],
|
|
'what_remains_uncertain' => ['Human review is still recommended for contextual identification.'],
|
|
'next_practical_step' => 'Review the output and rerun in strict mode if the text will be shared broadly.',
|
|
'trace' => $trace,
|
|
'trace_metadata' => [
|
|
'chunk_count' => 1,
|
|
'source_count' => 1,
|
|
'deployment' => $llmDeployment,
|
|
],
|
|
'disclaimer' => 'Privacy support tool. Review before disclosure.',
|
|
];
|
|
}
|
|
|
|
private function requireFamilyPackage(int $clientId): array
|
|
{
|
|
$package = dbnToolsFetchPackage('family-legal');
|
|
if (!$package || empty($package['is_active'])) {
|
|
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
|
|
}
|
|
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
|
|
dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
|
|
}
|
|
return $package;
|
|
}
|
|
|
|
private function runJsonTool(string $prompt, string $language, int $maxTokens): array
|
|
{
|
|
$raw = $this->azure->chatText([
|
|
['role' => 'system', 'content' => $this->legalJsonSystemPrompt($language)],
|
|
['role' => 'user', 'content' => $prompt],
|
|
], [
|
|
'json' => true,
|
|
'temperature' => 0.1,
|
|
'max_tokens' => $maxTokens,
|
|
]);
|
|
$json = $this->azure->decodeJsonObject($raw);
|
|
if (!$json) {
|
|
dbnToolsAbort('Azure OpenAI did not return valid structured JSON.', 502, 'azure_invalid_json');
|
|
}
|
|
return $json;
|
|
}
|
|
|
|
private function legalJsonSystemPrompt(string $language): string
|
|
{
|
|
$locale = $language === 'no' ? 'Norwegian' : 'English';
|
|
return <<<PROMPT
|
|
You are Do Better Norge Legal Tools in a source-grounded legal preparation workflow.
|
|
Use the DBN legal guardrails:
|
|
- Answer only from provided source excerpts or pasted text.
|
|
- Treat your role as legal information and issue-spotting, not final legal advice.
|
|
- Never invent statutes, paragraph numbers, case names, citations, parties, dates, or sources.
|
|
- If evidence is insufficient, say so plainly.
|
|
- Respond in {$locale}.
|
|
- Return valid JSON only. No markdown fences.
|
|
PROMPT;
|
|
}
|
|
|
|
private function buildEvidenceContext(array $hits): string
|
|
{
|
|
$lines = [];
|
|
foreach ($hits as $idx => $hit) {
|
|
$n = $idx + 1;
|
|
$lines[] = "[{$n}] Title: " . ($hit['title'] ?? 'Untitled');
|
|
if (!empty($hit['section'])) {
|
|
$lines[] = "Section: " . $hit['section'];
|
|
}
|
|
$lines[] = "Corpus/package: " . ($hit['package_or_corpus'] ?? 'unknown');
|
|
$lines[] = "Excerpt: " . ($hit['excerpt'] ?? '');
|
|
}
|
|
return implode("\n", $lines);
|
|
}
|
|
|
|
private function normalizeEvidenceTrail(mixed $trail, array $hits): array
|
|
{
|
|
if (!is_array($trail) || !$trail) {
|
|
return array_map(fn(array $hit): array => [
|
|
'title' => $hit['title'],
|
|
'citation' => $hit['title'],
|
|
'why_it_matters' => dbnToolsExcerpt($hit['excerpt'], 180),
|
|
], array_slice($hits, 0, 4));
|
|
}
|
|
return array_values(array_filter($trail, 'is_array'));
|
|
}
|
|
|
|
private function sourceFromChunk(array $chunk, ?string $docSummary = null): array
|
|
{
|
|
$title = (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source');
|
|
$score = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
|
|
$rawExcerpt = dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620);
|
|
return [
|
|
'title' => $title,
|
|
'excerpt' => $docSummary ?? $rawExcerpt,
|
|
'chunk_text' => $rawExcerpt,
|
|
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Norge'),
|
|
'score' => $score,
|
|
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
|
|
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
|
|
'section' => $chunk['section_title'] ?? null,
|
|
'authority_type' => $chunk['authority_type'] ?? null,
|
|
'jurisdiction' => $chunk['jurisdiction'] ?? null,
|
|
// Temporal annotations (present when temporal_mode = 'legal_conservative')
|
|
'temporal_state' => $chunk['temporal_state'] ?? null,
|
|
'temporal_kind' => $chunk['temporal_kind'] ?? null,
|
|
'temporal_reason' => $chunk['temporal_reason'] ?? null,
|
|
'currentness_warning' => $chunk['currentness_warning'] ?? null,
|
|
'valid_from' => $chunk['valid_from'] ?? null,
|
|
'valid_until' => $chunk['valid_until'] ?? null,
|
|
'is_current_version' => $chunk['is_current_version'] ?? null,
|
|
];
|
|
}
|
|
|
|
private function fetchDocSummaries(array $docIds): array
|
|
{
|
|
if (!$docIds) {
|
|
return [];
|
|
}
|
|
try {
|
|
$db = dbnToolsRagDb();
|
|
$placeholders = implode(',', array_fill(0, count($docIds), '?'));
|
|
$stmt = $db->prepare(
|
|
"SELECT document_id, summary FROM doc_summaries
|
|
WHERE document_id IN ({$placeholders}) AND summary != ''"
|
|
);
|
|
$stmt->execute(array_values($docIds));
|
|
return array_column($stmt->fetchAll(PDO::FETCH_ASSOC), 'summary', 'document_id');
|
|
} catch (Throwable) {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
private function citationConfidence(array $hits): string
|
|
{
|
|
if (!$hits) {
|
|
return 'low';
|
|
}
|
|
$scores = array_values(array_filter(array_map(fn(array $h) => $h['score'] ?? null, $hits), 'is_numeric'));
|
|
$best = $scores ? max($scores) : 0;
|
|
if (count($hits) >= 3 && $best >= 0.35) {
|
|
return 'high';
|
|
}
|
|
if (count($hits) >= 1) {
|
|
return 'medium';
|
|
}
|
|
return 'low';
|
|
}
|
|
|
|
private function fallbackKeywordSearch(int $clientId, array $package, string $query, int $limit): array
|
|
{
|
|
$results = [];
|
|
try {
|
|
$results = array_merge($results, $this->fallbackPrivateSearch($clientId, $query, $limit));
|
|
} catch (Throwable $e) {
|
|
error_log('DBN tools private fallback failed: ' . $e->getMessage());
|
|
}
|
|
try {
|
|
$remaining = max(1, $limit - count($results));
|
|
$results = array_merge($results, $this->fallbackSharedSearch($package, $query, $remaining));
|
|
} catch (Throwable $e) {
|
|
error_log('DBN tools shared fallback failed: ' . $e->getMessage());
|
|
}
|
|
return array_slice($results, 0, $limit);
|
|
}
|
|
|
|
private function fallbackPrivateSearch(int $clientId, string $query, int $limit): array
|
|
{
|
|
$db = dbnToolsDb();
|
|
$terms = $this->searchTerms($query);
|
|
if (!$terms) {
|
|
return [];
|
|
}
|
|
$clauses = [];
|
|
$params = [':client_id' => $clientId];
|
|
foreach ($terms as $i => $term) {
|
|
$key = ':term' . $i;
|
|
$clauses[] = "(cc.content LIKE {$key} OR cd.title LIKE {$key})";
|
|
$params[$key] = '%' . $term . '%';
|
|
}
|
|
$sql = 'SELECT cc.id, cc.document_id, cc.content, cd.title AS document_title, cd.category
|
|
FROM client_chunks cc
|
|
JOIN client_documents cd ON cc.document_id = cd.id
|
|
WHERE cc.client_id = :client_id AND cd.status = "ready" AND (' . implode(' OR ', $clauses) . ')
|
|
LIMIT ' . (int)$limit;
|
|
$stmt = $db->prepare($sql);
|
|
$stmt->execute($params);
|
|
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
|
|
foreach ($rows as &$row) {
|
|
$row['similarity'] = 0.25;
|
|
$row['source_name'] = 'Do Better Norge private corpus';
|
|
$row['source_type'] = 'private';
|
|
}
|
|
return $rows;
|
|
}
|
|
|
|
private function fallbackSharedSearch(array $package, string $query, int $limit): array
|
|
{
|
|
$ragDb = dbnToolsRagDb();
|
|
$terms = $this->searchTerms($query);
|
|
if (!$terms) {
|
|
return [];
|
|
}
|
|
|
|
$where = ['d.status = "ready"'];
|
|
$params = [];
|
|
|
|
if (!empty($package['corpus_id'])) {
|
|
$where[] = 'd.corpus_id = ?';
|
|
$params[] = (int)$package['corpus_id'];
|
|
}
|
|
|
|
$cats = json_decode((string)($package['category_filter'] ?? '[]'), true) ?: [];
|
|
if ($cats) {
|
|
$where[] = 'd.category IN (' . implode(',', array_fill(0, count($cats), '?')) . ')';
|
|
$params = array_merge($params, $cats);
|
|
}
|
|
|
|
$langs = json_decode((string)($package['language_filter'] ?? '[]'), true) ?: [];
|
|
if ($langs) {
|
|
$where[] = 'd.language IN (' . implode(',', array_fill(0, count($langs), '?')) . ')';
|
|
$params = array_merge($params, $langs);
|
|
}
|
|
|
|
$termClauses = [];
|
|
foreach ($terms as $term) {
|
|
$termClauses[] = '(c.content LIKE ? OR d.title LIKE ?)';
|
|
$params[] = '%' . $term . '%';
|
|
$params[] = '%' . $term . '%';
|
|
}
|
|
$where[] = '(' . implode(' OR ', $termClauses) . ')';
|
|
|
|
$sql = 'SELECT c.id, c.document_id, c.content, c.section_title, d.title AS document_title,
|
|
d.category, d.language
|
|
FROM chunks c
|
|
JOIN documents d ON c.document_id = d.id
|
|
WHERE ' . implode(' AND ', $where) . '
|
|
LIMIT ' . (int)$limit;
|
|
$stmt = $ragDb->prepare($sql);
|
|
$stmt->execute($params);
|
|
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
|
|
foreach ($rows as &$row) {
|
|
$row['similarity'] = 0.2;
|
|
$row['source_name'] = (string)($package['name'] ?? 'family-legal');
|
|
$row['source_type'] = 'package';
|
|
}
|
|
return $rows;
|
|
}
|
|
|
|
private function searchTerms(string $query): array
|
|
{
|
|
$parts = preg_split('/[^\p{L}\p{N}]+/u', mb_strtolower($query, 'UTF-8')) ?: [];
|
|
$stop = ['the', 'and', 'for', 'with', 'that', 'this', 'hva', 'har', 'kan', 'jeg', 'som', 'det', 'med', 'til', 'og'];
|
|
$terms = [];
|
|
foreach ($parts as $part) {
|
|
if (mb_strlen($part, 'UTF-8') < 3 || in_array($part, $stop, true)) {
|
|
continue;
|
|
}
|
|
$terms[] = $part;
|
|
}
|
|
return array_slice(array_values(array_unique($terms)), 0, 6);
|
|
}
|
|
|
|
private function requirePasteText(string $text): string
|
|
{
|
|
$text = trim($text);
|
|
if (mb_strlen($text, 'UTF-8') < 20) {
|
|
dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short');
|
|
}
|
|
if (mb_strlen($text, 'UTF-8') > self::MAX_PASTE_CHARS) {
|
|
dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long');
|
|
}
|
|
return $text;
|
|
}
|
|
|
|
private function deterministicRedaction(string $text, string $mode, string $region = 'nordic'): array
|
|
{
|
|
$counts = [];
|
|
|
|
$replace = function (string $pattern, string $type, string $token) use (&$text, &$counts): void {
|
|
$text = preg_replace_callback($pattern, function () use (&$counts, $type, $token): string {
|
|
$counts[$type] = ($counts[$type] ?? 0) + 1;
|
|
return $token;
|
|
}, $text) ?? $text;
|
|
};
|
|
|
|
foreach ($this->getPatternPack($region) as $entry) {
|
|
$replace($entry['pattern'], $entry['type'], $entry['replacement']);
|
|
}
|
|
|
|
// Structured role-label names (Barn: X, Mother: X, etc.) — universal
|
|
$text = preg_replace_callback(
|
|
'/\b(Barn|Child|Navn|Name|Mor|Far|Mother|Father|Sønn|Datter)\s*:\s*([^\r\n,.;]+)/iu',
|
|
function (array $m) use (&$counts): string {
|
|
$counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
|
|
return $m[1] . ': [PERSON]';
|
|
},
|
|
$text
|
|
) ?? $text;
|
|
|
|
// Child-identifier phrases ("barnet heter X", "child named X") — universal
|
|
$text = preg_replace_callback(
|
|
'/\b(?:barnet|child|sønn|son|datter|daughter)\s+(?:heter|named|called)?\s*([A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,})\b/iu',
|
|
function () use (&$counts): string {
|
|
$counts['person_or_child_name'] = ($counts['person_or_child_name'] ?? 0) + 1;
|
|
return '[CHILD_IDENTIFIER]';
|
|
},
|
|
$text
|
|
) ?? $text;
|
|
|
|
if ($mode === 'strict') {
|
|
$replace('/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,}\s+[A-ZÆØÅ][\p{L}æøåÆØÅ\-]{2,}\b/u', 'person_or_child_name', '[PERSON]');
|
|
}
|
|
|
|
return [$text, $counts];
|
|
}
|
|
|
|
private function getPatternPack(string $region): array
|
|
{
|
|
$nordic = [
|
|
['pattern' => '/\b[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}\b/i', 'replacement' => '[EMAIL]', 'type' => 'email'],
|
|
['pattern' => '/(?<!\d)(?:\d{6}[\s\-]?\d{5}|\d{11})(?!\d)/u', 'replacement' => '[FNR]', 'type' => 'fødselsnummer'],
|
|
['pattern' => '/(?<!\d)(?:\+47[\s.\-]?)?(?:\d[\s.\-]?){8}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
|
|
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
|
|
];
|
|
|
|
if ($region === 'nordic') {
|
|
return $nordic;
|
|
}
|
|
|
|
$european = array_merge($nordic, [
|
|
// Swedish personnummer short (YYMMDD-XXXX / YYMMDD+XXXX)
|
|
['pattern' => '/(?<!\d)\d{6}[-+]\d{4}(?!\d)/u', 'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
|
|
// Swedish personnummer full (YYYYMMDD-XXXX)
|
|
['pattern' => '/(?<!\d)\d{8}[-+]\d{4}(?!\d)/u', 'replacement' => '[SE_PERSONNUMMER]', 'type' => 'se_personnummer'],
|
|
// Danish/Finnish CPR / henkilötunnus — same format as short SE personnummer but included for clarity
|
|
['pattern' => '/\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b/i', 'replacement' => '[UK_NI]', 'type' => 'uk_ni'],
|
|
// French INSEE (15 digits, not overlapping with 11-digit FNR due to lookarounds)
|
|
['pattern' => '/(?<!\d)\d{15}(?!\d)/u', 'replacement' => '[FR_INSEE]', 'type' => 'fr_insee'],
|
|
// IBAN (2-letter country code + 2 check digits + up to 30 alphanumeric)
|
|
['pattern' => '/\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}[A-Z0-9]{0,16}\b/i', 'replacement' => '[IBAN]', 'type' => 'iban'],
|
|
// European phone (international prefix for major EU/EEA country codes)
|
|
['pattern' => '/(?<!\d)\+(?:44|46|45|358|33|49|34|39|31|32|41|43|30|351|353|48|36|420|421|372|371|370|386|385|356|357|40|359|352)[\s.\-]?(?:\d[\s.\-]?){7,12}(?!\d)/u', 'replacement' => '[PHONE]', 'type' => 'phone'],
|
|
// Street address expanded to European street-type keywords
|
|
['pattern' => '/\b[A-ZÆØÅ][\p{L}æøåÆØÅ\.\- ]{2,40}\s+(?:gate|gata|vei|veien|plass|street|road|avenue|ave|rue|straße|strasse|straat|gade|calle|via|gatan|vägen)\s+\d+[A-Za-z]?\b/iu', 'replacement' => '[ADDRESS]', 'type' => 'address'],
|
|
]);
|
|
|
|
if ($region === 'european') {
|
|
return $european;
|
|
}
|
|
|
|
$echr = array_merge($european, [
|
|
// ECHR application number (requires "Application no." or "App. No." prefix to avoid matching dates/pages)
|
|
['pattern' => '/\b(?:Application|App\.?)\s+(?:no\.?|nr\.?|#)\s*\d{3,6}\s*\/\s*\d{2,4}\b/i', 'replacement' => '[ECHR_APP_NO]', 'type' => 'echr_app_no'],
|
|
// Date of birth stated in judgment context
|
|
['pattern' => '/\bborn\s+(?:on\s+)?\d{1,2}[.\s]+(?:January|February|March|April|May|June|July|August|September|October|November|December|januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)[,\s]+\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
|
|
['pattern' => '/\bf\.\s*\d{4}\b/iu', 'replacement' => '[DOB]', 'type' => 'date_of_birth'],
|
|
// National ID label patterns in multiple languages
|
|
['pattern' => '/\b(?:personal\s+number|numéro\s+national|Personalausweis|personnummer|fødselsnummer|henkilötunnus|CPR-nummer)\s*:\s*[\w\s\-]+/iu', 'replacement' => '[NAT_ID]', 'type' => 'nat_id'],
|
|
]);
|
|
|
|
if ($region === 'echr') {
|
|
return $echr;
|
|
}
|
|
|
|
// global
|
|
return array_merge($echr, [
|
|
// US Social Security Number
|
|
['pattern' => '/(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/u', 'replacement' => '[SSN]', 'type' => 'ssn'],
|
|
// Document number in context (passport no., ID No., document no.)
|
|
['pattern' => '/\b(?:passport\s+(?:no\.?|number)|ID\s+(?:no\.?|number)|document\s+(?:no\.?|number))\s*[:\#]?\s*([A-Z0-9]{6,12})\b/iu', 'replacement' => '[DOC_NO]', 'type' => 'doc_no'],
|
|
]);
|
|
}
|
|
|
|
private function llmRedactionPass(string $preRedacted, string $language = 'en', array $aliases = []): array
|
|
{
|
|
$missing = $this->azure->missingChatConfig();
|
|
if ($missing) {
|
|
return ['skipped' => true, 'reason' => 'Azure chat not configured (' . implode(', ', $missing) . ')'];
|
|
}
|
|
|
|
$languageNote = $language === 'no' ? "\n • The document may contain Norwegian or mixed-language content." : '';
|
|
|
|
$aliasBlock = '';
|
|
if (!empty($aliases)) {
|
|
$lines = [];
|
|
foreach ($aliases as $a) {
|
|
$orig = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['original'] ?? '')), 0, 100));
|
|
$lbl = str_replace(["\n", "\r", '`', '"', '{', '}'], ' ', substr(trim((string)($a['alias'] ?? '')), 0, 100));
|
|
if ($orig !== '' && $lbl !== '') {
|
|
$lines[] = " \"{$orig}\" → [{$lbl}]";
|
|
}
|
|
}
|
|
if ($lines) {
|
|
$aliasBlock = "\n\nALIAS OVERRIDES — use these exact replacement tags for these specific names instead of inferring a role:\n" . implode("\n", $lines);
|
|
}
|
|
}
|
|
|
|
$system = <<<PROMPT
|
|
You are a privacy redaction assistant for legal documents (ECHR judgements, Norwegian family law cases, EU child welfare documents). The text below has already had mechanical identifiers replaced with placeholder tags in [BRACKETS].
|
|
|
|
Your task: find any remaining identifiable information — person names, organisation names, specific places at city level or below, dates of birth, and identifying descriptions.
|
|
|
|
STEP 1 — For person names: identify each individual and infer their role or relationship from context.
|
|
Assign each person a consistent contextual tag used for every occurrence of their name:
|
|
• Family roles: FATHER, MOTHER, CHILD, CHILD_1, CHILD_2, GRANDPARENT, SIBLING
|
|
• Professional roles: ATTORNEY, JUDGE, CASEWORKER, EXPERT_WITNESS
|
|
• Generic fallback: PERSON_1, PERSON_2 (use only when role cannot be determined)
|
|
The same individual MUST receive the same tag every time they appear.{$aliasBlock}
|
|
|
|
Return ONLY a valid JSON object:
|
|
{"redactions":[{"original":"exact text as it appears","type":"person_name","tag":"[FATHER]"}]}
|
|
|
|
Allowed types and their tag format:
|
|
person_name → contextual role tag e.g. [FATHER], [CHILD_1], [ATTORNEY] (or alias tag if provided above)
|
|
org → [ORG]
|
|
place → [PLACE]
|
|
date_of_birth → [DOB]
|
|
other → [IDENTIFIER]
|
|
|
|
Rules:
|
|
• Include only text that appears verbatim in the input. Do not invent or paraphrase.
|
|
• The same person MUST get the same tag every time they appear.
|
|
• If nothing needs redacting, return {"redactions":[]}.
|
|
• Do not redact text already inside [BRACKETS].
|
|
• Legal citations, statute names, article numbers, and institution names (e.g. "the European Court of Human Rights", "Barnevernloven § 4-12") are NOT PII.
|
|
• Short common words, conjunctions, and prepositions are NOT PII.{$languageNote}
|
|
PROMPT;
|
|
|
|
try {
|
|
$response = $this->azure->chat([
|
|
['role' => 'system', 'content' => $system],
|
|
['role' => 'user', 'content' => $preRedacted],
|
|
], [
|
|
'temperature' => 0.1,
|
|
'max_tokens' => 4000,
|
|
'json' => true,
|
|
'timeout' => 90,
|
|
]);
|
|
|
|
$content = (string)($response['choices'][0]['message']['content'] ?? '');
|
|
$json = $this->azure->decodeJsonObject($content);
|
|
|
|
if (!is_array($json) || !array_key_exists('redactions', $json)) {
|
|
return ['skipped' => false, 'entities' => [], 'error' => 'LLM returned unexpected JSON structure'];
|
|
}
|
|
|
|
return [
|
|
'skipped' => false,
|
|
'entities' => is_array($json['redactions']) ? $json['redactions'] : [],
|
|
'deployment' => $this->azure->chatDeployment(),
|
|
];
|
|
} catch (Throwable $e) {
|
|
error_log('DBN tools LLM redaction pass failed: ' . $e->getMessage());
|
|
return ['skipped' => false, 'entities' => [], 'error' => $e->getMessage()];
|
|
}
|
|
}
|
|
|
|
private function uncertaintySummary(mixed $uncertainty): string
|
|
{
|
|
if (is_array($uncertainty)) {
|
|
$uncertainty = implode(' ', array_map('strval', $uncertainty));
|
|
}
|
|
$uncertainty = trim((string)$uncertainty);
|
|
return $uncertainty !== '' ? dbnToolsExcerpt($uncertainty, 220) : 'No additional uncertainty was supplied by the tool.';
|
|
}
|
|
|
|
private function trace(string $label, string $detail, string $status = 'complete'): array
|
|
{
|
|
return [
|
|
'label' => $label,
|
|
'detail' => $detail,
|
|
'status' => $status,
|
|
];
|
|
}
|
|
}
|