308 lines
11 KiB
Python
308 lines
11 KiB
Python
from __future__ import annotations
|
|
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
from canonical_layer.store import CanonicalStore
|
|
from config.settings import OneCSettings, load_settings
|
|
|
|
|
|
SIGNAL_TO_DOMAIN = {
|
|
"stale_refresh": "operational_freshness",
|
|
"missing_refresh_baseline": "operational_freshness",
|
|
"no_canonical_data": "operational_freshness",
|
|
"high_link_degree": "suspicious_link_hub",
|
|
"entity_count_drift": "structural_drift",
|
|
"empty_display_share_high": "data_quality",
|
|
}
|
|
|
|
DOMAIN_PATTERN_KEY = {
|
|
"operational_freshness": "operational_freshness_risk",
|
|
"suspicious_link_hub": "suspicious_link_hub_risk",
|
|
"structural_drift": "structural_drift_risk",
|
|
"data_quality": "data_quality_risk",
|
|
"miscellaneous": "miscellaneous_risk",
|
|
}
|
|
|
|
DOMAIN_WEIGHTS = {
|
|
"operational_freshness": 0.35,
|
|
"suspicious_link_hub": 0.25,
|
|
"structural_drift": 0.25,
|
|
"data_quality": 0.15,
|
|
"miscellaneous": 0.20,
|
|
}
|
|
|
|
SEVERITY_BASE = {
|
|
"low": 0.25,
|
|
"medium": 0.50,
|
|
"high": 0.80,
|
|
"critical": 0.95,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class RiskEngineResult:
|
|
run_id: str
|
|
status: str
|
|
source_feature_run_id: str | None
|
|
active_anomalies_scanned: int
|
|
patterns_written: int
|
|
global_score: float
|
|
error_message: str | None = None
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"run_id": self.run_id,
|
|
"status": self.status,
|
|
"source_feature_run_id": self.source_feature_run_id,
|
|
"active_anomalies_scanned": self.active_anomalies_scanned,
|
|
"patterns_written": self.patterns_written,
|
|
"global_score": self.global_score,
|
|
"error_message": self.error_message,
|
|
}
|
|
|
|
|
|
class RiskService:
|
|
def __init__(self, *, settings: OneCSettings, store: CanonicalStore) -> None:
|
|
self.settings = settings
|
|
self.store = store
|
|
self.store.ensure_created()
|
|
|
|
@classmethod
|
|
def build(cls) -> "RiskService":
|
|
settings = load_settings()
|
|
store = CanonicalStore(settings.canonical_db_url)
|
|
return cls(settings=settings, store=store)
|
|
|
|
def _severity_from_score(self, score: float) -> str:
|
|
if score >= self.settings.risk_high_threshold:
|
|
return "high"
|
|
if score >= self.settings.risk_medium_threshold:
|
|
return "medium"
|
|
return "low"
|
|
|
|
@staticmethod
|
|
def _normalize_score(raw_score: float) -> float:
|
|
safe = max(0.0, raw_score)
|
|
if safe <= 1.0:
|
|
return safe
|
|
if safe >= 3.0:
|
|
return 1.0
|
|
return safe / 3.0
|
|
|
|
def _signal_score(self, anomaly: dict[str, Any]) -> float:
|
|
severity = str(anomaly.get("severity", "medium")).lower()
|
|
base = SEVERITY_BASE.get(severity, 0.50)
|
|
raw = self._normalize_score(float(anomaly.get("score", 0.0)))
|
|
return min(1.0, (0.45 * base) + (0.55 * raw))
|
|
|
|
def _build_domain_pattern(self, domain: str, anomalies: list[dict[str, Any]]) -> dict[str, Any]:
|
|
signal_scores = [self._signal_score(item) for item in anomalies]
|
|
average_score = (sum(signal_scores) / len(signal_scores)) if signal_scores else 0.0
|
|
max_score = max(signal_scores) if signal_scores else 0.0
|
|
density_bonus = min(0.30, max(0, len(anomalies) - 1) * 0.03)
|
|
domain_score = min(1.0, (0.60 * max_score) + (0.40 * average_score) + density_bonus)
|
|
confidence = min(1.0, 0.55 + min(0.40, len(anomalies) * 0.05))
|
|
severity = self._severity_from_score(domain_score)
|
|
|
|
by_signal_type: dict[str, int] = defaultdict(int)
|
|
for item in anomalies:
|
|
by_signal_type[str(item.get("signal_type", "unknown"))] += 1
|
|
|
|
top_examples = sorted(
|
|
anomalies,
|
|
key=lambda item: float(item.get("score", 0.0)),
|
|
reverse=True,
|
|
)[:5]
|
|
examples_payload = [
|
|
{
|
|
"signal_type": item.get("signal_type"),
|
|
"severity": item.get("severity"),
|
|
"scope": item.get("scope"),
|
|
"scope_id": item.get("scope_id"),
|
|
"score": item.get("score"),
|
|
"details": item.get("details", {}),
|
|
}
|
|
for item in top_examples
|
|
]
|
|
|
|
return {
|
|
"pattern_key": DOMAIN_PATTERN_KEY.get(domain, "miscellaneous_risk"),
|
|
"severity": severity,
|
|
"scope": "domain",
|
|
"scope_id": domain,
|
|
"score": round(domain_score, 6),
|
|
"confidence": round(confidence, 6),
|
|
"details": {
|
|
"anomalies_count": len(anomalies),
|
|
"signal_types": dict(sorted(by_signal_type.items(), key=lambda item: item[0])),
|
|
"top_examples": examples_payload,
|
|
},
|
|
}
|
|
|
|
def _compute_global_score(self, domain_patterns: list[dict[str, Any]]) -> float:
|
|
if not domain_patterns:
|
|
return 0.05
|
|
|
|
weighted_sum = 0.0
|
|
weight_total = 0.0
|
|
for item in domain_patterns:
|
|
domain = str(item.get("scope_id", "miscellaneous"))
|
|
weight = DOMAIN_WEIGHTS.get(domain, DOMAIN_WEIGHTS["miscellaneous"])
|
|
score = float(item.get("score", 0.0))
|
|
weighted_sum += weight * score
|
|
weight_total += weight
|
|
|
|
if weight_total <= 0.0:
|
|
return 0.05
|
|
return min(1.0, weighted_sum / weight_total)
|
|
|
|
def run_risk_engine(
|
|
self,
|
|
*,
|
|
source_feature_run_id: str | None = None,
|
|
anomaly_limit: int | None = None,
|
|
) -> RiskEngineResult:
|
|
selected_feature_run_id = source_feature_run_id
|
|
if not selected_feature_run_id:
|
|
latest = self.store.latest_successful_feature_run()
|
|
if latest is not None:
|
|
selected_feature_run_id = str(latest.get("run_id"))
|
|
|
|
run_id = self.store.start_risk_run(source_feature_run_id=selected_feature_run_id)
|
|
safe_limit = anomaly_limit or self.settings.risk_anomaly_scan_limit
|
|
|
|
try:
|
|
if selected_feature_run_id:
|
|
anomalies = self.store.list_anomaly_signals(
|
|
limit=safe_limit,
|
|
active_only=False,
|
|
run_id=selected_feature_run_id,
|
|
)
|
|
else:
|
|
anomalies = []
|
|
|
|
grouped: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
for anomaly in anomalies:
|
|
signal_type = str(anomaly.get("signal_type", "unknown"))
|
|
domain = SIGNAL_TO_DOMAIN.get(signal_type, "miscellaneous")
|
|
grouped[domain].append(anomaly)
|
|
|
|
domain_patterns: list[dict[str, Any]] = []
|
|
for domain, domain_anomalies in grouped.items():
|
|
if not domain_anomalies:
|
|
continue
|
|
domain_patterns.append(self._build_domain_pattern(domain, domain_anomalies))
|
|
|
|
if not selected_feature_run_id:
|
|
domain_patterns.append(
|
|
{
|
|
"pattern_key": DOMAIN_PATTERN_KEY["operational_freshness"],
|
|
"severity": "high",
|
|
"scope": "domain",
|
|
"scope_id": "operational_freshness",
|
|
"score": 0.90,
|
|
"confidence": 0.95,
|
|
"details": {
|
|
"anomalies_count": 0,
|
|
"signal_types": {},
|
|
"top_examples": [],
|
|
"reason": "No successful feature run found",
|
|
},
|
|
}
|
|
)
|
|
|
|
global_score = self._compute_global_score(domain_patterns)
|
|
global_severity = self._severity_from_score(global_score)
|
|
global_pattern = {
|
|
"pattern_key": "global_risk_summary",
|
|
"severity": global_severity,
|
|
"scope": "global",
|
|
"scope_id": "",
|
|
"score": round(global_score, 6),
|
|
"confidence": round(min(1.0, 0.60 + 0.05 * len(domain_patterns)), 6),
|
|
"details": {
|
|
"source_feature_run_id": selected_feature_run_id,
|
|
"domain_scores": [
|
|
{
|
|
"domain": item.get("scope_id"),
|
|
"score": item.get("score"),
|
|
"severity": item.get("severity"),
|
|
}
|
|
for item in sorted(
|
|
domain_patterns,
|
|
key=lambda item: float(item.get("score", 0.0)),
|
|
reverse=True,
|
|
)
|
|
],
|
|
"anomalies_scanned": len(anomalies),
|
|
},
|
|
}
|
|
|
|
all_patterns = [global_pattern] + domain_patterns
|
|
patterns_written = self.store.replace_risk_patterns(run_id=run_id, patterns=all_patterns)
|
|
self.store.finish_risk_run(
|
|
run_id=run_id,
|
|
status="success",
|
|
patterns_written=patterns_written,
|
|
global_score=global_score,
|
|
details={
|
|
"source_feature_run_id": selected_feature_run_id,
|
|
"anomalies_scanned": len(anomalies),
|
|
"domains_total": len(domain_patterns),
|
|
},
|
|
)
|
|
|
|
return RiskEngineResult(
|
|
run_id=run_id,
|
|
status="success",
|
|
source_feature_run_id=selected_feature_run_id,
|
|
active_anomalies_scanned=len(anomalies),
|
|
patterns_written=patterns_written,
|
|
global_score=round(global_score, 6),
|
|
)
|
|
except Exception as exc:
|
|
self.store.finish_risk_run(
|
|
run_id=run_id,
|
|
status="failed",
|
|
patterns_written=0,
|
|
global_score=0.0,
|
|
details={},
|
|
error_message=str(exc),
|
|
)
|
|
return RiskEngineResult(
|
|
run_id=run_id,
|
|
status="failed",
|
|
source_feature_run_id=selected_feature_run_id,
|
|
active_anomalies_scanned=0,
|
|
patterns_written=0,
|
|
global_score=0.0,
|
|
error_message=str(exc),
|
|
)
|
|
|
|
def list_recent_runs(self, limit: int = 20) -> list[dict[str, Any]]:
|
|
return self.store.list_recent_risk_runs(limit=limit)
|
|
|
|
def list_patterns(
|
|
self,
|
|
*,
|
|
limit: int = 200,
|
|
severity: str | None = None,
|
|
active_only: bool = True,
|
|
run_id: str | None = None,
|
|
pattern_key: str | None = None,
|
|
scope: str | None = None,
|
|
) -> list[dict[str, Any]]:
|
|
return self.store.list_risk_patterns(
|
|
limit=limit,
|
|
severity=severity,
|
|
active_only=active_only,
|
|
run_id=run_id,
|
|
pattern_key=pattern_key,
|
|
scope=scope,
|
|
)
|
|
|
|
def stats(self) -> dict[str, Any]:
|
|
return self.store.risk_store_stats()
|