297 lines
10 KiB
Python
297 lines
10 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
import json
|
|
import re
|
|
from typing import Any
|
|
|
|
from canonical_layer.mappers import map_records
|
|
from canonical_layer.store import CanonicalStore
|
|
from config.client import ODataClient, extract_entity_sets
|
|
from config.settings import LOGS_DIR, OneCSettings, load_settings
|
|
|
|
|
|
ODATA_DATE_RE = re.compile(r"/Date\((?P<millis>-?\d+)")
|
|
REFRESH_MODES = {"historical", "incremental", "targeted"}
|
|
|
|
|
|
def _parse_dt(raw: str | None) -> datetime | None:
|
|
if raw is None:
|
|
return None
|
|
text = raw.strip()
|
|
if not text:
|
|
return None
|
|
|
|
match = ODATA_DATE_RE.search(text)
|
|
if match:
|
|
millis = int(match.group("millis"))
|
|
return datetime.fromtimestamp(millis / 1000)
|
|
|
|
for candidate in (text.replace("Z", "+00:00"), text):
|
|
try:
|
|
return datetime.fromisoformat(candidate)
|
|
except ValueError:
|
|
continue
|
|
return None
|
|
|
|
|
|
def _in_date_range(record: dict[str, Any], date_from: datetime | None, date_to: datetime | None) -> bool:
|
|
if date_from is None and date_to is None:
|
|
return True
|
|
|
|
date_candidates = ("Date", "Дата", "Period", "Период", "PostedAt", "posted_at")
|
|
record_dt: datetime | None = None
|
|
for field in date_candidates:
|
|
value = record.get(field)
|
|
if value is None:
|
|
continue
|
|
record_dt = _parse_dt(str(value))
|
|
if record_dt is not None:
|
|
break
|
|
|
|
if record_dt is None:
|
|
return True
|
|
if date_from and record_dt < date_from:
|
|
return False
|
|
if date_to and record_dt > date_to:
|
|
return False
|
|
return True
|
|
|
|
|
|
def _unique_preserve_order(items: list[str]) -> list[str]:
|
|
result: list[str] = []
|
|
seen: set[str] = set()
|
|
for item in items:
|
|
cleaned = item.strip()
|
|
if not cleaned or cleaned in seen:
|
|
continue
|
|
seen.add(cleaned)
|
|
result.append(cleaned)
|
|
return result
|
|
|
|
|
|
@dataclass
|
|
class RefreshResult:
|
|
run_id: str
|
|
mode: str
|
|
status: str
|
|
requested_entity_sets: list[str]
|
|
successful_entity_sets: list[str]
|
|
failed_entity_sets: list[dict[str, str]]
|
|
date_from: str | None
|
|
date_to: str | None
|
|
target_id: str | None
|
|
limit_per_set: int
|
|
records_read: int
|
|
entities_written: int
|
|
links_written: int
|
|
checkpoints_updated: int
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"run_id": self.run_id,
|
|
"mode": self.mode,
|
|
"status": self.status,
|
|
"requested_entity_sets": self.requested_entity_sets,
|
|
"successful_entity_sets": self.successful_entity_sets,
|
|
"failed_entity_sets": self.failed_entity_sets,
|
|
"date_from": self.date_from,
|
|
"date_to": self.date_to,
|
|
"target_id": self.target_id,
|
|
"limit_per_set": self.limit_per_set,
|
|
"records_read": self.records_read,
|
|
"entities_written": self.entities_written,
|
|
"links_written": self.links_written,
|
|
"checkpoints_updated": self.checkpoints_updated,
|
|
}
|
|
|
|
|
|
class RefreshService:
|
|
def __init__(self, *, settings: OneCSettings, client: ODataClient, store: CanonicalStore) -> None:
|
|
self.settings = settings
|
|
self.client = client
|
|
self.store = store
|
|
self.store.ensure_created()
|
|
|
|
@classmethod
|
|
def build(cls) -> "RefreshService":
|
|
settings = load_settings()
|
|
client = ODataClient(settings)
|
|
store = CanonicalStore(settings.canonical_db_url)
|
|
return cls(settings=settings, client=client, store=store)
|
|
|
|
def _load_entity_sets(self) -> list[dict[str, str]]:
|
|
entity_sets_file = LOGS_DIR / "entity_sets.json"
|
|
if entity_sets_file.exists():
|
|
payload = json.loads(entity_sets_file.read_text(encoding="utf-8"))
|
|
entity_sets = payload.get("entity_sets", [])
|
|
if isinstance(entity_sets, list):
|
|
return [item for item in entity_sets if isinstance(item, dict)]
|
|
|
|
metadata_file = LOGS_DIR / "metadata.xml"
|
|
if metadata_file.exists():
|
|
metadata_xml = metadata_file.read_text(encoding="utf-8")
|
|
return extract_entity_sets(metadata_xml)
|
|
|
|
metadata_xml = self.client.fetch_metadata()
|
|
metadata_file.parent.mkdir(parents=True, exist_ok=True)
|
|
metadata_file.write_text(metadata_xml, encoding="utf-8")
|
|
return extract_entity_sets(metadata_xml)
|
|
|
|
def _resolve_entity_sets(
|
|
self,
|
|
*,
|
|
requested_entity_sets: list[str] | None,
|
|
entity_keywords: list[str] | None,
|
|
) -> list[str]:
|
|
if requested_entity_sets:
|
|
return _unique_preserve_order(requested_entity_sets)
|
|
|
|
entity_sets = self._load_entity_sets()
|
|
names = [str(item.get("name", "")).strip() for item in entity_sets if str(item.get("name", "")).strip()]
|
|
if not names:
|
|
return []
|
|
|
|
keywords_source = entity_keywords or list(self.settings.refresh_default_entity_keywords)
|
|
keywords = [keyword.strip().lower() for keyword in keywords_source if keyword.strip()]
|
|
if not keywords:
|
|
return names
|
|
|
|
matched = [name for name in names if any(keyword in name.lower() for keyword in keywords)]
|
|
if matched:
|
|
return _unique_preserve_order(matched)
|
|
return names
|
|
|
|
def run_refresh(
|
|
self,
|
|
*,
|
|
mode: str,
|
|
date_from: str | None = None,
|
|
date_to: str | None = None,
|
|
target_id: str | None = None,
|
|
limit_per_set: int | None = None,
|
|
requested_entity_sets: list[str] | None = None,
|
|
entity_keywords: list[str] | None = None,
|
|
) -> RefreshResult:
|
|
normalized_mode = mode.strip().lower()
|
|
if normalized_mode not in REFRESH_MODES:
|
|
raise ValueError(f"Unsupported refresh mode: {mode}")
|
|
|
|
resolved_limit = limit_per_set or self.settings.refresh_default_limit_per_set
|
|
resolved_sets = self._resolve_entity_sets(
|
|
requested_entity_sets=requested_entity_sets,
|
|
entity_keywords=entity_keywords,
|
|
)
|
|
if not resolved_sets:
|
|
raise RuntimeError("No entity sets resolved for refresh")
|
|
|
|
run_id = self.store.start_refresh_run(
|
|
mode=normalized_mode,
|
|
requested_entity_sets=resolved_sets,
|
|
date_from=date_from,
|
|
date_to=date_to,
|
|
limit_per_set=resolved_limit,
|
|
)
|
|
|
|
parsed_date_from = _parse_dt(date_from)
|
|
parsed_date_to = _parse_dt(date_to)
|
|
safe_target = target_id.strip() if target_id else None
|
|
|
|
successful_sets: list[str] = []
|
|
failed_sets: list[dict[str, str]] = []
|
|
records_read = 0
|
|
entities_written = 0
|
|
links_written = 0
|
|
|
|
per_set_stats: list[dict[str, Any]] = []
|
|
for entity_set in resolved_sets:
|
|
try:
|
|
records = self.client.read_entity_set_records(entity_set, top=resolved_limit)
|
|
except Exception as exc:
|
|
failed_sets.append({"entity_set": entity_set, "error": str(exc)})
|
|
continue
|
|
|
|
records_read += len(records)
|
|
filtered_records = records
|
|
if parsed_date_from or parsed_date_to:
|
|
filtered_records = [
|
|
row for row in filtered_records if _in_date_range(row, parsed_date_from, parsed_date_to)
|
|
]
|
|
|
|
if normalized_mode == "targeted" and safe_target:
|
|
lowered_target = safe_target.lower()
|
|
filtered_records = [
|
|
row
|
|
for row in filtered_records
|
|
if lowered_target in json.dumps(row, ensure_ascii=False).lower()
|
|
]
|
|
|
|
mapped = map_records(entity_set, filtered_records)
|
|
entity_delta, link_delta = self.store.upsert_entities(run_id=run_id, entities=mapped)
|
|
entities_written += entity_delta
|
|
links_written += link_delta
|
|
successful_sets.append(entity_set)
|
|
per_set_stats.append(
|
|
{
|
|
"entity_set": entity_set,
|
|
"records_read": len(records),
|
|
"records_after_filters": len(filtered_records),
|
|
"entities_written": entity_delta,
|
|
"links_written": link_delta,
|
|
}
|
|
)
|
|
|
|
checkpoints_updated = self.store.update_checkpoints(
|
|
run_id=run_id,
|
|
entity_sets=successful_sets,
|
|
date_from=date_from,
|
|
date_to=date_to,
|
|
)
|
|
|
|
status = "success"
|
|
error_message: str | None = None
|
|
if successful_sets and failed_sets:
|
|
status = "partial_success"
|
|
elif failed_sets and not successful_sets:
|
|
status = "failed"
|
|
error_message = "; ".join(f"{item['entity_set']}: {item['error']}" for item in failed_sets[:3])
|
|
|
|
details = {
|
|
"per_set": per_set_stats,
|
|
"failed_sets": failed_sets,
|
|
}
|
|
self.store.finish_refresh_run(
|
|
run_id=run_id,
|
|
status=status,
|
|
records_read=records_read,
|
|
entities_written=entities_written,
|
|
links_written=links_written,
|
|
checkpoints_updated=checkpoints_updated,
|
|
details=details,
|
|
error_message=error_message,
|
|
)
|
|
|
|
return RefreshResult(
|
|
run_id=run_id,
|
|
mode=normalized_mode,
|
|
status=status,
|
|
requested_entity_sets=resolved_sets,
|
|
successful_entity_sets=successful_sets,
|
|
failed_entity_sets=failed_sets,
|
|
date_from=date_from,
|
|
date_to=date_to,
|
|
target_id=safe_target,
|
|
limit_per_set=resolved_limit,
|
|
records_read=records_read,
|
|
entities_written=entities_written,
|
|
links_written=links_written,
|
|
checkpoints_updated=checkpoints_updated,
|
|
)
|
|
|
|
def list_recent_runs(self, limit: int = 20) -> list[dict[str, Any]]:
|
|
return self.store.list_recent_runs(limit=limit)
|
|
|
|
def store_stats(self) -> dict[str, Any]:
|
|
return self.store.store_stats()
|