from __future__ import annotations from dataclasses import dataclass from datetime import datetime import json import re from typing import Any from canonical_layer.mappers import map_records from canonical_layer.store import CanonicalStore from config.client import ODataClient, extract_entity_sets from config.settings import LOGS_DIR, OneCSettings, load_settings ODATA_DATE_RE = re.compile(r"/Date\((?P-?\d+)") REFRESH_MODES = {"historical", "incremental", "targeted"} def _parse_dt(raw: str | None) -> datetime | None: if raw is None: return None text = raw.strip() if not text: return None match = ODATA_DATE_RE.search(text) if match: millis = int(match.group("millis")) return datetime.fromtimestamp(millis / 1000) for candidate in (text.replace("Z", "+00:00"), text): try: return datetime.fromisoformat(candidate) except ValueError: continue return None def _in_date_range(record: dict[str, Any], date_from: datetime | None, date_to: datetime | None) -> bool: if date_from is None and date_to is None: return True date_candidates = ("Date", "Дата", "Period", "Период", "PostedAt", "posted_at") record_dt: datetime | None = None for field in date_candidates: value = record.get(field) if value is None: continue record_dt = _parse_dt(str(value)) if record_dt is not None: break if record_dt is None: return True if date_from and record_dt < date_from: return False if date_to and record_dt > date_to: return False return True def _unique_preserve_order(items: list[str]) -> list[str]: result: list[str] = [] seen: set[str] = set() for item in items: cleaned = item.strip() if not cleaned or cleaned in seen: continue seen.add(cleaned) result.append(cleaned) return result @dataclass class RefreshResult: run_id: str mode: str status: str requested_entity_sets: list[str] successful_entity_sets: list[str] failed_entity_sets: list[dict[str, str]] date_from: str | None date_to: str | None target_id: str | None limit_per_set: int records_read: int entities_written: int links_written: int checkpoints_updated: int def to_dict(self) -> dict[str, Any]: return { "run_id": self.run_id, "mode": self.mode, "status": self.status, "requested_entity_sets": self.requested_entity_sets, "successful_entity_sets": self.successful_entity_sets, "failed_entity_sets": self.failed_entity_sets, "date_from": self.date_from, "date_to": self.date_to, "target_id": self.target_id, "limit_per_set": self.limit_per_set, "records_read": self.records_read, "entities_written": self.entities_written, "links_written": self.links_written, "checkpoints_updated": self.checkpoints_updated, } class RefreshService: def __init__(self, *, settings: OneCSettings, client: ODataClient, store: CanonicalStore) -> None: self.settings = settings self.client = client self.store = store self.store.ensure_created() @classmethod def build(cls) -> "RefreshService": settings = load_settings() client = ODataClient(settings) store = CanonicalStore(settings.canonical_db_url) return cls(settings=settings, client=client, store=store) def _load_entity_sets(self) -> list[dict[str, str]]: entity_sets_file = LOGS_DIR / "entity_sets.json" if entity_sets_file.exists(): payload = json.loads(entity_sets_file.read_text(encoding="utf-8")) entity_sets = payload.get("entity_sets", []) if isinstance(entity_sets, list): return [item for item in entity_sets if isinstance(item, dict)] metadata_file = LOGS_DIR / "metadata.xml" if metadata_file.exists(): metadata_xml = metadata_file.read_text(encoding="utf-8") return extract_entity_sets(metadata_xml) metadata_xml = self.client.fetch_metadata() metadata_file.parent.mkdir(parents=True, exist_ok=True) metadata_file.write_text(metadata_xml, encoding="utf-8") return extract_entity_sets(metadata_xml) def _resolve_entity_sets( self, *, requested_entity_sets: list[str] | None, entity_keywords: list[str] | None, ) -> list[str]: if requested_entity_sets: return _unique_preserve_order(requested_entity_sets) entity_sets = self._load_entity_sets() names = [str(item.get("name", "")).strip() for item in entity_sets if str(item.get("name", "")).strip()] if not names: return [] keywords_source = entity_keywords or list(self.settings.refresh_default_entity_keywords) keywords = [keyword.strip().lower() for keyword in keywords_source if keyword.strip()] if not keywords: return names matched = [name for name in names if any(keyword in name.lower() for keyword in keywords)] if matched: return _unique_preserve_order(matched) return names def run_refresh( self, *, mode: str, date_from: str | None = None, date_to: str | None = None, target_id: str | None = None, limit_per_set: int | None = None, requested_entity_sets: list[str] | None = None, entity_keywords: list[str] | None = None, ) -> RefreshResult: normalized_mode = mode.strip().lower() if normalized_mode not in REFRESH_MODES: raise ValueError(f"Unsupported refresh mode: {mode}") resolved_limit = limit_per_set or self.settings.refresh_default_limit_per_set resolved_sets = self._resolve_entity_sets( requested_entity_sets=requested_entity_sets, entity_keywords=entity_keywords, ) if not resolved_sets: raise RuntimeError("No entity sets resolved for refresh") run_id = self.store.start_refresh_run( mode=normalized_mode, requested_entity_sets=resolved_sets, date_from=date_from, date_to=date_to, limit_per_set=resolved_limit, ) parsed_date_from = _parse_dt(date_from) parsed_date_to = _parse_dt(date_to) safe_target = target_id.strip() if target_id else None successful_sets: list[str] = [] failed_sets: list[dict[str, str]] = [] records_read = 0 entities_written = 0 links_written = 0 per_set_stats: list[dict[str, Any]] = [] for entity_set in resolved_sets: try: records = self.client.read_entity_set_records(entity_set, top=resolved_limit) except Exception as exc: failed_sets.append({"entity_set": entity_set, "error": str(exc)}) continue records_read += len(records) filtered_records = records if parsed_date_from or parsed_date_to: filtered_records = [ row for row in filtered_records if _in_date_range(row, parsed_date_from, parsed_date_to) ] if normalized_mode == "targeted" and safe_target: lowered_target = safe_target.lower() filtered_records = [ row for row in filtered_records if lowered_target in json.dumps(row, ensure_ascii=False).lower() ] mapped = map_records(entity_set, filtered_records) entity_delta, link_delta = self.store.upsert_entities(run_id=run_id, entities=mapped) entities_written += entity_delta links_written += link_delta successful_sets.append(entity_set) per_set_stats.append( { "entity_set": entity_set, "records_read": len(records), "records_after_filters": len(filtered_records), "entities_written": entity_delta, "links_written": link_delta, } ) checkpoints_updated = self.store.update_checkpoints( run_id=run_id, entity_sets=successful_sets, date_from=date_from, date_to=date_to, ) status = "success" error_message: str | None = None if successful_sets and failed_sets: status = "partial_success" elif failed_sets and not successful_sets: status = "failed" error_message = "; ".join(f"{item['entity_set']}: {item['error']}" for item in failed_sets[:3]) details = { "per_set": per_set_stats, "failed_sets": failed_sets, } self.store.finish_refresh_run( run_id=run_id, status=status, records_read=records_read, entities_written=entities_written, links_written=links_written, checkpoints_updated=checkpoints_updated, details=details, error_message=error_message, ) return RefreshResult( run_id=run_id, mode=normalized_mode, status=status, requested_entity_sets=resolved_sets, successful_entity_sets=successful_sets, failed_entity_sets=failed_sets, date_from=date_from, date_to=date_to, target_id=safe_target, limit_per_set=resolved_limit, records_read=records_read, entities_written=entities_written, links_written=links_written, checkpoints_updated=checkpoints_updated, ) def list_recent_runs(self, limit: int = 20) -> list[dict[str, Any]]: return self.store.list_recent_runs(limit=limit) def store_stats(self) -> dict[str, Any]: return self.store.store_stats()