NODEDC_1C/canonical_layer/refresh.py

297 lines
10 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
import json
import re
from typing import Any
from canonical_layer.mappers import map_records
from canonical_layer.store import CanonicalStore
from config.client import ODataClient, extract_entity_sets
from config.settings import LOGS_DIR, OneCSettings, load_settings
ODATA_DATE_RE = re.compile(r"/Date\((?P<millis>-?\d+)")
REFRESH_MODES = {"historical", "incremental", "targeted"}
def _parse_dt(raw: str | None) -> datetime | None:
if raw is None:
return None
text = raw.strip()
if not text:
return None
match = ODATA_DATE_RE.search(text)
if match:
millis = int(match.group("millis"))
return datetime.fromtimestamp(millis / 1000)
for candidate in (text.replace("Z", "+00:00"), text):
try:
return datetime.fromisoformat(candidate)
except ValueError:
continue
return None
def _in_date_range(record: dict[str, Any], date_from: datetime | None, date_to: datetime | None) -> bool:
if date_from is None and date_to is None:
return True
date_candidates = ("Date", "Дата", "Period", "Период", "PostedAt", "posted_at")
record_dt: datetime | None = None
for field in date_candidates:
value = record.get(field)
if value is None:
continue
record_dt = _parse_dt(str(value))
if record_dt is not None:
break
if record_dt is None:
return True
if date_from and record_dt < date_from:
return False
if date_to and record_dt > date_to:
return False
return True
def _unique_preserve_order(items: list[str]) -> list[str]:
result: list[str] = []
seen: set[str] = set()
for item in items:
cleaned = item.strip()
if not cleaned or cleaned in seen:
continue
seen.add(cleaned)
result.append(cleaned)
return result
@dataclass
class RefreshResult:
run_id: str
mode: str
status: str
requested_entity_sets: list[str]
successful_entity_sets: list[str]
failed_entity_sets: list[dict[str, str]]
date_from: str | None
date_to: str | None
target_id: str | None
limit_per_set: int
records_read: int
entities_written: int
links_written: int
checkpoints_updated: int
def to_dict(self) -> dict[str, Any]:
return {
"run_id": self.run_id,
"mode": self.mode,
"status": self.status,
"requested_entity_sets": self.requested_entity_sets,
"successful_entity_sets": self.successful_entity_sets,
"failed_entity_sets": self.failed_entity_sets,
"date_from": self.date_from,
"date_to": self.date_to,
"target_id": self.target_id,
"limit_per_set": self.limit_per_set,
"records_read": self.records_read,
"entities_written": self.entities_written,
"links_written": self.links_written,
"checkpoints_updated": self.checkpoints_updated,
}
class RefreshService:
def __init__(self, *, settings: OneCSettings, client: ODataClient, store: CanonicalStore) -> None:
self.settings = settings
self.client = client
self.store = store
self.store.ensure_created()
@classmethod
def build(cls) -> "RefreshService":
settings = load_settings()
client = ODataClient(settings)
store = CanonicalStore(settings.canonical_db_url)
return cls(settings=settings, client=client, store=store)
def _load_entity_sets(self) -> list[dict[str, str]]:
entity_sets_file = LOGS_DIR / "entity_sets.json"
if entity_sets_file.exists():
payload = json.loads(entity_sets_file.read_text(encoding="utf-8"))
entity_sets = payload.get("entity_sets", [])
if isinstance(entity_sets, list):
return [item for item in entity_sets if isinstance(item, dict)]
metadata_file = LOGS_DIR / "metadata.xml"
if metadata_file.exists():
metadata_xml = metadata_file.read_text(encoding="utf-8")
return extract_entity_sets(metadata_xml)
metadata_xml = self.client.fetch_metadata()
metadata_file.parent.mkdir(parents=True, exist_ok=True)
metadata_file.write_text(metadata_xml, encoding="utf-8")
return extract_entity_sets(metadata_xml)
def _resolve_entity_sets(
self,
*,
requested_entity_sets: list[str] | None,
entity_keywords: list[str] | None,
) -> list[str]:
if requested_entity_sets:
return _unique_preserve_order(requested_entity_sets)
entity_sets = self._load_entity_sets()
names = [str(item.get("name", "")).strip() for item in entity_sets if str(item.get("name", "")).strip()]
if not names:
return []
keywords_source = entity_keywords or list(self.settings.refresh_default_entity_keywords)
keywords = [keyword.strip().lower() for keyword in keywords_source if keyword.strip()]
if not keywords:
return names
matched = [name for name in names if any(keyword in name.lower() for keyword in keywords)]
if matched:
return _unique_preserve_order(matched)
return names
def run_refresh(
self,
*,
mode: str,
date_from: str | None = None,
date_to: str | None = None,
target_id: str | None = None,
limit_per_set: int | None = None,
requested_entity_sets: list[str] | None = None,
entity_keywords: list[str] | None = None,
) -> RefreshResult:
normalized_mode = mode.strip().lower()
if normalized_mode not in REFRESH_MODES:
raise ValueError(f"Unsupported refresh mode: {mode}")
resolved_limit = limit_per_set or self.settings.refresh_default_limit_per_set
resolved_sets = self._resolve_entity_sets(
requested_entity_sets=requested_entity_sets,
entity_keywords=entity_keywords,
)
if not resolved_sets:
raise RuntimeError("No entity sets resolved for refresh")
run_id = self.store.start_refresh_run(
mode=normalized_mode,
requested_entity_sets=resolved_sets,
date_from=date_from,
date_to=date_to,
limit_per_set=resolved_limit,
)
parsed_date_from = _parse_dt(date_from)
parsed_date_to = _parse_dt(date_to)
safe_target = target_id.strip() if target_id else None
successful_sets: list[str] = []
failed_sets: list[dict[str, str]] = []
records_read = 0
entities_written = 0
links_written = 0
per_set_stats: list[dict[str, Any]] = []
for entity_set in resolved_sets:
try:
records = self.client.read_entity_set_records(entity_set, top=resolved_limit)
except Exception as exc:
failed_sets.append({"entity_set": entity_set, "error": str(exc)})
continue
records_read += len(records)
filtered_records = records
if parsed_date_from or parsed_date_to:
filtered_records = [
row for row in filtered_records if _in_date_range(row, parsed_date_from, parsed_date_to)
]
if normalized_mode == "targeted" and safe_target:
lowered_target = safe_target.lower()
filtered_records = [
row
for row in filtered_records
if lowered_target in json.dumps(row, ensure_ascii=False).lower()
]
mapped = map_records(entity_set, filtered_records)
entity_delta, link_delta = self.store.upsert_entities(run_id=run_id, entities=mapped)
entities_written += entity_delta
links_written += link_delta
successful_sets.append(entity_set)
per_set_stats.append(
{
"entity_set": entity_set,
"records_read": len(records),
"records_after_filters": len(filtered_records),
"entities_written": entity_delta,
"links_written": link_delta,
}
)
checkpoints_updated = self.store.update_checkpoints(
run_id=run_id,
entity_sets=successful_sets,
date_from=date_from,
date_to=date_to,
)
status = "success"
error_message: str | None = None
if successful_sets and failed_sets:
status = "partial_success"
elif failed_sets and not successful_sets:
status = "failed"
error_message = "; ".join(f"{item['entity_set']}: {item['error']}" for item in failed_sets[:3])
details = {
"per_set": per_set_stats,
"failed_sets": failed_sets,
}
self.store.finish_refresh_run(
run_id=run_id,
status=status,
records_read=records_read,
entities_written=entities_written,
links_written=links_written,
checkpoints_updated=checkpoints_updated,
details=details,
error_message=error_message,
)
return RefreshResult(
run_id=run_id,
mode=normalized_mode,
status=status,
requested_entity_sets=resolved_sets,
successful_entity_sets=successful_sets,
failed_entity_sets=failed_sets,
date_from=date_from,
date_to=date_to,
target_id=safe_target,
limit_per_set=resolved_limit,
records_read=records_read,
entities_written=entities_written,
links_written=links_written,
checkpoints_updated=checkpoints_updated,
)
def list_recent_runs(self, limit: int = 20) -> list[dict[str, Any]]:
return self.store.list_recent_runs(limit=limit)
def store_stats(self) -> dict[str, Any]:
return self.store.store_stats()