NODEDC_1C/scripts/export_arch_2020_package.py

399 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import json
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
import sys
from typing import Any
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from canonical_layer.mappers import canonical_relation_rule_catalog
SNAPSHOT_PATH = PROJECT_ROOT / "logs" / "pre_report_snapshot_2020_2020-06_semantic_v2.json"
if not SNAPSHOT_PATH.exists():
SNAPSHOT_PATH = PROJECT_ROOT / "logs" / "pre_report_snapshot_2020_2020-06.json"
OUTPUT_DIR = PROJECT_ROOT / "docs" / "ARCH" / "2020экспорт"
CANONICAL_CLASSES = [
"CanonicalEntity",
"Organization",
"Counterparty",
"Contract",
"Account",
"Subconto",
"ResponsiblePerson",
"Currency",
"Warehouse",
"CashflowArticle",
"Department",
"Individual",
"Item",
"BankAccount",
"Document",
"InvoiceDocument",
"Posting",
"RegisterMovement",
"RegisterRecord",
"Period",
]
def load_snapshot(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def low(value: Any) -> str:
return str(value or "").strip().lower()
def has_any_token(text: str, tokens: list[str]) -> bool:
lowered = low(text)
return any(token in lowered for token in tokens)
def short_record(record: dict[str, Any], *, include_links: bool = True) -> dict[str, Any]:
result = {
"source_entity": record.get("source_entity"),
"source_id": record.get("source_id"),
"display_name": record.get("display_name"),
"attributes": record.get("attributes", {}),
}
if include_links:
result["links"] = record.get("links", [])
return result
def to_md_table(headers: list[str], rows: list[list[Any]]) -> str:
lines = [
"| " + " | ".join(headers) + " |",
"| " + " | ".join("---" for _ in headers) + " |",
]
for row in rows:
lines.append("| " + " | ".join(str(cell) for cell in row) + " |")
return "\n".join(lines)
def classify_entity_set(entity_set: str) -> str:
lowered = low(entity_set)
if "счетфактур" in lowered or "invoice" in lowered:
return "InvoiceDocument"
if "документ" in lowered or "document" in lowered:
return "Document"
if "контраг" in lowered or "counterparty" in lowered:
return "Counterparty"
if "договор" in lowered or "contract" in lowered:
return "Contract"
if "банковск" in lowered and "счет" in lowered:
return "BankAccount"
if "валют" in lowered or "currency" in lowered:
return "Currency"
if "склад" in lowered or "warehouse" in lowered:
return "Warehouse"
if "подраздел" in lowered or "department" in lowered:
return "Department"
if "физлиц" in lowered or "individual" in lowered:
return "Individual"
if "номенклатур" in lowered or "item" in lowered or "product" in lowered:
return "Item"
if "ответствен" in lowered or "employee" in lowered or "user" in lowered:
return "ResponsiblePerson"
if "статьядвиженияденежныхсредств" in lowered or "cashflow" in lowered:
return "CashflowArticle"
if "счет" in lowered or "account" in lowered:
return "Account"
if "субконто" in lowered or "subconto" in lowered:
return "Subconto"
if "движ" in lowered or "movement" in lowered:
return "RegisterMovement"
if "провод" in lowered or "posting" in lowered:
return "Posting"
if "регистр" in lowered or "register" in lowered:
return "RegisterRecord"
if "период" in lowered or "period" in lowered:
return "Period"
if "организ" in lowered or "organization" in lowered:
return "Organization"
return "CanonicalEntity"
def build_problem_fragment(items: list[dict[str, Any]], *, limit: int = 80) -> list[dict[str, Any]]:
problems: list[dict[str, Any]] = []
for row in items:
attrs = row.get("attributes", {})
links = row.get("links", [])
source_id = low(row.get("source_id"))
unknown_links = [link for link in links if low(link.get("target_entity")) in {"unknown", ""}]
flags: list[str] = []
if source_id in {"unknown", "", "none", "null"}:
flags.append("source_id_unknown")
if unknown_links:
flags.append("unknown_link_targets")
if isinstance(attrs, dict):
if any(low(v) == "00000000-0000-0000-0000-000000000000" for v in attrs.values()):
flags.append("zero_guid_present")
if any(k.endswith("@navigationLinkUrl") for k in attrs):
flags.append("navigation_links_present")
if flags:
problems.append(
{
"problem_flags": flags,
"unknown_link_count": len(unknown_links),
**short_record(row, include_links=True),
}
)
return problems[:limit]
def filter_samples(items: list[dict[str, Any]], predicate) -> list[dict[str, Any]]:
return [short_record(row, include_links=True) for row in items if predicate(row)]
def write_json(path: Path, payload: Any) -> None:
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
def write_text(path: Path, text: str) -> None:
path.write_text(text, encoding="utf-8")
def main() -> int:
snapshot = load_snapshot(SNAPSHOT_PATH)
items: list[dict[str, Any]] = snapshot.get("items", [])
records_per_set: dict[str, int] = snapshot.get("records_per_entity_set", {})
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
entity_set_classification = {
entity_set: classify_entity_set(entity_set) for entity_set in sorted(records_per_set.keys())
}
class_distribution = Counter(entity_set_classification.values())
link_target_distribution: Counter[str] = Counter()
relation_distribution: Counter[str] = Counter()
unknown_relations = 0
total_relations = 0
unknown_source_ids = 0
for row in items:
if low(row.get("source_id")) in {"unknown", "", "none", "null"}:
unknown_source_ids += 1
for link in row.get("links", []):
target_entity = str(link.get("target_entity", "Unknown"))
relation = str(link.get("relation", "reference"))
link_target_distribution[target_entity] += 1
relation_distribution[relation] += 1
total_relations += 1
if low(target_entity) == "unknown":
unknown_relations += 1
ontology_md = f"""# Текущая онтология / mapping-слой
Дата экспорта: {datetime.now(timezone.utc).isoformat()}
Источник snapshot: `{SNAPSHOT_PATH}`
## Что считается сущностями сейчас
Базовая модель (canonical classes):
{chr(10).join(f"- `{name}`" for name in CANONICAL_CLASSES)}
## Срез июня 2020: покрытие сущностей
- Отобранный период: `{snapshot.get("selected_window_key")}`
- Диапазон: `{snapshot.get("selected_window_start")} -> {snapshot.get("selected_window_end_exclusive")}`
- Записей в slice: `{snapshot.get("records_exported_total")}`
- Связей в slice: `{snapshot.get("links_exported_total")}`
- Entity sets: `{len(records_per_set)}`
- Записей с `source_id=unknown`: `{unknown_source_ids}`
### Распределение entity sets по canonical-классам
{to_md_table(["Canonical class", "Entity set count"], [[k, v] for k, v in sorted(class_distribution.items())])}
### Топ target_entity в links
{to_md_table(["target_entity", "count"], [[k, v] for k, v in link_target_distribution.most_common(15)])}
### Топ relation в links
{to_md_table(["relation", "count"], [[k, v] for k, v in relation_distribution.most_common(20)])}
### Качество типизации связей
- Всего связей: `{total_relations}`
- Связей с `target_entity=Unknown`: `{unknown_relations}`
- Доля unknown: `{round((unknown_relations / total_relations * 100.0), 2) if total_relations else 0.0}%`
"""
write_text(OUTPUT_DIR / "01_ontology_mapping_layer.md", ontology_md)
relation_rows = [
[row["context"], row["role"], row["relation"]] for row in canonical_relation_rule_catalog()
]
relation_rules_md = f"""# Текущие canonical relation rules
Источник: `canonical_layer/mappers.py`
## Текущий каталог semantic relations
{to_md_table(["Context", "Field role", "Relation"], relation_rows)}
## Базовые правила извлечения ссылок
1. Поле попадает в link, если это `_Key`, `*ref`, GUID или semantic-поле (например `Recorder`, `СчетФактура`).
2. `*_Type` используется как приоритетная подсказка типа target-сущности.
3. Нулевые GUID (`00000000-...`) отфильтровываются из canonical links.
4. Если `source_id` отсутствует, строится составной `cmp:<sha1>` ключ.
"""
write_text(OUTPUT_DIR / "02_canonical_relation_rules.md", relation_rules_md)
problem_fragment = build_problem_fragment(items, limit=80)
write_json(
OUTPUT_DIR / "03_snapshot_fragment_problem_cases.json",
{
"slice_window_key": snapshot.get("selected_window_key"),
"notes": [
"Фрагмент отобран по признакам: unknown source_id, unknown link targets, zero GUID, navigationLink присутствует.",
"Это не весь snapshot, а проблемный срез для диагностики.",
],
"records_total": len(problem_fragment),
"records": problem_fragment,
},
)
write_json(
OUTPUT_DIR / "04_samples_SpisanieSRaschetnogoScheta.json",
{
"selector": "source_entity contains 'СписаниеСРасчетногоСчета' OR latin fallback",
"records": filter_samples(
items,
lambda row: has_any_token(row.get("source_entity", ""), ["списаниесрасчетногосчета", "spisanie"]),
)[:40],
},
)
write_json(
OUTPUT_DIR / "05_samples_RealizaciyaTovarovUslug.json",
{
"selector": "source_entity contains 'РеализацияТоваровУслуг' OR latin fallback",
"records": filter_samples(
items,
lambda row: has_any_token(row.get("source_entity", ""), ["реализациятоваровуслуг", "realiz"]),
)[:40],
},
)
write_json(
OUTPUT_DIR / "06_samples_PostuplenieTovarovUslug.json",
{
"selector": "source_entity contains 'ПоступлениеТоваровУслуг' OR latin fallback",
"records": filter_samples(
items,
lambda row: has_any_token(row.get("source_entity", ""), ["поступлениетоваровуслуг", "postupl"]),
)[:40],
},
)
write_json(
OUTPUT_DIR / "07_samples_DocumentJournals.json",
{
"selector": "source_entity startswith DocumentJournal_",
"records": filter_samples(
items,
lambda row: str(row.get("source_entity", "")).startswith("DocumentJournal_"),
)[:80],
},
)
write_json(
OUTPUT_DIR / "08_samples_NDS_registers.json",
{
"selector": "source_entity startswith AccumulationRegister_ and contains НДС",
"records": filter_samples(
items,
lambda row: str(row.get("source_entity", "")).startswith("AccumulationRegister_")
and "ндс" in low(row.get("source_entity", "")),
)[:80],
},
)
def key_fields_predicate(row: dict[str, Any]) -> bool:
attrs = row.get("attributes", {})
if not isinstance(attrs, dict):
return False
keys = {low(key) for key in attrs.keys()}
tokens = {
"recorder",
"ref",
"ref_key",
"поставщик_key",
"покупатель_key",
"ответственный_key",
}
return any(token in keys for token in tokens)
key_field_records = filter_samples(items, key_fields_predicate)[:140]
write_json(
OUTPUT_DIR / "09_samples_key_fields_Recorder_Ref_Supplier_Buyer_Responsible.json",
{
"selector": "records where attributes contain any of Recorder, Ref/Ref_Key, Поставщик_Key, Покупатель_Key, Ответственный_Key",
"records_total": len(key_field_records),
"records": key_field_records,
},
)
key_stats = Counter()
for row in items:
attrs = row.get("attributes", {})
if not isinstance(attrs, dict):
continue
for key in attrs.keys():
lk = low(key)
if lk in {
"recorder",
"ref",
"ref_key",
"поставщик_key",
"покупатель_key",
"ответственный_key",
}:
key_stats[key] += 1
manifest_md = f"""# 2020 экспорт: состав выгрузки
Папка собрана автоматически для ручного анализа текущего состояния.
## Файлы
1. `01_ontology_mapping_layer.md` — текущая онтология/мэппинг и метрики среза.
2. `02_canonical_relation_rules.md` — правила построения canonical relations.
3. `03_snapshot_fragment_problem_cases.json` — проблемный фрагмент snapshot июня 2020.
4. `04_samples_SpisanieSRaschetnogoScheta.json` — реальные записи по `СписаниеСРасчетногоСчета`.
5. `05_samples_RealizaciyaTovarovUslug.json` — реальные записи по `РеализацияТоваровУслуг`.
6. `06_samples_PostuplenieTovarovUslug.json` — реальные записи по `ПоступлениеТоваровУслуг`.
7. `07_samples_DocumentJournals.json` — реальные записи по журналам документов.
8. `08_samples_NDS_registers.json` — реальные записи по НДС-регистрам.
9. `09_samples_key_fields_Recorder_Ref_Supplier_Buyer_Responsible.json` — записи с ключевыми полями.
## Ключевые поля: фактическая встречаемость в snapshot
{to_md_table(["field", "count"], [[k, v] for k, v in key_stats.most_common()] or [["(не найдено)", 0]])}
"""
write_text(OUTPUT_DIR / "00_manifest.md", manifest_md)
summary = {
"status": "success",
"output_dir": str(OUTPUT_DIR),
"snapshot_path": str(SNAPSHOT_PATH),
"files": sorted(path.name for path in OUTPUT_DIR.iterdir() if path.is_file()),
}
print(json.dumps(summary, ensure_ascii=False, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())