from __future__ import annotations from datetime import date, datetime, timezone from parser.normalizer import build_chunks from shared import ORM def parse_iso_date(value: str | None) -> date | None: if not value: return None return date.fromisoformat(value) async def ingest_documents( orm: ORM | None, normalized_documents: list[dict], dry_run: bool = False ) -> list[dict]: results = [] for document in normalized_documents: source_payload = { "title": document["title"], "source_type": document["source_type"], "jurisdiction": "RU", "law_type": document["law_type"], "document_number": document["document_number"], "adoption_date": parse_iso_date(document["adoption_date"]), "publication_date": parse_iso_date(document["publication_date"]), "effective_date": parse_iso_date(document["effective_date"]), "source_url": document["source_url"], "official_publication_number": None, "version_hash": document["version_hash"], "is_active": True, "loaded_at": datetime.now(timezone.utc), } chunks = build_chunks(document) if dry_run: results.append( { "document_key": document["key"], "status": "dry-run", "chunk_count": len(chunks), } ) continue if orm is None: raise RuntimeError("ORM instance is required when dry_run is disabled") source, created = await orm.upsert_law_source(source_payload) should_replace = created if not created: existing_count = await orm.get_chunks_count_by_source(source.id) should_replace = existing_count == 0 if should_replace: await orm.replace_law_chunks(source.id, chunks) results.append( { "document_key": document["key"], "status": "updated" if should_replace else "skipped", "chunk_count": len(chunks), "source_id": source.id, } ) return results