69 lines
2.2 KiB
Python
69 lines
2.2 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from datetime import date, datetime, timezone
|
||
|
|
|
||
|
|
from parser.normalizer import build_chunks
|
||
|
|
from shared import ORM
|
||
|
|
|
||
|
|
|
||
|
|
def parse_iso_date(value: str | None) -> date | None:
|
||
|
|
if not value:
|
||
|
|
return None
|
||
|
|
return date.fromisoformat(value)
|
||
|
|
|
||
|
|
|
||
|
|
async def ingest_documents(
|
||
|
|
orm: ORM | None, normalized_documents: list[dict], dry_run: bool = False
|
||
|
|
) -> list[dict]:
|
||
|
|
results = []
|
||
|
|
for document in normalized_documents:
|
||
|
|
source_payload = {
|
||
|
|
"title": document["title"],
|
||
|
|
"source_type": document["source_type"],
|
||
|
|
"jurisdiction": "RU",
|
||
|
|
"law_type": document["law_type"],
|
||
|
|
"document_number": document["document_number"],
|
||
|
|
"adoption_date": parse_iso_date(document["adoption_date"]),
|
||
|
|
"publication_date": parse_iso_date(document["publication_date"]),
|
||
|
|
"effective_date": parse_iso_date(document["effective_date"]),
|
||
|
|
"source_url": document["source_url"],
|
||
|
|
"official_publication_number": None,
|
||
|
|
"version_hash": document["version_hash"],
|
||
|
|
"is_active": True,
|
||
|
|
"loaded_at": datetime.now(timezone.utc),
|
||
|
|
}
|
||
|
|
chunks = build_chunks(document)
|
||
|
|
|
||
|
|
if dry_run:
|
||
|
|
results.append(
|
||
|
|
{
|
||
|
|
"document_key": document["key"],
|
||
|
|
"status": "dry-run",
|
||
|
|
"chunk_count": len(chunks),
|
||
|
|
}
|
||
|
|
)
|
||
|
|
continue
|
||
|
|
|
||
|
|
if orm is None:
|
||
|
|
raise RuntimeError("ORM instance is required when dry_run is disabled")
|
||
|
|
|
||
|
|
source, created = await orm.upsert_law_source(source_payload)
|
||
|
|
should_replace = created
|
||
|
|
if not created:
|
||
|
|
existing_count = await orm.get_chunks_count_by_source(source.id)
|
||
|
|
should_replace = existing_count == 0
|
||
|
|
|
||
|
|
if should_replace:
|
||
|
|
await orm.replace_law_chunks(source.id, chunks)
|
||
|
|
|
||
|
|
results.append(
|
||
|
|
{
|
||
|
|
"document_key": document["key"],
|
||
|
|
"status": "updated" if should_replace else "skipped",
|
||
|
|
"chunk_count": len(chunks),
|
||
|
|
"source_id": source.id,
|
||
|
|
}
|
||
|
|
)
|
||
|
|
|
||
|
|
return results
|