from __future__ import annotations import hashlib import json import re from datetime import datetime from pathlib import Path from typing import Any from urllib.parse import urljoin WHITESPACE_RE = re.compile(r"[ \t]+") NEWLINES_RE = re.compile(r"\n{3,}") RUSSIAN_MONTHS = { "января": 1, "февраля": 2, "марта": 3, "апреля": 4, "мая": 5, "июня": 6, "июля": 7, "августа": 8, "сентября": 9, "октября": 10, "ноября": 11, "декабря": 12, } def ensure_dir(path: Path) -> Path: path.mkdir(parents=True, exist_ok=True) return path def read_json(path: Path, default: Any = None) -> Any: if not path.exists(): return default return json.loads(path.read_text(encoding="utf-8")) def write_json(path: Path, payload: Any) -> None: ensure_dir(path.parent) path.write_text( json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=False), encoding="utf-8", ) def sha256_text(value: str) -> str: return hashlib.sha256(value.encode("utf-8")).hexdigest() def normalize_text(value: str) -> str: cleaned = value.replace("\xa0", " ") cleaned = WHITESPACE_RE.sub(" ", cleaned) cleaned = re.sub(r" *\n *", "\n", cleaned) cleaned = NEWLINES_RE.sub("\n\n", cleaned) return cleaned.strip() def slugify(value: str) -> str: slug = re.sub(r"[^a-zA-Z0-9]+", "-", value.lower()).strip("-") return slug or "document" def to_absolute_url(url: str, base_url: str) -> str: return urljoin(base_url, url) def parse_russian_date(value: str) -> str | None: match = re.search(r"(\d{1,2})\s+([а-я]+)\s+(\d{4})", value.lower()) if not match: return None day = int(match.group(1)) month = RUSSIAN_MONTHS.get(match.group(2)) year = int(match.group(3)) if month is None: return None return datetime(year, month, day).date().isoformat() def chunk_paragraphs( paragraphs: list[str], max_chars: int = 4500, overlap_paragraphs: int = 1 ) -> list[str]: if not paragraphs: return [] chunks: list[str] = [] current: list[str] = [] current_len = 0 for paragraph in paragraphs: paragraph_len = len(paragraph) if current and current_len + paragraph_len + 2 > max_chars: chunks.append("\n\n".join(current).strip()) current = current[-overlap_paragraphs:] if overlap_paragraphs else [] current_len = sum(len(item) + 2 for item in current) current.append(paragraph) current_len += paragraph_len + 2 if current: chunks.append("\n\n".join(current).strip()) return [chunk for chunk in chunks if chunk]