first commit

2026-05-25 01:12:43 +03:00
commit bfc22efe24
83 changed files with 8903 additions and 0 deletions
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from urllib.parse import urljoin
+
+
+WHITESPACE_RE = re.compile(r"[ \t]+")
+NEWLINES_RE = re.compile(r"\n{3,}")
+
+RUSSIAN_MONTHS = {
+    "января": 1,
+    "февраля": 2,
+    "марта": 3,
+    "апреля": 4,
+    "мая": 5,
+    "июня": 6,
+    "июля": 7,
+    "августа": 8,
+    "сентября": 9,
+    "октября": 10,
+    "ноября": 11,
+    "декабря": 12,
+}
+
+
+def ensure_dir(path: Path) -> Path:
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def read_json(path: Path, default: Any = None) -> Any:
+    if not path.exists():
+        return default
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def write_json(path: Path, payload: Any) -> None:
+    ensure_dir(path.parent)
+    path.write_text(
+        json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=False),
+        encoding="utf-8",
+    )
+
+
+def sha256_text(value: str) -> str:
+    return hashlib.sha256(value.encode("utf-8")).hexdigest()
+
+
+def normalize_text(value: str) -> str:
+    cleaned = value.replace("\xa0", " ")
+    cleaned = WHITESPACE_RE.sub(" ", cleaned)
+    cleaned = re.sub(r" *\n *", "\n", cleaned)
+    cleaned = NEWLINES_RE.sub("\n\n", cleaned)
+    return cleaned.strip()
+
+
+def slugify(value: str) -> str:
+    slug = re.sub(r"[^a-zA-Z0-9]+", "-", value.lower()).strip("-")
+    return slug or "document"
+
+
+def to_absolute_url(url: str, base_url: str) -> str:
+    return urljoin(base_url, url)
+
+
+def parse_russian_date(value: str) -> str | None:
+    match = re.search(r"(\d{1,2})\s+([а-я]+)\s+(\d{4})", value.lower())
+    if not match:
+        return None
+
+    day = int(match.group(1))
+    month = RUSSIAN_MONTHS.get(match.group(2))
+    year = int(match.group(3))
+    if month is None:
+        return None
+
+    return datetime(year, month, day).date().isoformat()
+
+
+def chunk_paragraphs(
+    paragraphs: list[str], max_chars: int = 4500, overlap_paragraphs: int = 1
+) -> list[str]:
+    if not paragraphs:
+        return []
+
+    chunks: list[str] = []
+    current: list[str] = []
+    current_len = 0
+
+    for paragraph in paragraphs:
+        paragraph_len = len(paragraph)
+        if current and current_len + paragraph_len + 2 > max_chars:
+            chunks.append("\n\n".join(current).strip())
+            current = current[-overlap_paragraphs:] if overlap_paragraphs else []
+            current_len = sum(len(item) + 2 for item in current)
+
+        current.append(paragraph)
+        current_len += paragraph_len + 2
+
+    if current:
+        chunks.append("\n\n".join(current).strip())
+
+    return [chunk for chunk in chunks if chunk]