Files
2026-05-25 01:12:43 +03:00

109 lines
2.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import hashlib
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Any
from urllib.parse import urljoin
WHITESPACE_RE = re.compile(r"[ \t]+")
NEWLINES_RE = re.compile(r"\n{3,}")
RUSSIAN_MONTHS = {
"января": 1,
"февраля": 2,
"марта": 3,
"апреля": 4,
"мая": 5,
"июня": 6,
"июля": 7,
"августа": 8,
"сентября": 9,
"октября": 10,
"ноября": 11,
"декабря": 12,
}
def ensure_dir(path: Path) -> Path:
path.mkdir(parents=True, exist_ok=True)
return path
def read_json(path: Path, default: Any = None) -> Any:
if not path.exists():
return default
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, payload: Any) -> None:
ensure_dir(path.parent)
path.write_text(
json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=False),
encoding="utf-8",
)
def sha256_text(value: str) -> str:
return hashlib.sha256(value.encode("utf-8")).hexdigest()
def normalize_text(value: str) -> str:
cleaned = value.replace("\xa0", " ")
cleaned = WHITESPACE_RE.sub(" ", cleaned)
cleaned = re.sub(r" *\n *", "\n", cleaned)
cleaned = NEWLINES_RE.sub("\n\n", cleaned)
return cleaned.strip()
def slugify(value: str) -> str:
slug = re.sub(r"[^a-zA-Z0-9]+", "-", value.lower()).strip("-")
return slug or "document"
def to_absolute_url(url: str, base_url: str) -> str:
return urljoin(base_url, url)
def parse_russian_date(value: str) -> str | None:
match = re.search(r"(\d{1,2})\s+([а-я]+)\s+(\d{4})", value.lower())
if not match:
return None
day = int(match.group(1))
month = RUSSIAN_MONTHS.get(match.group(2))
year = int(match.group(3))
if month is None:
return None
return datetime(year, month, day).date().isoformat()
def chunk_paragraphs(
paragraphs: list[str], max_chars: int = 4500, overlap_paragraphs: int = 1
) -> list[str]:
if not paragraphs:
return []
chunks: list[str] = []
current: list[str] = []
current_len = 0
for paragraph in paragraphs:
paragraph_len = len(paragraph)
if current and current_len + paragraph_len + 2 > max_chars:
chunks.append("\n\n".join(current).strip())
current = current[-overlap_paragraphs:] if overlap_paragraphs else []
current_len = sum(len(item) + 2 for item in current)
current.append(paragraph)
current_len += paragraph_len + 2
if current:
chunks.append("\n\n".join(current).strip())
return [chunk for chunk in chunks if chunk]