109 lines
2.7 KiB
Python
109 lines
2.7 KiB
Python
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import hashlib
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
from datetime import datetime
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Any
|
|||
|
|
from urllib.parse import urljoin
|
|||
|
|
|
|||
|
|
|
|||
|
|
WHITESPACE_RE = re.compile(r"[ \t]+")
|
|||
|
|
NEWLINES_RE = re.compile(r"\n{3,}")
|
|||
|
|
|
|||
|
|
RUSSIAN_MONTHS = {
|
|||
|
|
"января": 1,
|
|||
|
|
"февраля": 2,
|
|||
|
|
"марта": 3,
|
|||
|
|
"апреля": 4,
|
|||
|
|
"мая": 5,
|
|||
|
|
"июня": 6,
|
|||
|
|
"июля": 7,
|
|||
|
|
"августа": 8,
|
|||
|
|
"сентября": 9,
|
|||
|
|
"октября": 10,
|
|||
|
|
"ноября": 11,
|
|||
|
|
"декабря": 12,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def ensure_dir(path: Path) -> Path:
|
|||
|
|
path.mkdir(parents=True, exist_ok=True)
|
|||
|
|
return path
|
|||
|
|
|
|||
|
|
|
|||
|
|
def read_json(path: Path, default: Any = None) -> Any:
|
|||
|
|
if not path.exists():
|
|||
|
|
return default
|
|||
|
|
return json.loads(path.read_text(encoding="utf-8"))
|
|||
|
|
|
|||
|
|
|
|||
|
|
def write_json(path: Path, payload: Any) -> None:
|
|||
|
|
ensure_dir(path.parent)
|
|||
|
|
path.write_text(
|
|||
|
|
json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=False),
|
|||
|
|
encoding="utf-8",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def sha256_text(value: str) -> str:
|
|||
|
|
return hashlib.sha256(value.encode("utf-8")).hexdigest()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def normalize_text(value: str) -> str:
|
|||
|
|
cleaned = value.replace("\xa0", " ")
|
|||
|
|
cleaned = WHITESPACE_RE.sub(" ", cleaned)
|
|||
|
|
cleaned = re.sub(r" *\n *", "\n", cleaned)
|
|||
|
|
cleaned = NEWLINES_RE.sub("\n\n", cleaned)
|
|||
|
|
return cleaned.strip()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def slugify(value: str) -> str:
|
|||
|
|
slug = re.sub(r"[^a-zA-Z0-9]+", "-", value.lower()).strip("-")
|
|||
|
|
return slug or "document"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def to_absolute_url(url: str, base_url: str) -> str:
|
|||
|
|
return urljoin(base_url, url)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_russian_date(value: str) -> str | None:
|
|||
|
|
match = re.search(r"(\d{1,2})\s+([а-я]+)\s+(\d{4})", value.lower())
|
|||
|
|
if not match:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
day = int(match.group(1))
|
|||
|
|
month = RUSSIAN_MONTHS.get(match.group(2))
|
|||
|
|
year = int(match.group(3))
|
|||
|
|
if month is None:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
return datetime(year, month, day).date().isoformat()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def chunk_paragraphs(
|
|||
|
|
paragraphs: list[str], max_chars: int = 4500, overlap_paragraphs: int = 1
|
|||
|
|
) -> list[str]:
|
|||
|
|
if not paragraphs:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
chunks: list[str] = []
|
|||
|
|
current: list[str] = []
|
|||
|
|
current_len = 0
|
|||
|
|
|
|||
|
|
for paragraph in paragraphs:
|
|||
|
|
paragraph_len = len(paragraph)
|
|||
|
|
if current and current_len + paragraph_len + 2 > max_chars:
|
|||
|
|
chunks.append("\n\n".join(current).strip())
|
|||
|
|
current = current[-overlap_paragraphs:] if overlap_paragraphs else []
|
|||
|
|
current_len = sum(len(item) + 2 for item in current)
|
|||
|
|
|
|||
|
|
current.append(paragraph)
|
|||
|
|
current_len += paragraph_len + 2
|
|||
|
|
|
|||
|
|
if current:
|
|||
|
|
chunks.append("\n\n".join(current).strip())
|
|||
|
|
|
|||
|
|
return [chunk for chunk in chunks if chunk]
|