Files
LawBot/parser/normalizer.py
T
2026-05-25 01:12:43 +03:00

167 lines
6.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import re
from pathlib import Path
from bs4 import BeautifulSoup
from parser.config import NORMALIZED_ROOT
from parser.utils import (
chunk_paragraphs,
ensure_dir,
normalize_text,
parse_russian_date,
read_json,
write_json,
)
DOCUMENT_NUMBER_RE = re.compile(r"N\s*([0-9А-Яа-я\-ФКЗA-Z]+)")
def parse_root_metadata(root_html: str) -> dict:
soup = BeautifulSoup(root_html, "html.parser")
title_node = soup.select_one(".document-page__title h1")
title_text = title_node.get_text("\n", strip=True) if title_node else ""
content_node = soup.select_one(".document-page__content")
first_text = content_node.get_text("\n", strip=True) if content_node else ""
document_number = None
number_match = DOCUMENT_NUMBER_RE.search(first_text or title_text)
if number_match:
document_number = number_match.group(1)
adoption_date = parse_russian_date(first_text or title_text)
effective_date = None
effective_match = re.search(r"вступ\.\s*в силу с\s*(\d{2}\.\d{2}\.\d{4})", title_text)
if effective_match:
day, month, year = effective_match.group(1).split(".")
effective_date = f"{year}-{month}-{day}"
lines = [line.strip() for line in title_text.splitlines() if line.strip()]
return {
"title": lines[0] if lines else title_text.strip(),
"version_note": "\n".join(lines[1:]).strip() or None,
"document_number": document_number,
"adoption_date": adoption_date,
"effective_date": effective_date,
"publication_date": None,
}
def parse_article_page(article_html: str) -> list[str]:
soup = BeautifulSoup(article_html, "html.parser")
content = soup.select_one(".document-page__content")
if content is None:
return []
for selector in [
".document__insert",
".document__edit",
".document-page__notes",
".document-page__title-link",
".document-page__title",
]:
for node in content.select(selector):
node.decompose()
heading = content.select_one("h1")
if heading is not None:
heading.decompose()
paragraphs: list[str] = []
for node in content.select("p"):
text = normalize_text(node.get_text("\n", strip=True))
if not text:
continue
paragraphs.append(text)
return paragraphs
def normalize_document(raw_payload: dict) -> dict:
raw_dir = Path(raw_payload["raw_dir"])
root_html = (raw_dir / raw_payload["root_file"]).read_text(encoding="utf-8")
metadata = parse_root_metadata(root_html)
articles = []
for article_ref in raw_payload["articles"]:
article_html = (raw_dir / "articles" / article_ref["file_name"]).read_text(encoding="utf-8")
paragraphs = parse_article_page(article_html)
article_text = "\n\n".join(paragraphs).strip()
articles.append(
{
"article_number": article_ref["article_number"],
"article_title": article_ref["article_title"],
"article_url": article_ref["article_url"],
"section_title": article_ref.get("section_title"),
"chapter_title": article_ref.get("chapter_title"),
"part_title": article_ref.get("part_title"),
"breadcrumb": article_ref.get("breadcrumb", []),
"text": article_text,
"paragraphs": paragraphs,
}
)
normalized = {
"key": raw_payload["key"],
"title": metadata["title"],
"source_url": raw_payload["source_url"],
"source_type": "consultant_document",
"document_number": metadata["document_number"],
"adoption_date": metadata["adoption_date"],
"publication_date": metadata["publication_date"],
"effective_date": metadata["effective_date"],
"version_note": metadata["version_note"],
"version_hash": raw_payload["version_hash"],
"law_type": raw_payload["law_type"],
"consultant_category": raw_payload["consultant_category"],
"source_short_name": raw_payload["source_short_name"],
"articles": articles,
}
return normalized
def build_chunks(normalized_document: dict) -> list[dict]:
chunks: list[dict] = []
for article in normalized_document["articles"]:
paragraphs = article["paragraphs"] or ([article["text"]] if article["text"] else [])
text_chunks = chunk_paragraphs(paragraphs)
if not text_chunks and article["text"]:
text_chunks = [article["text"]]
for chunk_text in text_chunks:
chunks.append(
{
"chunk_index": len(chunks),
"article_number": article["article_number"],
"article_title": article["article_title"],
"chunk_text": chunk_text,
"metadata": {
"source_title": normalized_document["title"],
"source_short_name": normalized_document["source_short_name"],
"consultant_category": normalized_document["consultant_category"],
"chapter_title": article.get("chapter_title"),
"section_title": article.get("section_title"),
"article_number": article["article_number"],
"article_title": article["article_title"],
"document_url": article["article_url"],
"breadcrumb": article.get("breadcrumb", []),
"version_hash": normalized_document["version_hash"],
},
}
)
return chunks
def write_normalized_document(normalized_document: dict, dry_run: bool = False) -> Path:
output_path = NORMALIZED_ROOT / f"{normalized_document['key']}.json"
if not dry_run:
ensure_dir(output_path.parent)
write_json(output_path, normalized_document)
return output_path
def load_normalized_document(document_key: str) -> dict | None:
return read_json(NORMALIZED_ROOT / f"{document_key}.json", default=None)