from __future__ import annotations import re from pathlib import Path from bs4 import BeautifulSoup from parser.config import NORMALIZED_ROOT from parser.utils import ( chunk_paragraphs, ensure_dir, normalize_text, parse_russian_date, read_json, write_json, ) DOCUMENT_NUMBER_RE = re.compile(r"N\s*([0-9А-Яа-я\-ФКЗA-Z]+)") def parse_root_metadata(root_html: str) -> dict: soup = BeautifulSoup(root_html, "html.parser") title_node = soup.select_one(".document-page__title h1") title_text = title_node.get_text("\n", strip=True) if title_node else "" content_node = soup.select_one(".document-page__content") first_text = content_node.get_text("\n", strip=True) if content_node else "" document_number = None number_match = DOCUMENT_NUMBER_RE.search(first_text or title_text) if number_match: document_number = number_match.group(1) adoption_date = parse_russian_date(first_text or title_text) effective_date = None effective_match = re.search(r"вступ\.\s*в силу с\s*(\d{2}\.\d{2}\.\d{4})", title_text) if effective_match: day, month, year = effective_match.group(1).split(".") effective_date = f"{year}-{month}-{day}" lines = [line.strip() for line in title_text.splitlines() if line.strip()] return { "title": lines[0] if lines else title_text.strip(), "version_note": "\n".join(lines[1:]).strip() or None, "document_number": document_number, "adoption_date": adoption_date, "effective_date": effective_date, "publication_date": None, } def parse_article_page(article_html: str) -> list[str]: soup = BeautifulSoup(article_html, "html.parser") content = soup.select_one(".document-page__content") if content is None: return [] for selector in [ ".document__insert", ".document__edit", ".document-page__notes", ".document-page__title-link", ".document-page__title", ]: for node in content.select(selector): node.decompose() heading = content.select_one("h1") if heading is not None: heading.decompose() paragraphs: list[str] = [] for node in content.select("p"): text = normalize_text(node.get_text("\n", strip=True)) if not text: continue paragraphs.append(text) return paragraphs def normalize_document(raw_payload: dict) -> dict: raw_dir = Path(raw_payload["raw_dir"]) root_html = (raw_dir / raw_payload["root_file"]).read_text(encoding="utf-8") metadata = parse_root_metadata(root_html) articles = [] for article_ref in raw_payload["articles"]: article_html = (raw_dir / "articles" / article_ref["file_name"]).read_text(encoding="utf-8") paragraphs = parse_article_page(article_html) article_text = "\n\n".join(paragraphs).strip() articles.append( { "article_number": article_ref["article_number"], "article_title": article_ref["article_title"], "article_url": article_ref["article_url"], "section_title": article_ref.get("section_title"), "chapter_title": article_ref.get("chapter_title"), "part_title": article_ref.get("part_title"), "breadcrumb": article_ref.get("breadcrumb", []), "text": article_text, "paragraphs": paragraphs, } ) normalized = { "key": raw_payload["key"], "title": metadata["title"], "source_url": raw_payload["source_url"], "source_type": "consultant_document", "document_number": metadata["document_number"], "adoption_date": metadata["adoption_date"], "publication_date": metadata["publication_date"], "effective_date": metadata["effective_date"], "version_note": metadata["version_note"], "version_hash": raw_payload["version_hash"], "law_type": raw_payload["law_type"], "consultant_category": raw_payload["consultant_category"], "source_short_name": raw_payload["source_short_name"], "articles": articles, } return normalized def build_chunks(normalized_document: dict) -> list[dict]: chunks: list[dict] = [] for article in normalized_document["articles"]: paragraphs = article["paragraphs"] or ([article["text"]] if article["text"] else []) text_chunks = chunk_paragraphs(paragraphs) if not text_chunks and article["text"]: text_chunks = [article["text"]] for chunk_text in text_chunks: chunks.append( { "chunk_index": len(chunks), "article_number": article["article_number"], "article_title": article["article_title"], "chunk_text": chunk_text, "metadata": { "source_title": normalized_document["title"], "source_short_name": normalized_document["source_short_name"], "consultant_category": normalized_document["consultant_category"], "chapter_title": article.get("chapter_title"), "section_title": article.get("section_title"), "article_number": article["article_number"], "article_title": article["article_title"], "document_url": article["article_url"], "breadcrumb": article.get("breadcrumb", []), "version_hash": normalized_document["version_hash"], }, } ) return chunks def write_normalized_document(normalized_document: dict, dry_run: bool = False) -> Path: output_path = NORMALIZED_ROOT / f"{normalized_document['key']}.json" if not dry_run: ensure_dir(output_path.parent) write_json(output_path, normalized_document) return output_path def load_normalized_document(document_key: str) -> dict | None: return read_json(NORMALIZED_ROOT / f"{document_key}.json", default=None)