first commit

2026-05-25 01:12:43 +03:00
commit bfc22efe24
83 changed files with 8903 additions and 0 deletions
@@ -0,0 +1,166 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+from parser.config import NORMALIZED_ROOT
+from parser.utils import (
+    chunk_paragraphs,
+    ensure_dir,
+    normalize_text,
+    parse_russian_date,
+    read_json,
+    write_json,
+)
+
+
+DOCUMENT_NUMBER_RE = re.compile(r"N\s*([0-9А-Яа-я\-ФКЗA-Z]+)")
+
+
+def parse_root_metadata(root_html: str) -> dict:
+    soup = BeautifulSoup(root_html, "html.parser")
+    title_node = soup.select_one(".document-page__title h1")
+    title_text = title_node.get_text("\n", strip=True) if title_node else ""
+    content_node = soup.select_one(".document-page__content")
+    first_text = content_node.get_text("\n", strip=True) if content_node else ""
+
+    document_number = None
+    number_match = DOCUMENT_NUMBER_RE.search(first_text or title_text)
+    if number_match:
+        document_number = number_match.group(1)
+
+    adoption_date = parse_russian_date(first_text or title_text)
+    effective_date = None
+    effective_match = re.search(r"вступ\.\s*в силу с\s*(\d{2}\.\d{2}\.\d{4})", title_text)
+    if effective_match:
+        day, month, year = effective_match.group(1).split(".")
+        effective_date = f"{year}-{month}-{day}"
+
+    lines = [line.strip() for line in title_text.splitlines() if line.strip()]
+    return {
+        "title": lines[0] if lines else title_text.strip(),
+        "version_note": "\n".join(lines[1:]).strip() or None,
+        "document_number": document_number,
+        "adoption_date": adoption_date,
+        "effective_date": effective_date,
+        "publication_date": None,
+    }
+
+
+def parse_article_page(article_html: str) -> list[str]:
+    soup = BeautifulSoup(article_html, "html.parser")
+    content = soup.select_one(".document-page__content")
+    if content is None:
+        return []
+
+    for selector in [
+        ".document__insert",
+        ".document__edit",
+        ".document-page__notes",
+        ".document-page__title-link",
+        ".document-page__title",
+    ]:
+        for node in content.select(selector):
+            node.decompose()
+
+    heading = content.select_one("h1")
+    if heading is not None:
+        heading.decompose()
+
+    paragraphs: list[str] = []
+    for node in content.select("p"):
+        text = normalize_text(node.get_text("\n", strip=True))
+        if not text:
+            continue
+        paragraphs.append(text)
+
+    return paragraphs
+
+
+def normalize_document(raw_payload: dict) -> dict:
+    raw_dir = Path(raw_payload["raw_dir"])
+    root_html = (raw_dir / raw_payload["root_file"]).read_text(encoding="utf-8")
+    metadata = parse_root_metadata(root_html)
+
+    articles = []
+    for article_ref in raw_payload["articles"]:
+        article_html = (raw_dir / "articles" / article_ref["file_name"]).read_text(encoding="utf-8")
+        paragraphs = parse_article_page(article_html)
+        article_text = "\n\n".join(paragraphs).strip()
+        articles.append(
+            {
+                "article_number": article_ref["article_number"],
+                "article_title": article_ref["article_title"],
+                "article_url": article_ref["article_url"],
+                "section_title": article_ref.get("section_title"),
+                "chapter_title": article_ref.get("chapter_title"),
+                "part_title": article_ref.get("part_title"),
+                "breadcrumb": article_ref.get("breadcrumb", []),
+                "text": article_text,
+                "paragraphs": paragraphs,
+            }
+        )
+
+    normalized = {
+        "key": raw_payload["key"],
+        "title": metadata["title"],
+        "source_url": raw_payload["source_url"],
+        "source_type": "consultant_document",
+        "document_number": metadata["document_number"],
+        "adoption_date": metadata["adoption_date"],
+        "publication_date": metadata["publication_date"],
+        "effective_date": metadata["effective_date"],
+        "version_note": metadata["version_note"],
+        "version_hash": raw_payload["version_hash"],
+        "law_type": raw_payload["law_type"],
+        "consultant_category": raw_payload["consultant_category"],
+        "source_short_name": raw_payload["source_short_name"],
+        "articles": articles,
+    }
+    return normalized
+
+
+def build_chunks(normalized_document: dict) -> list[dict]:
+    chunks: list[dict] = []
+    for article in normalized_document["articles"]:
+        paragraphs = article["paragraphs"] or ([article["text"]] if article["text"] else [])
+        text_chunks = chunk_paragraphs(paragraphs)
+        if not text_chunks and article["text"]:
+            text_chunks = [article["text"]]
+
+        for chunk_text in text_chunks:
+            chunks.append(
+                {
+                    "chunk_index": len(chunks),
+                    "article_number": article["article_number"],
+                    "article_title": article["article_title"],
+                    "chunk_text": chunk_text,
+                    "metadata": {
+                        "source_title": normalized_document["title"],
+                        "source_short_name": normalized_document["source_short_name"],
+                        "consultant_category": normalized_document["consultant_category"],
+                        "chapter_title": article.get("chapter_title"),
+                        "section_title": article.get("section_title"),
+                        "article_number": article["article_number"],
+                        "article_title": article["article_title"],
+                        "document_url": article["article_url"],
+                        "breadcrumb": article.get("breadcrumb", []),
+                        "version_hash": normalized_document["version_hash"],
+                    },
+                }
+            )
+    return chunks
+
+
+def write_normalized_document(normalized_document: dict, dry_run: bool = False) -> Path:
+    output_path = NORMALIZED_ROOT / f"{normalized_document['key']}.json"
+    if not dry_run:
+        ensure_dir(output_path.parent)
+        write_json(output_path, normalized_document)
+    return output_path
+
+
+def load_normalized_document(document_key: str) -> dict | None:
+    return read_json(NORMALIZED_ROOT / f"{document_key}.json", default=None)