167 lines
6.1 KiB
Python
167 lines
6.1 KiB
Python
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import re
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
|
|||
|
|
from parser.config import NORMALIZED_ROOT
|
|||
|
|
from parser.utils import (
|
|||
|
|
chunk_paragraphs,
|
|||
|
|
ensure_dir,
|
|||
|
|
normalize_text,
|
|||
|
|
parse_russian_date,
|
|||
|
|
read_json,
|
|||
|
|
write_json,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
DOCUMENT_NUMBER_RE = re.compile(r"N\s*([0-9А-Яа-я\-ФКЗA-Z]+)")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_root_metadata(root_html: str) -> dict:
|
|||
|
|
soup = BeautifulSoup(root_html, "html.parser")
|
|||
|
|
title_node = soup.select_one(".document-page__title h1")
|
|||
|
|
title_text = title_node.get_text("\n", strip=True) if title_node else ""
|
|||
|
|
content_node = soup.select_one(".document-page__content")
|
|||
|
|
first_text = content_node.get_text("\n", strip=True) if content_node else ""
|
|||
|
|
|
|||
|
|
document_number = None
|
|||
|
|
number_match = DOCUMENT_NUMBER_RE.search(first_text or title_text)
|
|||
|
|
if number_match:
|
|||
|
|
document_number = number_match.group(1)
|
|||
|
|
|
|||
|
|
adoption_date = parse_russian_date(first_text or title_text)
|
|||
|
|
effective_date = None
|
|||
|
|
effective_match = re.search(r"вступ\.\s*в силу с\s*(\d{2}\.\d{2}\.\d{4})", title_text)
|
|||
|
|
if effective_match:
|
|||
|
|
day, month, year = effective_match.group(1).split(".")
|
|||
|
|
effective_date = f"{year}-{month}-{day}"
|
|||
|
|
|
|||
|
|
lines = [line.strip() for line in title_text.splitlines() if line.strip()]
|
|||
|
|
return {
|
|||
|
|
"title": lines[0] if lines else title_text.strip(),
|
|||
|
|
"version_note": "\n".join(lines[1:]).strip() or None,
|
|||
|
|
"document_number": document_number,
|
|||
|
|
"adoption_date": adoption_date,
|
|||
|
|
"effective_date": effective_date,
|
|||
|
|
"publication_date": None,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_article_page(article_html: str) -> list[str]:
|
|||
|
|
soup = BeautifulSoup(article_html, "html.parser")
|
|||
|
|
content = soup.select_one(".document-page__content")
|
|||
|
|
if content is None:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
for selector in [
|
|||
|
|
".document__insert",
|
|||
|
|
".document__edit",
|
|||
|
|
".document-page__notes",
|
|||
|
|
".document-page__title-link",
|
|||
|
|
".document-page__title",
|
|||
|
|
]:
|
|||
|
|
for node in content.select(selector):
|
|||
|
|
node.decompose()
|
|||
|
|
|
|||
|
|
heading = content.select_one("h1")
|
|||
|
|
if heading is not None:
|
|||
|
|
heading.decompose()
|
|||
|
|
|
|||
|
|
paragraphs: list[str] = []
|
|||
|
|
for node in content.select("p"):
|
|||
|
|
text = normalize_text(node.get_text("\n", strip=True))
|
|||
|
|
if not text:
|
|||
|
|
continue
|
|||
|
|
paragraphs.append(text)
|
|||
|
|
|
|||
|
|
return paragraphs
|
|||
|
|
|
|||
|
|
|
|||
|
|
def normalize_document(raw_payload: dict) -> dict:
|
|||
|
|
raw_dir = Path(raw_payload["raw_dir"])
|
|||
|
|
root_html = (raw_dir / raw_payload["root_file"]).read_text(encoding="utf-8")
|
|||
|
|
metadata = parse_root_metadata(root_html)
|
|||
|
|
|
|||
|
|
articles = []
|
|||
|
|
for article_ref in raw_payload["articles"]:
|
|||
|
|
article_html = (raw_dir / "articles" / article_ref["file_name"]).read_text(encoding="utf-8")
|
|||
|
|
paragraphs = parse_article_page(article_html)
|
|||
|
|
article_text = "\n\n".join(paragraphs).strip()
|
|||
|
|
articles.append(
|
|||
|
|
{
|
|||
|
|
"article_number": article_ref["article_number"],
|
|||
|
|
"article_title": article_ref["article_title"],
|
|||
|
|
"article_url": article_ref["article_url"],
|
|||
|
|
"section_title": article_ref.get("section_title"),
|
|||
|
|
"chapter_title": article_ref.get("chapter_title"),
|
|||
|
|
"part_title": article_ref.get("part_title"),
|
|||
|
|
"breadcrumb": article_ref.get("breadcrumb", []),
|
|||
|
|
"text": article_text,
|
|||
|
|
"paragraphs": paragraphs,
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
normalized = {
|
|||
|
|
"key": raw_payload["key"],
|
|||
|
|
"title": metadata["title"],
|
|||
|
|
"source_url": raw_payload["source_url"],
|
|||
|
|
"source_type": "consultant_document",
|
|||
|
|
"document_number": metadata["document_number"],
|
|||
|
|
"adoption_date": metadata["adoption_date"],
|
|||
|
|
"publication_date": metadata["publication_date"],
|
|||
|
|
"effective_date": metadata["effective_date"],
|
|||
|
|
"version_note": metadata["version_note"],
|
|||
|
|
"version_hash": raw_payload["version_hash"],
|
|||
|
|
"law_type": raw_payload["law_type"],
|
|||
|
|
"consultant_category": raw_payload["consultant_category"],
|
|||
|
|
"source_short_name": raw_payload["source_short_name"],
|
|||
|
|
"articles": articles,
|
|||
|
|
}
|
|||
|
|
return normalized
|
|||
|
|
|
|||
|
|
|
|||
|
|
def build_chunks(normalized_document: dict) -> list[dict]:
|
|||
|
|
chunks: list[dict] = []
|
|||
|
|
for article in normalized_document["articles"]:
|
|||
|
|
paragraphs = article["paragraphs"] or ([article["text"]] if article["text"] else [])
|
|||
|
|
text_chunks = chunk_paragraphs(paragraphs)
|
|||
|
|
if not text_chunks and article["text"]:
|
|||
|
|
text_chunks = [article["text"]]
|
|||
|
|
|
|||
|
|
for chunk_text in text_chunks:
|
|||
|
|
chunks.append(
|
|||
|
|
{
|
|||
|
|
"chunk_index": len(chunks),
|
|||
|
|
"article_number": article["article_number"],
|
|||
|
|
"article_title": article["article_title"],
|
|||
|
|
"chunk_text": chunk_text,
|
|||
|
|
"metadata": {
|
|||
|
|
"source_title": normalized_document["title"],
|
|||
|
|
"source_short_name": normalized_document["source_short_name"],
|
|||
|
|
"consultant_category": normalized_document["consultant_category"],
|
|||
|
|
"chapter_title": article.get("chapter_title"),
|
|||
|
|
"section_title": article.get("section_title"),
|
|||
|
|
"article_number": article["article_number"],
|
|||
|
|
"article_title": article["article_title"],
|
|||
|
|
"document_url": article["article_url"],
|
|||
|
|
"breadcrumb": article.get("breadcrumb", []),
|
|||
|
|
"version_hash": normalized_document["version_hash"],
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
return chunks
|
|||
|
|
|
|||
|
|
|
|||
|
|
def write_normalized_document(normalized_document: dict, dry_run: bool = False) -> Path:
|
|||
|
|
output_path = NORMALIZED_ROOT / f"{normalized_document['key']}.json"
|
|||
|
|
if not dry_run:
|
|||
|
|
ensure_dir(output_path.parent)
|
|||
|
|
write_json(output_path, normalized_document)
|
|||
|
|
return output_path
|
|||
|
|
|
|||
|
|
|
|||
|
|
def load_normalized_document(document_key: str) -> dict | None:
|
|||
|
|
return read_json(NORMALIZED_ROOT / f"{document_key}.json", default=None)
|