first commit
This commit is contained in:
@@ -0,0 +1 @@
|
||||
"""CLI parser package for LawBot ingestion."""
|
||||
@@ -0,0 +1,5 @@
|
||||
from parser.cli import main
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+135
@@ -0,0 +1,135 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from parser.discovery import discover_documents, build_session
|
||||
from parser.fetcher import fetch_documents, load_manifest, load_raw_index
|
||||
from parser.ingest import ingest_documents
|
||||
from parser.normalizer import load_normalized_document, normalize_document, write_normalized_document
|
||||
from shared import ORM
|
||||
|
||||
|
||||
def parse_categories(value: str | None) -> set[str] | None:
|
||||
if not value:
|
||||
return None
|
||||
return {item.strip() for item in value.split(",") if item.strip()}
|
||||
|
||||
|
||||
def select_documents(categories: set[str] | None, limit: int | None) -> list[dict]:
|
||||
manifest = load_manifest()
|
||||
documents = manifest["documents"]
|
||||
if categories:
|
||||
documents = [doc for doc in documents if doc["category_key"] in categories]
|
||||
if limit is not None:
|
||||
documents = documents[:limit]
|
||||
return documents
|
||||
|
||||
|
||||
def run_discover(_: argparse.Namespace) -> None:
|
||||
manifest = discover_documents(build_session())
|
||||
print(f"discovered {len(manifest['documents'])} documents from {manifest['source_page']}")
|
||||
|
||||
|
||||
def run_fetch(args: argparse.Namespace) -> None:
|
||||
documents = select_documents(parse_categories(args.categories), args.limit)
|
||||
payloads = fetch_documents(documents, force=args.force, dry_run=args.dry_run)
|
||||
print(f"fetched {len(payloads)} documents")
|
||||
|
||||
|
||||
def run_normalize(args: argparse.Namespace) -> None:
|
||||
documents = select_documents(parse_categories(args.categories), args.limit)
|
||||
raw_index = load_raw_index()
|
||||
normalized_count = 0
|
||||
|
||||
for document in documents:
|
||||
raw_payload = raw_index.get("documents", {}).get(document["key"])
|
||||
if raw_payload is None:
|
||||
raise FileNotFoundError(
|
||||
f"raw payload for {document['key']} not found; run `python -m parser fetch` first"
|
||||
)
|
||||
|
||||
normalized_document = normalize_document(raw_payload)
|
||||
write_normalized_document(normalized_document, dry_run=args.dry_run)
|
||||
normalized_count += 1
|
||||
|
||||
print(f"normalized {normalized_count} documents")
|
||||
|
||||
|
||||
async def _run_ingest_async(args: argparse.Namespace) -> None:
|
||||
documents = select_documents(parse_categories(args.categories), args.limit)
|
||||
normalized_documents = []
|
||||
|
||||
for document in documents:
|
||||
normalized = load_normalized_document(document["key"])
|
||||
if normalized is None:
|
||||
raise FileNotFoundError(
|
||||
f"normalized payload for {document['key']} not found; run `python -m parser normalize` first"
|
||||
)
|
||||
normalized_documents.append(normalized)
|
||||
|
||||
orm = None
|
||||
try:
|
||||
if not args.dry_run:
|
||||
orm = ORM()
|
||||
await orm.init_schema()
|
||||
results = await ingest_documents(orm, normalized_documents, dry_run=args.dry_run)
|
||||
finally:
|
||||
if orm is not None:
|
||||
await orm.close()
|
||||
|
||||
print(f"ingested {len(results)} documents")
|
||||
|
||||
|
||||
def run_ingest(args: argparse.Namespace) -> None:
|
||||
asyncio.run(_run_ingest_async(args))
|
||||
|
||||
|
||||
def run_pipeline(args: argparse.Namespace) -> None:
|
||||
run_discover(args)
|
||||
if args.dry_run:
|
||||
run_fetch(args)
|
||||
print("dry-run stopped after fetch preview; raw files and DB were not changed")
|
||||
return
|
||||
|
||||
run_fetch(args)
|
||||
run_normalize(args)
|
||||
run_ingest(args)
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Consultant ingestion pipeline for LawBot.")
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
def add_common_flags(command_parser: argparse.ArgumentParser) -> None:
|
||||
command_parser.add_argument("--categories", default=None)
|
||||
command_parser.add_argument("--force", action="store_true")
|
||||
command_parser.add_argument("--limit", type=int, default=None)
|
||||
command_parser.add_argument("--dry-run", action="store_true")
|
||||
|
||||
discover_parser = subparsers.add_parser("discover")
|
||||
discover_parser.set_defaults(func=run_discover)
|
||||
|
||||
fetch_parser = subparsers.add_parser("fetch")
|
||||
add_common_flags(fetch_parser)
|
||||
fetch_parser.set_defaults(func=run_fetch)
|
||||
|
||||
normalize_parser = subparsers.add_parser("normalize")
|
||||
add_common_flags(normalize_parser)
|
||||
normalize_parser.set_defaults(func=run_normalize)
|
||||
|
||||
ingest_parser = subparsers.add_parser("ingest")
|
||||
add_common_flags(ingest_parser)
|
||||
ingest_parser.set_defaults(func=run_ingest)
|
||||
|
||||
run_parser = subparsers.add_parser("run")
|
||||
add_common_flags(run_parser)
|
||||
run_parser.set_defaults(func=run_pipeline)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
args.func(args)
|
||||
@@ -0,0 +1,136 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
VOLUMES_DIR = BASE_DIR / "volumes" / "parser"
|
||||
RAW_ROOT = VOLUMES_DIR / "raw" / "consultant"
|
||||
NORMALIZED_ROOT = VOLUMES_DIR / "normalized"
|
||||
STATE_DIR = VOLUMES_DIR / "state"
|
||||
MANIFEST_PATH = STATE_DIR / "manifest.json"
|
||||
RAW_INDEX_PATH = STATE_DIR / "raw_index.json"
|
||||
|
||||
POPULAR_URL = "https://www.consultant.ru/popular/"
|
||||
REQUEST_TIMEOUT = 30
|
||||
MAX_RETRIES = 3
|
||||
MAX_WORKERS = 2
|
||||
USER_AGENT = (
|
||||
"LawBotParser/1.0 (+https://local.lawbot; "
|
||||
"purpose=legal-rag-ingestion; contact=local-dev)"
|
||||
)
|
||||
|
||||
TARGET_DOCUMENTS = [
|
||||
{
|
||||
"key": "constitution",
|
||||
"category_key": "constitution",
|
||||
"consultant_category": "Конституция РФ",
|
||||
"law_type": "constitutional",
|
||||
"source_short_name": "Конституция РФ",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_28399/",
|
||||
},
|
||||
{
|
||||
"key": "civil_code_part_1",
|
||||
"category_key": "civil",
|
||||
"consultant_category": "Гражданское право, гражданское законодательство РФ",
|
||||
"law_type": "civil",
|
||||
"source_short_name": "ГК РФ ч. 1",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_5142/",
|
||||
},
|
||||
{
|
||||
"key": "civil_code_part_2",
|
||||
"category_key": "civil",
|
||||
"consultant_category": "Гражданское право, гражданское законодательство РФ",
|
||||
"law_type": "civil",
|
||||
"source_short_name": "ГК РФ ч. 2",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_508506/",
|
||||
},
|
||||
{
|
||||
"key": "civil_code_part_3",
|
||||
"category_key": "civil",
|
||||
"consultant_category": "Гражданское право, гражданское законодательство РФ",
|
||||
"law_type": "civil",
|
||||
"source_short_name": "ГК РФ ч. 3",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_482694/",
|
||||
},
|
||||
{
|
||||
"key": "civil_code_part_4",
|
||||
"category_key": "civil",
|
||||
"consultant_category": "Гражданское право, гражданское законодательство РФ",
|
||||
"law_type": "civil",
|
||||
"source_short_name": "ГК РФ ч. 4",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_509417/",
|
||||
},
|
||||
{
|
||||
"key": "civil_procedure_code",
|
||||
"category_key": "civil_procedure",
|
||||
"consultant_category": "Гражданское процессуальное право, гражданский процесс",
|
||||
"law_type": "procedural",
|
||||
"source_short_name": "ГПК РФ",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_39570/",
|
||||
},
|
||||
{
|
||||
"key": "housing_code",
|
||||
"category_key": "housing",
|
||||
"consultant_category": "Жилищное право, жилищное законодательство РФ",
|
||||
"law_type": "housing",
|
||||
"source_short_name": "ЖК РФ",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_51057/",
|
||||
},
|
||||
{
|
||||
"key": "family_code",
|
||||
"category_key": "family",
|
||||
"consultant_category": "Семейное право, семейное законодательство РФ",
|
||||
"law_type": "family",
|
||||
"source_short_name": "СК РФ",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_8982/",
|
||||
},
|
||||
{
|
||||
"key": "labor_code",
|
||||
"category_key": "labor",
|
||||
"consultant_category": "Трудовое право, трудовое законодательство РФ",
|
||||
"law_type": "labor",
|
||||
"source_short_name": "ТК РФ",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_34683/",
|
||||
},
|
||||
{
|
||||
"key": "consumer_protection_law",
|
||||
"category_key": "consumer",
|
||||
"consultant_category": "Законодательство РФ о правах потребителя",
|
||||
"law_type": "consumer",
|
||||
"source_short_name": "ЗОЗПП",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_305/",
|
||||
},
|
||||
{
|
||||
"key": "enforcement_law",
|
||||
"category_key": "enforcement",
|
||||
"consultant_category": "Законодательство об исполнительном производстве",
|
||||
"law_type": "enforcement",
|
||||
"source_short_name": "ФЗ об исполнительном производстве",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_71450/",
|
||||
},
|
||||
{
|
||||
"key": "mortgage_law",
|
||||
"category_key": "mortgage",
|
||||
"consultant_category": "Законодательство об ипотеке",
|
||||
"law_type": "mortgage",
|
||||
"source_short_name": "ФЗ об ипотеке",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_19396/",
|
||||
},
|
||||
{
|
||||
"key": "administrative_code",
|
||||
"category_key": "administrative",
|
||||
"consultant_category": "Административное право, административное законодательство РФ",
|
||||
"law_type": "administrative",
|
||||
"source_short_name": "КоАП РФ",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_34661/",
|
||||
},
|
||||
{
|
||||
"key": "criminal_code",
|
||||
"category_key": "criminal",
|
||||
"consultant_category": "Уголовное право, уголовное законодательство РФ",
|
||||
"law_type": "criminal",
|
||||
"source_short_name": "УК РФ",
|
||||
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_10699/",
|
||||
},
|
||||
]
|
||||
@@ -0,0 +1,60 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from parser.config import MANIFEST_PATH, POPULAR_URL, TARGET_DOCUMENTS, USER_AGENT
|
||||
from parser.utils import to_absolute_url, write_json
|
||||
|
||||
|
||||
def discover_documents(session: requests.Session) -> dict:
|
||||
response = session.get(POPULAR_URL, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
anchors = {
|
||||
anchor.get_text(" ", strip=True): to_absolute_url(anchor.get("href", ""), POPULAR_URL)
|
||||
for anchor in soup.select("a[href]")
|
||||
}
|
||||
|
||||
grouped: dict[str, list[dict]] = defaultdict(list)
|
||||
for document in TARGET_DOCUMENTS:
|
||||
grouped[document["category_key"]].append(document)
|
||||
|
||||
categories = []
|
||||
for category_key, docs in grouped.items():
|
||||
category_title = docs[0]["consultant_category"]
|
||||
categories.append(
|
||||
{
|
||||
"key": category_key,
|
||||
"title": category_title,
|
||||
"found_on_popular_page": category_title in anchors,
|
||||
"documents": [
|
||||
{
|
||||
"key": doc["key"],
|
||||
"source_url": doc["source_url"],
|
||||
"law_type": doc["law_type"],
|
||||
"source_short_name": doc["source_short_name"],
|
||||
}
|
||||
for doc in docs
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
manifest = {
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"source_page": POPULAR_URL,
|
||||
"categories": sorted(categories, key=lambda item: item["title"]),
|
||||
"documents": TARGET_DOCUMENTS,
|
||||
}
|
||||
write_json(MANIFEST_PATH, manifest)
|
||||
return manifest
|
||||
|
||||
|
||||
def build_session() -> requests.Session:
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": USER_AGENT})
|
||||
return session
|
||||
@@ -0,0 +1,171 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from parser.config import (
|
||||
MANIFEST_PATH,
|
||||
MAX_RETRIES,
|
||||
MAX_WORKERS,
|
||||
RAW_INDEX_PATH,
|
||||
RAW_ROOT,
|
||||
REQUEST_TIMEOUT,
|
||||
)
|
||||
from parser.discovery import build_session, discover_documents
|
||||
from parser.utils import ensure_dir, read_json, sha256_text, to_absolute_url, write_json
|
||||
|
||||
|
||||
def fetch_with_retry(session: requests.Session, url: str) -> str:
|
||||
last_error: Exception | None = None
|
||||
for attempt in range(1, MAX_RETRIES + 1):
|
||||
try:
|
||||
response = session.get(url, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except Exception as exc: # pragma: no cover - network branch
|
||||
last_error = exc
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(attempt)
|
||||
raise last_error # type: ignore[misc]
|
||||
|
||||
|
||||
def extract_toc_articles(root_html: str, source_url: str) -> list[dict]:
|
||||
soup = BeautifulSoup(root_html, "html.parser")
|
||||
toc = soup.select_one(".document-page__toc > ul")
|
||||
if toc is None:
|
||||
return []
|
||||
|
||||
articles: list[dict] = []
|
||||
|
||||
def walk_list(node: Tag, stack: list[str]) -> None:
|
||||
items = [child for child in node.children if isinstance(child, Tag) and child.name == "li"]
|
||||
for li in items:
|
||||
anchor = li.find("a", href=True, recursive=False)
|
||||
if anchor is None:
|
||||
continue
|
||||
|
||||
title = anchor.get_text(" ", strip=True)
|
||||
url = to_absolute_url(anchor["href"], source_url)
|
||||
next_ul = li.find_next_sibling("ul")
|
||||
|
||||
if title.startswith("Статья "):
|
||||
article_number, _, article_title = title.partition(". ")
|
||||
articles.append(
|
||||
{
|
||||
"article_number": article_number.replace("Статья", "").strip(),
|
||||
"article_title": article_title.strip() or title,
|
||||
"article_url": url,
|
||||
"section_title": next((value for value in reversed(stack) if value.startswith("Раздел")), None),
|
||||
"chapter_title": next((value for value in reversed(stack) if value.startswith("Глава")), None),
|
||||
"part_title": next((value for value in reversed(stack) if value.startswith("Часть")), None),
|
||||
"breadcrumb": [*stack, title],
|
||||
}
|
||||
)
|
||||
else:
|
||||
if next_ul is not None:
|
||||
walk_list(next_ul, [*stack, title])
|
||||
|
||||
walk_list(toc, [])
|
||||
return articles
|
||||
|
||||
|
||||
def _fetch_article(session: requests.Session, article: dict) -> tuple[dict, str]:
|
||||
return article, fetch_with_retry(session, article["article_url"])
|
||||
|
||||
|
||||
def load_manifest() -> dict:
|
||||
manifest = read_json(MANIFEST_PATH)
|
||||
if manifest is None:
|
||||
manifest = discover_documents(build_session())
|
||||
return manifest
|
||||
|
||||
|
||||
def load_raw_index() -> dict:
|
||||
return read_json(RAW_INDEX_PATH, default={"documents": {}})
|
||||
|
||||
|
||||
def save_raw_index(index_payload: dict) -> None:
|
||||
write_json(RAW_INDEX_PATH, index_payload)
|
||||
|
||||
|
||||
def fetch_documents(
|
||||
selected_documents: list[dict], force: bool = False, dry_run: bool = False
|
||||
) -> list[dict]:
|
||||
session = build_session()
|
||||
raw_index = load_raw_index()
|
||||
raw_index.setdefault("documents", {})
|
||||
fetched_payloads: list[dict] = []
|
||||
|
||||
for document in selected_documents:
|
||||
root_html = fetch_with_retry(session, document["source_url"])
|
||||
root_hash = sha256_text(root_html)
|
||||
previous = raw_index["documents"].get(document["key"])
|
||||
|
||||
if (
|
||||
not force
|
||||
and previous
|
||||
and previous.get("root_hash") == root_hash
|
||||
and Path(previous["raw_dir"]).exists()
|
||||
):
|
||||
fetched_payloads.append(previous)
|
||||
continue
|
||||
|
||||
toc_articles = extract_toc_articles(root_html, document["source_url"])
|
||||
timestamp = datetime.now(timezone.utc)
|
||||
raw_dir = RAW_ROOT / timestamp.date().isoformat() / document["key"]
|
||||
article_dir = raw_dir / "articles"
|
||||
|
||||
article_payloads: list[dict] = []
|
||||
article_hash_parts: list[str] = []
|
||||
|
||||
if not dry_run:
|
||||
ensure_dir(article_dir)
|
||||
(raw_dir / "root.html").write_text(root_html, encoding="utf-8")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||||
futures = {
|
||||
executor.submit(_fetch_article, session, article): index
|
||||
for index, article in enumerate(toc_articles)
|
||||
}
|
||||
article_results = [None] * len(futures)
|
||||
for future in as_completed(futures):
|
||||
article, article_html = future.result()
|
||||
index = futures[future]
|
||||
article_hash = sha256_text(article_html)
|
||||
article_hash_parts.append(article_hash)
|
||||
article_payload = {
|
||||
**article,
|
||||
"file_name": f"{index:04d}.html",
|
||||
"sha256": article_hash,
|
||||
}
|
||||
article_results[index] = (article_payload, article_html)
|
||||
|
||||
for article_payload, article_html in article_results:
|
||||
article_payloads.append(article_payload)
|
||||
if not dry_run:
|
||||
(article_dir / article_payload["file_name"]).write_text(article_html, encoding="utf-8")
|
||||
|
||||
payload = {
|
||||
**document,
|
||||
"fetched_at": timestamp.isoformat(),
|
||||
"raw_dir": str(raw_dir),
|
||||
"root_file": "root.html",
|
||||
"root_hash": root_hash,
|
||||
"version_hash": sha256_text(root_hash + "".join(article_hash_parts)),
|
||||
"article_count": len(article_payloads),
|
||||
"articles": article_payloads,
|
||||
}
|
||||
|
||||
if not dry_run:
|
||||
write_json(raw_dir / "sidecar.json", payload)
|
||||
raw_index["documents"][document["key"]] = payload
|
||||
save_raw_index(raw_index)
|
||||
|
||||
fetched_payloads.append(payload)
|
||||
|
||||
return fetched_payloads
|
||||
@@ -0,0 +1,68 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date, datetime, timezone
|
||||
|
||||
from parser.normalizer import build_chunks
|
||||
from shared import ORM
|
||||
|
||||
|
||||
def parse_iso_date(value: str | None) -> date | None:
|
||||
if not value:
|
||||
return None
|
||||
return date.fromisoformat(value)
|
||||
|
||||
|
||||
async def ingest_documents(
|
||||
orm: ORM | None, normalized_documents: list[dict], dry_run: bool = False
|
||||
) -> list[dict]:
|
||||
results = []
|
||||
for document in normalized_documents:
|
||||
source_payload = {
|
||||
"title": document["title"],
|
||||
"source_type": document["source_type"],
|
||||
"jurisdiction": "RU",
|
||||
"law_type": document["law_type"],
|
||||
"document_number": document["document_number"],
|
||||
"adoption_date": parse_iso_date(document["adoption_date"]),
|
||||
"publication_date": parse_iso_date(document["publication_date"]),
|
||||
"effective_date": parse_iso_date(document["effective_date"]),
|
||||
"source_url": document["source_url"],
|
||||
"official_publication_number": None,
|
||||
"version_hash": document["version_hash"],
|
||||
"is_active": True,
|
||||
"loaded_at": datetime.now(timezone.utc),
|
||||
}
|
||||
chunks = build_chunks(document)
|
||||
|
||||
if dry_run:
|
||||
results.append(
|
||||
{
|
||||
"document_key": document["key"],
|
||||
"status": "dry-run",
|
||||
"chunk_count": len(chunks),
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
if orm is None:
|
||||
raise RuntimeError("ORM instance is required when dry_run is disabled")
|
||||
|
||||
source, created = await orm.upsert_law_source(source_payload)
|
||||
should_replace = created
|
||||
if not created:
|
||||
existing_count = await orm.get_chunks_count_by_source(source.id)
|
||||
should_replace = existing_count == 0
|
||||
|
||||
if should_replace:
|
||||
await orm.replace_law_chunks(source.id, chunks)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"document_key": document["key"],
|
||||
"status": "updated" if should_replace else "skipped",
|
||||
"chunk_count": len(chunks),
|
||||
"source_id": source.id,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,166 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from parser.config import NORMALIZED_ROOT
|
||||
from parser.utils import (
|
||||
chunk_paragraphs,
|
||||
ensure_dir,
|
||||
normalize_text,
|
||||
parse_russian_date,
|
||||
read_json,
|
||||
write_json,
|
||||
)
|
||||
|
||||
|
||||
DOCUMENT_NUMBER_RE = re.compile(r"N\s*([0-9А-Яа-я\-ФКЗA-Z]+)")
|
||||
|
||||
|
||||
def parse_root_metadata(root_html: str) -> dict:
|
||||
soup = BeautifulSoup(root_html, "html.parser")
|
||||
title_node = soup.select_one(".document-page__title h1")
|
||||
title_text = title_node.get_text("\n", strip=True) if title_node else ""
|
||||
content_node = soup.select_one(".document-page__content")
|
||||
first_text = content_node.get_text("\n", strip=True) if content_node else ""
|
||||
|
||||
document_number = None
|
||||
number_match = DOCUMENT_NUMBER_RE.search(first_text or title_text)
|
||||
if number_match:
|
||||
document_number = number_match.group(1)
|
||||
|
||||
adoption_date = parse_russian_date(first_text or title_text)
|
||||
effective_date = None
|
||||
effective_match = re.search(r"вступ\.\s*в силу с\s*(\d{2}\.\d{2}\.\d{4})", title_text)
|
||||
if effective_match:
|
||||
day, month, year = effective_match.group(1).split(".")
|
||||
effective_date = f"{year}-{month}-{day}"
|
||||
|
||||
lines = [line.strip() for line in title_text.splitlines() if line.strip()]
|
||||
return {
|
||||
"title": lines[0] if lines else title_text.strip(),
|
||||
"version_note": "\n".join(lines[1:]).strip() or None,
|
||||
"document_number": document_number,
|
||||
"adoption_date": adoption_date,
|
||||
"effective_date": effective_date,
|
||||
"publication_date": None,
|
||||
}
|
||||
|
||||
|
||||
def parse_article_page(article_html: str) -> list[str]:
|
||||
soup = BeautifulSoup(article_html, "html.parser")
|
||||
content = soup.select_one(".document-page__content")
|
||||
if content is None:
|
||||
return []
|
||||
|
||||
for selector in [
|
||||
".document__insert",
|
||||
".document__edit",
|
||||
".document-page__notes",
|
||||
".document-page__title-link",
|
||||
".document-page__title",
|
||||
]:
|
||||
for node in content.select(selector):
|
||||
node.decompose()
|
||||
|
||||
heading = content.select_one("h1")
|
||||
if heading is not None:
|
||||
heading.decompose()
|
||||
|
||||
paragraphs: list[str] = []
|
||||
for node in content.select("p"):
|
||||
text = normalize_text(node.get_text("\n", strip=True))
|
||||
if not text:
|
||||
continue
|
||||
paragraphs.append(text)
|
||||
|
||||
return paragraphs
|
||||
|
||||
|
||||
def normalize_document(raw_payload: dict) -> dict:
|
||||
raw_dir = Path(raw_payload["raw_dir"])
|
||||
root_html = (raw_dir / raw_payload["root_file"]).read_text(encoding="utf-8")
|
||||
metadata = parse_root_metadata(root_html)
|
||||
|
||||
articles = []
|
||||
for article_ref in raw_payload["articles"]:
|
||||
article_html = (raw_dir / "articles" / article_ref["file_name"]).read_text(encoding="utf-8")
|
||||
paragraphs = parse_article_page(article_html)
|
||||
article_text = "\n\n".join(paragraphs).strip()
|
||||
articles.append(
|
||||
{
|
||||
"article_number": article_ref["article_number"],
|
||||
"article_title": article_ref["article_title"],
|
||||
"article_url": article_ref["article_url"],
|
||||
"section_title": article_ref.get("section_title"),
|
||||
"chapter_title": article_ref.get("chapter_title"),
|
||||
"part_title": article_ref.get("part_title"),
|
||||
"breadcrumb": article_ref.get("breadcrumb", []),
|
||||
"text": article_text,
|
||||
"paragraphs": paragraphs,
|
||||
}
|
||||
)
|
||||
|
||||
normalized = {
|
||||
"key": raw_payload["key"],
|
||||
"title": metadata["title"],
|
||||
"source_url": raw_payload["source_url"],
|
||||
"source_type": "consultant_document",
|
||||
"document_number": metadata["document_number"],
|
||||
"adoption_date": metadata["adoption_date"],
|
||||
"publication_date": metadata["publication_date"],
|
||||
"effective_date": metadata["effective_date"],
|
||||
"version_note": metadata["version_note"],
|
||||
"version_hash": raw_payload["version_hash"],
|
||||
"law_type": raw_payload["law_type"],
|
||||
"consultant_category": raw_payload["consultant_category"],
|
||||
"source_short_name": raw_payload["source_short_name"],
|
||||
"articles": articles,
|
||||
}
|
||||
return normalized
|
||||
|
||||
|
||||
def build_chunks(normalized_document: dict) -> list[dict]:
|
||||
chunks: list[dict] = []
|
||||
for article in normalized_document["articles"]:
|
||||
paragraphs = article["paragraphs"] or ([article["text"]] if article["text"] else [])
|
||||
text_chunks = chunk_paragraphs(paragraphs)
|
||||
if not text_chunks and article["text"]:
|
||||
text_chunks = [article["text"]]
|
||||
|
||||
for chunk_text in text_chunks:
|
||||
chunks.append(
|
||||
{
|
||||
"chunk_index": len(chunks),
|
||||
"article_number": article["article_number"],
|
||||
"article_title": article["article_title"],
|
||||
"chunk_text": chunk_text,
|
||||
"metadata": {
|
||||
"source_title": normalized_document["title"],
|
||||
"source_short_name": normalized_document["source_short_name"],
|
||||
"consultant_category": normalized_document["consultant_category"],
|
||||
"chapter_title": article.get("chapter_title"),
|
||||
"section_title": article.get("section_title"),
|
||||
"article_number": article["article_number"],
|
||||
"article_title": article["article_title"],
|
||||
"document_url": article["article_url"],
|
||||
"breadcrumb": article.get("breadcrumb", []),
|
||||
"version_hash": normalized_document["version_hash"],
|
||||
},
|
||||
}
|
||||
)
|
||||
return chunks
|
||||
|
||||
|
||||
def write_normalized_document(normalized_document: dict, dry_run: bool = False) -> Path:
|
||||
output_path = NORMALIZED_ROOT / f"{normalized_document['key']}.json"
|
||||
if not dry_run:
|
||||
ensure_dir(output_path.parent)
|
||||
write_json(output_path, normalized_document)
|
||||
return output_path
|
||||
|
||||
|
||||
def load_normalized_document(document_key: str) -> dict | None:
|
||||
return read_json(NORMALIZED_ROOT / f"{document_key}.json", default=None)
|
||||
+108
@@ -0,0 +1,108 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
WHITESPACE_RE = re.compile(r"[ \t]+")
|
||||
NEWLINES_RE = re.compile(r"\n{3,}")
|
||||
|
||||
RUSSIAN_MONTHS = {
|
||||
"января": 1,
|
||||
"февраля": 2,
|
||||
"марта": 3,
|
||||
"апреля": 4,
|
||||
"мая": 5,
|
||||
"июня": 6,
|
||||
"июля": 7,
|
||||
"августа": 8,
|
||||
"сентября": 9,
|
||||
"октября": 10,
|
||||
"ноября": 11,
|
||||
"декабря": 12,
|
||||
}
|
||||
|
||||
|
||||
def ensure_dir(path: Path) -> Path:
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
return path
|
||||
|
||||
|
||||
def read_json(path: Path, default: Any = None) -> Any:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Any) -> None:
|
||||
ensure_dir(path.parent)
|
||||
path.write_text(
|
||||
json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def sha256_text(value: str) -> str:
|
||||
return hashlib.sha256(value.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def normalize_text(value: str) -> str:
|
||||
cleaned = value.replace("\xa0", " ")
|
||||
cleaned = WHITESPACE_RE.sub(" ", cleaned)
|
||||
cleaned = re.sub(r" *\n *", "\n", cleaned)
|
||||
cleaned = NEWLINES_RE.sub("\n\n", cleaned)
|
||||
return cleaned.strip()
|
||||
|
||||
|
||||
def slugify(value: str) -> str:
|
||||
slug = re.sub(r"[^a-zA-Z0-9]+", "-", value.lower()).strip("-")
|
||||
return slug or "document"
|
||||
|
||||
|
||||
def to_absolute_url(url: str, base_url: str) -> str:
|
||||
return urljoin(base_url, url)
|
||||
|
||||
|
||||
def parse_russian_date(value: str) -> str | None:
|
||||
match = re.search(r"(\d{1,2})\s+([а-я]+)\s+(\d{4})", value.lower())
|
||||
if not match:
|
||||
return None
|
||||
|
||||
day = int(match.group(1))
|
||||
month = RUSSIAN_MONTHS.get(match.group(2))
|
||||
year = int(match.group(3))
|
||||
if month is None:
|
||||
return None
|
||||
|
||||
return datetime(year, month, day).date().isoformat()
|
||||
|
||||
|
||||
def chunk_paragraphs(
|
||||
paragraphs: list[str], max_chars: int = 4500, overlap_paragraphs: int = 1
|
||||
) -> list[str]:
|
||||
if not paragraphs:
|
||||
return []
|
||||
|
||||
chunks: list[str] = []
|
||||
current: list[str] = []
|
||||
current_len = 0
|
||||
|
||||
for paragraph in paragraphs:
|
||||
paragraph_len = len(paragraph)
|
||||
if current and current_len + paragraph_len + 2 > max_chars:
|
||||
chunks.append("\n\n".join(current).strip())
|
||||
current = current[-overlap_paragraphs:] if overlap_paragraphs else []
|
||||
current_len = sum(len(item) + 2 for item in current)
|
||||
|
||||
current.append(paragraph)
|
||||
current_len += paragraph_len + 2
|
||||
|
||||
if current:
|
||||
chunks.append("\n\n".join(current).strip())
|
||||
|
||||
return [chunk for chunk in chunks if chunk]
|
||||
Reference in New Issue
Block a user