Files

61 lines
1.9 KiB
Python
Raw Permalink Normal View History

2026-05-25 01:12:43 +03:00
from __future__ import annotations
from collections import defaultdict
from datetime import datetime, timezone
import requests
from bs4 import BeautifulSoup
from parser.config import MANIFEST_PATH, POPULAR_URL, TARGET_DOCUMENTS, USER_AGENT
from parser.utils import to_absolute_url, write_json
def discover_documents(session: requests.Session) -> dict:
response = session.get(POPULAR_URL, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
anchors = {
anchor.get_text(" ", strip=True): to_absolute_url(anchor.get("href", ""), POPULAR_URL)
for anchor in soup.select("a[href]")
}
grouped: dict[str, list[dict]] = defaultdict(list)
for document in TARGET_DOCUMENTS:
grouped[document["category_key"]].append(document)
categories = []
for category_key, docs in grouped.items():
category_title = docs[0]["consultant_category"]
categories.append(
{
"key": category_key,
"title": category_title,
"found_on_popular_page": category_title in anchors,
"documents": [
{
"key": doc["key"],
"source_url": doc["source_url"],
"law_type": doc["law_type"],
"source_short_name": doc["source_short_name"],
}
for doc in docs
],
}
)
manifest = {
"generated_at": datetime.now(timezone.utc).isoformat(),
"source_page": POPULAR_URL,
"categories": sorted(categories, key=lambda item: item["title"]),
"documents": TARGET_DOCUMENTS,
}
write_json(MANIFEST_PATH, manifest)
return manifest
def build_session() -> requests.Session:
session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT})
return session