from __future__ import annotations from collections import defaultdict from datetime import datetime, timezone import requests from bs4 import BeautifulSoup from parser.config import MANIFEST_PATH, POPULAR_URL, TARGET_DOCUMENTS, USER_AGENT from parser.utils import to_absolute_url, write_json def discover_documents(session: requests.Session) -> dict: response = session.get(POPULAR_URL, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") anchors = { anchor.get_text(" ", strip=True): to_absolute_url(anchor.get("href", ""), POPULAR_URL) for anchor in soup.select("a[href]") } grouped: dict[str, list[dict]] = defaultdict(list) for document in TARGET_DOCUMENTS: grouped[document["category_key"]].append(document) categories = [] for category_key, docs in grouped.items(): category_title = docs[0]["consultant_category"] categories.append( { "key": category_key, "title": category_title, "found_on_popular_page": category_title in anchors, "documents": [ { "key": doc["key"], "source_url": doc["source_url"], "law_type": doc["law_type"], "source_short_name": doc["source_short_name"], } for doc in docs ], } ) manifest = { "generated_at": datetime.now(timezone.utc).isoformat(), "source_page": POPULAR_URL, "categories": sorted(categories, key=lambda item: item["title"]), "documents": TARGET_DOCUMENTS, } write_json(MANIFEST_PATH, manifest) return manifest def build_session() -> requests.Session: session = requests.Session() session.headers.update({"User-Agent": USER_AGENT}) return session