61 lines
1.9 KiB
Python
61 lines
1.9 KiB
Python
from __future__ import annotations
|
|
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from parser.config import MANIFEST_PATH, POPULAR_URL, TARGET_DOCUMENTS, USER_AGENT
|
|
from parser.utils import to_absolute_url, write_json
|
|
|
|
|
|
def discover_documents(session: requests.Session) -> dict:
|
|
response = session.get(POPULAR_URL, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
anchors = {
|
|
anchor.get_text(" ", strip=True): to_absolute_url(anchor.get("href", ""), POPULAR_URL)
|
|
for anchor in soup.select("a[href]")
|
|
}
|
|
|
|
grouped: dict[str, list[dict]] = defaultdict(list)
|
|
for document in TARGET_DOCUMENTS:
|
|
grouped[document["category_key"]].append(document)
|
|
|
|
categories = []
|
|
for category_key, docs in grouped.items():
|
|
category_title = docs[0]["consultant_category"]
|
|
categories.append(
|
|
{
|
|
"key": category_key,
|
|
"title": category_title,
|
|
"found_on_popular_page": category_title in anchors,
|
|
"documents": [
|
|
{
|
|
"key": doc["key"],
|
|
"source_url": doc["source_url"],
|
|
"law_type": doc["law_type"],
|
|
"source_short_name": doc["source_short_name"],
|
|
}
|
|
for doc in docs
|
|
],
|
|
}
|
|
)
|
|
|
|
manifest = {
|
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
"source_page": POPULAR_URL,
|
|
"categories": sorted(categories, key=lambda item: item["title"]),
|
|
"documents": TARGET_DOCUMENTS,
|
|
}
|
|
write_json(MANIFEST_PATH, manifest)
|
|
return manifest
|
|
|
|
|
|
def build_session() -> requests.Session:
|
|
session = requests.Session()
|
|
session.headers.update({"User-Agent": USER_AGENT})
|
|
return session
|