first commit
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from parser.config import MANIFEST_PATH, POPULAR_URL, TARGET_DOCUMENTS, USER_AGENT
|
||||
from parser.utils import to_absolute_url, write_json
|
||||
|
||||
|
||||
def discover_documents(session: requests.Session) -> dict:
|
||||
response = session.get(POPULAR_URL, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
anchors = {
|
||||
anchor.get_text(" ", strip=True): to_absolute_url(anchor.get("href", ""), POPULAR_URL)
|
||||
for anchor in soup.select("a[href]")
|
||||
}
|
||||
|
||||
grouped: dict[str, list[dict]] = defaultdict(list)
|
||||
for document in TARGET_DOCUMENTS:
|
||||
grouped[document["category_key"]].append(document)
|
||||
|
||||
categories = []
|
||||
for category_key, docs in grouped.items():
|
||||
category_title = docs[0]["consultant_category"]
|
||||
categories.append(
|
||||
{
|
||||
"key": category_key,
|
||||
"title": category_title,
|
||||
"found_on_popular_page": category_title in anchors,
|
||||
"documents": [
|
||||
{
|
||||
"key": doc["key"],
|
||||
"source_url": doc["source_url"],
|
||||
"law_type": doc["law_type"],
|
||||
"source_short_name": doc["source_short_name"],
|
||||
}
|
||||
for doc in docs
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
manifest = {
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"source_page": POPULAR_URL,
|
||||
"categories": sorted(categories, key=lambda item: item["title"]),
|
||||
"documents": TARGET_DOCUMENTS,
|
||||
}
|
||||
write_json(MANIFEST_PATH, manifest)
|
||||
return manifest
|
||||
|
||||
|
||||
def build_session() -> requests.Session:
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": USER_AGENT})
|
||||
return session
|
||||
Reference in New Issue
Block a user