first commit
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
GORICH_SITE_URL=https://gorych34.ru/
|
||||
MENU_OUTPUT_PATH=/data/menu/gorich_menu.json
|
||||
REQUEST_TIMEOUT_SECONDS=20
|
||||
SCRAPE_ON_STARTUP=true
|
||||
@@ -0,0 +1,13 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY menu_scraper/requirements.txt /app/requirements.txt
|
||||
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY menu_scraper/app /app/app
|
||||
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8010"]
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class Settings:
|
||||
site_url: str = os.getenv("GORICH_SITE_URL", "https://gorych34.ru/")
|
||||
output_path: str = os.getenv("MENU_OUTPUT_PATH", "/data/menu/gorich_menu.json")
|
||||
request_timeout: float = float(os.getenv("REQUEST_TIMEOUT_SECONDS", "20"))
|
||||
scrape_on_startup: bool = os.getenv("SCRAPE_ON_STARTUP", "true").lower() == "true"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
@@ -0,0 +1,63 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
|
||||
from .config import settings
|
||||
from .models import MenuSnapshot
|
||||
from .scraper import GorichMenuScraper
|
||||
|
||||
|
||||
scraper = GorichMenuScraper()
|
||||
output_path = Path(settings.output_path)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(_: FastAPI):
|
||||
if settings.scrape_on_startup:
|
||||
await scraper.scrape_and_save()
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Gorich Menu Scraper",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
|
||||
def load_snapshot_from_disk() -> MenuSnapshot:
|
||||
if not output_path.exists():
|
||||
raise HTTPException(status_code=404, detail="Menu snapshot not found")
|
||||
|
||||
data = json.loads(output_path.read_text(encoding="utf-8"))
|
||||
return MenuSnapshot.model_validate(data)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health() -> dict[str, str]:
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.post("/scrape", response_model=MenuSnapshot)
|
||||
async def scrape_menu() -> MenuSnapshot:
|
||||
return await scraper.scrape_and_save()
|
||||
|
||||
|
||||
@app.get("/items", response_model=MenuSnapshot)
|
||||
async def get_items() -> MenuSnapshot:
|
||||
return load_snapshot_from_disk()
|
||||
|
||||
|
||||
@app.get("/items/{item_id}")
|
||||
async def get_item(item_id: str) -> dict[str, object]:
|
||||
snapshot = load_snapshot_from_disk()
|
||||
for item in snapshot.items:
|
||||
if item.item_id == item_id:
|
||||
return item.model_dump(mode="json")
|
||||
|
||||
raise HTTPException(status_code=404, detail="Menu item not found")
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class MenuItem(BaseModel):
|
||||
item_id: str
|
||||
name: str
|
||||
category: str
|
||||
description: str
|
||||
ingredients: list[str]
|
||||
price: int | None = None
|
||||
price_label: str
|
||||
size: str | None = None
|
||||
photo_url: str
|
||||
source_url: str
|
||||
scraped_at: datetime
|
||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class MenuSnapshot(BaseModel):
|
||||
source_url: str
|
||||
scraped_at: datetime
|
||||
total_items: int
|
||||
items: list[MenuItem]
|
||||
|
||||
@@ -0,0 +1,309 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .config import settings
|
||||
from .models import MenuItem, MenuSnapshot
|
||||
|
||||
|
||||
SHOP_PAYLOAD_MARKERS = (
|
||||
"MsJsShop.init(",
|
||||
"MsJsPublishedManager.addJsData(",
|
||||
)
|
||||
SIZE_PATTERN = re.compile(
|
||||
r"(\d+\s*(?:см|г|мл)(?:\s*/\s*\d+\s*(?:см|г|мл))*)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def normalize_spaces(value: str) -> str:
|
||||
return " ".join(value.replace("\xa0", " ").split())
|
||||
|
||||
|
||||
def compact_text(value: str) -> str:
|
||||
return re.sub(r"\s+", "", value.replace("\xa0", " ")).lower()
|
||||
|
||||
|
||||
def parse_price(price_label: str) -> int | None:
|
||||
cleaned = normalize_spaces(price_label).lower()
|
||||
if "бесплатно" in cleaned:
|
||||
return None
|
||||
|
||||
digits = re.sub(r"[^\d]", "", cleaned)
|
||||
return int(digits) if digits else None
|
||||
|
||||
|
||||
def parse_ingredients(description: str) -> list[str]:
|
||||
cleaned = normalize_spaces(description)
|
||||
if not cleaned:
|
||||
return []
|
||||
|
||||
lower_cleaned = cleaned.lower()
|
||||
if lower_cleaned.startswith("состав:"):
|
||||
cleaned = cleaned.split(":", 1)[1].strip()
|
||||
|
||||
return [part.strip() for part in cleaned.split(",") if part.strip()]
|
||||
|
||||
|
||||
def extract_size(*values: str) -> str | None:
|
||||
for value in values:
|
||||
match = SIZE_PATTERN.search(value)
|
||||
if match:
|
||||
return match.group(1).replace(" ", "")
|
||||
return None
|
||||
|
||||
|
||||
def is_size_only_line(value: str) -> bool:
|
||||
size = extract_size(value)
|
||||
return size is not None and compact_text(value) == compact_text(size)
|
||||
|
||||
|
||||
def extract_first_json_object(html: str, marker: str) -> dict[str, object]:
|
||||
marker_index = html.find(marker)
|
||||
if marker_index == -1:
|
||||
raise ValueError(f"{marker} payload not found in page")
|
||||
|
||||
object_start = html.find("{", marker_index)
|
||||
if object_start == -1:
|
||||
raise ValueError("Shop payload start not found")
|
||||
|
||||
depth = 0
|
||||
in_string = False
|
||||
escaped = False
|
||||
object_end = None
|
||||
|
||||
for index in range(object_start, len(html)):
|
||||
char = html[index]
|
||||
|
||||
if in_string:
|
||||
if escaped:
|
||||
escaped = False
|
||||
elif char == "\\":
|
||||
escaped = True
|
||||
elif char == '"':
|
||||
in_string = False
|
||||
continue
|
||||
|
||||
if char == '"':
|
||||
in_string = True
|
||||
elif char == "{":
|
||||
depth += 1
|
||||
elif char == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
object_end = index + 1
|
||||
break
|
||||
|
||||
if object_end is None:
|
||||
raise ValueError("Shop payload end not found")
|
||||
|
||||
return json.loads(html[object_start:object_end])
|
||||
|
||||
|
||||
def find_shop_container(payload: object) -> dict[str, object] | None:
|
||||
if isinstance(payload, dict):
|
||||
shop = payload.get("shop")
|
||||
if isinstance(shop, dict) and isinstance(shop.get("products"), list):
|
||||
return payload
|
||||
|
||||
ds_shop = payload.get("dsShop")
|
||||
if isinstance(ds_shop, dict) and isinstance(ds_shop.get("data"), list):
|
||||
return {
|
||||
"shop": {
|
||||
"products": ds_shop.get("data", []),
|
||||
"settings": ds_shop.get("settings", {}),
|
||||
}
|
||||
}
|
||||
|
||||
for value in payload.values():
|
||||
found = find_shop_container(value)
|
||||
if found:
|
||||
return found
|
||||
|
||||
if isinstance(payload, list):
|
||||
for value in payload:
|
||||
found = find_shop_container(value)
|
||||
if found:
|
||||
return found
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_shop_payload(html: str) -> dict[str, object]:
|
||||
errors: list[str] = []
|
||||
for marker in SHOP_PAYLOAD_MARKERS:
|
||||
try:
|
||||
payload = extract_first_json_object(html, marker)
|
||||
except ValueError as exc:
|
||||
errors.append(str(exc))
|
||||
continue
|
||||
|
||||
shop_container = find_shop_container(payload)
|
||||
if shop_container is not None:
|
||||
return shop_container
|
||||
|
||||
errors.append(f"{marker} found, but shop container is missing")
|
||||
|
||||
raise ValueError("; ".join(errors) or "Shop payload not found in page")
|
||||
|
||||
|
||||
def html_fragment_to_lines(fragment: str) -> list[str]:
|
||||
if not fragment:
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(fragment, "html.parser")
|
||||
return [
|
||||
normalize_spaces(line)
|
||||
for line in soup.get_text("\n", strip=True).splitlines()
|
||||
if normalize_spaces(line)
|
||||
]
|
||||
|
||||
|
||||
class GorichMenuScraper:
|
||||
def __init__(self) -> None:
|
||||
self.site_url = settings.site_url
|
||||
self.output_path = Path(settings.output_path)
|
||||
self.timeout = settings.request_timeout
|
||||
|
||||
async def fetch_html(self) -> str:
|
||||
async with self._build_client() as client:
|
||||
response = await client.get(self.site_url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
def _build_client(self) -> httpx.AsyncClient:
|
||||
return httpx.AsyncClient(
|
||||
headers={"User-Agent": "Mozilla/5.0"},
|
||||
follow_redirects=True,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
|
||||
def parse_menu(self, html: str) -> MenuSnapshot:
|
||||
payload = extract_shop_payload(html)
|
||||
shop = payload.get("shop") or {}
|
||||
if not isinstance(shop, dict):
|
||||
raise ValueError("Shop payload has unexpected format")
|
||||
|
||||
shop_settings = shop.get("settings") or {}
|
||||
categories = shop_settings.get("categories") or []
|
||||
products = shop.get("products") or []
|
||||
if not isinstance(categories, list) or not isinstance(products, list):
|
||||
raise ValueError("Shop categories or products have unexpected format")
|
||||
|
||||
category_by_id: dict[int, dict[str, object]] = {}
|
||||
for category in categories:
|
||||
if not isinstance(category, dict):
|
||||
continue
|
||||
category_id = category.get("id")
|
||||
if isinstance(category_id, int):
|
||||
category_by_id[category_id] = category
|
||||
|
||||
scraped_at = datetime.now(timezone.utc)
|
||||
items: list[MenuItem] = []
|
||||
|
||||
for product in products:
|
||||
if not isinstance(product, dict):
|
||||
continue
|
||||
if not product.get("is_visible", True):
|
||||
continue
|
||||
|
||||
product_id = product.get("id")
|
||||
name = normalize_spaces(str(product.get("name", "")))
|
||||
if not product_id or not name:
|
||||
continue
|
||||
|
||||
raw_description = str(product.get("short_description", "") or "")
|
||||
description_lines = html_fragment_to_lines(raw_description)
|
||||
size = extract_size(name, *description_lines)
|
||||
description_parts = [line for line in description_lines if not is_size_only_line(line)]
|
||||
description = " ".join(description_parts).strip()
|
||||
if not description and description_lines:
|
||||
description = " ".join(description_lines).strip()
|
||||
|
||||
raw_category_ids = [
|
||||
category_id
|
||||
for category_id in product.get("category_list", [])
|
||||
if isinstance(category_id, int)
|
||||
]
|
||||
sorted_category_ids = sorted(
|
||||
raw_category_ids,
|
||||
key=lambda category_id: int(category_by_id.get(category_id, {}).get("pos", 10_000)),
|
||||
)
|
||||
category_name = "прочее"
|
||||
primary_category_id: int | None = None
|
||||
if sorted_category_ids:
|
||||
primary_category_id = sorted_category_ids[0]
|
||||
category_name = normalize_spaces(
|
||||
str(category_by_id.get(primary_category_id, {}).get("name", "прочее"))
|
||||
).lower()
|
||||
|
||||
image_url = ""
|
||||
image_list = product.get("image_list", [])
|
||||
if isinstance(image_list, list):
|
||||
for image in image_list:
|
||||
if not isinstance(image, dict):
|
||||
continue
|
||||
raw_url = str(image.get("url", "") or "")
|
||||
if raw_url:
|
||||
image_url = urljoin(self.site_url, raw_url)
|
||||
break
|
||||
|
||||
price = product.get("price")
|
||||
numeric_price = int(price) if isinstance(price, int) else None
|
||||
currency = normalize_spaces(str(product.get("currency", "руб.") or "руб."))
|
||||
price_label = (
|
||||
f"{numeric_price} {currency}" if numeric_price is not None else "Цена не указана"
|
||||
)
|
||||
|
||||
description_url = str(product.get("description_url", "") or "")
|
||||
source_url = urljoin(self.site_url, description_url) if description_url else self.site_url
|
||||
|
||||
items.append(
|
||||
MenuItem(
|
||||
item_id=str(product_id),
|
||||
name=name,
|
||||
category=category_name,
|
||||
description=description,
|
||||
ingredients=parse_ingredients(description),
|
||||
price=parse_price(price_label),
|
||||
price_label=price_label,
|
||||
size=size,
|
||||
photo_url=image_url,
|
||||
source_url=source_url,
|
||||
scraped_at=scraped_at,
|
||||
metadata={
|
||||
"category_id": primary_category_id,
|
||||
"category_ids": sorted_category_ids,
|
||||
"raw_short_description": raw_description,
|
||||
"amount": product.get("amount"),
|
||||
"sku": product.get("sku"),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return MenuSnapshot(
|
||||
source_url=self.site_url,
|
||||
scraped_at=scraped_at,
|
||||
total_items=len(items),
|
||||
items=items,
|
||||
)
|
||||
|
||||
def save_snapshot(self, snapshot: MenuSnapshot) -> None:
|
||||
self.output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.output_path.write_text(
|
||||
json.dumps(snapshot.model_dump(mode="json"), ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
async def scrape_and_save(self) -> MenuSnapshot:
|
||||
html = await self.fetch_html()
|
||||
snapshot = self.parse_menu(html)
|
||||
self.save_snapshot(snapshot)
|
||||
return snapshot
|
||||
@@ -0,0 +1,6 @@
|
||||
beautifulsoup4==4.12.3
|
||||
fastapi==0.115.12
|
||||
httpx==0.28.1
|
||||
pydantic==2.11.4
|
||||
uvicorn==0.34.2
|
||||
|
||||
Reference in New Issue
Block a user