first commit

2026-05-12 23:37:04 +03:00
commit aff0bc2990
67 changed files with 3984 additions and 0 deletions
@@ -0,0 +1,4 @@
+GORICH_SITE_URL=https://gorych34.ru/
+MENU_OUTPUT_PATH=/data/menu/gorich_menu.json
+REQUEST_TIMEOUT_SECONDS=20
+SCRAPE_ON_STARTUP=true
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+COPY menu_scraper/requirements.txt /app/requirements.txt
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+
+COPY menu_scraper/app /app/app
+
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8010"]
+
@@ -0,0 +1 @@
+
@@ -0,0 +1,15 @@
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+
+
+@dataclass(slots=True)
+class Settings:
+    site_url: str = os.getenv("GORICH_SITE_URL", "https://gorych34.ru/")
+    output_path: str = os.getenv("MENU_OUTPUT_PATH", "/data/menu/gorich_menu.json")
+    request_timeout: float = float(os.getenv("REQUEST_TIMEOUT_SECONDS", "20"))
+    scrape_on_startup: bool = os.getenv("SCRAPE_ON_STARTUP", "true").lower() == "true"
+
+
+settings = Settings()
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+import json
+from contextlib import asynccontextmanager
+from pathlib import Path
+
+from fastapi import FastAPI, HTTPException
+
+from .config import settings
+from .models import MenuSnapshot
+from .scraper import GorichMenuScraper
+
+
+scraper = GorichMenuScraper()
+output_path = Path(settings.output_path)
+
+
+@asynccontextmanager
+async def lifespan(_: FastAPI):
+    if settings.scrape_on_startup:
+        await scraper.scrape_and_save()
+    yield
+
+
+app = FastAPI(
+    title="Gorich Menu Scraper",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+
+
+def load_snapshot_from_disk() -> MenuSnapshot:
+    if not output_path.exists():
+        raise HTTPException(status_code=404, detail="Menu snapshot not found")
+
+    data = json.loads(output_path.read_text(encoding="utf-8"))
+    return MenuSnapshot.model_validate(data)
+
+
+@app.get("/health")
+async def health() -> dict[str, str]:
+    return {"status": "ok"}
+
+
+@app.post("/scrape", response_model=MenuSnapshot)
+async def scrape_menu() -> MenuSnapshot:
+    return await scraper.scrape_and_save()
+
+
+@app.get("/items", response_model=MenuSnapshot)
+async def get_items() -> MenuSnapshot:
+    return load_snapshot_from_disk()
+
+
+@app.get("/items/{item_id}")
+async def get_item(item_id: str) -> dict[str, object]:
+    snapshot = load_snapshot_from_disk()
+    for item in snapshot.items:
+        if item.item_id == item_id:
+            return item.model_dump(mode="json")
+
+    raise HTTPException(status_code=404, detail="Menu item not found")
+
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class MenuItem(BaseModel):
+    item_id: str
+    name: str
+    category: str
+    description: str
+    ingredients: list[str]
+    price: int | None = None
+    price_label: str
+    size: str | None = None
+    photo_url: str
+    source_url: str
+    scraped_at: datetime
+    metadata: dict[str, Any] = Field(default_factory=dict)
+
+
+class MenuSnapshot(BaseModel):
+    source_url: str
+    scraped_at: datetime
+    total_items: int
+    items: list[MenuItem]
+
@@ -0,0 +1,309 @@
+from __future__ import annotations
+
+import json
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.parse import urljoin
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .config import settings
+from .models import MenuItem, MenuSnapshot
+
+
+SHOP_PAYLOAD_MARKERS = (
+    "MsJsShop.init(",
+    "MsJsPublishedManager.addJsData(",
+)
+SIZE_PATTERN = re.compile(
+    r"(\d+\s*(?:см|г|мл)(?:\s*/\s*\d+\s*(?:см|г|мл))*)",
+    re.IGNORECASE,
+)
+
+
+def normalize_spaces(value: str) -> str:
+    return " ".join(value.replace("\xa0", " ").split())
+
+
+def compact_text(value: str) -> str:
+    return re.sub(r"\s+", "", value.replace("\xa0", " ")).lower()
+
+
+def parse_price(price_label: str) -> int | None:
+    cleaned = normalize_spaces(price_label).lower()
+    if "бесплатно" in cleaned:
+        return None
+
+    digits = re.sub(r"[^\d]", "", cleaned)
+    return int(digits) if digits else None
+
+
+def parse_ingredients(description: str) -> list[str]:
+    cleaned = normalize_spaces(description)
+    if not cleaned:
+        return []
+
+    lower_cleaned = cleaned.lower()
+    if lower_cleaned.startswith("состав:"):
+        cleaned = cleaned.split(":", 1)[1].strip()
+
+    return [part.strip() for part in cleaned.split(",") if part.strip()]
+
+
+def extract_size(*values: str) -> str | None:
+    for value in values:
+        match = SIZE_PATTERN.search(value)
+        if match:
+            return match.group(1).replace(" ", "")
+    return None
+
+
+def is_size_only_line(value: str) -> bool:
+    size = extract_size(value)
+    return size is not None and compact_text(value) == compact_text(size)
+
+
+def extract_first_json_object(html: str, marker: str) -> dict[str, object]:
+    marker_index = html.find(marker)
+    if marker_index == -1:
+        raise ValueError(f"{marker} payload not found in page")
+
+    object_start = html.find("{", marker_index)
+    if object_start == -1:
+        raise ValueError("Shop payload start not found")
+
+    depth = 0
+    in_string = False
+    escaped = False
+    object_end = None
+
+    for index in range(object_start, len(html)):
+        char = html[index]
+
+        if in_string:
+            if escaped:
+                escaped = False
+            elif char == "\\":
+                escaped = True
+            elif char == '"':
+                in_string = False
+            continue
+
+        if char == '"':
+            in_string = True
+        elif char == "{":
+            depth += 1
+        elif char == "}":
+            depth -= 1
+            if depth == 0:
+                object_end = index + 1
+                break
+
+    if object_end is None:
+        raise ValueError("Shop payload end not found")
+
+    return json.loads(html[object_start:object_end])
+
+
+def find_shop_container(payload: object) -> dict[str, object] | None:
+    if isinstance(payload, dict):
+        shop = payload.get("shop")
+        if isinstance(shop, dict) and isinstance(shop.get("products"), list):
+            return payload
+
+        ds_shop = payload.get("dsShop")
+        if isinstance(ds_shop, dict) and isinstance(ds_shop.get("data"), list):
+            return {
+                "shop": {
+                    "products": ds_shop.get("data", []),
+                    "settings": ds_shop.get("settings", {}),
+                }
+            }
+
+        for value in payload.values():
+            found = find_shop_container(value)
+            if found:
+                return found
+
+    if isinstance(payload, list):
+        for value in payload:
+            found = find_shop_container(value)
+            if found:
+                return found
+
+    return None
+
+
+def extract_shop_payload(html: str) -> dict[str, object]:
+    errors: list[str] = []
+    for marker in SHOP_PAYLOAD_MARKERS:
+        try:
+            payload = extract_first_json_object(html, marker)
+        except ValueError as exc:
+            errors.append(str(exc))
+            continue
+
+        shop_container = find_shop_container(payload)
+        if shop_container is not None:
+            return shop_container
+
+        errors.append(f"{marker} found, but shop container is missing")
+
+    raise ValueError("; ".join(errors) or "Shop payload not found in page")
+
+
+def html_fragment_to_lines(fragment: str) -> list[str]:
+    if not fragment:
+        return []
+
+    soup = BeautifulSoup(fragment, "html.parser")
+    return [
+        normalize_spaces(line)
+        for line in soup.get_text("\n", strip=True).splitlines()
+        if normalize_spaces(line)
+    ]
+
+
+class GorichMenuScraper:
+    def __init__(self) -> None:
+        self.site_url = settings.site_url
+        self.output_path = Path(settings.output_path)
+        self.timeout = settings.request_timeout
+
+    async def fetch_html(self) -> str:
+        async with self._build_client() as client:
+            response = await client.get(self.site_url)
+            response.raise_for_status()
+            return response.text
+
+    def _build_client(self) -> httpx.AsyncClient:
+        return httpx.AsyncClient(
+            headers={"User-Agent": "Mozilla/5.0"},
+            follow_redirects=True,
+            timeout=self.timeout,
+        )
+
+    def parse_menu(self, html: str) -> MenuSnapshot:
+        payload = extract_shop_payload(html)
+        shop = payload.get("shop") or {}
+        if not isinstance(shop, dict):
+            raise ValueError("Shop payload has unexpected format")
+
+        shop_settings = shop.get("settings") or {}
+        categories = shop_settings.get("categories") or []
+        products = shop.get("products") or []
+        if not isinstance(categories, list) or not isinstance(products, list):
+            raise ValueError("Shop categories or products have unexpected format")
+
+        category_by_id: dict[int, dict[str, object]] = {}
+        for category in categories:
+            if not isinstance(category, dict):
+                continue
+            category_id = category.get("id")
+            if isinstance(category_id, int):
+                category_by_id[category_id] = category
+
+        scraped_at = datetime.now(timezone.utc)
+        items: list[MenuItem] = []
+
+        for product in products:
+            if not isinstance(product, dict):
+                continue
+            if not product.get("is_visible", True):
+                continue
+
+            product_id = product.get("id")
+            name = normalize_spaces(str(product.get("name", "")))
+            if not product_id or not name:
+                continue
+
+            raw_description = str(product.get("short_description", "") or "")
+            description_lines = html_fragment_to_lines(raw_description)
+            size = extract_size(name, *description_lines)
+            description_parts = [line for line in description_lines if not is_size_only_line(line)]
+            description = " ".join(description_parts).strip()
+            if not description and description_lines:
+                description = " ".join(description_lines).strip()
+
+            raw_category_ids = [
+                category_id
+                for category_id in product.get("category_list", [])
+                if isinstance(category_id, int)
+            ]
+            sorted_category_ids = sorted(
+                raw_category_ids,
+                key=lambda category_id: int(category_by_id.get(category_id, {}).get("pos", 10_000)),
+            )
+            category_name = "прочее"
+            primary_category_id: int | None = None
+            if sorted_category_ids:
+                primary_category_id = sorted_category_ids[0]
+                category_name = normalize_spaces(
+                    str(category_by_id.get(primary_category_id, {}).get("name", "прочее"))
+                ).lower()
+
+            image_url = ""
+            image_list = product.get("image_list", [])
+            if isinstance(image_list, list):
+                for image in image_list:
+                    if not isinstance(image, dict):
+                        continue
+                    raw_url = str(image.get("url", "") or "")
+                    if raw_url:
+                        image_url = urljoin(self.site_url, raw_url)
+                        break
+
+            price = product.get("price")
+            numeric_price = int(price) if isinstance(price, int) else None
+            currency = normalize_spaces(str(product.get("currency", "руб.") or "руб."))
+            price_label = (
+                f"{numeric_price} {currency}" if numeric_price is not None else "Цена не указана"
+            )
+
+            description_url = str(product.get("description_url", "") or "")
+            source_url = urljoin(self.site_url, description_url) if description_url else self.site_url
+
+            items.append(
+                MenuItem(
+                    item_id=str(product_id),
+                    name=name,
+                    category=category_name,
+                    description=description,
+                    ingredients=parse_ingredients(description),
+                    price=parse_price(price_label),
+                    price_label=price_label,
+                    size=size,
+                    photo_url=image_url,
+                    source_url=source_url,
+                    scraped_at=scraped_at,
+                    metadata={
+                        "category_id": primary_category_id,
+                        "category_ids": sorted_category_ids,
+                        "raw_short_description": raw_description,
+                        "amount": product.get("amount"),
+                        "sku": product.get("sku"),
+                    },
+                )
+            )
+
+        return MenuSnapshot(
+            source_url=self.site_url,
+            scraped_at=scraped_at,
+            total_items=len(items),
+            items=items,
+        )
+
+    def save_snapshot(self, snapshot: MenuSnapshot) -> None:
+        self.output_path.parent.mkdir(parents=True, exist_ok=True)
+        self.output_path.write_text(
+            json.dumps(snapshot.model_dump(mode="json"), ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+
+    async def scrape_and_save(self) -> MenuSnapshot:
+        html = await self.fetch_html()
+        snapshot = self.parse_menu(html)
+        self.save_snapshot(snapshot)
+        return snapshot
@@ -0,0 +1,6 @@
+beautifulsoup4==4.12.3
+fastapi==0.115.12
+httpx==0.28.1
+pydantic==2.11.4
+uvicorn==0.34.2
+