GorichBot/rag_api/app/menu_catalog.py

from __future__ import annotations

import json
import re
from pathlib import Path

from .config import settings
from .models import MenuItem, MenuSnapshot


def tokenize(value: str) -> list[str]:
    raw_tokens = re.findall(r"[a-zA-Zа-яА-Я0-9]+", value.lower())
    return [
        token
        for token in raw_tokens
        if token not in QUERY_STOPWORDS and (len(token) > 2 or token.isdigit())
    ]


QUERY_STOPWORDS = {
    "что",
    "у",
    "вас",
    "есть",
    "из",
    "как",
    "ли",
    "мне",
    "могу",
    "хочу",
    "надо",
    "для",
    "под",
    "про",
    "или",
    "это",
    "эта",
    "этот",
    "какой",
    "какая",
    "какие",
    "посоветуй",
    "посоветуйте",
    "подбери",
    "подобрать",
    "вкусную",
    "вкусный",
    "вкусное",
}


QUERY_HINTS = {
    "шаурма": ["шаурма", "классика"],
    "шаурмы": ["шаурма", "классика"],
    "шаверма": ["шаурма", "классика"],
    "шавуха": ["шаурма", "классика"],
    "острый": ["халапеньо", "шрирача", "том", "ям"],
    "острая": ["халапеньо", "шрирача", "том", "ям"],
    "острое": ["халапеньо", "шрирача", "том", "ям"],
    "острого": ["халапеньо", "шрирача", "том", "ям"],
    "пикантный": ["халапеньо", "шрирача", "том", "ям"],
    "сыр": ["сыр", "моцарелла", "пармезан", "крем", "чиз"],
    "сыром": ["сыр", "моцарелла", "пармезан", "крем", "чиз"],
    "сыра": ["сыр", "моцарелла", "пармезан", "крем", "чиз"],
    "сырный": ["сыр", "моцарелла", "пармезан", "крем", "чиз"],
    "сырная": ["сыр", "моцарелла", "пармезан", "крем", "чиз"],
    "рыбный": ["лосось"],
    "рыбная": ["лосось"],
    "мясной": ["свинина", "курица", "ростбиф", "колбаски", "пепперони"],
    "мясная": ["свинина", "курица", "ростбиф", "колбаски", "пепперони"],
}

CATEGORY_ALIASES = {
    "шаурмы": "шаурма",
    "шаверма": "шаурма",
    "шавуха": "шаурма",
}


class MenuCatalog:
    def __init__(self) -> None:
        self.snapshot_path = Path(settings.menu_snapshot_path)

    def exists(self) -> bool:
        return self.snapshot_path.exists()

    def load_snapshot(self) -> MenuSnapshot:
        data = json.loads(self.snapshot_path.read_text(encoding="utf-8"))
        return MenuSnapshot.model_validate(data)

    def menu_documents(self) -> list[tuple[MenuItem, str]]:
        if not self.exists():
            return []

        snapshot = self.load_snapshot()
        documents: list[tuple[MenuItem, str]] = []
        for item in snapshot.items:
            text = " | ".join(
                [
                    item.name,
                    item.category,
                    item.description,
                    ", ".join(item.ingredients),
                    item.size or "",
                    item.price_label,
                ]
            )
            documents.append((item, text))
        return documents

    def items_map(self) -> dict[str, MenuItem]:
        if not self.exists():
            return {}

        snapshot = self.load_snapshot()
        return {item.item_id: item for item in snapshot.items}

    def search(
        self,
        query: str = "",
        max_price: int | None = None,
        category: str | None = None,
        must_include: list[str] | None = None,
        must_not_include: list[str] | None = None,
        limit: int = 5,
        candidate_ids: list[str] | None = None,
        semantic_ranks: dict[str, int] | None = None,
    ) -> list[dict[str, object]]:
        if not self.exists():
            return []

        must_include = [value.lower() for value in (must_include or [])]
        must_not_include = [value.lower() for value in (must_not_include or [])]
        query_tokens = tokenize(query)
        normalized_category = category.lower() if category else None
        if normalized_category in CATEGORY_ALIASES:
            normalized_category = CATEGORY_ALIASES[normalized_category]
        hint_tokens = []
        for token in query_tokens:
            hint_tokens.extend(QUERY_HINTS.get(token, []))
        candidate_set = set(candidate_ids or [])
        semantic_ranks = semantic_ranks or {}

        scored_items: list[tuple[int, MenuItem]] = []
        for item, text in self.menu_documents():
            if candidate_set and item.item_id not in candidate_set:
                continue

            lowered = text.lower()

            if normalized_category and item.category.lower() != normalized_category:
                continue
            if max_price is not None and item.price is not None and item.price > max_price:
                continue
            if max_price is not None and item.price is None:
                continue
            if any(value not in lowered for value in must_include):
                continue
            if any(value in lowered for value in must_not_include):
                continue

            score = 0
            for token in query_tokens:
                if token in lowered:
                    score += 3
                if token in item.name.lower():
                    score += 5

            for token in hint_tokens:
                if token in lowered:
                    score += 6
                if token == item.category.lower():
                    score += 8

            for token in must_include:
                if token in lowered:
                    score += 4

            if item.item_id in semantic_ranks:
                score += max(0, 20 - semantic_ranks[item.item_id])

            if not query_tokens and not must_include and category:
                score += 1

            scored_items.append((score, item))

        scored_items.sort(
            key=lambda row: (
                row[0],
                -(row[1].price or 0),
                row[1].name,
            ),
            reverse=True,
        )

        results: list[dict[str, object]] = []
        for score, item in scored_items[:limit]:
            results.append(
                {
                    "item_id": item.item_id,
                    "name": item.name,
                    "category": item.category,
                    "description": item.description,
                    "ingredients": item.ingredients,
                    "price": item.price,
                    "price_label": item.price_label,
                    "size": item.size,
                    "photo_url": item.photo_url,
                    "source_url": item.source_url,
                    "score": score,
                }
            )

        return results