from __future__ import annotations import re from datetime import datetime, timezone import httpx from bs4 import BeautifulSoup from .config import settings from .models import KnowledgeDocument MAP_PATTERN = re.compile( r"yandexMaps\.addMap\('([^']+)'\s*,\s*'([^']+)'\s*,\s*'([^']+)'\)" ) def normalize_spaces(value: str) -> str: return " ".join(value.replace("\xa0", " ").split()) def deduplicate_preserving_order(values: list[str]) -> list[str]: seen: set[str] = set() result: list[str] = [] for value in values: if value and value not in seen: seen.add(value) result.append(value) return result def is_meaningful_value(value: str) -> bool: return any(char.isalnum() for char in value) class SiteKnowledgeScraper: ABOUT_MARKER = "ТЕРРИТОРИЯ БЫСТРОГО ПИТАНИЯ В ВОЛГОГРАДЕ" MENU_MARKER = "МЕНЮ" DELIVERY_MARKER = "ДОСТАВКА" CONTACT_MARKER = "КОНТАКТЫ" CONTACT_END_MARKERS = ("Закрыть", "OK") def __init__(self) -> None: self.site_url = settings.site_url self.timeout = settings.request_timeout async def fetch_homepage(self) -> str: async with httpx.AsyncClient( headers={"User-Agent": "Mozilla/5.0"}, follow_redirects=True, timeout=self.timeout, ) as client: response = await client.get(self.site_url) response.raise_for_status() return response.text def visible_strings(self, soup: BeautifulSoup) -> list[str]: return [ normalized for text in soup.stripped_strings for normalized in [normalize_spaces(text)] if normalized and is_meaningful_value(normalized) ] def find_marker(self, values: list[str], marker: str, start: int = 0) -> int | None: for index in range(start, len(values)): if values[index] == marker: return index return None def find_last_marker(self, values: list[str], marker: str, start: int = 0) -> int | None: for index in range(len(values) - 1, start - 1, -1): if values[index] == marker: return index return None def slice_between_markers( self, values: list[str], start_marker: str, end_markers: tuple[str, ...], start_at: int = 0, ) -> list[str]: start_index = self.find_marker(values, start_marker, start_at) if start_index is None: return [] end_index = len(values) for marker in end_markers: marker_index = self.find_marker(values, marker, start_index + 1) if marker_index is not None: end_index = min(end_index, marker_index) return values[start_index:end_index] def extract_social_links(self, soup: BeautifulSoup) -> list[str]: links: list[str] = [] for node in soup.select("[data-page-link]"): href = node.get("data-page-link") label = normalize_spaces(node.get_text(" ", strip=True)) if not href: continue if label: links.append(f"{label}: {href}") else: links.append(str(href)) return deduplicate_preserving_order(links) def extract_map_coordinates(self, html: str) -> str | None: match = MAP_PATTERN.search(html) if not match: return None latitude = normalize_spaces(match.group(2)) longitude = normalize_spaces(match.group(3)) return f"{latitude}, {longitude}" def parse_homepage(self, html: str) -> list[KnowledgeDocument]: soup = BeautifulSoup(html, "html.parser") strings = self.visible_strings(soup) documents: list[KnowledgeDocument] = [] scraped_at = datetime.now(timezone.utc) meta_description = soup.select_one('meta[name="description"]') if meta_description and meta_description.get("content"): documents.append( KnowledgeDocument( doc_id="site-meta-description", title="Краткое описание заведения", text=normalize_spaces(meta_description["content"]), source_type="about", source_url=self.site_url, metadata={"scraped_at": scraped_at.isoformat()}, ) ) about_section = self.slice_between_markers( strings, self.ABOUT_MARKER, (self.MENU_MARKER,), ) if about_section: documents.append( KnowledgeDocument( doc_id="site-about", title=about_section[0], text="\n".join(deduplicate_preserving_order(about_section[1:])), source_type="about", source_url=self.site_url, metadata={"scraped_at": scraped_at.isoformat()}, ) ) social_links = self.extract_social_links(soup) if social_links: documents.append( KnowledgeDocument( doc_id="site-links", title="Соцсети и внешние площадки", text="\n".join(social_links), source_type="links", source_url=self.site_url, metadata={"scraped_at": scraped_at.isoformat()}, ) ) menu_index = self.find_marker(strings, self.MENU_MARKER) delivery_start = self.find_last_marker( strings, self.DELIVERY_MARKER, start=(menu_index + 1) if menu_index is not None else 0, ) contact_start = self.find_last_marker( strings, self.CONTACT_MARKER, start=(delivery_start + 1) if delivery_start is not None else 0, ) delivery_section = ( strings[delivery_start:contact_start] if delivery_start is not None and contact_start is not None and contact_start > delivery_start else [] ) if delivery_section: documents.append( KnowledgeDocument( doc_id="site-delivery", title=delivery_section[0], text="\n".join(deduplicate_preserving_order(delivery_section[1:])), source_type="delivery", source_url=self.site_url, metadata={"scraped_at": scraped_at.isoformat()}, ) ) auth_index = len(strings) if contact_start is not None: for marker in self.CONTACT_END_MARKERS: marker_index = self.find_marker(strings, marker, contact_start + 1) if marker_index is not None: auth_index = min(auth_index, marker_index) contact_section = ( strings[contact_start:auth_index] if contact_start is not None and auth_index > contact_start else [] ) if contact_section: metadata = {"scraped_at": scraped_at.isoformat()} coordinates = self.extract_map_coordinates(html) if coordinates: metadata["map_coordinates"] = coordinates documents.append( KnowledgeDocument( doc_id="site-contact", title=contact_section[0], text="\n".join(deduplicate_preserving_order(contact_section[1:])), source_type="contact", source_url=self.site_url, metadata=metadata, ) ) return documents async def scrape(self) -> list[KnowledgeDocument]: html = await self.fetch_homepage() return self.parse_homepage(html)