first commit

This commit is contained in:
2026-05-25 01:12:43 +03:00
commit bfc22efe24
83 changed files with 8903 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
"""CLI parser package for LawBot ingestion."""
+5
View File
@@ -0,0 +1,5 @@
from parser.cli import main
if __name__ == "__main__":
main()
+135
View File
@@ -0,0 +1,135 @@
from __future__ import annotations
import argparse
import asyncio
from parser.discovery import discover_documents, build_session
from parser.fetcher import fetch_documents, load_manifest, load_raw_index
from parser.ingest import ingest_documents
from parser.normalizer import load_normalized_document, normalize_document, write_normalized_document
from shared import ORM
def parse_categories(value: str | None) -> set[str] | None:
if not value:
return None
return {item.strip() for item in value.split(",") if item.strip()}
def select_documents(categories: set[str] | None, limit: int | None) -> list[dict]:
manifest = load_manifest()
documents = manifest["documents"]
if categories:
documents = [doc for doc in documents if doc["category_key"] in categories]
if limit is not None:
documents = documents[:limit]
return documents
def run_discover(_: argparse.Namespace) -> None:
manifest = discover_documents(build_session())
print(f"discovered {len(manifest['documents'])} documents from {manifest['source_page']}")
def run_fetch(args: argparse.Namespace) -> None:
documents = select_documents(parse_categories(args.categories), args.limit)
payloads = fetch_documents(documents, force=args.force, dry_run=args.dry_run)
print(f"fetched {len(payloads)} documents")
def run_normalize(args: argparse.Namespace) -> None:
documents = select_documents(parse_categories(args.categories), args.limit)
raw_index = load_raw_index()
normalized_count = 0
for document in documents:
raw_payload = raw_index.get("documents", {}).get(document["key"])
if raw_payload is None:
raise FileNotFoundError(
f"raw payload for {document['key']} not found; run `python -m parser fetch` first"
)
normalized_document = normalize_document(raw_payload)
write_normalized_document(normalized_document, dry_run=args.dry_run)
normalized_count += 1
print(f"normalized {normalized_count} documents")
async def _run_ingest_async(args: argparse.Namespace) -> None:
documents = select_documents(parse_categories(args.categories), args.limit)
normalized_documents = []
for document in documents:
normalized = load_normalized_document(document["key"])
if normalized is None:
raise FileNotFoundError(
f"normalized payload for {document['key']} not found; run `python -m parser normalize` first"
)
normalized_documents.append(normalized)
orm = None
try:
if not args.dry_run:
orm = ORM()
await orm.init_schema()
results = await ingest_documents(orm, normalized_documents, dry_run=args.dry_run)
finally:
if orm is not None:
await orm.close()
print(f"ingested {len(results)} documents")
def run_ingest(args: argparse.Namespace) -> None:
asyncio.run(_run_ingest_async(args))
def run_pipeline(args: argparse.Namespace) -> None:
run_discover(args)
if args.dry_run:
run_fetch(args)
print("dry-run stopped after fetch preview; raw files and DB were not changed")
return
run_fetch(args)
run_normalize(args)
run_ingest(args)
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Consultant ingestion pipeline for LawBot.")
subparsers = parser.add_subparsers(dest="command", required=True)
def add_common_flags(command_parser: argparse.ArgumentParser) -> None:
command_parser.add_argument("--categories", default=None)
command_parser.add_argument("--force", action="store_true")
command_parser.add_argument("--limit", type=int, default=None)
command_parser.add_argument("--dry-run", action="store_true")
discover_parser = subparsers.add_parser("discover")
discover_parser.set_defaults(func=run_discover)
fetch_parser = subparsers.add_parser("fetch")
add_common_flags(fetch_parser)
fetch_parser.set_defaults(func=run_fetch)
normalize_parser = subparsers.add_parser("normalize")
add_common_flags(normalize_parser)
normalize_parser.set_defaults(func=run_normalize)
ingest_parser = subparsers.add_parser("ingest")
add_common_flags(ingest_parser)
ingest_parser.set_defaults(func=run_ingest)
run_parser = subparsers.add_parser("run")
add_common_flags(run_parser)
run_parser.set_defaults(func=run_pipeline)
return parser
def main() -> None:
parser = build_parser()
args = parser.parse_args()
args.func(args)
+136
View File
@@ -0,0 +1,136 @@
from __future__ import annotations
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
VOLUMES_DIR = BASE_DIR / "volumes" / "parser"
RAW_ROOT = VOLUMES_DIR / "raw" / "consultant"
NORMALIZED_ROOT = VOLUMES_DIR / "normalized"
STATE_DIR = VOLUMES_DIR / "state"
MANIFEST_PATH = STATE_DIR / "manifest.json"
RAW_INDEX_PATH = STATE_DIR / "raw_index.json"
POPULAR_URL = "https://www.consultant.ru/popular/"
REQUEST_TIMEOUT = 30
MAX_RETRIES = 3
MAX_WORKERS = 2
USER_AGENT = (
"LawBotParser/1.0 (+https://local.lawbot; "
"purpose=legal-rag-ingestion; contact=local-dev)"
)
TARGET_DOCUMENTS = [
{
"key": "constitution",
"category_key": "constitution",
"consultant_category": "Конституция РФ",
"law_type": "constitutional",
"source_short_name": "Конституция РФ",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_28399/",
},
{
"key": "civil_code_part_1",
"category_key": "civil",
"consultant_category": "Гражданское право, гражданское законодательство РФ",
"law_type": "civil",
"source_short_name": "ГК РФ ч. 1",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_5142/",
},
{
"key": "civil_code_part_2",
"category_key": "civil",
"consultant_category": "Гражданское право, гражданское законодательство РФ",
"law_type": "civil",
"source_short_name": "ГК РФ ч. 2",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_508506/",
},
{
"key": "civil_code_part_3",
"category_key": "civil",
"consultant_category": "Гражданское право, гражданское законодательство РФ",
"law_type": "civil",
"source_short_name": "ГК РФ ч. 3",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_482694/",
},
{
"key": "civil_code_part_4",
"category_key": "civil",
"consultant_category": "Гражданское право, гражданское законодательство РФ",
"law_type": "civil",
"source_short_name": "ГК РФ ч. 4",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_509417/",
},
{
"key": "civil_procedure_code",
"category_key": "civil_procedure",
"consultant_category": "Гражданское процессуальное право, гражданский процесс",
"law_type": "procedural",
"source_short_name": "ГПК РФ",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_39570/",
},
{
"key": "housing_code",
"category_key": "housing",
"consultant_category": "Жилищное право, жилищное законодательство РФ",
"law_type": "housing",
"source_short_name": "ЖК РФ",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_51057/",
},
{
"key": "family_code",
"category_key": "family",
"consultant_category": "Семейное право, семейное законодательство РФ",
"law_type": "family",
"source_short_name": "СК РФ",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_8982/",
},
{
"key": "labor_code",
"category_key": "labor",
"consultant_category": "Трудовое право, трудовое законодательство РФ",
"law_type": "labor",
"source_short_name": "ТК РФ",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_34683/",
},
{
"key": "consumer_protection_law",
"category_key": "consumer",
"consultant_category": "Законодательство РФ о правах потребителя",
"law_type": "consumer",
"source_short_name": "ЗОЗПП",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_305/",
},
{
"key": "enforcement_law",
"category_key": "enforcement",
"consultant_category": "Законодательство об исполнительном производстве",
"law_type": "enforcement",
"source_short_name": "ФЗ об исполнительном производстве",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_71450/",
},
{
"key": "mortgage_law",
"category_key": "mortgage",
"consultant_category": "Законодательство об ипотеке",
"law_type": "mortgage",
"source_short_name": "ФЗ об ипотеке",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_19396/",
},
{
"key": "administrative_code",
"category_key": "administrative",
"consultant_category": "Административное право, административное законодательство РФ",
"law_type": "administrative",
"source_short_name": "КоАП РФ",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_34661/",
},
{
"key": "criminal_code",
"category_key": "criminal",
"consultant_category": "Уголовное право, уголовное законодательство РФ",
"law_type": "criminal",
"source_short_name": "УК РФ",
"source_url": "https://www.consultant.ru/document/cons_doc_LAW_10699/",
},
]
+60
View File
@@ -0,0 +1,60 @@
from __future__ import annotations
from collections import defaultdict
from datetime import datetime, timezone
import requests
from bs4 import BeautifulSoup
from parser.config import MANIFEST_PATH, POPULAR_URL, TARGET_DOCUMENTS, USER_AGENT
from parser.utils import to_absolute_url, write_json
def discover_documents(session: requests.Session) -> dict:
response = session.get(POPULAR_URL, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
anchors = {
anchor.get_text(" ", strip=True): to_absolute_url(anchor.get("href", ""), POPULAR_URL)
for anchor in soup.select("a[href]")
}
grouped: dict[str, list[dict]] = defaultdict(list)
for document in TARGET_DOCUMENTS:
grouped[document["category_key"]].append(document)
categories = []
for category_key, docs in grouped.items():
category_title = docs[0]["consultant_category"]
categories.append(
{
"key": category_key,
"title": category_title,
"found_on_popular_page": category_title in anchors,
"documents": [
{
"key": doc["key"],
"source_url": doc["source_url"],
"law_type": doc["law_type"],
"source_short_name": doc["source_short_name"],
}
for doc in docs
],
}
)
manifest = {
"generated_at": datetime.now(timezone.utc).isoformat(),
"source_page": POPULAR_URL,
"categories": sorted(categories, key=lambda item: item["title"]),
"documents": TARGET_DOCUMENTS,
}
write_json(MANIFEST_PATH, manifest)
return manifest
def build_session() -> requests.Session:
session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT})
return session
+171
View File
@@ -0,0 +1,171 @@
from __future__ import annotations
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path
import requests
from bs4 import BeautifulSoup, Tag
from parser.config import (
MANIFEST_PATH,
MAX_RETRIES,
MAX_WORKERS,
RAW_INDEX_PATH,
RAW_ROOT,
REQUEST_TIMEOUT,
)
from parser.discovery import build_session, discover_documents
from parser.utils import ensure_dir, read_json, sha256_text, to_absolute_url, write_json
def fetch_with_retry(session: requests.Session, url: str) -> str:
last_error: Exception | None = None
for attempt in range(1, MAX_RETRIES + 1):
try:
response = session.get(url, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
return response.text
except Exception as exc: # pragma: no cover - network branch
last_error = exc
if attempt < MAX_RETRIES:
time.sleep(attempt)
raise last_error # type: ignore[misc]
def extract_toc_articles(root_html: str, source_url: str) -> list[dict]:
soup = BeautifulSoup(root_html, "html.parser")
toc = soup.select_one(".document-page__toc > ul")
if toc is None:
return []
articles: list[dict] = []
def walk_list(node: Tag, stack: list[str]) -> None:
items = [child for child in node.children if isinstance(child, Tag) and child.name == "li"]
for li in items:
anchor = li.find("a", href=True, recursive=False)
if anchor is None:
continue
title = anchor.get_text(" ", strip=True)
url = to_absolute_url(anchor["href"], source_url)
next_ul = li.find_next_sibling("ul")
if title.startswith("Статья "):
article_number, _, article_title = title.partition(". ")
articles.append(
{
"article_number": article_number.replace("Статья", "").strip(),
"article_title": article_title.strip() or title,
"article_url": url,
"section_title": next((value for value in reversed(stack) if value.startswith("Раздел")), None),
"chapter_title": next((value for value in reversed(stack) if value.startswith("Глава")), None),
"part_title": next((value for value in reversed(stack) if value.startswith("Часть")), None),
"breadcrumb": [*stack, title],
}
)
else:
if next_ul is not None:
walk_list(next_ul, [*stack, title])
walk_list(toc, [])
return articles
def _fetch_article(session: requests.Session, article: dict) -> tuple[dict, str]:
return article, fetch_with_retry(session, article["article_url"])
def load_manifest() -> dict:
manifest = read_json(MANIFEST_PATH)
if manifest is None:
manifest = discover_documents(build_session())
return manifest
def load_raw_index() -> dict:
return read_json(RAW_INDEX_PATH, default={"documents": {}})
def save_raw_index(index_payload: dict) -> None:
write_json(RAW_INDEX_PATH, index_payload)
def fetch_documents(
selected_documents: list[dict], force: bool = False, dry_run: bool = False
) -> list[dict]:
session = build_session()
raw_index = load_raw_index()
raw_index.setdefault("documents", {})
fetched_payloads: list[dict] = []
for document in selected_documents:
root_html = fetch_with_retry(session, document["source_url"])
root_hash = sha256_text(root_html)
previous = raw_index["documents"].get(document["key"])
if (
not force
and previous
and previous.get("root_hash") == root_hash
and Path(previous["raw_dir"]).exists()
):
fetched_payloads.append(previous)
continue
toc_articles = extract_toc_articles(root_html, document["source_url"])
timestamp = datetime.now(timezone.utc)
raw_dir = RAW_ROOT / timestamp.date().isoformat() / document["key"]
article_dir = raw_dir / "articles"
article_payloads: list[dict] = []
article_hash_parts: list[str] = []
if not dry_run:
ensure_dir(article_dir)
(raw_dir / "root.html").write_text(root_html, encoding="utf-8")
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {
executor.submit(_fetch_article, session, article): index
for index, article in enumerate(toc_articles)
}
article_results = [None] * len(futures)
for future in as_completed(futures):
article, article_html = future.result()
index = futures[future]
article_hash = sha256_text(article_html)
article_hash_parts.append(article_hash)
article_payload = {
**article,
"file_name": f"{index:04d}.html",
"sha256": article_hash,
}
article_results[index] = (article_payload, article_html)
for article_payload, article_html in article_results:
article_payloads.append(article_payload)
if not dry_run:
(article_dir / article_payload["file_name"]).write_text(article_html, encoding="utf-8")
payload = {
**document,
"fetched_at": timestamp.isoformat(),
"raw_dir": str(raw_dir),
"root_file": "root.html",
"root_hash": root_hash,
"version_hash": sha256_text(root_hash + "".join(article_hash_parts)),
"article_count": len(article_payloads),
"articles": article_payloads,
}
if not dry_run:
write_json(raw_dir / "sidecar.json", payload)
raw_index["documents"][document["key"]] = payload
save_raw_index(raw_index)
fetched_payloads.append(payload)
return fetched_payloads
+68
View File
@@ -0,0 +1,68 @@
from __future__ import annotations
from datetime import date, datetime, timezone
from parser.normalizer import build_chunks
from shared import ORM
def parse_iso_date(value: str | None) -> date | None:
if not value:
return None
return date.fromisoformat(value)
async def ingest_documents(
orm: ORM | None, normalized_documents: list[dict], dry_run: bool = False
) -> list[dict]:
results = []
for document in normalized_documents:
source_payload = {
"title": document["title"],
"source_type": document["source_type"],
"jurisdiction": "RU",
"law_type": document["law_type"],
"document_number": document["document_number"],
"adoption_date": parse_iso_date(document["adoption_date"]),
"publication_date": parse_iso_date(document["publication_date"]),
"effective_date": parse_iso_date(document["effective_date"]),
"source_url": document["source_url"],
"official_publication_number": None,
"version_hash": document["version_hash"],
"is_active": True,
"loaded_at": datetime.now(timezone.utc),
}
chunks = build_chunks(document)
if dry_run:
results.append(
{
"document_key": document["key"],
"status": "dry-run",
"chunk_count": len(chunks),
}
)
continue
if orm is None:
raise RuntimeError("ORM instance is required when dry_run is disabled")
source, created = await orm.upsert_law_source(source_payload)
should_replace = created
if not created:
existing_count = await orm.get_chunks_count_by_source(source.id)
should_replace = existing_count == 0
if should_replace:
await orm.replace_law_chunks(source.id, chunks)
results.append(
{
"document_key": document["key"],
"status": "updated" if should_replace else "skipped",
"chunk_count": len(chunks),
"source_id": source.id,
}
)
return results
+166
View File
@@ -0,0 +1,166 @@
from __future__ import annotations
import re
from pathlib import Path
from bs4 import BeautifulSoup
from parser.config import NORMALIZED_ROOT
from parser.utils import (
chunk_paragraphs,
ensure_dir,
normalize_text,
parse_russian_date,
read_json,
write_json,
)
DOCUMENT_NUMBER_RE = re.compile(r"N\s*([0-9А-Яа-я\-ФКЗA-Z]+)")
def parse_root_metadata(root_html: str) -> dict:
soup = BeautifulSoup(root_html, "html.parser")
title_node = soup.select_one(".document-page__title h1")
title_text = title_node.get_text("\n", strip=True) if title_node else ""
content_node = soup.select_one(".document-page__content")
first_text = content_node.get_text("\n", strip=True) if content_node else ""
document_number = None
number_match = DOCUMENT_NUMBER_RE.search(first_text or title_text)
if number_match:
document_number = number_match.group(1)
adoption_date = parse_russian_date(first_text or title_text)
effective_date = None
effective_match = re.search(r"вступ\.\s*в силу с\s*(\d{2}\.\d{2}\.\d{4})", title_text)
if effective_match:
day, month, year = effective_match.group(1).split(".")
effective_date = f"{year}-{month}-{day}"
lines = [line.strip() for line in title_text.splitlines() if line.strip()]
return {
"title": lines[0] if lines else title_text.strip(),
"version_note": "\n".join(lines[1:]).strip() or None,
"document_number": document_number,
"adoption_date": adoption_date,
"effective_date": effective_date,
"publication_date": None,
}
def parse_article_page(article_html: str) -> list[str]:
soup = BeautifulSoup(article_html, "html.parser")
content = soup.select_one(".document-page__content")
if content is None:
return []
for selector in [
".document__insert",
".document__edit",
".document-page__notes",
".document-page__title-link",
".document-page__title",
]:
for node in content.select(selector):
node.decompose()
heading = content.select_one("h1")
if heading is not None:
heading.decompose()
paragraphs: list[str] = []
for node in content.select("p"):
text = normalize_text(node.get_text("\n", strip=True))
if not text:
continue
paragraphs.append(text)
return paragraphs
def normalize_document(raw_payload: dict) -> dict:
raw_dir = Path(raw_payload["raw_dir"])
root_html = (raw_dir / raw_payload["root_file"]).read_text(encoding="utf-8")
metadata = parse_root_metadata(root_html)
articles = []
for article_ref in raw_payload["articles"]:
article_html = (raw_dir / "articles" / article_ref["file_name"]).read_text(encoding="utf-8")
paragraphs = parse_article_page(article_html)
article_text = "\n\n".join(paragraphs).strip()
articles.append(
{
"article_number": article_ref["article_number"],
"article_title": article_ref["article_title"],
"article_url": article_ref["article_url"],
"section_title": article_ref.get("section_title"),
"chapter_title": article_ref.get("chapter_title"),
"part_title": article_ref.get("part_title"),
"breadcrumb": article_ref.get("breadcrumb", []),
"text": article_text,
"paragraphs": paragraphs,
}
)
normalized = {
"key": raw_payload["key"],
"title": metadata["title"],
"source_url": raw_payload["source_url"],
"source_type": "consultant_document",
"document_number": metadata["document_number"],
"adoption_date": metadata["adoption_date"],
"publication_date": metadata["publication_date"],
"effective_date": metadata["effective_date"],
"version_note": metadata["version_note"],
"version_hash": raw_payload["version_hash"],
"law_type": raw_payload["law_type"],
"consultant_category": raw_payload["consultant_category"],
"source_short_name": raw_payload["source_short_name"],
"articles": articles,
}
return normalized
def build_chunks(normalized_document: dict) -> list[dict]:
chunks: list[dict] = []
for article in normalized_document["articles"]:
paragraphs = article["paragraphs"] or ([article["text"]] if article["text"] else [])
text_chunks = chunk_paragraphs(paragraphs)
if not text_chunks and article["text"]:
text_chunks = [article["text"]]
for chunk_text in text_chunks:
chunks.append(
{
"chunk_index": len(chunks),
"article_number": article["article_number"],
"article_title": article["article_title"],
"chunk_text": chunk_text,
"metadata": {
"source_title": normalized_document["title"],
"source_short_name": normalized_document["source_short_name"],
"consultant_category": normalized_document["consultant_category"],
"chapter_title": article.get("chapter_title"),
"section_title": article.get("section_title"),
"article_number": article["article_number"],
"article_title": article["article_title"],
"document_url": article["article_url"],
"breadcrumb": article.get("breadcrumb", []),
"version_hash": normalized_document["version_hash"],
},
}
)
return chunks
def write_normalized_document(normalized_document: dict, dry_run: bool = False) -> Path:
output_path = NORMALIZED_ROOT / f"{normalized_document['key']}.json"
if not dry_run:
ensure_dir(output_path.parent)
write_json(output_path, normalized_document)
return output_path
def load_normalized_document(document_key: str) -> dict | None:
return read_json(NORMALIZED_ROOT / f"{document_key}.json", default=None)
+108
View File
@@ -0,0 +1,108 @@
from __future__ import annotations
import hashlib
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Any
from urllib.parse import urljoin
WHITESPACE_RE = re.compile(r"[ \t]+")
NEWLINES_RE = re.compile(r"\n{3,}")
RUSSIAN_MONTHS = {
"января": 1,
"февраля": 2,
"марта": 3,
"апреля": 4,
"мая": 5,
"июня": 6,
"июля": 7,
"августа": 8,
"сентября": 9,
"октября": 10,
"ноября": 11,
"декабря": 12,
}
def ensure_dir(path: Path) -> Path:
path.mkdir(parents=True, exist_ok=True)
return path
def read_json(path: Path, default: Any = None) -> Any:
if not path.exists():
return default
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, payload: Any) -> None:
ensure_dir(path.parent)
path.write_text(
json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=False),
encoding="utf-8",
)
def sha256_text(value: str) -> str:
return hashlib.sha256(value.encode("utf-8")).hexdigest()
def normalize_text(value: str) -> str:
cleaned = value.replace("\xa0", " ")
cleaned = WHITESPACE_RE.sub(" ", cleaned)
cleaned = re.sub(r" *\n *", "\n", cleaned)
cleaned = NEWLINES_RE.sub("\n\n", cleaned)
return cleaned.strip()
def slugify(value: str) -> str:
slug = re.sub(r"[^a-zA-Z0-9]+", "-", value.lower()).strip("-")
return slug or "document"
def to_absolute_url(url: str, base_url: str) -> str:
return urljoin(base_url, url)
def parse_russian_date(value: str) -> str | None:
match = re.search(r"(\d{1,2})\s+([а-я]+)\s+(\d{4})", value.lower())
if not match:
return None
day = int(match.group(1))
month = RUSSIAN_MONTHS.get(match.group(2))
year = int(match.group(3))
if month is None:
return None
return datetime(year, month, day).date().isoformat()
def chunk_paragraphs(
paragraphs: list[str], max_chars: int = 4500, overlap_paragraphs: int = 1
) -> list[str]:
if not paragraphs:
return []
chunks: list[str] = []
current: list[str] = []
current_len = 0
for paragraph in paragraphs:
paragraph_len = len(paragraph)
if current and current_len + paragraph_len + 2 > max_chars:
chunks.append("\n\n".join(current).strip())
current = current[-overlap_paragraphs:] if overlap_paragraphs else []
current_len = sum(len(item) + 2 for item in current)
current.append(paragraph)
current_len += paragraph_len + 2
if current:
chunks.append("\n\n".join(current).strip())
return [chunk for chunk in chunks if chunk]