#!/bin/bash set -ex SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR" source ../../env.sh cd "$SCRIPT_DIR" # 1. Install dependencies uv pip install --upgrade scrapy markdownify #!/bin/bash set -e # Ensure clean environment rm -rf yew_docs output # 1. Install required packages uv pip install --upgrade scrapy markdownify # 2. Create Scrapy project scrapy startproject yew_docs cd yew_docs # 3. Update settings to ignore robots.txt and set export directory echo -e "\nROBOTSTXT_OBEY = False" >> yew_docs/settings.py # 4. Create Spider with filters cat << 'EOF' > yew_docs/spiders/yew_docs_spider.py import os import scrapy from urllib.parse import urlparse, urljoin import markdownify class YewDocsSpider(scrapy.Spider): name = "yew_docs" allowed_domains = ["yew.rs"] start_urls = ["https://yew.rs/docs/getting-started/introduction"] def parse(self, response): # Extract title title = response.css("title::text").get() or "Page" # Extract main content main = response.css("main").get() if not main: self.logger.warning(f"No main content at {response.url}") return # Convert to Markdown md = markdownify.markdownify(main, heading_style="ATX") # Construct clean file path parsed = urlparse(response.url) path = parsed.path.lstrip("/") if path.endswith("/") or path == "": path += "index" filepath = os.path.join("output", f"{path}.md") os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, "w", encoding="utf-8") as f: f.write(f"# {title.strip()}\n\n{md}") # Follow only clean internal links under /docs/ for href in response.css("a::attr(href)").getall(): link = urljoin(response.url, href) parsed = urlparse(link) path = parsed.path if parsed.netloc == "yew.rs" and path.startswith("/docs/"): if ( "/docs/0." in path or "/docs/next" in path or "/docs/en" in path or "#" in parsed.fragment or path.count("/") > 5 ): continue yield scrapy.Request(link.split("#")[0], callback=self.parse) EOF # 5. Run the spider scrapy crawl yew_docs