herolib_python/examples/scrapper/yew_docs.sh
2025-08-08 09:38:37 +02:00

87 lines
2.3 KiB
Bash
Executable File

#!/bin/bash
set -ex
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
source ../../env.sh
cd "$SCRIPT_DIR"
# 1. Install dependencies
uv pip install --upgrade scrapy markdownify
#!/bin/bash
set -e
# Ensure clean environment
rm -rf yew_docs output
# 1. Install required packages
uv pip install --upgrade scrapy markdownify
# 2. Create Scrapy project
scrapy startproject yew_docs
cd yew_docs
# 3. Update settings to ignore robots.txt and set export directory
echo -e "\nROBOTSTXT_OBEY = False" >> yew_docs/settings.py
# 4. Create Spider with filters
cat << 'EOF' > yew_docs/spiders/yew_docs_spider.py
import os
import scrapy
from urllib.parse import urlparse, urljoin
import markdownify
class YewDocsSpider(scrapy.Spider):
name = "yew_docs"
allowed_domains = ["yew.rs"]
start_urls = ["https://yew.rs/docs/getting-started/introduction"]
def parse(self, response):
# Extract title
title = response.css("title::text").get() or "Page"
# Extract main content
main = response.css("main").get()
if not main:
self.logger.warning(f"No main content at {response.url}")
return
# Convert to Markdown
md = markdownify.markdownify(main, heading_style="ATX")
# Construct clean file path
parsed = urlparse(response.url)
path = parsed.path.lstrip("/")
if path.endswith("/") or path == "":
path += "index"
filepath = os.path.join("output", f"{path}.md")
os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"# {title.strip()}\n\n{md}")
# Follow only clean internal links under /docs/
for href in response.css("a::attr(href)").getall():
link = urljoin(response.url, href)
parsed = urlparse(link)
path = parsed.path
if parsed.netloc == "yew.rs" and path.startswith("/docs/"):
if (
"/docs/0." in path or
"/docs/next" in path or
"/docs/en" in path or
"#" in parsed.fragment or
path.count("/") > 5
):
continue
yield scrapy.Request(link.split("#")[0], callback=self.parse)
EOF
# 5. Run the spider
scrapy crawl yew_docs