87 lines
2.3 KiB
Bash
Executable File
87 lines
2.3 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
set -ex
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
cd "$SCRIPT_DIR"
|
|
|
|
source ../../env.sh
|
|
|
|
cd "$SCRIPT_DIR"
|
|
|
|
# 1. Install dependencies
|
|
uv pip install --upgrade scrapy markdownify
|
|
#!/bin/bash
|
|
|
|
set -e
|
|
|
|
# Ensure clean environment
|
|
rm -rf yew_docs output
|
|
|
|
# 1. Install required packages
|
|
uv pip install --upgrade scrapy markdownify
|
|
|
|
# 2. Create Scrapy project
|
|
scrapy startproject yew_docs
|
|
cd yew_docs
|
|
|
|
# 3. Update settings to ignore robots.txt and set export directory
|
|
echo -e "\nROBOTSTXT_OBEY = False" >> yew_docs/settings.py
|
|
|
|
# 4. Create Spider with filters
|
|
cat << 'EOF' > yew_docs/spiders/yew_docs_spider.py
|
|
import os
|
|
import scrapy
|
|
from urllib.parse import urlparse, urljoin
|
|
import markdownify
|
|
|
|
class YewDocsSpider(scrapy.Spider):
|
|
name = "yew_docs"
|
|
allowed_domains = ["yew.rs"]
|
|
start_urls = ["https://yew.rs/docs/getting-started/introduction"]
|
|
|
|
def parse(self, response):
|
|
# Extract title
|
|
title = response.css("title::text").get() or "Page"
|
|
|
|
# Extract main content
|
|
main = response.css("main").get()
|
|
if not main:
|
|
self.logger.warning(f"No main content at {response.url}")
|
|
return
|
|
|
|
# Convert to Markdown
|
|
md = markdownify.markdownify(main, heading_style="ATX")
|
|
|
|
# Construct clean file path
|
|
parsed = urlparse(response.url)
|
|
path = parsed.path.lstrip("/")
|
|
if path.endswith("/") or path == "":
|
|
path += "index"
|
|
filepath = os.path.join("output", f"{path}.md")
|
|
|
|
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
f.write(f"# {title.strip()}\n\n{md}")
|
|
|
|
# Follow only clean internal links under /docs/
|
|
for href in response.css("a::attr(href)").getall():
|
|
link = urljoin(response.url, href)
|
|
parsed = urlparse(link)
|
|
path = parsed.path
|
|
|
|
if parsed.netloc == "yew.rs" and path.startswith("/docs/"):
|
|
if (
|
|
"/docs/0." in path or
|
|
"/docs/next" in path or
|
|
"/docs/en" in path or
|
|
"#" in parsed.fragment or
|
|
path.count("/") > 5
|
|
):
|
|
continue
|
|
yield scrapy.Request(link.split("#")[0], callback=self.parse)
|
|
EOF
|
|
|
|
# 5. Run the spider
|
|
scrapy crawl yew_docs
|