...
This commit is contained in:
@@ -1,86 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
source ../../env.sh
|
||||
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
# 1. Install dependencies
|
||||
uv pip install --upgrade scrapy markdownify
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Ensure clean environment
|
||||
rm -rf yew_docs output
|
||||
|
||||
# 1. Install required packages
|
||||
uv pip install --upgrade scrapy markdownify
|
||||
|
||||
# 2. Create Scrapy project
|
||||
scrapy startproject yew_docs
|
||||
cd yew_docs
|
||||
|
||||
# 3. Update settings to ignore robots.txt and set export directory
|
||||
echo -e "\nROBOTSTXT_OBEY = False" >> yew_docs/settings.py
|
||||
|
||||
# 4. Create Spider with filters
|
||||
cat << 'EOF' > yew_docs/spiders/yew_docs_spider.py
|
||||
import os
|
||||
import scrapy
|
||||
from urllib.parse import urlparse, urljoin
|
||||
import markdownify
|
||||
|
||||
class YewDocsSpider(scrapy.Spider):
|
||||
name = "yew_docs"
|
||||
allowed_domains = ["yew.rs"]
|
||||
start_urls = ["https://yew.rs/docs/getting-started/introduction"]
|
||||
|
||||
def parse(self, response):
|
||||
# Extract title
|
||||
title = response.css("title::text").get() or "Page"
|
||||
|
||||
# Extract main content
|
||||
main = response.css("main").get()
|
||||
if not main:
|
||||
self.logger.warning(f"No main content at {response.url}")
|
||||
return
|
||||
|
||||
# Convert to Markdown
|
||||
md = markdownify.markdownify(main, heading_style="ATX")
|
||||
|
||||
# Construct clean file path
|
||||
parsed = urlparse(response.url)
|
||||
path = parsed.path.lstrip("/")
|
||||
if path.endswith("/") or path == "":
|
||||
path += "index"
|
||||
filepath = os.path.join("output", f"{path}.md")
|
||||
|
||||
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(f"# {title.strip()}\n\n{md}")
|
||||
|
||||
# Follow only clean internal links under /docs/
|
||||
for href in response.css("a::attr(href)").getall():
|
||||
link = urljoin(response.url, href)
|
||||
parsed = urlparse(link)
|
||||
path = parsed.path
|
||||
|
||||
if parsed.netloc == "yew.rs" and path.startswith("/docs/"):
|
||||
if (
|
||||
"/docs/0." in path or
|
||||
"/docs/next" in path or
|
||||
"/docs/en" in path or
|
||||
"#" in parsed.fragment or
|
||||
path.count("/") > 5
|
||||
):
|
||||
continue
|
||||
yield scrapy.Request(link.split("#")[0], callback=self.parse)
|
||||
EOF
|
||||
|
||||
# 5. Run the spider
|
||||
scrapy crawl yew_docs
|
Reference in New Issue
Block a user