...

2025-08-20 04:01:35 +02:00
parent 749c89aefc
commit 6b9f0cf291
32 changed files with 327 additions and 511 deletions
--- a/examples/scrapper/yew_docs.sh
+++ b/examples/scrapper/yew_docs.sh
@@ -1,86 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR"
-
-source ../../env.sh
-
-cd "$SCRIPT_DIR"
-
-# 1. Install dependencies
-uv pip install --upgrade scrapy markdownify
-#!/bin/bash
-
-set -e
-
-# Ensure clean environment
-rm -rf yew_docs output
-
-# 1. Install required packages
-uv pip install --upgrade scrapy markdownify
-
-# 2. Create Scrapy project
-scrapy startproject yew_docs
-cd yew_docs
-
-# 3. Update settings to ignore robots.txt and set export directory
-echo -e "\nROBOTSTXT_OBEY = False" >> yew_docs/settings.py
-
-# 4. Create Spider with filters
-cat << 'EOF' > yew_docs/spiders/yew_docs_spider.py
-import os
-import scrapy
-from urllib.parse import urlparse, urljoin
-import markdownify
-
-class YewDocsSpider(scrapy.Spider):
-    name = "yew_docs"
-    allowed_domains = ["yew.rs"]
-    start_urls = ["https://yew.rs/docs/getting-started/introduction"]
-
-    def parse(self, response):
-        # Extract title
-        title = response.css("title::text").get() or "Page"
-
-        # Extract main content
-        main = response.css("main").get()
-        if not main:
-            self.logger.warning(f"No main content at {response.url}")
-            return
-
-        # Convert to Markdown
-        md = markdownify.markdownify(main, heading_style="ATX")
-
-        # Construct clean file path
-        parsed = urlparse(response.url)
-        path = parsed.path.lstrip("/")
-        if path.endswith("/") or path == "":
-            path += "index"
-        filepath = os.path.join("output", f"{path}.md")
-
-        os.makedirs(os.path.dirname(filepath), exist_ok=True)
-        with open(filepath, "w", encoding="utf-8") as f:
-            f.write(f"# {title.strip()}\n\n{md}")
-
-        # Follow only clean internal links under /docs/
-        for href in response.css("a::attr(href)").getall():
-            link = urljoin(response.url, href)
-            parsed = urlparse(link)
-            path = parsed.path
-
-            if parsed.netloc == "yew.rs" and path.startswith("/docs/"):
-                if (
-                    "/docs/0." in path or
-                    "/docs/next" in path or
-                    "/docs/en" in path or
-                    "#" in parsed.fragment or
-                    path.count("/") > 5
-                ):
-                    continue
-                yield scrapy.Request(link.split("#")[0], callback=self.parse)
-EOF
-
-# 5. Run the spider
-scrapy crawl yew_docs