...

2025-08-05 15:15:36 +02:00
parent 4bd960ed05
commit 7fabb4163a
192 changed files with 14901 additions and 0 deletions
--- a/examples/downloader/downloader_example.py
+++ b/examples/downloader/downloader_example.py
@@ -0,0 +1,467 @@
+import http.server
+import json
+import logging
+import multiprocessing
+import os
+import queue  # For queue.Empty exception
+import shutil  # For removing temp dir if TemporaryDirectory context manager not used for whole scope
+import socketserver
+import sys
+import tempfile
+import time
+
+import requests  # For checking server readiness
+
+# Adjust the Python path to include the parent directory (project root)
+# so that 'lib.downloader' can be imported.
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from lib.downloader import STATE_FILE_NAME, download_site
+
+# Configure logging for the example script
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+# This function needs to be at the top level for multiprocessing to find it.
+def run_download_test_process(test_name, downloader_kwargs, queue):
+    """
+    Wrapper to run download_site in a separate process and put summary in a queue.
+    """
+    logger.info(f"--- Running Test in subprocess: {test_name} ---")
+    summary = None
+    try:
+        summary = download_site(**downloader_kwargs)
+        logger.info(f"Test {test_name} completed in subprocess.")
+    except Exception as e:
+        logger.error(f"Error in test {test_name} (subprocess): {e}", exc_info=True)
+        # summary will remain None or be an incomplete one if error is after its creation
+    finally:
+        queue.put({"test_name": test_name, "summary": summary})
+
+
+def create_temp_site_files(base_dir):
+    """Creates the dummy HTML files in a 'test_site' subdirectory of base_dir."""
+    site_dir = os.path.join(base_dir, "test_site")
+    os.makedirs(os.path.join(site_dir, "sub"), exist_ok=True)
+
+    with open(os.path.join(site_dir, "index.html"), "w") as f:
+        f.write(
+            '<h1>Index</h1><a href="page1.html">Page 1</a> <a href="sub/page2.html">Page 2</a> <a href="ignored_page.html">Ignored</a> <a href="nonexistent.html">Non Existent</a>'
+        )
+    with open(os.path.join(site_dir, "page1.html"), "w") as f:
+        f.write('<h1>Page 1</h1><a href="index.html">Index</a>')
+    with open(os.path.join(site_dir, "sub", "page2.html"), "w") as f:
+        f.write(
+            '<h1>Page 2</h1><a href="../index.html">Index Back</a> <a href="http://neverssl.com">External</a>'
+        )
+    with open(os.path.join(site_dir, "ignored_page.html"), "w") as f:
+        f.write("<h1>Ignored Page</h1>")
+    logger.info(f"Created dummy site files in {site_dir}")
+    return site_dir
+
+
+# Top-level target function for the HTTP server process
+def _http_server_target_function(directory, host, port):
+    import functools
+
+    # Use functools.partial to set the 'directory' argument for SimpleHTTPRequestHandler
+    # This ensures the server serves files from the specified 'directory'.
+    Handler = functools.partial(
+        http.server.SimpleHTTPRequestHandler, directory=directory
+    )
+
+    try:
+        with socketserver.TCPServer((host, port), Handler) as httpd:
+            logger.info(
+                f"HTTP server process (PID {os.getpid()}) started on {host}:{port}, serving {directory}"
+            )
+            httpd.serve_forever()
+    except Exception as e:
+        logger.error(
+            f"HTTP server process (PID {os.getpid()}) failed: {e}", exc_info=True
+        )
+        raise
+
+
+def start_http_server_process(directory, host, port):
+    """Starts a simple HTTP server in a separate process."""
+    # Removed server_ready_event = multiprocessing.Event()
+
+    server_process = multiprocessing.Process(
+        target=_http_server_target_function,
+        args=(directory, host, port),  # Removed server_ready_event from args
+        daemon=True,
+    )
+    server_process.start()
+    logger.info(
+        f"HTTP server process (PID: {server_process.pid}) initiated for {directory} on {host}:{port}"
+    )
+    # Removed event waiting logic
+
+    return server_process
+
+
+def find_free_port():
+    """Finds an available port on the local machine."""
+    with socketserver.TCPServer(
+        ("localhost", 0), http.server.BaseHTTPRequestHandler
+    ) as s:
+        return s.server_address[1]
+
+
+def check_server_ready(url, retries=10, delay=0.5):
+    """Checks if the server is responding to requests."""
+    for i in range(retries):
+        try:
+            response = requests.get(url, timeout=1)
+            if response.status_code == 200:
+                logger.info(f"Server is ready at {url}")
+                return True
+        except requests.ConnectionError:
+            logger.debug(
+                f"Server not ready yet at {url}, attempt {i + 1}/{retries}. Retrying in {delay}s..."
+            )
+        except requests.Timeout:
+            logger.debug(
+                f"Server timed out at {url}, attempt {i + 1}/{retries}. Retrying in {delay}s..."
+            )
+        time.sleep(delay)
+    logger.error(f"Server failed to start at {url} after {retries} retries.")
+    return False
+
+
+def main():
+    # Using TemporaryDirectory for automatic cleanup
+    with tempfile.TemporaryDirectory(prefix="downloader_test_") as temp_base_dir:
+        logger.info(f"Created temporary base directory: {temp_base_dir}")
+
+        # 1. Create the dummy website files
+        site_root_path = create_temp_site_files(
+            temp_base_dir
+        )  # This is /tmp/xxxx/test_site
+
+        # 2. Start the HTTP server
+        host = "localhost"
+        port = find_free_port()
+        server_process = start_http_server_process(site_root_path, host, port)
+
+        test_url_base = f"http://{host}:{port}/"  # Server serves from site_root_path, so URLs are relative to that
+
+        # 3. Check if server is ready
+        # We check the index.html which is at the root of what's being served
+        if not check_server_ready(test_url_base + "index.html"):
+            logger.error("Test server failed to become ready. Aborting tests.")
+            if server_process.is_alive():
+                server_process.terminate()
+                server_process.join(timeout=5)
+            return
+
+        # 4. Define test parameters
+        # Destination for downloaded content will also be inside the temp_base_dir
+        download_destination_root = os.path.join(temp_base_dir, "downloaded_content")
+        os.makedirs(download_destination_root, exist_ok=True)
+
+        tests_params_config = [
+            (
+                "1: Basic recursive download (depth 2)",
+                {
+                    "start_url": test_url_base + "index.html",
+                    "dest_dir": os.path.join(download_destination_root, "test1"),
+                    "recursive": True,
+                    "follow_links": True,
+                    "depth_limit": 2,
+                    "max_age_hours": 0,
+                },
+            ),
+            (
+                "2: With ignore_paths and max_age (reuse test1 dir)",
+                {
+                    "start_url": test_url_base + "index.html",
+                    "dest_dir": os.path.join(
+                        download_destination_root, "test1"
+                    ),  # Use same dest
+                    "recursive": True,
+                    "follow_links": True,
+                    "depth_limit": 2,
+                    "ignore_paths": ["ignored_page.html"],
+                    "max_age_hours": 1,  # Should skip files from test1 if downloaded recently
+                },
+            ),
+            (
+                "3: Non-recursive (single page)",
+                {
+                    "start_url": test_url_base + "page1.html",
+                    "dest_dir": os.path.join(download_destination_root, "test3"),
+                    "recursive": False,  # Effectively depth_limit 0 for the spider
+                    "max_age_hours": 0,
+                },
+            ),
+            (
+                "4: Depth limit 0 (only start_url)",
+                {
+                    "start_url": test_url_base + "index.html",
+                    "dest_dir": os.path.join(download_destination_root, "test4_depth0"),
+                    "recursive": True,  # 'recursive' flag enables depth control
+                    "follow_links": True,
+                    "depth_limit": 0,  # Spider should only download index.html
+                    "max_age_hours": 0,
+                },
+            ),
+            (
+                "5: Depth limit 1",
+                {
+                    "start_url": test_url_base + "index.html",
+                    "dest_dir": os.path.join(download_destination_root, "test5_depth1"),
+                    "recursive": True,
+                    "follow_links": True,
+                    "depth_limit": 1,  # index.html and its direct links
+                    "max_age_hours": 0,
+                },
+            ),
+        ]
+
+        # 5. Run tests using multiprocessing
+        # A queue to get results back from subprocesses
+        results_queue = multiprocessing.Queue()
+        processes = []
+
+        for test_name, downloader_kwargs in tests_params_config:
+            # Ensure dest_dir exists for each test before starting
+            os.makedirs(downloader_kwargs["dest_dir"], exist_ok=True)
+
+            p = multiprocessing.Process(
+                target=run_download_test_process,
+                args=(test_name, downloader_kwargs, results_queue),
+            )
+            processes.append(p)
+            p.start()
+
+        # Wait for all processes to complete
+        for p in processes:
+            p.join()
+
+        # Collect and print results
+        logger.info("\n--- All Test Processes Completed. Results: ---")
+        all_tests_passed = True
+        results_collected = 0
+        failed_tests_details = []  # Store details of failed tests
+
+        # ANSI escape codes for colors
+        RED = "\033[91m"
+        GREEN = "\033[92m"
+        RESET = "\033[0m"
+
+        while results_collected < len(tests_params_config):
+            current_test_passed = True
+            failure_reason = ""
+            try:
+                result = results_queue.get(timeout=10)  # Timeout to avoid hanging
+                results_collected += 1
+                test_name = result["test_name"]
+                summary = result["summary"]
+
+                print(f"\nResult for Test: {test_name}")
+                if summary:
+                    print(f"  Summary: {json.dumps(summary, indent=2)}")
+                    # Basic check: if errors array in summary is empty, consider it a pass for now
+                    if summary.get("errors") and len(summary.get("errors")) > 0:
+                        failure_reason = (
+                            f"Reported errors in summary: {summary.get('errors')}"
+                        )
+                        logger.error(f"  Test '{test_name}' {failure_reason}")
+                        current_test_passed = False
+                    elif (
+                        summary.get("successful_downloads", 0) == 0
+                        and not (
+                            test_name.startswith(
+                                "4:"
+                            )  # Test 4 might have 0 successful if only start_url is processed
+                            and summary.get("total_processed_urls", 0)
+                            > 0  # and it was processed
+                        )
+                        and not test_name.startswith(
+                            "2:"
+                        )  # Test 2 might have 0 successful if all skipped
+                    ):
+                        # This condition is a bit loose. Specific checks below are more important.
+                        # For now, we don't mark as failed here unless other checks also fail.
+                        pass
+
+                    # Specific checks for state and re-download
+                    if test_name.startswith("1:"):  # After Test 1
+                        state_file = summary.get("state_file_path")
+                        if state_file and os.path.exists(state_file):
+                            with open(state_file, "r") as f:
+                                state = json.load(f)
+                            expected_success_files = [
+                                test_url_base + "index.html",
+                                test_url_base + "page1.html",
+                                test_url_base + "sub/page2.html",
+                            ]
+                            actual_success_count = 0
+                            for url, data in state.items():
+                                if (
+                                    url in expected_success_files
+                                    and data.get("status") == "success"
+                                ):
+                                    actual_success_count += 1
+                            if actual_success_count >= 3:
+                                logger.info(
+                                    f"  Test 1: State file check PASSED for key successful files."
+                                )
+                            else:
+                                failure_reason = f"State file check FAILED. Expected ~3 successes, got {actual_success_count}. State: {state}"
+                                logger.error(f"  Test 1: {failure_reason}")
+                                current_test_passed = False
+                        else:
+                            failure_reason = (
+                                "State file not found or summary incomplete."
+                            )
+                            logger.error(f"  Test 1: {failure_reason}")
+                            current_test_passed = False
+
+                    elif test_name.startswith(
+                        "2:"
+                    ):  # After Test 2 (re-run on test1 dir)
+                        state_file = summary.get("state_file_path")
+                        if state_file and os.path.exists(state_file):
+                            with open(state_file, "r") as f:
+                                state = json.load(f)
+                            skipped_count = 0
+                            main_files_to_check_skip = [
+                                test_url_base + "index.html",
+                                test_url_base + "page1.html",
+                                test_url_base + "sub/page2.html",
+                            ]
+                            for url_to_check in main_files_to_check_skip:
+                                if (
+                                    url_to_check in state
+                                    and state[url_to_check].get("status")
+                                    == "skipped_max_age"
+                                ):
+                                    skipped_count += 1
+
+                            if skipped_count >= 3:
+                                logger.info(
+                                    f"  Test 2: Re-download check (skipped_max_age) PASSED for key files."
+                                )
+                            else:
+                                failure_reason = f"Re-download check FAILED. Expected ~3 skips, got {skipped_count}. State: {state}"
+                                logger.error(f"  Test 2: {failure_reason}")
+                                current_test_passed = False
+
+                            if (
+                                test_url_base + "ignored_page.html" in state
+                                and state[test_url_base + "ignored_page.html"].get(
+                                    "status"
+                                )
+                                == "success"
+                            ):
+                                ignore_fail_reason = "ignored_page.html was downloaded, but should have been ignored."
+                                logger.error(f"  Test 2: {ignore_fail_reason}")
+                                if not failure_reason:
+                                    failure_reason = ignore_fail_reason
+                                else:
+                                    failure_reason += f"; {ignore_fail_reason}"
+                                current_test_passed = False
+                        else:
+                            failure_reason = (
+                                "State file not found or summary incomplete."
+                            )
+                            logger.error(f"  Test 2: {failure_reason}")
+                            current_test_passed = False
+
+                    elif test_name.startswith("4:"):  # Depth 0
+                        state_file = summary.get("state_file_path")
+                        if state_file and os.path.exists(state_file):
+                            with open(state_file, "r") as f:
+                                state = json.load(f)
+                            if (
+                                len(state) == 1
+                                and (test_url_base + "index.html") in state
+                                and (
+                                    state[test_url_base + "index.html"].get("status")
+                                    == "success"
+                                    # Allow "failed" for depth 0 if the single URL itself failed,
+                                    # as the test is about *not* crawling further.
+                                    or state[test_url_base + "index.html"].get("status")
+                                    == "failed"
+                                )
+                            ):
+                                logger.info(
+                                    f"  Test 4: Depth 0 check PASSED (1 item in state)."
+                                )
+                            else:
+                                failure_reason = f"Depth 0 check FAILED. Expected 1 item processed, got {len(state)}. State: {state}"
+                                logger.error(f"  Test 4: {failure_reason}")
+                                current_test_passed = False
+                        else:
+                            failure_reason = (
+                                "State file not found or summary incomplete."
+                            )
+                            logger.error(f"  Test 4: {failure_reason}")
+                            current_test_passed = False
+                else:
+                    failure_reason = (
+                        "Did not return a summary (likely failed hard in subprocess)."
+                    )
+                    logger.error(f"  Test '{test_name}' {failure_reason}")
+                    current_test_passed = False
+
+            except queue.Empty:  # Changed from multiprocessing.queues.Empty
+                test_name = f"Unknown Test (result {results_collected + 1} of {len(tests_params_config)})"
+                failure_reason = "Queue was empty after waiting, a subprocess might have died without putting result."
+                logger.error(failure_reason)
+                current_test_passed = False
+                # Do not break here, try to collect other results if any.
+                # Instead, mark this attempt as a failure.
+                # We increment results_collected because we "processed" an attempt to get a result.
+
+            if not current_test_passed:
+                all_tests_passed = False
+                failed_tests_details.append(
+                    {"name": test_name, "reason": failure_reason}
+                )
+
+        # 6. Terminate the server
+        logger.info("Terminating HTTP server process...")
+        if server_process.is_alive():
+            server_process.terminate()
+            server_process.join(timeout=5)  # Wait for it to terminate
+        if server_process.is_alive():
+            logger.warning(
+                "Server process did not terminate gracefully, attempting to kill."
+            )
+            server_process.kill()  # Force kill if terminate didn't work
+            server_process.join(timeout=5)
+
+        if server_process.is_alive():
+            logger.error("SERVER PROCESS COULD NOT BE STOPPED.")
+        else:
+            logger.info("HTTP server process stopped.")
+
+        if failed_tests_details:
+            logger.error(f"\n--- {RED}Summary of Failed Tests{RESET} ---")
+            for failed_test in failed_tests_details:
+                logger.error(f"{RED}  Test: {failed_test['name']}{RESET}")
+                logger.error(f"{RED}    Reason: {failed_test['reason']}{RESET}")
+            logger.error(f"\n{RED}Some downloader tests FAILED.{RESET}")
+            sys.exit(1)  # Exit with error code if tests failed
+        else:
+            logger.info(
+                f"\n{GREEN}All downloader tests PASSED (based on implemented checks).{RESET}"
+            )
+
+    # Note: TemporaryDirectory temp_base_dir is automatically cleaned up here
+    logger.info(
+        f"Temporary base directory {temp_base_dir} and its contents (should be) removed."
+    )
+
+
+if __name__ == "__main__":
+    # It's good practice to protect the main call for multiprocessing,
+    # especially on Windows, though 'spawn' (default on macOS for 3.8+) is generally safer.
+    multiprocessing.freeze_support()  # For PyInstaller compatibility, good habit
+    main()
--- a/examples/downloader/threefold_scraper.py
+++ b/examples/downloader/threefold_scraper.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import tempfile
+
+import html2text
+import lmstudio as lms
+import requests
+import scrapy
+from IPython import embed
+from openai import OpenAI
+from scrapy.crawler import CrawlerProcess
+from scrapy.http import Request
+
+client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
+# api_key is usually required, but for LM Studio it might not be strictly necessary.
+# client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
+# openai.api_key = "YOUR_ANY_API_KEY" # Replace with your actual key if needed
+
+
+class ThreeFoldDocsSpider(scrapy.Spider):
+    name = "threefold_docs"
+    start_urls = ["https://threefold.info/tech/docs/"]
+
+    def parse(self, response):
+        # Extract the main content
+        content = response.css("main").get()
+
+        # Convert HTML to markdown using LMStudio
+        markdown_content = convert_html_to_markdown_with_lmstudio(content)
+
+        # Save the content
+        if markdown_content:
+            # Remove leading whitespace and markdown code block fence if present
+            markdown_content = markdown_content.lstrip()
+            if markdown_content.startswith("```markdown"):
+                markdown_content = markdown_content[len("```markdown\n") :]
+            elif markdown_content.startswith("```"):
+                markdown_content = markdown_content[len("```\n") :]
+
+            with open("threefold_docs.md", "w", encoding="utf-8") as f:
+                f.write(markdown_content)
+
+            self.log(f"Saved content to threefold_docs.md")
+        else:
+            self.log(f"Could not convert HTML to Markdown for {response.url}")
+
+
+def convert_html_to_markdown_with_lmstudio(html_content):
+    """Convert HTML to Markdown using LMStudio with jinaai.readerlm-v2"""
+    try:
+        # Use the OpenAI-compatible API provided by LMStudio
+        response = client.chat.completions.create(
+            model="jinaai/ReaderLM-v2",  # Assuming this is the correct model ID
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant that converts HTML to Markdown.",
+                },
+                {
+                    "role": "user",
+                    "content": f"Please convert the following HTML to Markdown:\n\n{html_content}",
+                },
+            ],
+            stream=False,  # Set to True if streaming is desired
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        print(f"Error converting HTML to Markdown with LMStudio: {e}")
+        return None
+
+
+def scrape_threefold_docs():
+    """Run the Scrapy spider to scrape ThreeFold docs"""
+    process = CrawlerProcess(
+        {
+            "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "LOG_LEVEL": "INFO",
+        }
+    )
+
+    process.crawl(ThreeFoldDocsSpider)
+    process.start()
+
+    return "threefold_docs.md"
+
+
+# Note: The lmstudio Python library primarily provides an OpenAI-compatible client.
+# Automated model downloading and loading are typically handled manually in the
+# LM Studio application or through its local server API, not directly via this client.
+# Ensure the "jinaai/jina-embeddings-v2-base-en" model (or the desired Jina embeddings v3 model if available)
+# is downloaded and loaded in your LM Studio application before running this script.
+
+
+def get_embedding_with_lmstudio(text, model_name="jinaai/jina-embeddings-v2-base-en"):
+    """Get embedding for text using LM Studio with the specified model."""
+    # Ensure the "jinaai/jina-embeddings-v3" model is downloaded and loaded in your LM Studio application before running this script.
+    try:
+        # Use the OpenAI-compatible API for embeddings
+        response = client.embeddings.create(model=model_name, input=[text])
+        return response.data[0].embedding
+    except Exception as e:
+        print(f"Error getting embedding with LMStudio: {e}")
+        print("Please ensure LM Studio is running and the specified model is loaded.")
+        return None
+
+
+def main():
+    model_to_use = "jinaai/jina-embeddings-v2-base-en"
+
+    markdown_file = scrape_threefold_docs()
+
+    embed()
+
+    if os.path.exists(markdown_file):
+        with open(markdown_file, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        # Example usage of the embedding function
+        embedding = get_embedding_with_lmstudio(content, model_to_use)
+        if embedding:
+            print(
+                f"Successfully generated embedding (first 10 dimensions): {embedding[:10]}..."
+            )
+        else:
+            print("Failed to generate embedding.")
+
+    # Model unloading should be done manually in LM Studio.
+
+
+if __name__ == "__main__":
+    main()