herolib_python/examples/downloader/threefold_scraper.py

#!/usr/bin/env python3

import argparse
import json
import os
import tempfile

import html2text
import lmstudio as lms
import requests
import scrapy
from IPython import embed
from openai import OpenAI
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request

client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
# api_key is usually required, but for LM Studio it might not be strictly necessary.
# client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
# openai.api_key = "YOUR_ANY_API_KEY" # Replace with your actual key if needed


class ThreeFoldDocsSpider(scrapy.Spider):
    name = "threefold_docs"
    start_urls = ["https://threefold.info/tech/docs/"]

    def parse(self, response):
        # Extract the main content
        content = response.css("main").get()

        # Convert HTML to markdown using LMStudio
        markdown_content = convert_html_to_markdown_with_lmstudio(content)

        # Save the content
        if markdown_content:
            # Remove leading whitespace and markdown code block fence if present
            markdown_content = markdown_content.lstrip()
            if markdown_content.startswith("```markdown"):
                markdown_content = markdown_content[len("```markdown\n") :]
            elif markdown_content.startswith("```"):
                markdown_content = markdown_content[len("```\n") :]

            with open("threefold_docs.md", "w", encoding="utf-8") as f:
                f.write(markdown_content)

            self.log(f"Saved content to threefold_docs.md")
        else:
            self.log(f"Could not convert HTML to Markdown for {response.url}")


def convert_html_to_markdown_with_lmstudio(html_content):
    """Convert HTML to Markdown using LMStudio with jinaai.readerlm-v2"""
    try:
        # Use the OpenAI-compatible API provided by LMStudio
        response = client.chat.completions.create(
            model="jinaai/ReaderLM-v2",  # Assuming this is the correct model ID
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that converts HTML to Markdown.",
                },
                {
                    "role": "user",
                    "content": f"Please convert the following HTML to Markdown:\n\n{html_content}",
                },
            ],
            stream=False,  # Set to True if streaming is desired
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error converting HTML to Markdown with LMStudio: {e}")
        return None


def scrape_threefold_docs():
    """Run the Scrapy spider to scrape ThreeFold docs"""
    process = CrawlerProcess(
        {
            "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "LOG_LEVEL": "INFO",
        }
    )

    process.crawl(ThreeFoldDocsSpider)
    process.start()

    return "threefold_docs.md"


# Note: The lmstudio Python library primarily provides an OpenAI-compatible client.
# Automated model downloading and loading are typically handled manually in the
# LM Studio application or through its local server API, not directly via this client.
# Ensure the "jinaai/jina-embeddings-v2-base-en" model (or the desired Jina embeddings v3 model if available)
# is downloaded and loaded in your LM Studio application before running this script.


def get_embedding_with_lmstudio(text, model_name="jinaai/jina-embeddings-v2-base-en"):
    """Get embedding for text using LM Studio with the specified model."""
    # Ensure the "jinaai/jina-embeddings-v3" model is downloaded and loaded in your LM Studio application before running this script.
    try:
        # Use the OpenAI-compatible API for embeddings
        response = client.embeddings.create(model=model_name, input=[text])
        return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding with LMStudio: {e}")
        print("Please ensure LM Studio is running and the specified model is loaded.")
        return None


def main():
    model_to_use = "jinaai/jina-embeddings-v2-base-en"

    markdown_file = scrape_threefold_docs()

    embed()

    if os.path.exists(markdown_file):
        with open(markdown_file, "r", encoding="utf-8") as f:
            content = f.read()

        # Example usage of the embedding function
        embedding = get_embedding_with_lmstudio(content, model_to_use)
        if embedding:
            print(
                f"Successfully generated embedding (first 10 dimensions): {embedding[:10]}..."
            )
        else:
            print("Failed to generate embedding.")

    # Model unloading should be done manually in LM Studio.


if __name__ == "__main__":
    main()