herolib_python/examples/downloader/threefold_scraper.py
2025-08-05 15:15:36 +02:00

135 lines
4.8 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import json
import os
import tempfile
import html2text
import lmstudio as lms
import requests
import scrapy
from IPython import embed
from openai import OpenAI
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request
client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
# api_key is usually required, but for LM Studio it might not be strictly necessary.
# client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
# openai.api_key = "YOUR_ANY_API_KEY" # Replace with your actual key if needed
class ThreeFoldDocsSpider(scrapy.Spider):
name = "threefold_docs"
start_urls = ["https://threefold.info/tech/docs/"]
def parse(self, response):
# Extract the main content
content = response.css("main").get()
# Convert HTML to markdown using LMStudio
markdown_content = convert_html_to_markdown_with_lmstudio(content)
# Save the content
if markdown_content:
# Remove leading whitespace and markdown code block fence if present
markdown_content = markdown_content.lstrip()
if markdown_content.startswith("```markdown"):
markdown_content = markdown_content[len("```markdown\n") :]
elif markdown_content.startswith("```"):
markdown_content = markdown_content[len("```\n") :]
with open("threefold_docs.md", "w", encoding="utf-8") as f:
f.write(markdown_content)
self.log(f"Saved content to threefold_docs.md")
else:
self.log(f"Could not convert HTML to Markdown for {response.url}")
def convert_html_to_markdown_with_lmstudio(html_content):
"""Convert HTML to Markdown using LMStudio with jinaai.readerlm-v2"""
try:
# Use the OpenAI-compatible API provided by LMStudio
response = client.chat.completions.create(
model="jinaai/ReaderLM-v2", # Assuming this is the correct model ID
messages=[
{
"role": "system",
"content": "You are a helpful assistant that converts HTML to Markdown.",
},
{
"role": "user",
"content": f"Please convert the following HTML to Markdown:\n\n{html_content}",
},
],
stream=False, # Set to True if streaming is desired
)
return response.choices[0].message.content
except Exception as e:
print(f"Error converting HTML to Markdown with LMStudio: {e}")
return None
def scrape_threefold_docs():
"""Run the Scrapy spider to scrape ThreeFold docs"""
process = CrawlerProcess(
{
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"LOG_LEVEL": "INFO",
}
)
process.crawl(ThreeFoldDocsSpider)
process.start()
return "threefold_docs.md"
# Note: The lmstudio Python library primarily provides an OpenAI-compatible client.
# Automated model downloading and loading are typically handled manually in the
# LM Studio application or through its local server API, not directly via this client.
# Ensure the "jinaai/jina-embeddings-v2-base-en" model (or the desired Jina embeddings v3 model if available)
# is downloaded and loaded in your LM Studio application before running this script.
def get_embedding_with_lmstudio(text, model_name="jinaai/jina-embeddings-v2-base-en"):
"""Get embedding for text using LM Studio with the specified model."""
# Ensure the "jinaai/jina-embeddings-v3" model is downloaded and loaded in your LM Studio application before running this script.
try:
# Use the OpenAI-compatible API for embeddings
response = client.embeddings.create(model=model_name, input=[text])
return response.data[0].embedding
except Exception as e:
print(f"Error getting embedding with LMStudio: {e}")
print("Please ensure LM Studio is running and the specified model is loaded.")
return None
def main():
model_to_use = "jinaai/jina-embeddings-v2-base-en"
markdown_file = scrape_threefold_docs()
embed()
if os.path.exists(markdown_file):
with open(markdown_file, "r", encoding="utf-8") as f:
content = f.read()
# Example usage of the embedding function
embedding = get_embedding_with_lmstudio(content, model_to_use)
if embedding:
print(
f"Successfully generated embedding (first 10 dimensions): {embedding[:10]}..."
)
else:
print("Failed to generate embedding.")
# Model unloading should be done manually in LM Studio.
if __name__ == "__main__":
main()