135 lines
4.8 KiB
Python
Executable File
135 lines
4.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import tempfile
|
|
|
|
import html2text
|
|
import lmstudio as lms
|
|
import requests
|
|
import scrapy
|
|
from IPython import embed
|
|
from openai import OpenAI
|
|
from scrapy.crawler import CrawlerProcess
|
|
from scrapy.http import Request
|
|
|
|
client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
|
|
# api_key is usually required, but for LM Studio it might not be strictly necessary.
|
|
# client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
|
|
# openai.api_key = "YOUR_ANY_API_KEY" # Replace with your actual key if needed
|
|
|
|
|
|
class ThreeFoldDocsSpider(scrapy.Spider):
|
|
name = "threefold_docs"
|
|
start_urls = ["https://threefold.info/tech/docs/"]
|
|
|
|
def parse(self, response):
|
|
# Extract the main content
|
|
content = response.css("main").get()
|
|
|
|
# Convert HTML to markdown using LMStudio
|
|
markdown_content = convert_html_to_markdown_with_lmstudio(content)
|
|
|
|
# Save the content
|
|
if markdown_content:
|
|
# Remove leading whitespace and markdown code block fence if present
|
|
markdown_content = markdown_content.lstrip()
|
|
if markdown_content.startswith("```markdown"):
|
|
markdown_content = markdown_content[len("```markdown\n") :]
|
|
elif markdown_content.startswith("```"):
|
|
markdown_content = markdown_content[len("```\n") :]
|
|
|
|
with open("threefold_docs.md", "w", encoding="utf-8") as f:
|
|
f.write(markdown_content)
|
|
|
|
self.log(f"Saved content to threefold_docs.md")
|
|
else:
|
|
self.log(f"Could not convert HTML to Markdown for {response.url}")
|
|
|
|
|
|
def convert_html_to_markdown_with_lmstudio(html_content):
|
|
"""Convert HTML to Markdown using LMStudio with jinaai.readerlm-v2"""
|
|
try:
|
|
# Use the OpenAI-compatible API provided by LMStudio
|
|
response = client.chat.completions.create(
|
|
model="jinaai/ReaderLM-v2", # Assuming this is the correct model ID
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are a helpful assistant that converts HTML to Markdown.",
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Please convert the following HTML to Markdown:\n\n{html_content}",
|
|
},
|
|
],
|
|
stream=False, # Set to True if streaming is desired
|
|
)
|
|
return response.choices[0].message.content
|
|
except Exception as e:
|
|
print(f"Error converting HTML to Markdown with LMStudio: {e}")
|
|
return None
|
|
|
|
|
|
def scrape_threefold_docs():
|
|
"""Run the Scrapy spider to scrape ThreeFold docs"""
|
|
process = CrawlerProcess(
|
|
{
|
|
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
"LOG_LEVEL": "INFO",
|
|
}
|
|
)
|
|
|
|
process.crawl(ThreeFoldDocsSpider)
|
|
process.start()
|
|
|
|
return "threefold_docs.md"
|
|
|
|
|
|
# Note: The lmstudio Python library primarily provides an OpenAI-compatible client.
|
|
# Automated model downloading and loading are typically handled manually in the
|
|
# LM Studio application or through its local server API, not directly via this client.
|
|
# Ensure the "jinaai/jina-embeddings-v2-base-en" model (or the desired Jina embeddings v3 model if available)
|
|
# is downloaded and loaded in your LM Studio application before running this script.
|
|
|
|
|
|
def get_embedding_with_lmstudio(text, model_name="jinaai/jina-embeddings-v2-base-en"):
|
|
"""Get embedding for text using LM Studio with the specified model."""
|
|
# Ensure the "jinaai/jina-embeddings-v3" model is downloaded and loaded in your LM Studio application before running this script.
|
|
try:
|
|
# Use the OpenAI-compatible API for embeddings
|
|
response = client.embeddings.create(model=model_name, input=[text])
|
|
return response.data[0].embedding
|
|
except Exception as e:
|
|
print(f"Error getting embedding with LMStudio: {e}")
|
|
print("Please ensure LM Studio is running and the specified model is loaded.")
|
|
return None
|
|
|
|
|
|
def main():
|
|
model_to_use = "jinaai/jina-embeddings-v2-base-en"
|
|
|
|
markdown_file = scrape_threefold_docs()
|
|
|
|
embed()
|
|
|
|
if os.path.exists(markdown_file):
|
|
with open(markdown_file, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
# Example usage of the embedding function
|
|
embedding = get_embedding_with_lmstudio(content, model_to_use)
|
|
if embedding:
|
|
print(
|
|
f"Successfully generated embedding (first 10 dimensions): {embedding[:10]}..."
|
|
)
|
|
else:
|
|
print("Failed to generate embedding.")
|
|
|
|
# Model unloading should be done manually in LM Studio.
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|