...
This commit is contained in:
467
examples/downloader/downloader_example.py
Normal file
467
examples/downloader/downloader_example.py
Normal file
@@ -0,0 +1,467 @@
|
||||
import http.server
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import queue # For queue.Empty exception
|
||||
import shutil # For removing temp dir if TemporaryDirectory context manager not used for whole scope
|
||||
import socketserver
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import requests # For checking server readiness
|
||||
|
||||
# Adjust the Python path to include the parent directory (project root)
|
||||
# so that 'lib.downloader' can be imported.
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||
from lib.downloader import STATE_FILE_NAME, download_site
|
||||
|
||||
# Configure logging for the example script
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
|
||||
|
||||
# This function needs to be at the top level for multiprocessing to find it.
|
||||
def run_download_test_process(test_name, downloader_kwargs, queue):
|
||||
"""
|
||||
Wrapper to run download_site in a separate process and put summary in a queue.
|
||||
"""
|
||||
logger.info(f"--- Running Test in subprocess: {test_name} ---")
|
||||
summary = None
|
||||
try:
|
||||
summary = download_site(**downloader_kwargs)
|
||||
logger.info(f"Test {test_name} completed in subprocess.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in test {test_name} (subprocess): {e}", exc_info=True)
|
||||
# summary will remain None or be an incomplete one if error is after its creation
|
||||
finally:
|
||||
queue.put({"test_name": test_name, "summary": summary})
|
||||
|
||||
|
||||
def create_temp_site_files(base_dir):
|
||||
"""Creates the dummy HTML files in a 'test_site' subdirectory of base_dir."""
|
||||
site_dir = os.path.join(base_dir, "test_site")
|
||||
os.makedirs(os.path.join(site_dir, "sub"), exist_ok=True)
|
||||
|
||||
with open(os.path.join(site_dir, "index.html"), "w") as f:
|
||||
f.write(
|
||||
'<h1>Index</h1><a href="page1.html">Page 1</a> <a href="sub/page2.html">Page 2</a> <a href="ignored_page.html">Ignored</a> <a href="nonexistent.html">Non Existent</a>'
|
||||
)
|
||||
with open(os.path.join(site_dir, "page1.html"), "w") as f:
|
||||
f.write('<h1>Page 1</h1><a href="index.html">Index</a>')
|
||||
with open(os.path.join(site_dir, "sub", "page2.html"), "w") as f:
|
||||
f.write(
|
||||
'<h1>Page 2</h1><a href="../index.html">Index Back</a> <a href="http://neverssl.com">External</a>'
|
||||
)
|
||||
with open(os.path.join(site_dir, "ignored_page.html"), "w") as f:
|
||||
f.write("<h1>Ignored Page</h1>")
|
||||
logger.info(f"Created dummy site files in {site_dir}")
|
||||
return site_dir
|
||||
|
||||
|
||||
# Top-level target function for the HTTP server process
|
||||
def _http_server_target_function(directory, host, port):
|
||||
import functools
|
||||
|
||||
# Use functools.partial to set the 'directory' argument for SimpleHTTPRequestHandler
|
||||
# This ensures the server serves files from the specified 'directory'.
|
||||
Handler = functools.partial(
|
||||
http.server.SimpleHTTPRequestHandler, directory=directory
|
||||
)
|
||||
|
||||
try:
|
||||
with socketserver.TCPServer((host, port), Handler) as httpd:
|
||||
logger.info(
|
||||
f"HTTP server process (PID {os.getpid()}) started on {host}:{port}, serving {directory}"
|
||||
)
|
||||
httpd.serve_forever()
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"HTTP server process (PID {os.getpid()}) failed: {e}", exc_info=True
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
def start_http_server_process(directory, host, port):
|
||||
"""Starts a simple HTTP server in a separate process."""
|
||||
# Removed server_ready_event = multiprocessing.Event()
|
||||
|
||||
server_process = multiprocessing.Process(
|
||||
target=_http_server_target_function,
|
||||
args=(directory, host, port), # Removed server_ready_event from args
|
||||
daemon=True,
|
||||
)
|
||||
server_process.start()
|
||||
logger.info(
|
||||
f"HTTP server process (PID: {server_process.pid}) initiated for {directory} on {host}:{port}"
|
||||
)
|
||||
# Removed event waiting logic
|
||||
|
||||
return server_process
|
||||
|
||||
|
||||
def find_free_port():
|
||||
"""Finds an available port on the local machine."""
|
||||
with socketserver.TCPServer(
|
||||
("localhost", 0), http.server.BaseHTTPRequestHandler
|
||||
) as s:
|
||||
return s.server_address[1]
|
||||
|
||||
|
||||
def check_server_ready(url, retries=10, delay=0.5):
|
||||
"""Checks if the server is responding to requests."""
|
||||
for i in range(retries):
|
||||
try:
|
||||
response = requests.get(url, timeout=1)
|
||||
if response.status_code == 200:
|
||||
logger.info(f"Server is ready at {url}")
|
||||
return True
|
||||
except requests.ConnectionError:
|
||||
logger.debug(
|
||||
f"Server not ready yet at {url}, attempt {i + 1}/{retries}. Retrying in {delay}s..."
|
||||
)
|
||||
except requests.Timeout:
|
||||
logger.debug(
|
||||
f"Server timed out at {url}, attempt {i + 1}/{retries}. Retrying in {delay}s..."
|
||||
)
|
||||
time.sleep(delay)
|
||||
logger.error(f"Server failed to start at {url} after {retries} retries.")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
# Using TemporaryDirectory for automatic cleanup
|
||||
with tempfile.TemporaryDirectory(prefix="downloader_test_") as temp_base_dir:
|
||||
logger.info(f"Created temporary base directory: {temp_base_dir}")
|
||||
|
||||
# 1. Create the dummy website files
|
||||
site_root_path = create_temp_site_files(
|
||||
temp_base_dir
|
||||
) # This is /tmp/xxxx/test_site
|
||||
|
||||
# 2. Start the HTTP server
|
||||
host = "localhost"
|
||||
port = find_free_port()
|
||||
server_process = start_http_server_process(site_root_path, host, port)
|
||||
|
||||
test_url_base = f"http://{host}:{port}/" # Server serves from site_root_path, so URLs are relative to that
|
||||
|
||||
# 3. Check if server is ready
|
||||
# We check the index.html which is at the root of what's being served
|
||||
if not check_server_ready(test_url_base + "index.html"):
|
||||
logger.error("Test server failed to become ready. Aborting tests.")
|
||||
if server_process.is_alive():
|
||||
server_process.terminate()
|
||||
server_process.join(timeout=5)
|
||||
return
|
||||
|
||||
# 4. Define test parameters
|
||||
# Destination for downloaded content will also be inside the temp_base_dir
|
||||
download_destination_root = os.path.join(temp_base_dir, "downloaded_content")
|
||||
os.makedirs(download_destination_root, exist_ok=True)
|
||||
|
||||
tests_params_config = [
|
||||
(
|
||||
"1: Basic recursive download (depth 2)",
|
||||
{
|
||||
"start_url": test_url_base + "index.html",
|
||||
"dest_dir": os.path.join(download_destination_root, "test1"),
|
||||
"recursive": True,
|
||||
"follow_links": True,
|
||||
"depth_limit": 2,
|
||||
"max_age_hours": 0,
|
||||
},
|
||||
),
|
||||
(
|
||||
"2: With ignore_paths and max_age (reuse test1 dir)",
|
||||
{
|
||||
"start_url": test_url_base + "index.html",
|
||||
"dest_dir": os.path.join(
|
||||
download_destination_root, "test1"
|
||||
), # Use same dest
|
||||
"recursive": True,
|
||||
"follow_links": True,
|
||||
"depth_limit": 2,
|
||||
"ignore_paths": ["ignored_page.html"],
|
||||
"max_age_hours": 1, # Should skip files from test1 if downloaded recently
|
||||
},
|
||||
),
|
||||
(
|
||||
"3: Non-recursive (single page)",
|
||||
{
|
||||
"start_url": test_url_base + "page1.html",
|
||||
"dest_dir": os.path.join(download_destination_root, "test3"),
|
||||
"recursive": False, # Effectively depth_limit 0 for the spider
|
||||
"max_age_hours": 0,
|
||||
},
|
||||
),
|
||||
(
|
||||
"4: Depth limit 0 (only start_url)",
|
||||
{
|
||||
"start_url": test_url_base + "index.html",
|
||||
"dest_dir": os.path.join(download_destination_root, "test4_depth0"),
|
||||
"recursive": True, # 'recursive' flag enables depth control
|
||||
"follow_links": True,
|
||||
"depth_limit": 0, # Spider should only download index.html
|
||||
"max_age_hours": 0,
|
||||
},
|
||||
),
|
||||
(
|
||||
"5: Depth limit 1",
|
||||
{
|
||||
"start_url": test_url_base + "index.html",
|
||||
"dest_dir": os.path.join(download_destination_root, "test5_depth1"),
|
||||
"recursive": True,
|
||||
"follow_links": True,
|
||||
"depth_limit": 1, # index.html and its direct links
|
||||
"max_age_hours": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
# 5. Run tests using multiprocessing
|
||||
# A queue to get results back from subprocesses
|
||||
results_queue = multiprocessing.Queue()
|
||||
processes = []
|
||||
|
||||
for test_name, downloader_kwargs in tests_params_config:
|
||||
# Ensure dest_dir exists for each test before starting
|
||||
os.makedirs(downloader_kwargs["dest_dir"], exist_ok=True)
|
||||
|
||||
p = multiprocessing.Process(
|
||||
target=run_download_test_process,
|
||||
args=(test_name, downloader_kwargs, results_queue),
|
||||
)
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
# Wait for all processes to complete
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
# Collect and print results
|
||||
logger.info("\n--- All Test Processes Completed. Results: ---")
|
||||
all_tests_passed = True
|
||||
results_collected = 0
|
||||
failed_tests_details = [] # Store details of failed tests
|
||||
|
||||
# ANSI escape codes for colors
|
||||
RED = "\033[91m"
|
||||
GREEN = "\033[92m"
|
||||
RESET = "\033[0m"
|
||||
|
||||
while results_collected < len(tests_params_config):
|
||||
current_test_passed = True
|
||||
failure_reason = ""
|
||||
try:
|
||||
result = results_queue.get(timeout=10) # Timeout to avoid hanging
|
||||
results_collected += 1
|
||||
test_name = result["test_name"]
|
||||
summary = result["summary"]
|
||||
|
||||
print(f"\nResult for Test: {test_name}")
|
||||
if summary:
|
||||
print(f" Summary: {json.dumps(summary, indent=2)}")
|
||||
# Basic check: if errors array in summary is empty, consider it a pass for now
|
||||
if summary.get("errors") and len(summary.get("errors")) > 0:
|
||||
failure_reason = (
|
||||
f"Reported errors in summary: {summary.get('errors')}"
|
||||
)
|
||||
logger.error(f" Test '{test_name}' {failure_reason}")
|
||||
current_test_passed = False
|
||||
elif (
|
||||
summary.get("successful_downloads", 0) == 0
|
||||
and not (
|
||||
test_name.startswith(
|
||||
"4:"
|
||||
) # Test 4 might have 0 successful if only start_url is processed
|
||||
and summary.get("total_processed_urls", 0)
|
||||
> 0 # and it was processed
|
||||
)
|
||||
and not test_name.startswith(
|
||||
"2:"
|
||||
) # Test 2 might have 0 successful if all skipped
|
||||
):
|
||||
# This condition is a bit loose. Specific checks below are more important.
|
||||
# For now, we don't mark as failed here unless other checks also fail.
|
||||
pass
|
||||
|
||||
# Specific checks for state and re-download
|
||||
if test_name.startswith("1:"): # After Test 1
|
||||
state_file = summary.get("state_file_path")
|
||||
if state_file and os.path.exists(state_file):
|
||||
with open(state_file, "r") as f:
|
||||
state = json.load(f)
|
||||
expected_success_files = [
|
||||
test_url_base + "index.html",
|
||||
test_url_base + "page1.html",
|
||||
test_url_base + "sub/page2.html",
|
||||
]
|
||||
actual_success_count = 0
|
||||
for url, data in state.items():
|
||||
if (
|
||||
url in expected_success_files
|
||||
and data.get("status") == "success"
|
||||
):
|
||||
actual_success_count += 1
|
||||
if actual_success_count >= 3:
|
||||
logger.info(
|
||||
f" Test 1: State file check PASSED for key successful files."
|
||||
)
|
||||
else:
|
||||
failure_reason = f"State file check FAILED. Expected ~3 successes, got {actual_success_count}. State: {state}"
|
||||
logger.error(f" Test 1: {failure_reason}")
|
||||
current_test_passed = False
|
||||
else:
|
||||
failure_reason = (
|
||||
"State file not found or summary incomplete."
|
||||
)
|
||||
logger.error(f" Test 1: {failure_reason}")
|
||||
current_test_passed = False
|
||||
|
||||
elif test_name.startswith(
|
||||
"2:"
|
||||
): # After Test 2 (re-run on test1 dir)
|
||||
state_file = summary.get("state_file_path")
|
||||
if state_file and os.path.exists(state_file):
|
||||
with open(state_file, "r") as f:
|
||||
state = json.load(f)
|
||||
skipped_count = 0
|
||||
main_files_to_check_skip = [
|
||||
test_url_base + "index.html",
|
||||
test_url_base + "page1.html",
|
||||
test_url_base + "sub/page2.html",
|
||||
]
|
||||
for url_to_check in main_files_to_check_skip:
|
||||
if (
|
||||
url_to_check in state
|
||||
and state[url_to_check].get("status")
|
||||
== "skipped_max_age"
|
||||
):
|
||||
skipped_count += 1
|
||||
|
||||
if skipped_count >= 3:
|
||||
logger.info(
|
||||
f" Test 2: Re-download check (skipped_max_age) PASSED for key files."
|
||||
)
|
||||
else:
|
||||
failure_reason = f"Re-download check FAILED. Expected ~3 skips, got {skipped_count}. State: {state}"
|
||||
logger.error(f" Test 2: {failure_reason}")
|
||||
current_test_passed = False
|
||||
|
||||
if (
|
||||
test_url_base + "ignored_page.html" in state
|
||||
and state[test_url_base + "ignored_page.html"].get(
|
||||
"status"
|
||||
)
|
||||
== "success"
|
||||
):
|
||||
ignore_fail_reason = "ignored_page.html was downloaded, but should have been ignored."
|
||||
logger.error(f" Test 2: {ignore_fail_reason}")
|
||||
if not failure_reason:
|
||||
failure_reason = ignore_fail_reason
|
||||
else:
|
||||
failure_reason += f"; {ignore_fail_reason}"
|
||||
current_test_passed = False
|
||||
else:
|
||||
failure_reason = (
|
||||
"State file not found or summary incomplete."
|
||||
)
|
||||
logger.error(f" Test 2: {failure_reason}")
|
||||
current_test_passed = False
|
||||
|
||||
elif test_name.startswith("4:"): # Depth 0
|
||||
state_file = summary.get("state_file_path")
|
||||
if state_file and os.path.exists(state_file):
|
||||
with open(state_file, "r") as f:
|
||||
state = json.load(f)
|
||||
if (
|
||||
len(state) == 1
|
||||
and (test_url_base + "index.html") in state
|
||||
and (
|
||||
state[test_url_base + "index.html"].get("status")
|
||||
== "success"
|
||||
# Allow "failed" for depth 0 if the single URL itself failed,
|
||||
# as the test is about *not* crawling further.
|
||||
or state[test_url_base + "index.html"].get("status")
|
||||
== "failed"
|
||||
)
|
||||
):
|
||||
logger.info(
|
||||
f" Test 4: Depth 0 check PASSED (1 item in state)."
|
||||
)
|
||||
else:
|
||||
failure_reason = f"Depth 0 check FAILED. Expected 1 item processed, got {len(state)}. State: {state}"
|
||||
logger.error(f" Test 4: {failure_reason}")
|
||||
current_test_passed = False
|
||||
else:
|
||||
failure_reason = (
|
||||
"State file not found or summary incomplete."
|
||||
)
|
||||
logger.error(f" Test 4: {failure_reason}")
|
||||
current_test_passed = False
|
||||
else:
|
||||
failure_reason = (
|
||||
"Did not return a summary (likely failed hard in subprocess)."
|
||||
)
|
||||
logger.error(f" Test '{test_name}' {failure_reason}")
|
||||
current_test_passed = False
|
||||
|
||||
except queue.Empty: # Changed from multiprocessing.queues.Empty
|
||||
test_name = f"Unknown Test (result {results_collected + 1} of {len(tests_params_config)})"
|
||||
failure_reason = "Queue was empty after waiting, a subprocess might have died without putting result."
|
||||
logger.error(failure_reason)
|
||||
current_test_passed = False
|
||||
# Do not break here, try to collect other results if any.
|
||||
# Instead, mark this attempt as a failure.
|
||||
# We increment results_collected because we "processed" an attempt to get a result.
|
||||
|
||||
if not current_test_passed:
|
||||
all_tests_passed = False
|
||||
failed_tests_details.append(
|
||||
{"name": test_name, "reason": failure_reason}
|
||||
)
|
||||
|
||||
# 6. Terminate the server
|
||||
logger.info("Terminating HTTP server process...")
|
||||
if server_process.is_alive():
|
||||
server_process.terminate()
|
||||
server_process.join(timeout=5) # Wait for it to terminate
|
||||
if server_process.is_alive():
|
||||
logger.warning(
|
||||
"Server process did not terminate gracefully, attempting to kill."
|
||||
)
|
||||
server_process.kill() # Force kill if terminate didn't work
|
||||
server_process.join(timeout=5)
|
||||
|
||||
if server_process.is_alive():
|
||||
logger.error("SERVER PROCESS COULD NOT BE STOPPED.")
|
||||
else:
|
||||
logger.info("HTTP server process stopped.")
|
||||
|
||||
if failed_tests_details:
|
||||
logger.error(f"\n--- {RED}Summary of Failed Tests{RESET} ---")
|
||||
for failed_test in failed_tests_details:
|
||||
logger.error(f"{RED} Test: {failed_test['name']}{RESET}")
|
||||
logger.error(f"{RED} Reason: {failed_test['reason']}{RESET}")
|
||||
logger.error(f"\n{RED}Some downloader tests FAILED.{RESET}")
|
||||
sys.exit(1) # Exit with error code if tests failed
|
||||
else:
|
||||
logger.info(
|
||||
f"\n{GREEN}All downloader tests PASSED (based on implemented checks).{RESET}"
|
||||
)
|
||||
|
||||
# Note: TemporaryDirectory temp_base_dir is automatically cleaned up here
|
||||
logger.info(
|
||||
f"Temporary base directory {temp_base_dir} and its contents (should be) removed."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# It's good practice to protect the main call for multiprocessing,
|
||||
# especially on Windows, though 'spawn' (default on macOS for 3.8+) is generally safer.
|
||||
multiprocessing.freeze_support() # For PyInstaller compatibility, good habit
|
||||
main()
|
134
examples/downloader/threefold_scraper.py
Executable file
134
examples/downloader/threefold_scraper.py
Executable file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import html2text
|
||||
import lmstudio as lms
|
||||
import requests
|
||||
import scrapy
|
||||
from IPython import embed
|
||||
from openai import OpenAI
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.http import Request
|
||||
|
||||
client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
|
||||
# api_key is usually required, but for LM Studio it might not be strictly necessary.
|
||||
# client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
|
||||
# openai.api_key = "YOUR_ANY_API_KEY" # Replace with your actual key if needed
|
||||
|
||||
|
||||
class ThreeFoldDocsSpider(scrapy.Spider):
|
||||
name = "threefold_docs"
|
||||
start_urls = ["https://threefold.info/tech/docs/"]
|
||||
|
||||
def parse(self, response):
|
||||
# Extract the main content
|
||||
content = response.css("main").get()
|
||||
|
||||
# Convert HTML to markdown using LMStudio
|
||||
markdown_content = convert_html_to_markdown_with_lmstudio(content)
|
||||
|
||||
# Save the content
|
||||
if markdown_content:
|
||||
# Remove leading whitespace and markdown code block fence if present
|
||||
markdown_content = markdown_content.lstrip()
|
||||
if markdown_content.startswith("```markdown"):
|
||||
markdown_content = markdown_content[len("```markdown\n") :]
|
||||
elif markdown_content.startswith("```"):
|
||||
markdown_content = markdown_content[len("```\n") :]
|
||||
|
||||
with open("threefold_docs.md", "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
self.log(f"Saved content to threefold_docs.md")
|
||||
else:
|
||||
self.log(f"Could not convert HTML to Markdown for {response.url}")
|
||||
|
||||
|
||||
def convert_html_to_markdown_with_lmstudio(html_content):
|
||||
"""Convert HTML to Markdown using LMStudio with jinaai.readerlm-v2"""
|
||||
try:
|
||||
# Use the OpenAI-compatible API provided by LMStudio
|
||||
response = client.chat.completions.create(
|
||||
model="jinaai/ReaderLM-v2", # Assuming this is the correct model ID
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant that converts HTML to Markdown.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Please convert the following HTML to Markdown:\n\n{html_content}",
|
||||
},
|
||||
],
|
||||
stream=False, # Set to True if streaming is desired
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
except Exception as e:
|
||||
print(f"Error converting HTML to Markdown with LMStudio: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def scrape_threefold_docs():
|
||||
"""Run the Scrapy spider to scrape ThreeFold docs"""
|
||||
process = CrawlerProcess(
|
||||
{
|
||||
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
"LOG_LEVEL": "INFO",
|
||||
}
|
||||
)
|
||||
|
||||
process.crawl(ThreeFoldDocsSpider)
|
||||
process.start()
|
||||
|
||||
return "threefold_docs.md"
|
||||
|
||||
|
||||
# Note: The lmstudio Python library primarily provides an OpenAI-compatible client.
|
||||
# Automated model downloading and loading are typically handled manually in the
|
||||
# LM Studio application or through its local server API, not directly via this client.
|
||||
# Ensure the "jinaai/jina-embeddings-v2-base-en" model (or the desired Jina embeddings v3 model if available)
|
||||
# is downloaded and loaded in your LM Studio application before running this script.
|
||||
|
||||
|
||||
def get_embedding_with_lmstudio(text, model_name="jinaai/jina-embeddings-v2-base-en"):
|
||||
"""Get embedding for text using LM Studio with the specified model."""
|
||||
# Ensure the "jinaai/jina-embeddings-v3" model is downloaded and loaded in your LM Studio application before running this script.
|
||||
try:
|
||||
# Use the OpenAI-compatible API for embeddings
|
||||
response = client.embeddings.create(model=model_name, input=[text])
|
||||
return response.data[0].embedding
|
||||
except Exception as e:
|
||||
print(f"Error getting embedding with LMStudio: {e}")
|
||||
print("Please ensure LM Studio is running and the specified model is loaded.")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
model_to_use = "jinaai/jina-embeddings-v2-base-en"
|
||||
|
||||
markdown_file = scrape_threefold_docs()
|
||||
|
||||
embed()
|
||||
|
||||
if os.path.exists(markdown_file):
|
||||
with open(markdown_file, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
# Example usage of the embedding function
|
||||
embedding = get_embedding_with_lmstudio(content, model_to_use)
|
||||
if embedding:
|
||||
print(
|
||||
f"Successfully generated embedding (first 10 dimensions): {embedding[:10]}..."
|
||||
)
|
||||
else:
|
||||
print("Failed to generate embedding.")
|
||||
|
||||
# Model unloading should be done manually in LM Studio.
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user