...
This commit is contained in:
@@ -1,2 +1,4 @@
|
|||||||
# herolib_python
|
# herolib_python
|
||||||
|
|
||||||
|
see also ~/code/git.ourworld.tf/tfgrid_research/tfdev
|
||||||
|
has some usefull stuff as well
|
@@ -1,86 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
cd "$SCRIPT_DIR"
|
|
||||||
|
|
||||||
source ../../env.sh
|
|
||||||
|
|
||||||
cd "$SCRIPT_DIR"
|
|
||||||
|
|
||||||
# 1. Install dependencies
|
|
||||||
uv pip install --upgrade scrapy markdownify
|
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# Ensure clean environment
|
|
||||||
rm -rf yew_docs output
|
|
||||||
|
|
||||||
# 1. Install required packages
|
|
||||||
uv pip install --upgrade scrapy markdownify
|
|
||||||
|
|
||||||
# 2. Create Scrapy project
|
|
||||||
scrapy startproject yew_docs
|
|
||||||
cd yew_docs
|
|
||||||
|
|
||||||
# 3. Update settings to ignore robots.txt and set export directory
|
|
||||||
echo -e "\nROBOTSTXT_OBEY = False" >> yew_docs/settings.py
|
|
||||||
|
|
||||||
# 4. Create Spider with filters
|
|
||||||
cat << 'EOF' > yew_docs/spiders/yew_docs_spider.py
|
|
||||||
import os
|
|
||||||
import scrapy
|
|
||||||
from urllib.parse import urlparse, urljoin
|
|
||||||
import markdownify
|
|
||||||
|
|
||||||
class YewDocsSpider(scrapy.Spider):
|
|
||||||
name = "yew_docs"
|
|
||||||
allowed_domains = ["yew.rs"]
|
|
||||||
start_urls = ["https://yew.rs/docs/getting-started/introduction"]
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
# Extract title
|
|
||||||
title = response.css("title::text").get() or "Page"
|
|
||||||
|
|
||||||
# Extract main content
|
|
||||||
main = response.css("main").get()
|
|
||||||
if not main:
|
|
||||||
self.logger.warning(f"No main content at {response.url}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Convert to Markdown
|
|
||||||
md = markdownify.markdownify(main, heading_style="ATX")
|
|
||||||
|
|
||||||
# Construct clean file path
|
|
||||||
parsed = urlparse(response.url)
|
|
||||||
path = parsed.path.lstrip("/")
|
|
||||||
if path.endswith("/") or path == "":
|
|
||||||
path += "index"
|
|
||||||
filepath = os.path.join("output", f"{path}.md")
|
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
|
||||||
with open(filepath, "w", encoding="utf-8") as f:
|
|
||||||
f.write(f"# {title.strip()}\n\n{md}")
|
|
||||||
|
|
||||||
# Follow only clean internal links under /docs/
|
|
||||||
for href in response.css("a::attr(href)").getall():
|
|
||||||
link = urljoin(response.url, href)
|
|
||||||
parsed = urlparse(link)
|
|
||||||
path = parsed.path
|
|
||||||
|
|
||||||
if parsed.netloc == "yew.rs" and path.startswith("/docs/"):
|
|
||||||
if (
|
|
||||||
"/docs/0." in path or
|
|
||||||
"/docs/next" in path or
|
|
||||||
"/docs/en" in path or
|
|
||||||
"#" in parsed.fragment or
|
|
||||||
path.count("/") > 5
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
yield scrapy.Request(link.split("#")[0], callback=self.parse)
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# 5. Run the spider
|
|
||||||
scrapy crawl yew_docs
|
|
12
herolib.egg-info/PKG-INFO
Normal file
12
herolib.egg-info/PKG-INFO
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
Metadata-Version: 2.4
|
||||||
|
Name: herolib
|
||||||
|
Version: 0.1.0
|
||||||
|
Summary: A Python library for HeroCode
|
||||||
|
Author-email: Kilo Code <kilo.code@example.com>
|
||||||
|
Requires-Python: >=3.8
|
||||||
|
Description-Content-Type: text/markdown
|
||||||
|
|
||||||
|
# herolib_python
|
||||||
|
|
||||||
|
see also ~/code/git.ourworld.tf/tfgrid_research/tfdev
|
||||||
|
has some usefull stuff as well
|
78
herolib.egg-info/SOURCES.txt
Normal file
78
herolib.egg-info/SOURCES.txt
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
README.md
|
||||||
|
pyproject.toml
|
||||||
|
herolib.egg-info/PKG-INFO
|
||||||
|
herolib.egg-info/SOURCES.txt
|
||||||
|
herolib.egg-info/dependency_links.txt
|
||||||
|
herolib.egg-info/top_level.txt
|
||||||
|
lib/__init__.py
|
||||||
|
lib/clients/__init__.py
|
||||||
|
lib/clients/assemblyai/__init__.py
|
||||||
|
lib/clients/assemblyai/client.py
|
||||||
|
lib/clients/stellar/__init__.py
|
||||||
|
lib/clients/stellar/horizon.py
|
||||||
|
lib/clients/stellar/model.py
|
||||||
|
lib/clients/stellar/testnet.py
|
||||||
|
lib/clients/telegram/__init__.py
|
||||||
|
lib/clients/telegram/bot.py
|
||||||
|
lib/clients/telegram/bot_audio.py
|
||||||
|
lib/clients/telegram/bot_text.py
|
||||||
|
lib/clients/telegram/errorqueue.py
|
||||||
|
lib/clients/vimeo/__init__.py
|
||||||
|
lib/clients/vimeo/client.py
|
||||||
|
lib/clients/vimeo/model_video.py
|
||||||
|
lib/clients/whisper/__init__.py
|
||||||
|
lib/clients/whisper/convert.py
|
||||||
|
lib/clients/whisper/whisper.py
|
||||||
|
lib/clients/wireless/__init__.py
|
||||||
|
lib/clients/wireless/wigle_net.py
|
||||||
|
lib/core/__init__.py
|
||||||
|
lib/core/heroscript/__init__.py
|
||||||
|
lib/core/heroscript/heroaction.py
|
||||||
|
lib/core/heroscript/heroscripts.py
|
||||||
|
lib/core/heroscript/mixin.py
|
||||||
|
lib/core/heroscript/tools.py
|
||||||
|
lib/core/heroscript/examples/__init__.py
|
||||||
|
lib/core/heroscript/examples/heroscript_example.py
|
||||||
|
lib/core/heroscript/examples/heroscript_example2.py
|
||||||
|
lib/core/heroscript/examples/wiki/__init__.py
|
||||||
|
lib/core/heroscript/examples/wiki/sub/__init__.py
|
||||||
|
lib/core/logger/__init__.py
|
||||||
|
lib/core/logger/factory.py
|
||||||
|
lib/core/logger/log.py
|
||||||
|
lib/core/logger/log_test.py
|
||||||
|
lib/core/logger/model.py
|
||||||
|
lib/core/logger/search.py
|
||||||
|
lib/core/loghandler/__init__.py
|
||||||
|
lib/core/loghandler/mylogging.py
|
||||||
|
lib/core/pathlib/__init__.py
|
||||||
|
lib/core/pathlib/pathlib.py
|
||||||
|
lib/core/texttools/__init__.py
|
||||||
|
lib/core/texttools/texttools.py
|
||||||
|
lib/crypt/__init__.py
|
||||||
|
lib/crypt/box/__init__.py
|
||||||
|
lib/crypt/box/box.py
|
||||||
|
lib/crypt/box/box_api.py
|
||||||
|
lib/data/__init__.py
|
||||||
|
lib/data/ourtime/__init__.py
|
||||||
|
lib/data/ourtime/ourtime.py
|
||||||
|
lib/downloader/__init__.py
|
||||||
|
lib/downloader/scrape_dynamic/dynamic_crawl.py
|
||||||
|
lib/downloader/scrape_scapegraph/main.py
|
||||||
|
lib/downloader/scrape_scapegraph/scrape.py
|
||||||
|
lib/downloader/scrape_scapegraph/scrape_md.py
|
||||||
|
lib/downloader/scrape_scapegraph/scrape_search.py
|
||||||
|
lib/downloader/scrape_scapegraph/scrape_with_local_llm.py
|
||||||
|
lib/downloader/scrape_scapegraph/scrape_with_local_llm_search.py
|
||||||
|
lib/tools/__init__.py
|
||||||
|
lib/tools/extensions.py
|
||||||
|
lib/tools/gitscanner.py
|
||||||
|
lib/tools/logger.py
|
||||||
|
lib/tools/md5.py
|
||||||
|
lib/tools/ourtime.py
|
||||||
|
lib/tools/pathtools.py
|
||||||
|
lib/tools/texttools.py
|
||||||
|
lib/web/__init__.py
|
||||||
|
lib/web/doctools/__init__.py
|
||||||
|
lib/web/doctools/html_replacer.py
|
||||||
|
lib/web/doctools/md_replacer.py
|
||||||
|
lib/web/doctools/processor.py
|
1
herolib.egg-info/dependency_links.txt
Normal file
1
herolib.egg-info/dependency_links.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
|
1
herolib.egg-info/top_level.txt
Normal file
1
herolib.egg-info/top_level.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
lib
|
0
lib/__init__.py
Normal file
0
lib/__init__.py
Normal file
0
lib/clients/__init__.py
Normal file
0
lib/clients/__init__.py
Normal file
0
lib/clients/assemblyai/__init__.py
Normal file
0
lib/clients/assemblyai/__init__.py
Normal file
0
lib/clients/vimeo/__init__.py
Normal file
0
lib/clients/vimeo/__init__.py
Normal file
0
lib/clients/wireless/__init__.py
Normal file
0
lib/clients/wireless/__init__.py
Normal file
0
lib/core/__init__.py
Normal file
0
lib/core/__init__.py
Normal file
0
lib/core/heroscript/__init__.py
Normal file
0
lib/core/heroscript/__init__.py
Normal file
0
lib/core/heroscript/examples/__init__.py
Normal file
0
lib/core/heroscript/examples/__init__.py
Normal file
0
lib/core/heroscript/examples/wiki/__init__.py
Normal file
0
lib/core/heroscript/examples/wiki/__init__.py
Normal file
0
lib/core/heroscript/examples/wiki/sub/__init__.py
Normal file
0
lib/core/heroscript/examples/wiki/sub/__init__.py
Normal file
0
lib/core/logger/__pycache__/__init__.py
Normal file
0
lib/core/logger/__pycache__/__init__.py
Normal file
0
lib/core/loghandler/__init__.py
Normal file
0
lib/core/loghandler/__init__.py
Normal file
214
lib/core/loghandler/mylogging.py
Normal file
214
lib/core/loghandler/mylogging.py
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
from peewee import *
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, List, Dict, Any, Iterable, Union
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
# Configure database path
|
||||||
|
DB_DIR = os.path.expanduser('~/hero/var/logdb/')
|
||||||
|
DB_FILE = os.path.join(DB_DIR, 'logs.db')
|
||||||
|
|
||||||
|
# Create directory if it doesn't exist
|
||||||
|
os.makedirs(DB_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
# Initialize database
|
||||||
|
database = SqliteDatabase(DB_FILE, pragmas={'journal_mode': 'wal'})
|
||||||
|
|
||||||
|
class BaseModel(Model):
|
||||||
|
"""Base model class for Peewee."""
|
||||||
|
class Meta:
|
||||||
|
database = database
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convert model instance to dictionary."""
|
||||||
|
data = {}
|
||||||
|
for field_name in self._meta.fields:
|
||||||
|
field_value = getattr(self, field_name)
|
||||||
|
if field_name in ('time', 'last_seen') and isinstance(field_value, int):
|
||||||
|
# Convert epoch to a readable format for the frontend
|
||||||
|
data[field_name] = datetime.fromtimestamp(field_value).strftime('%d-%m %H:%M')
|
||||||
|
else:
|
||||||
|
data[field_name] = field_value
|
||||||
|
return data
|
||||||
|
|
||||||
|
class Log(BaseModel):
|
||||||
|
"""Model for INFO logs."""
|
||||||
|
time = IntegerField(default=lambda: int(time.time()), index=True)
|
||||||
|
email = CharField(max_length=255, null=True)
|
||||||
|
logmsg = TextField()
|
||||||
|
level = IntegerField(default=100)
|
||||||
|
cat = CharField(max_length=100, index=True, default="general")
|
||||||
|
payload = TextField(null=True)
|
||||||
|
payload_cat = CharField(max_length=100, null=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table_name = 'logs'
|
||||||
|
|
||||||
|
class Error(BaseModel):
|
||||||
|
"""Model for ERROR logs."""
|
||||||
|
time = IntegerField(default=lambda: int(time.time()), index=True)
|
||||||
|
last_seen = IntegerField(default=lambda: int(time.time()), index=True)
|
||||||
|
email = CharField(max_length=255, null=True)
|
||||||
|
logmsg = TextField()
|
||||||
|
stacktrace = TextField(null=True)
|
||||||
|
count = IntegerField(default=1)
|
||||||
|
cat = CharField(max_length=100, index=True, default="general")
|
||||||
|
payload = TextField(null=True)
|
||||||
|
payload_cat = CharField(max_length=100, null=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table_name = 'errors'
|
||||||
|
|
||||||
|
def init_db_logging():
|
||||||
|
"""Create tables if they don't exist."""
|
||||||
|
with database:
|
||||||
|
database.create_tables([Log, Error], safe=True)
|
||||||
|
|
||||||
|
class DatabaseLogHandler(logging.Handler):
|
||||||
|
"""A logging handler that writes logs to the Peewee database."""
|
||||||
|
def emit(self, record):
|
||||||
|
stacktrace = None
|
||||||
|
if record.exc_info:
|
||||||
|
stacktrace = logging.Formatter().formatException(record.exc_info)
|
||||||
|
|
||||||
|
if record.levelno >= logging.ERROR:
|
||||||
|
log_error(
|
||||||
|
msg=record.getMessage(),
|
||||||
|
cat=record.name,
|
||||||
|
stacktrace=stacktrace
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
log_info(
|
||||||
|
msg=record.getMessage(),
|
||||||
|
level=record.levelno,
|
||||||
|
cat=record.name
|
||||||
|
)
|
||||||
|
|
||||||
|
def log_error(msg: str, cat: str = "general", email: Optional[str] = None, stacktrace: Optional[str] = None, payload: Optional[str] = None, payload_cat: Optional[str] = None):
|
||||||
|
"""Log an ERROR message to the database, handling duplicates."""
|
||||||
|
try:
|
||||||
|
log_info(msg=msg, cat=cat, email=email, payload=payload, payload_cat=payload_cat)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
if not stacktrace:
|
||||||
|
# Capture the current stack trace if not provided
|
||||||
|
stacktrace = "".join(traceback.format_stack())
|
||||||
|
|
||||||
|
# Filter out irrelevant lines from the stack trace
|
||||||
|
if stacktrace:
|
||||||
|
lines = stacktrace.split('\n')
|
||||||
|
filtered_lines = [
|
||||||
|
line for line in lines
|
||||||
|
if 'python3.13/logging' not in line and 'src/mylogging.py' not in line
|
||||||
|
]
|
||||||
|
stacktrace = '\n'.join(filtered_lines)
|
||||||
|
|
||||||
|
one_day_ago = int(time.time()) - (24 * 3600)
|
||||||
|
|
||||||
|
# Look for a similar error in the last 24 hours from the same user
|
||||||
|
existing_error = Error.select().where(
|
||||||
|
(Error.logmsg == msg) &
|
||||||
|
(Error.email == email) &
|
||||||
|
(Error.last_seen >= one_day_ago)
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if existing_error:
|
||||||
|
# If found, increment counter and update last_seen
|
||||||
|
existing_error.count += 1
|
||||||
|
existing_error.last_seen = int(time.time())
|
||||||
|
existing_error.stacktrace = stacktrace
|
||||||
|
existing_error.save()
|
||||||
|
print(existing_error)
|
||||||
|
else:
|
||||||
|
# Otherwise, create a new error record
|
||||||
|
Error.create(
|
||||||
|
logmsg=msg,
|
||||||
|
cat=cat,
|
||||||
|
email=email,
|
||||||
|
stacktrace=stacktrace,
|
||||||
|
payload=payload,
|
||||||
|
payload_cat=payload_cat
|
||||||
|
)
|
||||||
|
logging.info(f"Successfully logged new error: {msg}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to log error to {DB_FILE}: {e}")
|
||||||
|
|
||||||
|
def log_info(msg: str, level: int = 0, cat: str = "general", email: Optional[str] = None, payload: Optional[str] = None, payload_cat: Optional[str] = None):
|
||||||
|
"""Log an INFO message to the database."""
|
||||||
|
try:
|
||||||
|
Log.create(logmsg=msg, level=level, cat=cat, email=email, payload=payload, payload_cat=payload_cat)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to log info to {DB_FILE}: {e}")
|
||||||
|
|
||||||
|
def get_errors(search: Optional[str] = None, cat: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||||
|
"""Get errors from the database with optional filters. Category search is prefix-based."""
|
||||||
|
query = Error.select().order_by(Error.last_seen.desc())
|
||||||
|
if search:
|
||||||
|
query = query.where(Error.logmsg.contains(search))
|
||||||
|
if cat and cat.strip():
|
||||||
|
query = query.where(Error.cat.startswith(cat.strip()))
|
||||||
|
return [e.to_dict() for e in query]
|
||||||
|
|
||||||
|
def get_logs(
|
||||||
|
search: Optional[str] = None,
|
||||||
|
cat: Optional[str] = None,
|
||||||
|
level: Optional[int] = None,
|
||||||
|
hours_ago: Optional[int] = None,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Get logs from the database with optional filters. Category search is prefix-based."""
|
||||||
|
query = Log.select().order_by(Log.time.desc())
|
||||||
|
|
||||||
|
if search and search.strip():
|
||||||
|
query = query.where(Log.logmsg.contains(search))
|
||||||
|
|
||||||
|
if cat and cat.strip():
|
||||||
|
query = query.where(Log.cat.startswith(cat.strip()))
|
||||||
|
|
||||||
|
if level is not None:
|
||||||
|
query = query.where(Log.level <= level)
|
||||||
|
|
||||||
|
if hours_ago is not None:
|
||||||
|
time_ago = int(time.time()) - (hours_ago * 3600)
|
||||||
|
query = query.where(Log.time >= time_ago)
|
||||||
|
|
||||||
|
return [l.to_dict() for l in query]
|
||||||
|
|
||||||
|
def get_log_by_id(log_id: int) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get a single log by its ID."""
|
||||||
|
try:
|
||||||
|
log = Log.get_by_id(log_id)
|
||||||
|
return log.to_dict()
|
||||||
|
except Log.DoesNotExist:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def delete_logs_older_than(minutes: int):
|
||||||
|
"""Delete logs older than a specified number of minutes."""
|
||||||
|
time_ago = int(time.time()) - (minutes * 60)
|
||||||
|
Log.delete().where(Log.time < time_ago).execute()
|
||||||
|
|
||||||
|
def delete_errors_older_than(minutes: int):
|
||||||
|
"""Delete errors older than a specified number of minutes."""
|
||||||
|
time_ago = int(time.time()) - (minutes * 60)
|
||||||
|
Error.delete().where(Error.time < time_ago).execute()
|
||||||
|
|
||||||
|
def get_unique_log_categories() -> List[str]:
|
||||||
|
"""Get unique log categories from the database."""
|
||||||
|
query = (Log
|
||||||
|
.select(Log.cat)
|
||||||
|
.where(Log.cat.is_null(False))
|
||||||
|
.distinct()
|
||||||
|
.order_by(Log.cat))
|
||||||
|
return [l.cat for l in query]
|
||||||
|
|
||||||
|
def get_unique_error_categories() -> List[str]:
|
||||||
|
"""Get unique error categories from the database."""
|
||||||
|
query = (Error
|
||||||
|
.select(Error.cat)
|
||||||
|
.where(Error.cat.is_null(False))
|
||||||
|
.distinct()
|
||||||
|
.order_by(Error.cat))
|
||||||
|
return [e.cat for e in query]
|
0
lib/core/pathlib/__pycache__/__init__.py
Normal file
0
lib/core/pathlib/__pycache__/__init__.py
Normal file
0
lib/crypt/__init__.py
Normal file
0
lib/crypt/__init__.py
Normal file
0
lib/crypt/box/__init__.py
Normal file
0
lib/crypt/box/__init__.py
Normal file
0
lib/data/__init__.py
Normal file
0
lib/data/__init__.py
Normal file
0
lib/data/ourtime/__pycache__/__init__.py
Normal file
0
lib/data/ourtime/__pycache__/__init__.py
Normal file
0
lib/downloader/__init__.py
Normal file
0
lib/downloader/__init__.py
Normal file
@@ -1,412 +0,0 @@
|
|||||||
import json
|
|
||||||
import logging
|
|
||||||
import mimetypes # Added
|
|
||||||
import os
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from urllib.parse import urljoin, urlparse
|
|
||||||
|
|
||||||
import scrapy
|
|
||||||
from scrapy.crawler import CrawlerProcess
|
|
||||||
from scrapy.linkextractors import LinkExtractor
|
|
||||||
from scrapy.utils.project import get_project_settings
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
||||||
)
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
STATE_FILE_NAME = ".download_state.json"
|
|
||||||
|
|
||||||
|
|
||||||
class GenericDownloaderSpider(scrapy.Spider):
|
|
||||||
name = "generic_downloader"
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
start_url,
|
|
||||||
dest_dir,
|
|
||||||
allowed_domains,
|
|
||||||
ignore_paths=None,
|
|
||||||
depth_limit=0,
|
|
||||||
follow_links=True,
|
|
||||||
max_age_hours=0,
|
|
||||||
state_data=None,
|
|
||||||
*args,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
super(GenericDownloaderSpider, self).__init__(*args, **kwargs)
|
|
||||||
self.start_urls = [start_url]
|
|
||||||
self.dest_dir = dest_dir
|
|
||||||
self.allowed_domains = allowed_domains
|
|
||||||
self.ignore_paths = ignore_paths if ignore_paths else []
|
|
||||||
self.depth_limit = int(depth_limit)
|
|
||||||
self.follow_links = bool(follow_links)
|
|
||||||
self.max_age_hours = int(max_age_hours)
|
|
||||||
self.state_data = state_data if state_data else {}
|
|
||||||
self.link_extractor = LinkExtractor(allow_domains=self.allowed_domains)
|
|
||||||
|
|
||||||
os.makedirs(self.dest_dir, exist_ok=True)
|
|
||||||
logger.info(f"Downloader initialized for {start_url}")
|
|
||||||
logger.info(f"Destination directory: {self.dest_dir}")
|
|
||||||
logger.info(f"Allowed domains: {self.allowed_domains}")
|
|
||||||
logger.info(f"Ignore paths: {self.ignore_paths}")
|
|
||||||
logger.info(f"Depth limit: {self.depth_limit}")
|
|
||||||
logger.info(f"Follow links: {self.follow_links}")
|
|
||||||
logger.info(f"Max age (hours): {self.max_age_hours}")
|
|
||||||
|
|
||||||
def _should_ignore(self, url_path):
|
|
||||||
for pattern in self.ignore_paths:
|
|
||||||
if pattern in url_path: # Simple substring match for now, can be regex
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _get_file_path(self, response): # Changed signature to take response
|
|
||||||
url = response.url
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
original_path = parsed_url.path # e.g. /foo/bar.html or /foo/ or /
|
|
||||||
|
|
||||||
# Determine base_name and current_ext from original_path
|
|
||||||
if original_path.endswith("/"):
|
|
||||||
base_name = "index"
|
|
||||||
current_ext = ""
|
|
||||||
# path_for_dirs is the path part that forms the directory structure
|
|
||||||
path_for_dirs = original_path.lstrip("/")
|
|
||||||
else:
|
|
||||||
path_basename = os.path.basename(original_path)
|
|
||||||
if (
|
|
||||||
not path_basename and original_path == "/"
|
|
||||||
): # Root path e.g. http://example.com
|
|
||||||
base_name = "index"
|
|
||||||
current_ext = ""
|
|
||||||
else: # e.g. /file.txt or /file_no_ext or /.config
|
|
||||||
base_name, current_ext = os.path.splitext(path_basename)
|
|
||||||
if not base_name and current_ext: # Hidden file like /.bashrc
|
|
||||||
base_name = current_ext # Treat .bashrc as base_name
|
|
||||||
current_ext = "" # No further extension part
|
|
||||||
path_for_dirs = os.path.dirname(original_path.lstrip("/"))
|
|
||||||
|
|
||||||
# Try to get extension from Content-Type
|
|
||||||
content_type = (
|
|
||||||
response.headers.get("Content-Type", b"")
|
|
||||||
.decode("utf-8")
|
|
||||||
.split(";")[0]
|
|
||||||
.strip()
|
|
||||||
)
|
|
||||||
mime_ext = mimetypes.guess_extension(content_type) if content_type else None
|
|
||||||
|
|
||||||
final_ext = current_ext
|
|
||||||
if mime_ext and not current_ext: # No path extension, use MIME type's
|
|
||||||
final_ext = mime_ext
|
|
||||||
elif (
|
|
||||||
mime_ext
|
|
||||||
and current_ext.lower() in [".htm", ".html"]
|
|
||||||
and mime_ext
|
|
||||||
and mime_ext.lower() not in [".htm", ".html"]
|
|
||||||
):
|
|
||||||
# Path had .html/.htm, but MIME type suggests something more specific
|
|
||||||
final_ext = mime_ext
|
|
||||||
logger.debug(
|
|
||||||
f"URL {url}: Path ext {current_ext} overridden by Content-Type ext {mime_ext}."
|
|
||||||
)
|
|
||||||
elif not final_ext and (
|
|
||||||
content_type.startswith("text/")
|
|
||||||
or content_type
|
|
||||||
in ["application/javascript", "application/json", "application/xml"]
|
|
||||||
):
|
|
||||||
# Fallback for common text types if no extension determined yet and no path ext
|
|
||||||
if not base_name.endswith(
|
|
||||||
(".js", ".css", ".json", ".xml", ".txt")
|
|
||||||
): # Avoid double .html.html
|
|
||||||
final_ext = ".html"
|
|
||||||
|
|
||||||
filename = base_name + final_ext
|
|
||||||
|
|
||||||
# Create path components for the directory structure
|
|
||||||
components = []
|
|
||||||
if path_for_dirs:
|
|
||||||
components.extend(comp for comp in path_for_dirs.split("/") if comp)
|
|
||||||
components.append(filename)
|
|
||||||
|
|
||||||
# Sanitize components
|
|
||||||
sane_components = []
|
|
||||||
for comp_idx, comp_val in enumerate(components):
|
|
||||||
# Basic sanitization: replace invalid chars, limit length, avoid '..'
|
|
||||||
# Allow '.' for filenames but not as a full component name if it's not the only char
|
|
||||||
if comp_val == "..":
|
|
||||||
continue # Skip parent dir references in path construction
|
|
||||||
|
|
||||||
sane_comp = "".join(
|
|
||||||
c if c.isalnum() or c in ["-", "_", "."] else "_" for c in comp_val
|
|
||||||
)
|
|
||||||
sane_comp = sane_comp[:150] # Limit component length
|
|
||||||
|
|
||||||
if (
|
|
||||||
not sane_comp and comp_idx == len(components) - 1
|
|
||||||
): # last component (filename) became empty
|
|
||||||
sane_comp = "downloaded_file" + final_ext # fallback filename
|
|
||||||
elif not sane_comp:
|
|
||||||
sane_comp = "_" # placeholder for empty dir name
|
|
||||||
|
|
||||||
if sane_comp: # Ensure component is not empty after sanitization
|
|
||||||
sane_components.append(sane_comp)
|
|
||||||
|
|
||||||
if not sane_components: # If all components were sanitized away or skipped
|
|
||||||
sane_components = [filename if filename else "unknown_file" + final_ext]
|
|
||||||
|
|
||||||
file_path = os.path.join(self.dest_dir, *sane_components)
|
|
||||||
return file_path
|
|
||||||
|
|
||||||
def parse(self, response, depth=0):
|
|
||||||
url = response.url
|
|
||||||
logger.info(f"Processing URL (depth {depth}): {url}")
|
|
||||||
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
if self._should_ignore(parsed_url.path):
|
|
||||||
logger.info(f"Ignoring URL (matches ignore_paths): {url}")
|
|
||||||
return
|
|
||||||
|
|
||||||
file_path = self._get_file_path(response) # Pass response object
|
|
||||||
|
|
||||||
# Check download state and max_age
|
|
||||||
if url in self.state_data:
|
|
||||||
url_state = self.state_data[url]
|
|
||||||
last_download_time_str = url_state.get("timestamp")
|
|
||||||
# Consider previous status; only skip if it was a success or another skip
|
|
||||||
can_skip_based_on_history = url_state.get("status", "").startswith(
|
|
||||||
"success"
|
|
||||||
) or url_state.get("status", "").startswith("skipped")
|
|
||||||
|
|
||||||
if last_download_time_str and can_skip_based_on_history:
|
|
||||||
last_download_time = datetime.fromisoformat(last_download_time_str)
|
|
||||||
if self.max_age_hours > 0 and (
|
|
||||||
datetime.utcnow() - last_download_time
|
|
||||||
) < timedelta(hours=self.max_age_hours):
|
|
||||||
logger.info(
|
|
||||||
f"Skipping download for {url}, recently processed at {last_download_time_str} with status '{url_state.get('status')}'."
|
|
||||||
)
|
|
||||||
# Update state to reflect this skip check
|
|
||||||
self.state_data[url]["status"] = "skipped_max_age"
|
|
||||||
self.state_data[url]["skipped_timestamp"] = (
|
|
||||||
datetime.utcnow().isoformat()
|
|
||||||
)
|
|
||||||
# Still need to check for links if recursive
|
|
||||||
# Corrected depth condition:
|
|
||||||
# Follow if self.depth_limit is 0 (infinite) OR current depth is less than a positive limit.
|
|
||||||
if self.follow_links and (
|
|
||||||
self.depth_limit == 0 or depth < self.depth_limit
|
|
||||||
):
|
|
||||||
for link in self.link_extractor.extract_links(response):
|
|
||||||
parsed_link_url = urlparse(link.url)
|
|
||||||
if not self._should_ignore(parsed_link_url.path):
|
|
||||||
yield response.follow(link, callback=self.parse)
|
|
||||||
else:
|
|
||||||
logger.info(
|
|
||||||
f"Ignoring extracted link (matches ignore_paths): {link.url}"
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info(f"Processing and saving {url} to {file_path}")
|
|
||||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open(file_path, "wb") as f:
|
|
||||||
f.write(response.body)
|
|
||||||
logger.info(f"Successfully saved {url} to {file_path}")
|
|
||||||
self.state_data[url] = {
|
|
||||||
"timestamp": datetime.utcnow().isoformat(),
|
|
||||||
"status": "success",
|
|
||||||
"path": file_path,
|
|
||||||
"size": len(response.body),
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to save {url} to {file_path}: {e}")
|
|
||||||
self.state_data[url] = {
|
|
||||||
"timestamp": datetime.utcnow().isoformat(),
|
|
||||||
"status": "failed",
|
|
||||||
"error": str(e),
|
|
||||||
}
|
|
||||||
return # Do not proceed further if save failed
|
|
||||||
|
|
||||||
# Corrected depth condition for following links:
|
|
||||||
# Follow if self.depth_limit is 0 (infinite) OR current depth is less than a positive limit.
|
|
||||||
if self.follow_links and (self.depth_limit == 0 or depth < self.depth_limit):
|
|
||||||
logger.info(
|
|
||||||
f"Following links from {url} at custom depth {depth} (for next level {depth + 1})"
|
|
||||||
)
|
|
||||||
extracted_links = list(self.link_extractor.extract_links(response))
|
|
||||||
if not extracted_links:
|
|
||||||
logger.info(f" No links extracted from {url} by LinkExtractor.")
|
|
||||||
else:
|
|
||||||
logger.info(
|
|
||||||
f" LinkExtractor found {len(extracted_links)} links from {url}: {[l.url for l in extracted_links]}"
|
|
||||||
)
|
|
||||||
|
|
||||||
for link_idx, link in enumerate(extracted_links):
|
|
||||||
logger.debug(
|
|
||||||
f" Considering link {link_idx + 1}/{len(extracted_links)}: Text='{link.text}', URL='{link.url}'"
|
|
||||||
)
|
|
||||||
parsed_link_url = urlparse(link.url)
|
|
||||||
if self._should_ignore(parsed_link_url.path):
|
|
||||||
logger.info(
|
|
||||||
f" Ignoring extracted link (matches ignore_paths): {link.url}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.info(
|
|
||||||
f" Yielding request for: {link.url} (to be processed at custom depth {depth + 1})"
|
|
||||||
)
|
|
||||||
yield response.follow(link, callback=self.parse)
|
|
||||||
|
|
||||||
def closed(self, reason):
|
|
||||||
logger.info(f"Spider closed: {reason}. Finalizing and saving state.")
|
|
||||||
state_file_path = os.path.join(self.dest_dir, STATE_FILE_NAME)
|
|
||||||
try:
|
|
||||||
# Ensure the directory for the state file exists, though dest_dir should already.
|
|
||||||
os.makedirs(os.path.dirname(state_file_path), exist_ok=True)
|
|
||||||
with open(state_file_path, "w") as f:
|
|
||||||
json.dump(self.state_data, f, indent=4)
|
|
||||||
logger.info(
|
|
||||||
f"Spider successfully saved state ({len(self.state_data)} items) to {state_file_path}"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
f"Spider failed to save state to {state_file_path}: {e}", exc_info=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def download_site(
|
|
||||||
start_url,
|
|
||||||
dest_dir,
|
|
||||||
recursive=True,
|
|
||||||
ignore_paths=None,
|
|
||||||
depth_limit=0, # 0 means no limit if recursive is True
|
|
||||||
follow_links=True, # This is somewhat redundant if recursive is True, but good for clarity
|
|
||||||
max_age_hours=24, # Re-download if older than 24 hours
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Downloads a website or a single page.
|
|
||||||
|
|
||||||
:param start_url: The URL to start downloading from.
|
|
||||||
:param dest_dir: The directory to save downloaded files.
|
|
||||||
:param recursive: Whether to download recursively.
|
|
||||||
:param ignore_paths: A list of path substrings or regex patterns to ignore.
|
|
||||||
:param depth_limit: Maximum depth for recursive downloads (0 for no limit).
|
|
||||||
:param follow_links: Whether to follow links on pages (primarily for recursive).
|
|
||||||
:param max_age_hours: Max age of a file in hours. If a file was downloaded
|
|
||||||
more recently than this, it won't be re-downloaded.
|
|
||||||
0 means always re-download.
|
|
||||||
:return: A dictionary summarizing the download process.
|
|
||||||
"""
|
|
||||||
parsed_url = urlparse(start_url)
|
|
||||||
if not parsed_url.scheme or not parsed_url.netloc:
|
|
||||||
logger.error(
|
|
||||||
f"Invalid start_url: {start_url}. Must be a full URL (e.g., http://example.com)"
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
allowed_domains = [parsed_url.hostname] # Changed from netloc to hostname
|
|
||||||
|
|
||||||
state_file_path = os.path.join(dest_dir, STATE_FILE_NAME)
|
|
||||||
state_data = {}
|
|
||||||
if os.path.exists(state_file_path):
|
|
||||||
try:
|
|
||||||
with open(state_file_path, "r") as f:
|
|
||||||
state_data = json.load(f)
|
|
||||||
logger.info(f"Loaded download state from {state_file_path}")
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
logger.warning(
|
|
||||||
f"Could not decode JSON from state file {state_file_path}. Starting fresh."
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
f"Error loading state file {state_file_path}: {e}. Starting fresh."
|
|
||||||
)
|
|
||||||
|
|
||||||
settings = get_project_settings()
|
|
||||||
settings.set("ROBOTSTXT_OBEY", False) # Explicitly disable robots.txt
|
|
||||||
# settings.set('LOG_LEVEL', 'DEBUG') # Optionally enable for more Scrapy internal logs
|
|
||||||
|
|
||||||
effective_scrapy_depth = 0 # Default for non-recursive or depth_limit=0 with recursion (0 means infinite for Scrapy)
|
|
||||||
if recursive and int(depth_limit) > 0:
|
|
||||||
effective_scrapy_depth = int(depth_limit)
|
|
||||||
# If not recursive, effective_scrapy_depth remains 0.
|
|
||||||
# If recursive and depth_limit is 0, effective_scrapy_depth remains 0 (infinite).
|
|
||||||
settings.set("DEPTH_LIMIT", effective_scrapy_depth)
|
|
||||||
|
|
||||||
logger.info(f"Scrapy DEPTH_LIMIT set to: {effective_scrapy_depth}")
|
|
||||||
# Scrapy's DEPTH_PRIORITY and SCHEDULER_DISK_QUEUE might be useful for large crawls
|
|
||||||
# For now, keeping it simple.
|
|
||||||
|
|
||||||
process = CrawlerProcess(settings)
|
|
||||||
|
|
||||||
# The spider needs to be instantiated with all its custom args
|
|
||||||
# Scrapy's process.crawl can take kwargs which are passed to the spider's __init__
|
|
||||||
process.crawl(
|
|
||||||
GenericDownloaderSpider,
|
|
||||||
start_url=start_url,
|
|
||||||
dest_dir=dest_dir,
|
|
||||||
allowed_domains=allowed_domains,
|
|
||||||
ignore_paths=ignore_paths,
|
|
||||||
depth_limit=int(depth_limit)
|
|
||||||
if recursive
|
|
||||||
else 0, # Spider handles its own depth based on this
|
|
||||||
follow_links=follow_links and recursive,
|
|
||||||
max_age_hours=int(max_age_hours),
|
|
||||||
state_data=state_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Starting download process for {start_url}...")
|
|
||||||
process.start() # This will block until the crawl is finished
|
|
||||||
|
|
||||||
# The spider's closed() method is now responsible for writing the final state.
|
|
||||||
# Load this definitive state to build the summary.
|
|
||||||
final_state_data_for_summary = {}
|
|
||||||
if os.path.exists(state_file_path):
|
|
||||||
try:
|
|
||||||
with open(state_file_path, "r") as f:
|
|
||||||
final_state_data_for_summary = json.load(f)
|
|
||||||
logger.info(
|
|
||||||
f"Loaded final state ({len(final_state_data_for_summary)} items) from {state_file_path} for summary construction."
|
|
||||||
)
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.error(
|
|
||||||
f"Error decoding JSON from final state file {state_file_path} for summary: {e}. Summary will be based on empty or incomplete state."
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
f"Error loading final state from {state_file_path} for summary: {e}. Summary will be based on empty or incomplete state."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
f"State file {state_file_path} not found after spider closed. Summary will be based on empty state."
|
|
||||||
)
|
|
||||||
|
|
||||||
summary = {
|
|
||||||
"start_url": start_url,
|
|
||||||
"dest_dir": dest_dir,
|
|
||||||
"total_processed_urls": len(final_state_data_for_summary),
|
|
||||||
"successful_downloads": 0,
|
|
||||||
"failed_downloads": 0,
|
|
||||||
"skipped_max_age": 0,
|
|
||||||
"total_bytes_downloaded": 0,
|
|
||||||
"state_file_path": state_file_path,
|
|
||||||
"errors": [],
|
|
||||||
}
|
|
||||||
|
|
||||||
# Populate summary from the final_state_data_for_summary loaded from the file
|
|
||||||
for url_key, data_val in final_state_data_for_summary.items():
|
|
||||||
status = data_val.get("status")
|
|
||||||
if status == "success":
|
|
||||||
summary["successful_downloads"] += 1
|
|
||||||
summary["total_bytes_downloaded"] += data_val.get("size", 0)
|
|
||||||
elif status == "failed":
|
|
||||||
summary["failed_downloads"] += 1
|
|
||||||
if "error" in data_val:
|
|
||||||
summary["errors"].append(f"URL: {url_key}, Error: {data_val['error']}")
|
|
||||||
elif status == "skipped_max_age":
|
|
||||||
summary["skipped_max_age"] += 1
|
|
||||||
# Any errors during state file loading for summary should also be noted if critical
|
|
||||||
# For now, the logs capture it. If final_state_data_for_summary is empty due to load error, summary will reflect that.
|
|
||||||
|
|
||||||
logger.info(f"Download process finished. Summary: {json.dumps(summary, indent=2)}")
|
|
||||||
return summary
|
|
1
lib/downloader/scrape_dynamic
Symbolic link
1
lib/downloader/scrape_dynamic
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
../../../../tfgrid_research/tfdev/research/scrape_dynamic
|
1
lib/downloader/scrape_fast
Symbolic link
1
lib/downloader/scrape_fast
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
../../../../tfgrid_research/tfdev/research/scrape_fast
|
1
lib/downloader/scrape_scapegraph
Symbolic link
1
lib/downloader/scrape_scapegraph
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
../../../../tfgrid_research/tfdev/research/scrape_scapegraph
|
0
lib/web/__init__.py
Normal file
0
lib/web/__init__.py
Normal file
0
lib/web/doctools/__init__.py
Normal file
0
lib/web/doctools/__init__.py
Normal file
@@ -1,15 +1,18 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "Herolib"
|
name = "herolib"
|
||||||
version = "0.9.0"
|
version = "0.1.0"
|
||||||
description = "Lib from Hero's project for Actors"
|
description = "A Python library for HeroCode"
|
||||||
requires-python = ">=3.13"
|
authors = [
|
||||||
dependencies = [
|
{ name = "Kilo Code", email = "kilo.code@example.com" }
|
||||||
"peewee>=3.17.0",
|
|
||||||
"pygments>=2.16.1",
|
|
||||||
"toml",
|
|
||||||
"requests>=2.31.0",
|
|
||||||
"beautifulsoup4>=4.12.2",
|
|
||||||
"pydantic>=2.8.0",
|
|
||||||
"scrapy==2.13.3",
|
|
||||||
"markdownify=1.1.0"
|
|
||||||
]
|
]
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.8"
|
||||||
|
dependencies = []
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61.0"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["."]
|
||||||
|
include = ["lib*"]
|
Reference in New Issue
Block a user