This commit is contained in:
2025-08-20 04:15:43 +02:00
parent 6b9f0cf291
commit e4bb201181
95 changed files with 194 additions and 907 deletions

0
herolib/web/__init__.py Normal file
View File

View File

View File

@@ -0,0 +1,94 @@
from herotools.logger import logger
from bs4 import BeautifulSoup
import re
from typing import Callable
from herotools.texttools import name_fix
# Define the type for the content and link fetching functions
LinkFetcher = Callable[[str, str, str, str, str], str]
ContentFetcher = Callable[[str, str, str, str], str]
# Private functions to be used internally
def _get_link(language: str, prefix: str, site_name: str, pagename: str, name: str) -> str:
# Replace this with your logic to get the actual link
logger.debug(f"_get_link: {language[:10]:<10} {site_name}:{pagename}:{name}")
return f"{prefix}{language}/{site_name}/{pagename}/{name}.jpg"
def _get_content(language: str, site_name: str, pagename: str, name: str) -> str:
# Replace this with your logic to get the actual content
logger.debug(f"_get_content: {language[:10]:<10} {site_name}:{pagename}:{name}")
return f"Replaced text for {name} on page {pagename} in {language} language on {site_name} site"
def _process_html(language: str, prefix: str, site_name: str, pagename: str, html_content: str) -> str:
"""
Function to process HTML and replace content based on tags.
This allows us to work with templates and get content based on language to replace in HTML.
"""
language = name_fix(language)
site_name = name_fix(site_name)
pagename = name_fix(pagename)
prefix = prefix.strip()
if not prefix.endswith('/'):
prefix += '/'
soup = BeautifulSoup(html_content, 'html.parser')
# Find all elements with class names starting with !!img: or !!txt:
for element in soup.find_all(class_=re.compile(r'!!(img|txt):(.+)')):
for cls in element['class']:
if cls.startswith('!!img:'):
name = cls.split(':')[1]
name = name_fix(name)
# Get the link to replace the src attribute in !!img: elements
link = _get_link(language=language, prefix=prefix, site_name=site_name, pagename=pagename, name=name)
if element.name == 'img':
element['src'] = link
elif 'src' in element.attrs:
element['src'] = link # In case the element is not an img but has a src attribute
elif cls.startswith('!!txt:'):
name = cls.split(':')[1]
name = name_fix(name)
# Get the content to replace the text in !!txt: elements
content = _get_content(language=language, site_name=site_name, pagename=pagename, name=name)
element.string = content
# Output the modified HTML
return str(soup)
# Public function to process the HTML content
def process(language: str, prefix: str, site_name: str, pagename: str, html_content: str) -> str:
"""
Public function to process HTML and replace content based on tags.
This function wraps the internal _process_html function.
"""
return _process_html(language=language, prefix=prefix, site_name=site_name, pagename=pagename, html_content=html_content)
# Sample usage with a given language, site name, page name, and HTML content
if __name__ == "__main__":
# Example HTML content
html_content = '''
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Sample Page</title>
</head>
<body>
<h2 class="mb-6 is-size-1 is-size-3-mobile has-text-weight-bold !!txt:title1">Take care of your performance every day.</h2>
<img class="responsive !!img:logo" src="old-link.jpg" alt="Company Logo">
<p class="content !!txt:description">This is a sample description text.</p>
</body>
</html>
'''
# Process the HTML content for a specific language, site name, and page
language: str = "en"
site_name: str = "ExampleSite"
pagename: str = "HomePage"
prefix: str = "http://localhost/images/"
processed_html: str = process(language=language, prefix=prefix, site_name=site_name, pagename=pagename, html_content=html_content)
# Print the modified HTML
print(processed_html)

View File

@@ -0,0 +1,172 @@
import sys
import os
# Add the parent directory of herotools to the Python module search path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from herotools.logger import logger
from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
import re
from enum import Enum
from herotools.texttools import name_fix
from mdformat.renderer import MDRenderer
from urllib.parse import urlparse
class ImageType(Enum):
JPEG = 'jpeg'
PNG = 'png'
GIF = 'gif'
OTHER = 'other'
def get_link_page(prefix:str, linkname:str, sitename: str, name: str) -> str:
"""
Generates a page link based on sitename and name.
Args:
sitename (str): The name of the site.
name (str): The name of the page.
Returns:
str: The generated link.
"""
logger.debug(f"get_link_page: {prefix[:60]:<60} {linkname} {sitename}:{name}")
return f"[{linkname}]({prefix}/{sitename}/{name})"
def get_link_image(prefix:str, sitename: str, name: str, image_type: ImageType) -> str:
"""
Generates an image link based on the URL and image type.
Args:
url (str): The original URL of the image.
image_type (ImageType): The type of the image.
Returns:
str: The generated link.
"""
logger.debug(f"get_link_image: {prefix[:60]:<60} {sitename}:{name}")
return f"![]({prefix}/{sitename}/{name})"
def get_include(sitename: str, name: str) -> str:
"""
Generates an include directive link based on sitename and name.
Args:
sitename (str): The name of the site.
name (str): The name of the page to include.
Returns:
str: The generated include directive.
"""
logger.debug(f"get_include: {sitename}:{name}")
return f"include: {sitename}/{name}"
def replace(prefix:str, markdown: str) -> str:
"""
Finds all image links, markdown page links, and custom include directives in the provided markdown text
and replaces them using the appropriate functions.
Args:
markdown (str): The markdown content.
Returns:
str: The modified markdown content with updated links.
"""
# Initialize the Markdown parser
md = MarkdownIt()
tokens = md.parse(markdown)
ast = SyntaxTreeNode(tokens)
print(ast.pretty(indent=2, show_text=True))
def process_node(node: SyntaxTreeNode):
# from IPython import embed; embed()
def get_new_url(url: str):
logger.debug(f"url: {url}")
parsed_url = urlparse(url)
# site_name = parsed_url.netloc
image_path = parsed_url.path
logger.debug(f"parsed_url: {parsed_url}")
# prefix = prefix.rstrip('/')
# image_path = image_path.strip('/')
new_url = f"{prefix.rstrip('/')}/{image_path.strip('/')}"
logger.debug(f"new_url: {new_url}")
return new_url
if node.type == 'image':
# Process image link
url = node.attrs.get('src', '')
new_url = get_new_url(url)
node.attrs['src'] = new_url
elif node.type == 'link':
# Process markdown page link
url = node.attrs.get('href', '')
new_url = get_new_url(url)
node.attrs['href'] = new_url
# Recursively process child nodes
for child in node.children or []:
process_node(child)
def replace_include_directives(match: re.Match) -> str:
"""
Replaces custom include directives with appropriate links.
Args:
match (re.Match): The match object containing the found include directive.
Returns:
str: The generated link for the include directive.
"""
url = match.group(1)
if ':' in url:
site_name, page = url.split(':', 1)
page_name = page.split('/')[-1]
else:
site_name = ""
page_name = url
if not page.endswith('.md'):
page += '.md'
return get_include(prefix, site_name, page_name)
# Process the root node
process_node(ast)
# Convert the AST back to markdown
renderer = MDRenderer()
options = {}
env = {}
rendered_markdown = renderer.render(tokens, options, env)
# include_pattern = re.compile(r"!!include page:'(.*?)'")
# rendered_markdown = include_pattern.sub(replace_include_directives, rendered_markdown)
return rendered_markdown
if __name__ == "__main__":
text = """
![Image description](https://example.com/image.png)
[Page link](sitename:some/path/to/page.md)
!!include page:'mypage'
!!include page:'mypage.md'
!!include page:'mysite:mypage
!!include page:'mysite:mypage'
!!include page:'mysite:mypage.md'
"""
print(text)
text2=replace("http://localhost:8080/pre/", text)
print(text2)

View File

@@ -0,0 +1,94 @@
import os
import re
from typing import Callable
from herotools.logger import logger
from herotools.md5 import file_md5
from herotools.texttools import name_fix
def _example_set_file(site_name: str, path: str, md5: str) -> None:
# Placeholder for actual implementation
logger.debug(f"set_file : site_name={site_name[:20]:<20} {path}")
def _example_set_img(site_name: str, path: str, md5: str) -> None:
# Placeholder for actual implementation
logger.debug(f"set_img : site_name={site_name[:20]:<20} {path}")
def _example_set_markdown(
site_name: str, path: str, md5: str, content: str
) -> None:
# Placeholder for actual implementation
logger.debug(f"set_markdown : site_name={site_name[:20]:<20} {path}")
def _example_set_site(site_name: str, path: str) -> None:
# Placeholder for actual implementation
logger.info(f"set_site : site_name={site_name[:20]:<20} {path}")
def _site_process_action(
site_name: str,
site_path: str,
set_file: Callable[[str, str, str], None],
set_img: Callable[[str, str, str], None],
set_markdown: Callable[[str, str, str, str], None],
) -> None:
logger.debug(f"site process: {site_path[:60]:<60} -> {site_name}")
for root, _, files in os.walk(site_path):
for file in files:
file_path = os.path.join(root, file)
file_path_rel = os.path.relpath(file_path, site_path)
file_name = os.path.basename(file)
# print(file_name)
mymd5 = file_md5(file_path)
if file.lower().endswith(".md"):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
set_markdown(site_name, file_path_rel, mymd5, content)
elif file_name in [".collection", ".site", ".done"]:
continue
elif re.search(
r"\.(jpg|jpeg|png|gif|bmp|tiff|webp)$", file, re.IGNORECASE
):
set_img(site_name, file_path_rel, mymd5)
else:
set_file(site_name, file_path_rel, mymd5)
def process(
path: str,
set_site: Callable[[str, str], None],
set_file: Callable[[str, str, str], None],
set_img: Callable[[str, str, str], None],
set_markdown: Callable[[str, str, str, str], None],
) -> None:
"""
walk over directory and apply set_file(), set_img() and set_markdown()
"""
path = os.path.abspath(os.path.expanduser(path))
logger.info(f"sites process: {path}")
for root, dirs, files in os.walk(path):
if ".site" in files or ".collection" in files:
site_name = name_fix(os.path.basename(root))
set_site(site_name, root)
_site_process_action(
site_name, root, set_file, set_img, set_markdown
)
# Prevent the os.walk from going deeper into subdirectories
dirs[:] = []
if __name__ == "__main__":
mypath = "~/code/git.threefold.info/projectmycelium/info_projectmycelium/collections"
# logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
process(
mypath,
_example_set_site,
_example_set_file,
_example_set_img,
_example_set_markdown,
)