muutils.web.bundle_html
Inline / bundle external assets (CSS, JS, SVG, PNG) into an HTML document.
Default mode uses zero external dependencies and a few well-targeted
regular expressions. If you install beautifulsoup4 you can enable the
far more robust BS4 mode by passing InlineConfig(use_bs4=True).
1""" 2Inline / bundle external assets (CSS, JS, SVG, PNG) into an HTML document. 3 4Default mode uses **zero external dependencies** and a few well-targeted 5regular expressions. If you install *beautifulsoup4* you can enable the 6far more robust BS4 mode by passing `InlineConfig(use_bs4=True)`. 7""" 8 9from __future__ import annotations 10 11import base64 12import re 13import urllib.request 14import warnings 15from dataclasses import dataclass, field 16from pathlib import Path 17from typing import Final, Literal 18 19# bs4 import deferred to avoid an unconditional dependency. 20 21# constants 22# --------------------------------------------------------------------- 23 24AssetExt = Literal[".css", ".js", ".svg", ".png"] 25 26DEFAULT_ALLOWED_EXTENSIONS: Final[set[AssetExt]] = {".css", ".js", ".svg", ".png"} 27 28DEFAULT_TAG_ATTR: Final[dict[str, str]] = { 29 "link": "href", # <link rel="stylesheet" href="..."> 30 "script": "src", # <script src="..."></script> 31 "img": "src", # <img src="..."> 32 "use": "xlink:href", # <use xlink:href="sprite.svg#id"> 33} 34 35MIME_BY_EXT: Final[dict[AssetExt, str]] = { 36 ".css": "text/css", 37 ".js": "application/javascript", 38 ".svg": "image/svg+xml", 39 ".png": "image/png", 40} 41 42# Configuration 43# --------------------------------------------------------------------- 44 45 46@dataclass 47class InlineConfig: 48 """High-level configuration for the inliner. 49 50 # Parameters 51 - `allowed_extensions : set[AssetExt]` 52 Extensions that may be inlined. 53 - `tag_attr : dict[str, str]` 54 Mapping *tag -> attribute* that holds the asset reference. 55 - `max_bytes : int` 56 Assets larger than this are ignored. 57 - `local : bool` 58 Allow local filesystem assets. 59 - `remote : bool` 60 Allow remote http/https assets. 61 - `include_filename_comments : bool` 62 Surround every replacement with `<!-- begin '...' -->` 63 and `<!-- end '...' -->`. 64 - `use_bs4 : bool` 65 Parse the document with BeautifulSoup if available. 66 """ 67 68 allowed_extensions: set[AssetExt] = field( 69 default_factory=lambda: set(DEFAULT_ALLOWED_EXTENSIONS) 70 ) 71 tag_attr: dict[str, str] = field(default_factory=lambda: dict(DEFAULT_TAG_ATTR)) 72 max_bytes: int = 128 * 1024 73 local: bool = True 74 remote: bool = False 75 include_filename_comments: bool = True 76 use_bs4: bool = False 77 78 79# Low-level helpers 80# --------------------------------------------------------------------- 81 82 83def _is_remote(url: str) -> bool: 84 """Return *True* if *url* starts with http:// or https://.""" 85 return url.lower().startswith(("http://", "https://")) 86 87 88def _fetch_bytes(src: str, base: Path) -> bytes: 89 """Fetch *src* (local or remote) and return its raw bytes.""" 90 if _is_remote(src): 91 with urllib.request.urlopen(src) as resp: 92 return resp.read() 93 return (base / src).read_bytes() 94 95 96def _decode_text(buf: bytes) -> str: 97 """Decode *buf* as UTF-8, falling back to replacement.""" 98 try: 99 return buf.decode() 100 except UnicodeDecodeError: 101 return buf.decode("utf-8", "replace") 102 103 104# Regex-based implementation (no deps) 105# --------------------------------------------------------------------- 106 107 108def _apply_indent(html: str, start: int, replacement: str) -> str: 109 """Indent *replacement* to match the line that starts at *start*.""" 110 line_start: int = html.rfind("\n", 0, start) + 1 111 indent: str = html[line_start:start] 112 return "\n".join(indent + line for line in replacement.splitlines()) 113 114 115def _inline_with_regex(html: str, base: Path, cfg: InlineConfig) -> str: 116 """Inline assets using pure-regex parsing (no third-party libs).""" 117 tag: str 118 attr: str 119 for tag, attr in cfg.tag_attr.items(): 120 pattern: str 121 if tag == "script": 122 pattern = ( 123 rf"<script\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>\s*</script>" 124 ) 125 elif tag == "link": 126 pattern = rf"<link\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>" 127 else: # img, use, etc. 128 pattern = rf"<{tag}\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>" 129 130 matches: list[re.Match[str]] = list(re.finditer(pattern, html, re.IGNORECASE)) 131 m: re.Match[str] 132 for m in reversed(matches): 133 raw_src: str = m.group(1) # may contain #fragment 134 clean_src: str = re.split(r"[?#]", raw_src, maxsplit=1)[0] # file path only 135 ext: str = Path(clean_src).suffix.lower() 136 137 if ext not in cfg.allowed_extensions: 138 continue 139 if _is_remote(clean_src) and not cfg.remote: 140 continue 141 if not _is_remote(clean_src) and not cfg.local: 142 continue 143 144 try: 145 data: bytes = _fetch_bytes(clean_src, base) 146 except Exception as err: 147 warnings.warn(f"skip '{raw_src}': {err}") 148 continue 149 150 if len(data) > cfg.max_bytes: 151 continue 152 153 # build replacement 154 replacement: str 155 if ext in {".css", ".js"}: 156 tag_name: str = "style" if ext == ".css" else "script" 157 replacement = f"<{tag_name}>\n{_decode_text(data)}\n</{tag_name}>" 158 else: # .svg or .png 159 b64: str = base64.b64encode(data).decode() 160 # TYPING: we check earlier, ext if for sure in MIME_BY_EXT 161 data_uri: str = f"data:{MIME_BY_EXT[ext]};base64,{b64}" # type: ignore[index] 162 replacement = m.group(0).replace(raw_src, data_uri, 1) 163 164 if cfg.include_filename_comments: 165 replacement = f"<!-- begin '{clean_src}' -->\n{replacement}\n<!-- end '{clean_src}' -->" 166 167 replacement = _apply_indent(html, m.start(), replacement) 168 html = html[: m.start()] + replacement + html[m.end() :] 169 170 return html 171 172 173# BeautifulSoup-based implementation (optional) 174# --------------------------------------------------------------------- 175 176 177def _inline_with_bs4(html: str, base: Path, cfg: InlineConfig) -> str: 178 """Inline assets using BeautifulSoup when available.""" 179 try: 180 from bs4 import BeautifulSoup, Comment, Tag 181 except ModuleNotFoundError as exc: # pragma: no cover 182 raise RuntimeError("BeautifulSoup requested but not installed") from exc 183 184 soup: BeautifulSoup = BeautifulSoup(html, "html.parser") 185 186 tag: Tag # TYPING: i think soup.find_all() returns a list of Tag objects? mypy thinks it should be PageElement (of which Tag is a subclass) 187 for tag in list(soup.find_all(cfg.tag_attr.keys())): # type: ignore[assignment] 188 attr: str = cfg.tag_attr[tag.name] 189 # TYPING: error: Incompatible types in assignment (expression has type "str | AttributeValueList | None", variable has type "str | None") [assignment] 190 src_full: str | None = tag.get(attr) # type: ignore[assignment] 191 if not src_full: 192 continue 193 194 clean_src: str = re.split(r"[?#]", src_full, maxsplit=1)[0] 195 ext: str = Path(clean_src).suffix.lower() 196 197 if ext not in cfg.allowed_extensions: 198 continue 199 if _is_remote(clean_src) and not cfg.remote: 200 continue 201 if not _is_remote(clean_src) and not cfg.local: 202 continue 203 204 try: 205 data: bytes = _fetch_bytes(clean_src, base) 206 except Exception as err: 207 warnings.warn(f"skip '{src_full}': {err}") 208 continue 209 210 if len(data) > cfg.max_bytes: 211 continue 212 213 if ext in {".css", ".js"}: 214 new_tag: Tag = soup.new_tag("style" if ext == ".css" else "script") 215 new_tag.string = _decode_text(data) 216 if cfg.include_filename_comments: 217 tag.insert_before(Comment(f" begin '{src_full}' ")) 218 tag.insert_after(Comment(f" end '{src_full}' ")) 219 tag.replace_with(new_tag) 220 else: # .svg or .png 221 b64: str = base64.b64encode(data).decode() 222 # we are sure ext is in MIME_BY_EXT, so ignore type error 223 tag[attr] = f"data:{MIME_BY_EXT[ext]};base64,{b64}" # type: ignore[index] 224 if cfg.include_filename_comments: 225 tag.insert_before(Comment(f" begin '{src_full}' ")) 226 tag.insert_after(Comment(f" end '{src_full}' ")) 227 228 return str(soup) 229 230 231# Public API 232# --------------------------------------------------------------------- 233 234 235def inline_html_assets( 236 html: str, 237 *, 238 base_path: Path, 239 config: InlineConfig | None = None, 240 prettify: bool = False, # kept for API compatibility (ignored in regex mode) 241) -> str: 242 """Inline permitted external assets inside *html*. 243 244 # Parameters 245 - `html : str` 246 Raw HTML text. 247 - `base_path : Path` 248 Directory used to resolve relative asset paths. 249 - `config : InlineConfig | None` 250 Inlining options (see `InlineConfig`). 251 - `prettify : bool` 252 Pretty-print output (only effective in BS4 mode). 253 254 # Returns 255 - `str` 256 Modified HTML. 257 """ 258 cfg: InlineConfig = config or InlineConfig() 259 if cfg.use_bs4: 260 html_out: str = _inline_with_bs4(html, base_path, cfg) 261 if prettify: 262 # lazy import to avoid unconditional dependency 263 from bs4 import BeautifulSoup 264 265 # TYPING: .prettify() returns str if no encoding is set 266 html_out = str(BeautifulSoup(html_out, "html.parser").prettify()) 267 else: 268 html_out = _inline_with_regex(html, base_path, cfg) 269 return html_out 270 271 272def inline_html_file( 273 html_path: Path, 274 output_path: Path, 275 base_path: Path | None = None, 276 config: InlineConfig | None = None, 277 prettify: bool = False, 278) -> Path: 279 """Read *html_path*, inline its assets, and write the result. 280 281 # Parameters 282 - `html_path : Path` 283 Source HTML file. 284 - `output_path : Path` 285 Destination path to write the modified HTML. 286 - `base_path : Path | None` 287 Directory used to resolve relative asset paths (defaults to the HTML file's directory). 288 If `None`, uses the directory of *html_path*. 289 (default: `None` -> use `html_path.parent`) 290 - `config : InlineConfig | None` 291 Inlining options. 292 If `None`, uses default configuration. 293 (default: `None` -> use `InlineConfig()`) 294 - `prettify : bool` 295 Pretty-print when `use_bs4=True`. 296 (default: `False`) 297 298 # Returns 299 - `Path` 300 Path actually written. 301 """ 302 if base_path is None: 303 base_path = html_path.parent 304 html_raw: str = html_path.read_text() 305 html_new: str = inline_html_assets( 306 html_raw, 307 base_path=base_path, 308 config=config, 309 prettify=prettify, 310 ) 311 dest: Path = output_path or html_path 312 dest.write_text(html_new) 313 return dest 314 315 316# CLI 317# --------------------------------------------------------------------- 318 319if __name__ == "__main__": 320 import argparse 321 322 parser: argparse.ArgumentParser = argparse.ArgumentParser( 323 description="Inline / bundle CSS, JS, SVG, PNG assets. " 324 "Uses regex parsing by default; pass --bs4 to require BeautifulSoup." 325 ) 326 parser.add_argument("html", type=Path, help="input HTML file") 327 parser.add_argument( 328 "-o", 329 "--output", 330 type=Path, 331 help="output file", 332 required=True, 333 ) 334 parser.add_argument( 335 "--source-dir", 336 type=Path, 337 default=None, 338 help="base directory for relative asset paths (defaults to the HTML file's directory)", 339 ) 340 parser.add_argument("--remote", action="store_true", help="allow remote URLs") 341 parser.add_argument("--bs4", action="store_true", help="use BeautifulSoup parser") 342 parser.add_argument( 343 "--prettify", action="store_true", help="pretty-print with BeautifulSoup)" 344 ) 345 parser.add_argument( 346 "--max-bytes", type=int, default=128 * 1024, help="size limit per asset" 347 ) 348 parser.add_argument( 349 "--ext", 350 nargs="+", 351 default=list(DEFAULT_ALLOWED_EXTENSIONS), 352 help="extensions to inline", 353 ) 354 parser.add_argument( 355 "--tag-attr", 356 type=str, 357 default=None, 358 help='override tag->attr map. format: "tag1=attr1,tag2=attr2"', 359 ) 360 parser.add_argument("--no-comments", dest="comments", action="store_false") 361 args: argparse.Namespace = parser.parse_args() 362 363 tag_attr: dict[str, str] 364 if args.tag_attr: 365 tag_attr = { 366 tag: attr 367 for tag, attr in (item.split("=") for item in args.tag_attr.split(",")) 368 } 369 370 else: 371 tag_attr = dict(DEFAULT_TAG_ATTR) 372 373 cfg: InlineConfig = InlineConfig( 374 allowed_extensions=set(args.ext), # type: ignore[arg-type] 375 tag_attr=tag_attr, 376 max_bytes=args.max_bytes, 377 remote=args.remote, 378 include_filename_comments=args.comments, 379 use_bs4=args.bs4, 380 ) 381 382 inline_html_file( 383 args.html, 384 output_path=args.output, 385 base_path=args.source_dir, 386 config=cfg, 387 prettify=args.prettify, 388 )
AssetExt =
typing.Literal['.css', '.js', '.svg', '.png']
DEFAULT_ALLOWED_EXTENSIONS: Final[set[Literal['.css', '.js', '.svg', '.png']]] =
{'.svg', '.css', '.js', '.png'}
DEFAULT_TAG_ATTR: Final[dict[str, str]] =
{'link': 'href', 'script': 'src', 'img': 'src', 'use': 'xlink:href'}
MIME_BY_EXT: Final[dict[Literal['.css', '.js', '.svg', '.png'], str]] =
{'.css': 'text/css', '.js': 'application/javascript', '.svg': 'image/svg+xml', '.png': 'image/png'}
@dataclass
class
InlineConfig:
47@dataclass 48class InlineConfig: 49 """High-level configuration for the inliner. 50 51 # Parameters 52 - `allowed_extensions : set[AssetExt]` 53 Extensions that may be inlined. 54 - `tag_attr : dict[str, str]` 55 Mapping *tag -> attribute* that holds the asset reference. 56 - `max_bytes : int` 57 Assets larger than this are ignored. 58 - `local : bool` 59 Allow local filesystem assets. 60 - `remote : bool` 61 Allow remote http/https assets. 62 - `include_filename_comments : bool` 63 Surround every replacement with `<!-- begin '...' -->` 64 and `<!-- end '...' -->`. 65 - `use_bs4 : bool` 66 Parse the document with BeautifulSoup if available. 67 """ 68 69 allowed_extensions: set[AssetExt] = field( 70 default_factory=lambda: set(DEFAULT_ALLOWED_EXTENSIONS) 71 ) 72 tag_attr: dict[str, str] = field(default_factory=lambda: dict(DEFAULT_TAG_ATTR)) 73 max_bytes: int = 128 * 1024 74 local: bool = True 75 remote: bool = False 76 include_filename_comments: bool = True 77 use_bs4: bool = False
High-level configuration for the inliner.
Parameters
allowed_extensions : set[AssetExt]Extensions that may be inlined.tag_attr : dict[str, str]Mapping tag -> attribute that holds the asset reference.max_bytes : intAssets larger than this are ignored.local : boolAllow local filesystem assets.remote : boolAllow remote http/https assets.include_filename_comments : boolSurround every replacement with<!-- begin '...' -->and<!-- end '...' -->.use_bs4 : boolParse the document with BeautifulSoup if available.
def
inline_html_assets( html: str, *, base_path: pathlib._local.Path, config: InlineConfig | None = None, prettify: bool = False) -> str:
236def inline_html_assets( 237 html: str, 238 *, 239 base_path: Path, 240 config: InlineConfig | None = None, 241 prettify: bool = False, # kept for API compatibility (ignored in regex mode) 242) -> str: 243 """Inline permitted external assets inside *html*. 244 245 # Parameters 246 - `html : str` 247 Raw HTML text. 248 - `base_path : Path` 249 Directory used to resolve relative asset paths. 250 - `config : InlineConfig | None` 251 Inlining options (see `InlineConfig`). 252 - `prettify : bool` 253 Pretty-print output (only effective in BS4 mode). 254 255 # Returns 256 - `str` 257 Modified HTML. 258 """ 259 cfg: InlineConfig = config or InlineConfig() 260 if cfg.use_bs4: 261 html_out: str = _inline_with_bs4(html, base_path, cfg) 262 if prettify: 263 # lazy import to avoid unconditional dependency 264 from bs4 import BeautifulSoup 265 266 # TYPING: .prettify() returns str if no encoding is set 267 html_out = str(BeautifulSoup(html_out, "html.parser").prettify()) 268 else: 269 html_out = _inline_with_regex(html, base_path, cfg) 270 return html_out
Inline permitted external assets inside html.
Parameters
html : strRaw HTML text.base_path : PathDirectory used to resolve relative asset paths.config : InlineConfig | NoneInlining options (seeInlineConfig).prettify : boolPretty-print output (only effective in BS4 mode).
Returns
strModified HTML.
def
inline_html_file( html_path: pathlib._local.Path, output_path: pathlib._local.Path, base_path: pathlib._local.Path | None = None, config: InlineConfig | None = None, prettify: bool = False) -> pathlib._local.Path:
273def inline_html_file( 274 html_path: Path, 275 output_path: Path, 276 base_path: Path | None = None, 277 config: InlineConfig | None = None, 278 prettify: bool = False, 279) -> Path: 280 """Read *html_path*, inline its assets, and write the result. 281 282 # Parameters 283 - `html_path : Path` 284 Source HTML file. 285 - `output_path : Path` 286 Destination path to write the modified HTML. 287 - `base_path : Path | None` 288 Directory used to resolve relative asset paths (defaults to the HTML file's directory). 289 If `None`, uses the directory of *html_path*. 290 (default: `None` -> use `html_path.parent`) 291 - `config : InlineConfig | None` 292 Inlining options. 293 If `None`, uses default configuration. 294 (default: `None` -> use `InlineConfig()`) 295 - `prettify : bool` 296 Pretty-print when `use_bs4=True`. 297 (default: `False`) 298 299 # Returns 300 - `Path` 301 Path actually written. 302 """ 303 if base_path is None: 304 base_path = html_path.parent 305 html_raw: str = html_path.read_text() 306 html_new: str = inline_html_assets( 307 html_raw, 308 base_path=base_path, 309 config=config, 310 prettify=prettify, 311 ) 312 dest: Path = output_path or html_path 313 dest.write_text(html_new) 314 return dest
Read html_path, inline its assets, and write the result.
Parameters
html_path : PathSource HTML file.output_path : PathDestination path to write the modified HTML.base_path : Path | NoneDirectory used to resolve relative asset paths (defaults to the HTML file's directory). IfNone, uses the directory of html_path. (default:None-> usehtml_path.parent)config : InlineConfig | NoneInlining options. IfNone, uses default configuration. (default:None-> useInlineConfig())prettify : boolPretty-print whenuse_bs4=True. (default:False)
Returns
PathPath actually written.