docs for muutils v0.8.12
View Source on GitHub

muutils.web.bundle_html

Inline / bundle external assets (CSS, JS, SVG, PNG) into an HTML document.

Default mode uses zero external dependencies and a few well-targeted regular expressions. If you install beautifulsoup4 you can enable the far more robust BS4 mode by passing InlineConfig(use_bs4=True).


  1"""
  2Inline / bundle external assets (CSS, JS, SVG, PNG) into an HTML document.
  3
  4Default mode uses **zero external dependencies** and a few well-targeted
  5regular expressions.  If you install *beautifulsoup4* you can enable the
  6far more robust BS4 mode by passing `InlineConfig(use_bs4=True)`.
  7"""
  8
  9from __future__ import annotations
 10
 11import base64
 12import re
 13import urllib.request
 14import warnings
 15from dataclasses import dataclass, field
 16from pathlib import Path
 17from typing import Final, Literal
 18
 19# bs4 import deferred to avoid an unconditional dependency.
 20
 21# constants
 22# ---------------------------------------------------------------------
 23
 24AssetExt = Literal[".css", ".js", ".svg", ".png"]
 25
 26DEFAULT_ALLOWED_EXTENSIONS: Final[set[AssetExt]] = {".css", ".js", ".svg", ".png"}
 27
 28DEFAULT_TAG_ATTR: Final[dict[str, str]] = {
 29    "link": "href",  # <link rel="stylesheet" href="...">
 30    "script": "src",  # <script src="..."></script>
 31    "img": "src",  # <img src="...">
 32    "use": "xlink:href",  # <use xlink:href="sprite.svg#id">
 33}
 34
 35MIME_BY_EXT: Final[dict[AssetExt, str]] = {
 36    ".css": "text/css",
 37    ".js": "application/javascript",
 38    ".svg": "image/svg+xml",
 39    ".png": "image/png",
 40}
 41
 42# Configuration
 43# ---------------------------------------------------------------------
 44
 45
 46@dataclass
 47class InlineConfig:
 48    """High-level configuration for the inliner.
 49
 50    # Parameters
 51    - `allowed_extensions : set[AssetExt]`
 52        Extensions that may be inlined.
 53    - `tag_attr : dict[str, str]`
 54        Mapping *tag -> attribute* that holds the asset reference.
 55    - `max_bytes : int`
 56        Assets larger than this are ignored.
 57    - `local : bool`
 58        Allow local filesystem assets.
 59    - `remote : bool`
 60        Allow remote http/https assets.
 61    - `include_filename_comments : bool`
 62        Surround every replacement with `<!-- begin '...' -->`
 63        and `<!-- end '...' -->`.
 64    - `use_bs4 : bool`
 65        Parse the document with BeautifulSoup if available.
 66    """
 67
 68    allowed_extensions: set[AssetExt] = field(
 69        default_factory=lambda: set(DEFAULT_ALLOWED_EXTENSIONS)
 70    )
 71    tag_attr: dict[str, str] = field(default_factory=lambda: dict(DEFAULT_TAG_ATTR))
 72    max_bytes: int = 128 * 1024
 73    local: bool = True
 74    remote: bool = False
 75    include_filename_comments: bool = True
 76    use_bs4: bool = False
 77
 78
 79# Low-level helpers
 80# ---------------------------------------------------------------------
 81
 82
 83def _is_remote(url: str) -> bool:
 84    """Return *True* if *url* starts with http:// or https://."""
 85    return url.lower().startswith(("http://", "https://"))
 86
 87
 88def _fetch_bytes(src: str, base: Path) -> bytes:
 89    """Fetch *src* (local or remote) and return its raw bytes."""
 90    if _is_remote(src):
 91        with urllib.request.urlopen(src) as resp:
 92            return resp.read()
 93    return (base / src).read_bytes()
 94
 95
 96def _decode_text(buf: bytes) -> str:
 97    """Decode *buf* as UTF-8, falling back to replacement."""
 98    try:
 99        return buf.decode()
100    except UnicodeDecodeError:
101        return buf.decode("utf-8", "replace")
102
103
104# Regex-based implementation (no deps)
105# ---------------------------------------------------------------------
106
107
108def _apply_indent(html: str, start: int, replacement: str) -> str:
109    """Indent *replacement* to match the line that starts at *start*."""
110    line_start: int = html.rfind("\n", 0, start) + 1
111    indent: str = html[line_start:start]
112    return "\n".join(indent + line for line in replacement.splitlines())
113
114
115def _inline_with_regex(html: str, base: Path, cfg: InlineConfig) -> str:
116    """Inline assets using pure-regex parsing (no third-party libs)."""
117    tag: str
118    attr: str
119    for tag, attr in cfg.tag_attr.items():
120        pattern: str
121        if tag == "script":
122            pattern = (
123                rf"<script\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>\s*</script>"
124            )
125        elif tag == "link":
126            pattern = rf"<link\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>"
127        else:  # img, use, etc.
128            pattern = rf"<{tag}\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>"
129
130        matches: list[re.Match[str]] = list(re.finditer(pattern, html, re.IGNORECASE))
131        m: re.Match[str]
132        for m in reversed(matches):
133            raw_src: str = m.group(1)  # may contain #fragment
134            clean_src: str = re.split(r"[?#]", raw_src, maxsplit=1)[0]  # file path only
135            ext: str = Path(clean_src).suffix.lower()
136
137            if ext not in cfg.allowed_extensions:
138                continue
139            if _is_remote(clean_src) and not cfg.remote:
140                continue
141            if not _is_remote(clean_src) and not cfg.local:
142                continue
143
144            try:
145                data: bytes = _fetch_bytes(clean_src, base)
146            except Exception as err:
147                warnings.warn(f"skip '{raw_src}': {err}")
148                continue
149
150            if len(data) > cfg.max_bytes:
151                continue
152
153            # build replacement
154            replacement: str
155            if ext in {".css", ".js"}:
156                tag_name: str = "style" if ext == ".css" else "script"
157                replacement = f"<{tag_name}>\n{_decode_text(data)}\n</{tag_name}>"
158            else:  # .svg or .png
159                b64: str = base64.b64encode(data).decode()
160                # TYPING: we check earlier, ext if for sure in MIME_BY_EXT
161                data_uri: str = f"data:{MIME_BY_EXT[ext]};base64,{b64}"  # type: ignore[index]
162                replacement = m.group(0).replace(raw_src, data_uri, 1)
163
164            if cfg.include_filename_comments:
165                replacement = f"<!-- begin '{clean_src}' -->\n{replacement}\n<!-- end '{clean_src}' -->"
166
167            replacement = _apply_indent(html, m.start(), replacement)
168            html = html[: m.start()] + replacement + html[m.end() :]
169
170    return html
171
172
173# BeautifulSoup-based implementation (optional)
174# ---------------------------------------------------------------------
175
176
177def _inline_with_bs4(html: str, base: Path, cfg: InlineConfig) -> str:
178    """Inline assets using BeautifulSoup when available."""
179    try:
180        from bs4 import BeautifulSoup, Comment, Tag
181    except ModuleNotFoundError as exc:  # pragma: no cover
182        raise RuntimeError("BeautifulSoup requested but not installed") from exc
183
184    soup: BeautifulSoup = BeautifulSoup(html, "html.parser")
185
186    tag: Tag  # TYPING: i think soup.find_all() returns a list of Tag objects? mypy thinks it should be PageElement (of which Tag is a subclass)
187    for tag in list(soup.find_all(cfg.tag_attr.keys())):  # type: ignore[assignment]
188        attr: str = cfg.tag_attr[tag.name]
189        # TYPING: error: Incompatible types in assignment (expression has type "str | AttributeValueList | None", variable has type "str | None")  [assignment]
190        src_full: str | None = tag.get(attr)  # type: ignore[assignment]
191        if not src_full:
192            continue
193
194        clean_src: str = re.split(r"[?#]", src_full, maxsplit=1)[0]
195        ext: str = Path(clean_src).suffix.lower()
196
197        if ext not in cfg.allowed_extensions:
198            continue
199        if _is_remote(clean_src) and not cfg.remote:
200            continue
201        if not _is_remote(clean_src) and not cfg.local:
202            continue
203
204        try:
205            data: bytes = _fetch_bytes(clean_src, base)
206        except Exception as err:
207            warnings.warn(f"skip '{src_full}': {err}")
208            continue
209
210        if len(data) > cfg.max_bytes:
211            continue
212
213        if ext in {".css", ".js"}:
214            new_tag: Tag = soup.new_tag("style" if ext == ".css" else "script")
215            new_tag.string = _decode_text(data)
216            if cfg.include_filename_comments:
217                tag.insert_before(Comment(f" begin '{src_full}' "))
218                tag.insert_after(Comment(f" end '{src_full}' "))
219            tag.replace_with(new_tag)
220        else:  # .svg or .png
221            b64: str = base64.b64encode(data).decode()
222            # we are sure ext is in MIME_BY_EXT, so ignore type error
223            tag[attr] = f"data:{MIME_BY_EXT[ext]};base64,{b64}"  # type: ignore[index]
224            if cfg.include_filename_comments:
225                tag.insert_before(Comment(f" begin '{src_full}' "))
226                tag.insert_after(Comment(f" end '{src_full}' "))
227
228    return str(soup)
229
230
231# Public API
232# ---------------------------------------------------------------------
233
234
235def inline_html_assets(
236    html: str,
237    *,
238    base_path: Path,
239    config: InlineConfig | None = None,
240    prettify: bool = False,  # kept for API compatibility (ignored in regex mode)
241) -> str:
242    """Inline permitted external assets inside *html*.
243
244    # Parameters
245    - `html : str`
246        Raw HTML text.
247    - `base_path : Path`
248        Directory used to resolve relative asset paths.
249    - `config : InlineConfig | None`
250        Inlining options (see `InlineConfig`).
251    - `prettify : bool`
252        Pretty-print output (only effective in BS4 mode).
253
254    # Returns
255    - `str`
256        Modified HTML.
257    """
258    cfg: InlineConfig = config or InlineConfig()
259    if cfg.use_bs4:
260        html_out: str = _inline_with_bs4(html, base_path, cfg)
261        if prettify:
262            # lazy import to avoid unconditional dependency
263            from bs4 import BeautifulSoup
264
265            # TYPING: .prettify() returns str if no encoding is set
266            html_out = str(BeautifulSoup(html_out, "html.parser").prettify())
267    else:
268        html_out = _inline_with_regex(html, base_path, cfg)
269    return html_out
270
271
272def inline_html_file(
273    html_path: Path,
274    output_path: Path,
275    base_path: Path | None = None,
276    config: InlineConfig | None = None,
277    prettify: bool = False,
278) -> Path:
279    """Read *html_path*, inline its assets, and write the result.
280
281    # Parameters
282    - `html_path : Path`
283        Source HTML file.
284    - `output_path : Path`
285        Destination path to write the modified HTML.
286    - `base_path : Path | None`
287        Directory used to resolve relative asset paths (defaults to the HTML file's directory).
288        If `None`, uses the directory of *html_path*.
289        (default: `None` -> use `html_path.parent`)
290    - `config : InlineConfig | None`
291        Inlining options.
292        If `None`, uses default configuration.
293        (default: `None` -> use `InlineConfig()`)
294    - `prettify : bool`
295        Pretty-print when `use_bs4=True`.
296        (default: `False`)
297
298    # Returns
299    - `Path`
300        Path actually written.
301    """
302    if base_path is None:
303        base_path = html_path.parent
304    html_raw: str = html_path.read_text()
305    html_new: str = inline_html_assets(
306        html_raw,
307        base_path=base_path,
308        config=config,
309        prettify=prettify,
310    )
311    dest: Path = output_path or html_path
312    dest.write_text(html_new)
313    return dest
314
315
316# CLI
317# ---------------------------------------------------------------------
318
319if __name__ == "__main__":
320    import argparse
321
322    parser: argparse.ArgumentParser = argparse.ArgumentParser(
323        description="Inline / bundle CSS, JS, SVG, PNG assets. "
324        "Uses regex parsing by default; pass --bs4 to require BeautifulSoup."
325    )
326    parser.add_argument("html", type=Path, help="input HTML file")
327    parser.add_argument(
328        "-o",
329        "--output",
330        type=Path,
331        help="output file",
332        required=True,
333    )
334    parser.add_argument(
335        "--source-dir",
336        type=Path,
337        default=None,
338        help="base directory for relative asset paths (defaults to the HTML file's directory)",
339    )
340    parser.add_argument("--remote", action="store_true", help="allow remote URLs")
341    parser.add_argument("--bs4", action="store_true", help="use BeautifulSoup parser")
342    parser.add_argument(
343        "--prettify", action="store_true", help="pretty-print with BeautifulSoup)"
344    )
345    parser.add_argument(
346        "--max-bytes", type=int, default=128 * 1024, help="size limit per asset"
347    )
348    parser.add_argument(
349        "--ext",
350        nargs="+",
351        default=list(DEFAULT_ALLOWED_EXTENSIONS),
352        help="extensions to inline",
353    )
354    parser.add_argument(
355        "--tag-attr",
356        type=str,
357        default=None,
358        help='override tag->attr map. format: "tag1=attr1,tag2=attr2"',
359    )
360    parser.add_argument("--no-comments", dest="comments", action="store_false")
361    args: argparse.Namespace = parser.parse_args()
362
363    tag_attr: dict[str, str]
364    if args.tag_attr:
365        tag_attr = {
366            tag: attr
367            for tag, attr in (item.split("=") for item in args.tag_attr.split(","))
368        }
369
370    else:
371        tag_attr = dict(DEFAULT_TAG_ATTR)
372
373    cfg: InlineConfig = InlineConfig(
374        allowed_extensions=set(args.ext),  # type: ignore[arg-type]
375        tag_attr=tag_attr,
376        max_bytes=args.max_bytes,
377        remote=args.remote,
378        include_filename_comments=args.comments,
379        use_bs4=args.bs4,
380    )
381
382    inline_html_file(
383        args.html,
384        output_path=args.output,
385        base_path=args.source_dir,
386        config=cfg,
387        prettify=args.prettify,
388    )

AssetExt = typing.Literal['.css', '.js', '.svg', '.png']
DEFAULT_ALLOWED_EXTENSIONS: Final[set[Literal['.css', '.js', '.svg', '.png']]] = {'.svg', '.css', '.js', '.png'}
DEFAULT_TAG_ATTR: Final[dict[str, str]] = {'link': 'href', 'script': 'src', 'img': 'src', 'use': 'xlink:href'}
MIME_BY_EXT: Final[dict[Literal['.css', '.js', '.svg', '.png'], str]] = {'.css': 'text/css', '.js': 'application/javascript', '.svg': 'image/svg+xml', '.png': 'image/png'}
@dataclass
class InlineConfig:
47@dataclass
48class InlineConfig:
49    """High-level configuration for the inliner.
50
51    # Parameters
52    - `allowed_extensions : set[AssetExt]`
53        Extensions that may be inlined.
54    - `tag_attr : dict[str, str]`
55        Mapping *tag -> attribute* that holds the asset reference.
56    - `max_bytes : int`
57        Assets larger than this are ignored.
58    - `local : bool`
59        Allow local filesystem assets.
60    - `remote : bool`
61        Allow remote http/https assets.
62    - `include_filename_comments : bool`
63        Surround every replacement with `<!-- begin '...' -->`
64        and `<!-- end '...' -->`.
65    - `use_bs4 : bool`
66        Parse the document with BeautifulSoup if available.
67    """
68
69    allowed_extensions: set[AssetExt] = field(
70        default_factory=lambda: set(DEFAULT_ALLOWED_EXTENSIONS)
71    )
72    tag_attr: dict[str, str] = field(default_factory=lambda: dict(DEFAULT_TAG_ATTR))
73    max_bytes: int = 128 * 1024
74    local: bool = True
75    remote: bool = False
76    include_filename_comments: bool = True
77    use_bs4: bool = False

High-level configuration for the inliner.

Parameters

  • allowed_extensions : set[AssetExt] Extensions that may be inlined.
  • tag_attr : dict[str, str] Mapping tag -> attribute that holds the asset reference.
  • max_bytes : int Assets larger than this are ignored.
  • local : bool Allow local filesystem assets.
  • remote : bool Allow remote http/https assets.
  • include_filename_comments : bool Surround every replacement with <!-- begin '...' --> and <!-- end '...' -->.
  • use_bs4 : bool Parse the document with BeautifulSoup if available.
InlineConfig( allowed_extensions: set[typing.Literal['.css', '.js', '.svg', '.png']] = <factory>, tag_attr: dict[str, str] = <factory>, max_bytes: int = 131072, local: bool = True, remote: bool = False, include_filename_comments: bool = True, use_bs4: bool = False)
allowed_extensions: set[typing.Literal['.css', '.js', '.svg', '.png']]
tag_attr: dict[str, str]
max_bytes: int = 131072
local: bool = True
remote: bool = False
include_filename_comments: bool = True
use_bs4: bool = False
def inline_html_assets( html: str, *, base_path: pathlib._local.Path, config: InlineConfig | None = None, prettify: bool = False) -> str:
236def inline_html_assets(
237    html: str,
238    *,
239    base_path: Path,
240    config: InlineConfig | None = None,
241    prettify: bool = False,  # kept for API compatibility (ignored in regex mode)
242) -> str:
243    """Inline permitted external assets inside *html*.
244
245    # Parameters
246    - `html : str`
247        Raw HTML text.
248    - `base_path : Path`
249        Directory used to resolve relative asset paths.
250    - `config : InlineConfig | None`
251        Inlining options (see `InlineConfig`).
252    - `prettify : bool`
253        Pretty-print output (only effective in BS4 mode).
254
255    # Returns
256    - `str`
257        Modified HTML.
258    """
259    cfg: InlineConfig = config or InlineConfig()
260    if cfg.use_bs4:
261        html_out: str = _inline_with_bs4(html, base_path, cfg)
262        if prettify:
263            # lazy import to avoid unconditional dependency
264            from bs4 import BeautifulSoup
265
266            # TYPING: .prettify() returns str if no encoding is set
267            html_out = str(BeautifulSoup(html_out, "html.parser").prettify())
268    else:
269        html_out = _inline_with_regex(html, base_path, cfg)
270    return html_out

Inline permitted external assets inside html.

Parameters

  • html : str Raw HTML text.
  • base_path : Path Directory used to resolve relative asset paths.
  • config : InlineConfig | None Inlining options (see InlineConfig).
  • prettify : bool Pretty-print output (only effective in BS4 mode).

Returns

  • str Modified HTML.
def inline_html_file( html_path: pathlib._local.Path, output_path: pathlib._local.Path, base_path: pathlib._local.Path | None = None, config: InlineConfig | None = None, prettify: bool = False) -> pathlib._local.Path:
273def inline_html_file(
274    html_path: Path,
275    output_path: Path,
276    base_path: Path | None = None,
277    config: InlineConfig | None = None,
278    prettify: bool = False,
279) -> Path:
280    """Read *html_path*, inline its assets, and write the result.
281
282    # Parameters
283    - `html_path : Path`
284        Source HTML file.
285    - `output_path : Path`
286        Destination path to write the modified HTML.
287    - `base_path : Path | None`
288        Directory used to resolve relative asset paths (defaults to the HTML file's directory).
289        If `None`, uses the directory of *html_path*.
290        (default: `None` -> use `html_path.parent`)
291    - `config : InlineConfig | None`
292        Inlining options.
293        If `None`, uses default configuration.
294        (default: `None` -> use `InlineConfig()`)
295    - `prettify : bool`
296        Pretty-print when `use_bs4=True`.
297        (default: `False`)
298
299    # Returns
300    - `Path`
301        Path actually written.
302    """
303    if base_path is None:
304        base_path = html_path.parent
305    html_raw: str = html_path.read_text()
306    html_new: str = inline_html_assets(
307        html_raw,
308        base_path=base_path,
309        config=config,
310        prettify=prettify,
311    )
312    dest: Path = output_path or html_path
313    dest.write_text(html_new)
314    return dest

Read html_path, inline its assets, and write the result.

Parameters

  • html_path : Path Source HTML file.
  • output_path : Path Destination path to write the modified HTML.
  • base_path : Path | None Directory used to resolve relative asset paths (defaults to the HTML file's directory). If None, uses the directory of html_path. (default: None -> use html_path.parent)
  • config : InlineConfig | None Inlining options. If None, uses default configuration. (default: None -> use InlineConfig())
  • prettify : bool Pretty-print when use_bs4=True. (default: False)

Returns

  • Path Path actually written.