Source code for docp.parsers._pptxbaseparser

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides generalised base functionality for
            parsing PPTX documents.

:Platform:  Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email:     development@s3dev.uk

.. attention::

            This module is *not* designed to be interacted with
            directly, only via the appropriate interface class(es).

            Rather, please create an instance of a PPTX document parsing
            object using the following:

                - :class:`~docp.parsers.pptxparser.PPTXParser`

"""
# pylint: disable=protected-access

import os
from pptx import Presentation
# locals
try:
    from libs.utilities import utilities
    from objects.pptxobject import DocPPTX
except ImportError:
    from .libs.utilities import utilities
    from .objects.pptxobject import DocPPTX


[docs] class _PPTXBaseParser: """Base class containing generalised PPTX parsing functionality.""" def __init__(self, path: str): """Private base parser class initialiser. Args: path (str): Full path to the document to be parsed. """ self._path = path self._doc = DocPPTX() self._set_paths() self._open() @property def doc(self) -> DocPPTX: """Accessor to the document object.""" return self._doc
[docs] def _open(self) -> None: """Open the PPTX document for reading. Before opening the file, a test is performed to ensure the PPTX is valid. The file must: - exist - be a ZIP archive, per the file signature - have a .pptx file extension :Other Operations: - Store the ``pptx.Presentation`` parser object returned from the :func:`pptx.Presentation` instance creation into the :attr:`self._doc._parser` attribute. - Store the number of pages into the :attr:`self._doc._npages` attribute. - Store the document's meta data into the :attr:`self._doc._meta` attribute. Raises: TypeError: Raised if the file type criteria above are not met. """ if all((os.path.exists(self._doc._fpath), utilities.iszip(self._doc._fpath), os.path.splitext(self._doc._fpath)[1].lower() == '.pptx')): self._doc._parser = Presentation(self._doc._fpath) self._doc._npages = len(self._doc._parser.slides) self._doc._meta = self._doc._parser.core_properties else: msg = f'{self._doc._fname} is not a valid PPTX file.' raise TypeError(msg)
[docs] def _set_paths(self) -> None: """Set the document's file path attributes.""" self._doc._fpath = os.path.realpath(self._path) self._doc._fname = os.path.basename(self._path)