Source code for docp.objects._pageobject

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides the implementation for the
            ``PageObject`` object.

:Platform:  Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email:     development@s3dev.uk

:Comments:  n/a

"""

from __future__ import annotations


[docs] class PageObject: """This class provides the implementation for the ``PageObject``. For each page in a document, an instance of this class is created, populated and appended into the document's ``pages`` list attribute. Args: content (str, optional): Page content as a single string. Defaults to ''. pageno (int, optional): Page number. Defaults to 0. parser (object, optional): The underlying document parser object. Defaults to None. """ __slots__ = ('_content', '_hastext', '_pageno', '_parser', '_tables') def __init__(self, content: str='', pageno: int=0, parser: object=None): """Page object class initialiser.""" self._content = content self._pageno = pageno self._parser = parser self._hastext = bool(content) self._tables = [] def __repr__(self) -> str: """Formatted representation of this object.""" if self._pageno == 0: return f'<Page: {self._pageno}; <index offset>>' return f'<Page: {self._pageno}; Chars: {len(self._content)}>' def __str__(self) -> str: """Formatted string displayed when printing this object.""" c = self._content[:25].replace('\n', ' ') + ' ...' if self._content else '' fmt = (f'Page no: {self._pageno}; ' f'Content: "{c}"; ' f'Chars: {len(self._content)}; ' f'nTables: {len(self._tables)}; ' f'Parser avail: {bool(self._parser)}') return fmt @property def content(self) -> str: """Accessor to the page's textual content.""" return self._content @content.setter def content(self, value: str) -> None: """Setter for the ``content`` attribute. If the ``value`` argument is populated, the content is set and the ``hastext`` attribute is set to ``True``. """ if value: self._content = value self._hastext = True @property def hastext(self) -> bool: """Flag indicating if the ``content`` attribute is populated.""" return self._hastext @property def pageno(self) -> int: """Accessor to the page number. Note: This is the page number with regard to the page's *sequence in the overall document*. This is *not* guaranteed to be the page's number per the document's page labeling scheme. """ return self._pageno @property def parser(self) -> object: """Accessor to the document parser's internal functionality. Note: The population of this property is determined by the document-type-specific ``docp`` parser. If the underlying parsing library has functionality worth preserving and making available to the user, it is stored to this property. Otherwise, this property will remain as ``None``. """ return self._parser @property def tables(self) -> list: """Accessor to the page's tables, if parsed.""" return self._tables
[docs] def show(self) -> pdfplumber.display.PageImage: # pylint: disable=undefined-variable # noqa """Display the page as an image. Additionally, the return value exposes access to the underlying ``pdfplumber`` debugging visualisation methods such as: - :func:`img.debug_tablefinder` - :func:`img.draw_*` - :func:`img.outline_chars` - :func:`img.outline_words` - :func:`img.reset` - etc. """ return self.parser.to_image()