Source code for docp.parsers._pptxtextparser
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose: This module provides the logic for parsing text from a PPTX
document.
:Platform: Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email: development@s3dev.uk
.. attention::
This module is *not* designed to be interacted with
directly, only via the appropriate interface class(es).
Rather, please create an instance of a PPTX document parsing
object using the following:
- :class:`~docp.parsers.pptxparser.PPTXParser`
"""
from unidecode import unidecode
# locals
try:
from .objects._slideobject import SlideObject
from .objects._textobject import TextObject
from .parsers._pptxbaseparser import _PPTXBaseParser
except ImportError:
from objects._slideobject import SlideObject
from objects._textobject import TextObject
from parsers._pptxbaseparser import _PPTXBaseParser
[docs]
class _PPTXTextParser(_PPTXBaseParser):
"""Private PPTX document text parser intermediate class.
Args:
path (str): Full path to the PPTX document.
:Example:
Extract text from a PPTX file::
>>> from docp import PPTXParser
>>> pptx = PPTXParser(path='/path/to/myfile.pptx')
>>> pptx.extract_text()
# Access the text on slide 1.
>>> pg1 = pptx.doc.slides[1].content
"""
[docs]
def extract_text(self,
*,
remove_newlines: bool=False,
convert_to_ascii: bool=True,
**kwargs) -> None:
"""Extract text from the document.
A list of slides, with extracted content can be accessed using
the :attr:`self.doc.slides` attribute.
Args:
remove_newlines (bool, optional): If True, the newline
characters are replaced with a space. Defaults to False.
convert_to_ascii (bool, optional): When a non-ASCII character
is found, an attempt is made to convert it to an
associated ASCII character. If a character cannot be
converted, it is replaced with a ``'?'``.
Defaults to True.
:Keyword Args:
- None
Returns:
None.
"""
# pylint: disable=unused-argument # **kwargs
# pylint: disable=unnecessary-dunder-call
if len(self.doc.slides) > 1:
# Reinitialise the doc object and reopen the document.
self.__init__(path=self._path)
self._extract_text(remove_newlines=remove_newlines, convert_to_ascii=convert_to_ascii)
[docs]
def _extract_text(self, remove_newlines: bool, convert_to_ascii: bool) -> None:
"""Extract the text from all shapes on all slides.
Args:
remove_newlines (bool): Replace the newline characters with
a space.
convert_to_ascii (bool): Attempt to convert any non-ASCII
characters to their ASCII equivalent.
The text extracted from each slide is stored as a ``TextObject``
which is appended to the slide's ``texts`` attribute.
"""
for idx, slide in enumerate(self.doc.parser.slides, 1):
_slideobj = SlideObject(pageno=idx, parser=slide)
for shape in slide.shapes:
if hasattr(shape, 'text'):
if shape.text:
text = shape.text
if remove_newlines:
text = text.replace('\n', ' ')
if convert_to_ascii:
text = unidecode(string=text,
errors='replace',
replace_str='?')
_textobj = TextObject(content=text)
_slideobj.texts.append(_textobj)
self.doc.slides.append(_slideobj)