Source code for docp.parsers._pdftableparser

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides the logic for parsing tables from a PDF
            document.

:Platform:  Linux
:Developer: J Berendt
:Email:     jeremy.berendt@rolls-royce.com

.. attention::

            This module is *not* designed to be interacted with
            directly, only via the appropriate interface class(es).

            Rather, please create an instance of a PDF document parsing
            object using the following:

                - :class:`~docp.parsers.pdfparser.PDFParser`

"""
# pylint: disable=import-error
# pylint: disable=protected-access
# pylint: disable=wrong-import-order

import io
import os
import pandas as pd
import shutil
# locals
from parsers._pdfbaseparser import _PDFBaseParser

# TODO: Move to a config file/class.  (TOML?)
_SETTINGS = {'vertical_strategy': 'lines',
             'horizontal_strategy':'lines',
             'snap_x_tolerance': 12}


[docs] class _PDFTableParser(_PDFBaseParser): """Private PDF document table parser intermediate class. Args: path (str): Full path to the PDF document. :Example: Extract tables from a PDF file:: >>> from docp import PDFParser >>> pdf = PDFParser(path='/path/to/myfile.pdf') >>> pdf.extract_tables() >>> tables = pdf.doc.tables """
[docs] def extract_tables(self, table_settings: dict=None, as_dataframe: bool=False, to_csv: bool=True, verbose: bool=False): """Extract tables from the document. Before a table is extracted, a number of validation tests are performed to verify what has been identified as a 'table' is actually a table which might be useful to the user. Each 'valid' table is written as a CSV file on the user's desktop. Additionally, the extracted table data is stored to the class' :attr:`self.tables` attribute. Args: table_settings (dict, optional): Table settings to be used for the table extraction. Defaults to None, which is replaced by the value in the config. as_dataframe (bool, optional): By default, the extracted tables are returned as a list of (lists of lists), for example: all_tables[table[rows[data]]]. However, if this argument is ``True``, the table data is returned as a list of ``pandas.DataFrame`` objects. In this case, the first row of the table is used as the header, and all remaining rows are treated as data. **Note:** This will *not* work properly for all tables. Defaults to False. to_csv (bool, optional): Dump extracted table data to a CSV file, one per table. Defaults to True. verbose (bool, optional): Display how many tables were extracted, and the path to their location. """ # pylint: disable=invalid-name # pylint: disable=too-many-nested-blocks # pylint: disable=unnecessary-dunder-call if self._doc.tables: # Reinitialise the doc object and reopen the document. self.__init__(path=self._path) c = 0 if to_csv: self._create_table_directory_path() if table_settings is None: table_settings = _SETTINGS for p in self._doc._pdf.pages: tblno = 1 tables = self._filter_tables(tables=p.find_tables(), threshold=5000) for table in tables: pc = p.crop(table.bbox) data = pc.extract_table(table_settings=table_settings) if all(len(row) > 1 for row in data) and len(data) > 1: # Verify no table rows are found in the most common rows (header/footer). if not self._table_header_footer(table=data): if not as_dataframe: self._doc._tables.append(data) if to_csv or as_dataframe: buffer = self._to_buffer(data=data) if to_csv: c += self._to_csv(buffer=buffer, pageno=p.page_number, tableno=tblno) if as_dataframe: self._to_df(buffer=buffer) buffer.close() tblno += 1 if verbose and to_csv: print('', 'Complete.', f'{c} tables were extracted and stored at the path below.', f'Path: {self._tbl_opath}', sep='\n')
[docs] def _create_table_directory_path(self): """Create the output directory for table data. If the directory does not exist, it is created. """ # Defined in parent class. # pylint: disable=attribute-defined-outside-init trans = {32: '_', 45: '_'} path = (os.path.join(os.path.join(os.environ['HOME'], 'Desktop'), 'docutils', 'pdf_tables', (os.path.splitext(os.path.basename(self._path))[0] .lower() .translate(trans)))) self._tbl_opath = path if not os.path.exists(path): os.makedirs(path)
[docs] def _create_table_file_path(self, pageno: int, tblno: int) -> str: """Create the filename for the table. Args: pageno (int): Page from which the table was extracted. tblno (int): Number of the table on the page, starting at 1. Returns: str: Explicit path to the file to be written. """ path = os.path.join(self._tbl_opath, f'pg{str(pageno).zfill(3)}_tb{str(tblno).zfill(3)}.csv') return path
[docs] @staticmethod def _filter_tables(tables: list, threshold: int=5000) -> list: """Remove tables from the passed list which are deemed invalid. Args: tables (list): A list of tables as detected by the :meth:`Page.find_table()` method. threshold (int, optional): Minimum pixel area for a detected table to be returned. Defaults to 5000. :Rationale: An 'invalid' table is determined by the number of pixels which the table covered. Any table which is less than (N) pixels is likely a block of text which has been categorised as a 'table', but is not. Returns: list: A list of tables whose pixel area is greater than ``threshold``. """ # pylint: disable=invalid-name t = [] for table in tables: x0, y0, x1, y1 = table.bbox if (x1-x0) * (y1-y0) > threshold: t.append(table) return t
[docs] def _to_buffer(self, data: list[list]) -> io.StringIO: """Write the table data into a string buffer. Args: data (list[list]): The table data as a list of lists to be written to a buffer. Returns: io.StringIO: A string buffer as an ``io.StringIO`` object. """ b = io.StringIO() for row in data: line = self._prepare_row(row=row) b.write(line) b.write('\n') b.seek(0) return b
[docs] def _to_csv(self, buffer: io.StringIO, pageno: int, tableno: int) -> int: """Write a table (from the buffer) to CSV. Args: buffer (io.StringIO): A pre-processed ``StringIO`` object containing table data to be written. pageno (int): Page number from the ``Page`` object. tableno (int): Number of the table on the page, based at 1. Returns: int: 1 if the file was written, otherwise 0. This is used by the caller to track the number of CSV files written. """ if buffer.seek(0, os.SEEK_END): # Test buffer is populated. path = self._create_table_file_path(pageno=pageno, tblno=tableno) with open(path, 'w', encoding='utf-8') as f: buffer.seek(0) shutil.copyfileobj(buffer, f) return 1 return 0
[docs] def _to_df(self, buffer: io.StringIO): """Write a table (from the buffer) to a DataFrame. Once written, the DataFrame is appended to :attr:`self._doc._tables` list of tables. Args: buffer (io.StringIO): A pre-processed ``StringIO`` object containing table data to be written. """ if buffer.seek(0, os.SEEK_END): buffer.seek(0) self._doc._tables.append(pd.read_csv(buffer))