Source code for docp.loaders.chromapdfloader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides the entry point for loading PDF files
            into a Chroma database.

:Platform:  Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email:     development@s3dev.uk

:Comments:  n/a

:Examples:

    Parse and load a *single* PDF file into a Chroma database
    collection::

        >>> from docp.loaders import ChromaPDFLoader

        >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
                                collection='spam')
        >>> l.load(path='/path/to/directory/myfile.pdf')


    Parse and load a *directory* of PDF files into a Chroma database
    collection::

        >>> from docp.loaders import ChromaPDFLoader

        >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
                                collection='spam')
        >>> l.load(path='/path/to/directory', ext='pdf')


    For further example code use, please refer to the
    :class:`ChromaPDFLoader` class docstring.

"""

import os
# locals
try:
    from .libs.utilities import utilities
    from .loaders._chromabasepdfloader import _ChromaBasePDFLoader
except ImportError:
    from libs.utilities import utilities
    from loaders._chromabasepdfloader import _ChromaBasePDFLoader


[docs] class ChromaPDFLoader(_ChromaBasePDFLoader): """Chroma database PDF-specific document loader. Args: dbpath (str | ChromaDB): Either the full path to the Chroma database *directory*, or an instance of a :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is passed, the ``collection`` argument is ignored. collection (str, optional): Name of the Chroma database collection. Only required if the ``dbpath`` parameter is a path. Defaults to None. split_text (bool, optional): Split the document into chunks, before loading it into the database. Defaults to True. load_keywords (bool, optional): Use an LLM to derive keywords from the document and load these keywords into the sister keywords collection. Defaults to False. llm (object, optional): If deriving keywords, this is the LLM which will do the derivation. Defaults to None. offline (bool, optional): Remain offline and use the locally cached embedding function model. Defaults to False. .. important:: The *deriving and loading of keywords* is only recommended for **GPU-bound processing** as the LLM is invoked to infer the keywords for each given document. If called on a 'standard' PC, this will take a *long* time to complete, if it completes at all. :Examples: Parse and load a *single* PDF file into a Chroma database collection:: >>> from docp.loaders import ChromaPDFLoader >>> l = ChromaPDFLoader(dbpath='/path/to/chroma', collection='spam') >>> l.load(path='/path/to/directory/myfile.pdf') Parse and load a *directory* of PDF files into a Chroma database collection:: >>> from docp.loaders import ChromaPDFLoader >>> l = ChromaPDFLoader(dbpath='/path/to/chroma', collection='spam') >>> l.load(path='/path/to/directory', ext='pdf') """ # # No __init__ method here to ensure the ultimate base class' # signature is used and to save passing loads of stuff around, if we # don't have to. #
[docs] def load(self, path: str, *, ext: str='**', recursive: bool=True, remove_header: bool=True, remove_footer: bool=True, remove_newlines: bool=True, ignore_tags: set=None, convert_to_ascii: bool=True, **unused) -> None: """Load a PDF file (or files) into a Chroma database. Args: path (str): Full path to the file (or *directory*) to be parsed and loaded. Note: If this is a directory, a specific file extension can be passed into the :meth:`load` method using the ``ext`` argument. ext (str, optional): If the ``path`` argument refers to a *directory*, a specific file extension can be specified here. For example: ``ext = 'pdf'``. If anything other than ``'**'`` is provided, all alpha-characters are parsed from the string, and prefixed with ``*.``. Meaning, if ``'.pdf'`` is passed, the characters ``'pdf'`` are parsed and prefixed with ``*.`` to create ``'*.pdf'``. However, if ``'things.foo'`` is passed, the derived extension will be ``'*.thingsfoo'``. Defaults to '**', for a recursive search. recursive (bool, optional): If True, subdirectories are searched. Defaults to True. remove_header (bool, optional): Attempt to remove the header from each page. Defaults to True. remove_footer (bool, optional): Attempt to remove the footer from each page. Defaults to True. remove_newlines (bool, optional): Replace newline characters with a space. Defaults to True, as this helps with document chunk splitting. ignore_tags (set, optional): If provided, these are the PDF 'marked content' tags which will be ignored. Note that the PDF document must contain tags, otherwise the bounding box method is used and this argument is ignored. Defaults to ``{'Artifact'}``, as these generally relate to a header and/or footer. To include all tags, (not skip any) pass this argument as ``'na'``. convert_to_ascii (bool, optional): Convert all characters to ASCII. Defaults to True. :Keyword Args: unused (dict): This enables keywords to be passed into a loader-agnostic ``.load()`` function without raising a 'unexpected keyword argument` ``TypeError``. """ # pylint: disable=unused-argument # They are 'used' via locals(). # Prepare the arguments being sent to the doc parser. kwargs = self._set_kwargs(locals_=locals()) # Load multi if os.path.isdir(path): files = utilities.collect_files(path=path, ext=ext, recursive=recursive) count = len(files) for idx, f in enumerate(files, 1): print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}') self._load(path=f, **kwargs) # Load single else: print(f'Processing: {os.path.basename(path)} ...') self._load(path=path, **kwargs)
[docs] @staticmethod def _set_kwargs(locals_: dict) -> dict: r"""Prepare the arguments which are sent to the doc parser. As :func:`locals()` is used to capture the :meth:`load` method's arguments for passing into the doc parser, some argument must be removed first. Args: locals\_ (dict): The return value from a :func:`locals` call. Returns: dict: A *copy* of the provided dictionary with specific key/value pairs removed. """ # ^^^ The backslash in locals\_ is required for documentation to render correctly. kwargs = locals_.copy() for k in ['self', 'path']: kwargs.pop(k) return kwargs