#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose: This module provides the entry point for loading PDF files
into a Chroma database.
:Platform: Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email: development@s3dev.uk
:Comments: n/a
:Examples:
Parse and load a *single* PDF file into a Chroma database
collection::
>>> from docp.loaders import ChromaPDFLoader
>>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
collection='spam')
>>> l.load(path='/path/to/directory/myfile.pdf')
Parse and load a *directory* of PDF files into a Chroma database
collection::
>>> from docp.loaders import ChromaPDFLoader
>>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
collection='spam')
>>> l.load(path='/path/to/directory', ext='pdf')
For further example code use, please refer to the
:class:`ChromaPDFLoader` class docstring.
"""
import os
# locals
try:
from .libs.utilities import utilities
from .loaders._chromabasepdfloader import _ChromaBasePDFLoader
except ImportError:
from libs.utilities import utilities
from loaders._chromabasepdfloader import _ChromaBasePDFLoader
[docs]
class ChromaPDFLoader(_ChromaBasePDFLoader):
"""Chroma database PDF-specific document loader.
Args:
dbpath (str | ChromaDB): Either the full path to the Chroma
database *directory*, or an instance of a
:class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
passed, the ``collection`` argument is ignored.
collection (str, optional): Name of the Chroma database
collection. Only required if the ``dbpath`` parameter is a
path. Defaults to None.
split_text (bool, optional): Split the document into chunks,
before loading it into the database. Defaults to True.
load_keywords (bool, optional): Use an LLM to derive keywords
from the document and load these keywords into the sister
keywords collection. Defaults to False.
llm (object, optional): If deriving keywords, this is the LLM
which will do the derivation. Defaults to None.
offline (bool, optional): Remain offline and use the locally
cached embedding function model. Defaults to False.
.. important::
The *deriving and loading of keywords* is only recommended for
**GPU-bound processing** as the LLM is invoked to infer the
keywords for each given document.
If called on a 'standard' PC, this will take a *long* time to
complete, if it completes at all.
:Examples:
Parse and load a *single* PDF file into a Chroma database
collection::
>>> from docp.loaders import ChromaPDFLoader
>>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
collection='spam')
>>> l.load(path='/path/to/directory/myfile.pdf')
Parse and load a *directory* of PDF files into a Chroma
database collection::
>>> from docp.loaders import ChromaPDFLoader
>>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
collection='spam')
>>> l.load(path='/path/to/directory', ext='pdf')
"""
#
# No __init__ method here to ensure the ultimate base class'
# signature is used and to save passing loads of stuff around, if we
# don't have to.
#
[docs]
def load(self,
path: str,
*,
ext: str='**',
recursive: bool=True,
remove_header: bool=True,
remove_footer: bool=True,
remove_newlines: bool=True,
ignore_tags: set=None,
convert_to_ascii: bool=True,
**unused) -> None:
"""Load a PDF file (or files) into a Chroma database.
Args:
path (str): Full path to the file (or *directory*) to be
parsed and loaded. Note: If this is a directory, a
specific file extension can be passed into the
:meth:`load` method using the ``ext`` argument.
ext (str, optional): If the ``path`` argument refers to a
*directory*, a specific file extension can be specified
here. For example: ``ext = 'pdf'``.
If anything other than ``'**'`` is provided, all
alpha-characters are parsed from the string, and prefixed
with ``*.``. Meaning, if ``'.pdf'`` is passed, the
characters ``'pdf'`` are parsed and prefixed with ``*.``
to create ``'*.pdf'``. However, if ``'things.foo'`` is
passed, the derived extension will be ``'*.thingsfoo'``.
Defaults to '**', for a recursive search.
recursive (bool, optional): If True, subdirectories are
searched. Defaults to True.
remove_header (bool, optional): Attempt to remove the header
from each page. Defaults to True.
remove_footer (bool, optional): Attempt to remove the footer
from each page. Defaults to True.
remove_newlines (bool, optional): Replace newline characters
with a space. Defaults to True, as this helps with
document chunk splitting.
ignore_tags (set, optional): If provided, these are the
PDF 'marked content' tags which will be ignored. Note
that the PDF document must contain tags, otherwise the
bounding box method is used and this argument is ignored.
Defaults to ``{'Artifact'}``, as these generally
relate to a header and/or footer. To include all tags,
(not skip any) pass this argument as ``'na'``.
convert_to_ascii (bool, optional): Convert all characters to
ASCII. Defaults to True.
:Keyword Args:
unused (dict): This enables keywords to be passed into a
loader-agnostic ``.load()`` function without raising a
'unexpected keyword argument` ``TypeError``.
"""
# pylint: disable=unused-argument # They are 'used' via locals().
# Prepare the arguments being sent to the doc parser.
kwargs = self._set_kwargs(locals_=locals())
# Load multi
if os.path.isdir(path):
files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
count = len(files)
for idx, f in enumerate(files, 1):
print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
self._load(path=f, **kwargs)
# Load single
else:
print(f'Processing: {os.path.basename(path)} ...')
self._load(path=path, **kwargs)
[docs]
@staticmethod
def _set_kwargs(locals_: dict) -> dict:
r"""Prepare the arguments which are sent to the doc parser.
As :func:`locals()` is used to capture the :meth:`load` method's
arguments for passing into the doc parser, some argument must be
removed first.
Args:
locals\_ (dict): The return value from a :func:`locals` call.
Returns:
dict: A *copy* of the provided dictionary with specific
key/value pairs removed.
"""
# ^^^ The backslash in locals\_ is required for documentation to render correctly.
kwargs = locals_.copy()
for k in ['self', 'path']:
kwargs.pop(k)
return kwargs