Source code for docp.loaders._chromabasepdfloader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides the mid-level functionality to parse
            and store PDF files into a Chroma vector database.

:Platform:  Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email:     development@s3dev.uk

:Comments:  n/a

.. attention::

            This module is *not* designed to be interacted with
            directly, only via the appropriate interface class(es).

            Rather, please create an instance of a Chroma PDF document
            loading object using the following class:

                - :class:`~docp.loaders.chromapdfloader.ChromaPDFLoader`

"""

from langchain.docstore.document import Document
from utils4.user_interface import ui
# locals
try:
    from .loaders._chromabaseloader import _ChromaBaseLoader
    from .parsers.pdfparser import PDFParser
except ImportError:
    from loaders._chromabaseloader import _ChromaBaseLoader
    from parsers.pdfparser import PDFParser


[docs] class _ChromaBasePDFLoader(_ChromaBaseLoader): """Base class for loading PDF documents into a Chroma vector database. This class is a specialised version of the :class:`~docp.loaders._chromabaseloader._ChromaBaseLoader` class, designed to handle PDF presentations. Args: dbpath (str | ChromaDB): Either the full path to the Chroma database *directory*, or an instance of a :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is passed, the ``collection`` argument is ignored. collection (str, optional): Name of the Chroma database collection. Only required if the ``db`` parameter is a path. Defaults to None. split_text (bool, optional): Split the document into chunks, before loading it into the database. Defaults to True. load_keywords (bool, optional): Derive keywords from the document and load these into the sister keywords collection. Defaults to False. llm (object, optional): If deriving keywords, this is the LLM which will do the derivation. Defaults to None. offline (bool, optional): Remain offline and use the locally cached embedding function model. Defaults to False. """ # pylint: disable=attribute-defined-outside-init # These are defined in the base class. # # No __init__ method here to ensure the ultimate base class' # signature is used and to save passing loads of stuff around, if we # don't have to. #
[docs] def _create_documents(self) -> bool: """Convert each extracted page into a ``Document`` object. Returns: bool: True of the pages are loaded as ``Document`` objects successfully. Otherwise False. """ for page in self._p.doc.pages: if page.hastext: doc = Document(page_content=page.content, metadata={'source': self._p.doc.basename, 'pageno': page.pageno}) # Prevent duplicates which cause chroma to fall over on load. if doc not in self._docs: self._docs.append(doc) if not self._docs: msg = f'{self._PFX_WARN} Text could not be parsed from {self._p.doc.basename}.' ui.print_warning(msg) return False return True
[docs] def _parse_text(self, **kwargs) -> bool: """Parse text from the document. :Keyword Arguments: Those to be passed into the text extraction method. Returns: bool: True if the parser's 'text' object is populated, otherwise False. """ print('- Extracting text ...') self._p.extract_text(**kwargs) if len(self._p.doc.pages) < 2: ui.print_warning(f'No text extracted from {self._p.doc.basename}') return False return True
[docs] def _set_parser(self): """Set the appropriate document parser. Setting the parser creates a parser instance as an attribute of this class. When the parser instance is created, various file verification checks are made. For detail, refer to the following parser method: - :meth:`docp.parsers._pdfbaseparser._PDFBaseParser._open` """ self._p = PDFParser(path=self._fpath)