lightudq.utils

 1import os
 2
 3import pymupdf
 4
 5
 6class UnsupportedFileTypeError(ValueError):
 7    """Raised when the supplied document type is not supported."""
 8
 9
10def read_markdown_to_text(file_path: str) -> str:
11    with open(file_path, encoding="utf-8") as f:
12        return f.read()
13
14
15def read_pdf_to_text(file_path: str) -> str:
16    doc = pymupdf.open(file_path)
17    text = ""
18    for page in doc:  # iterate the document pages
19        text += page.get_text()  # get plain text (is in UTF-8)
20    return text
21
22
23def read_document(file_path: str) -> str:
24    """Read document content based on file extension"""
25    _, file_extension = os.path.splitext(file_path)
26    file_extension = file_extension.lower()
27
28    if file_extension == ".md":
29        return read_markdown_to_text(file_path)
30    elif file_extension == ".txt":
31        with open(file_path) as file:
32            return file.read()
33    elif file_extension == ".pdf":
34        return read_pdf_to_text(file_path)
35    else:
36        print(f"Unsupported file format: {file_extension}")
37        raise UnsupportedFileTypeError(file_extension)
class UnsupportedFileTypeError(builtins.ValueError):
7class UnsupportedFileTypeError(ValueError):
8    """Raised when the supplied document type is not supported."""

Raised when the supplied document type is not supported.

def read_markdown_to_text(file_path: str) -> str:
11def read_markdown_to_text(file_path: str) -> str:
12    with open(file_path, encoding="utf-8") as f:
13        return f.read()
def read_pdf_to_text(file_path: str) -> str:
16def read_pdf_to_text(file_path: str) -> str:
17    doc = pymupdf.open(file_path)
18    text = ""
19    for page in doc:  # iterate the document pages
20        text += page.get_text()  # get plain text (is in UTF-8)
21    return text
def read_document(file_path: str) -> str:
24def read_document(file_path: str) -> str:
25    """Read document content based on file extension"""
26    _, file_extension = os.path.splitext(file_path)
27    file_extension = file_extension.lower()
28
29    if file_extension == ".md":
30        return read_markdown_to_text(file_path)
31    elif file_extension == ".txt":
32        with open(file_path) as file:
33            return file.read()
34    elif file_extension == ".pdf":
35        return read_pdf_to_text(file_path)
36    else:
37        print(f"Unsupported file format: {file_extension}")
38        raise UnsupportedFileTypeError(file_extension)

Read document content based on file extension