lightudq.utils
1import os 2 3import pymupdf 4 5 6class UnsupportedFileTypeError(ValueError): 7 """Raised when the supplied document type is not supported.""" 8 9 10def read_markdown_to_text(file_path: str) -> str: 11 with open(file_path, encoding="utf-8") as f: 12 return f.read() 13 14 15def read_pdf_to_text(file_path: str) -> str: 16 doc = pymupdf.open(file_path) 17 text = "" 18 for page in doc: # iterate the document pages 19 text += page.get_text() # get plain text (is in UTF-8) 20 return text 21 22 23def read_document(file_path: str) -> str: 24 """Read document content based on file extension""" 25 _, file_extension = os.path.splitext(file_path) 26 file_extension = file_extension.lower() 27 28 if file_extension == ".md": 29 return read_markdown_to_text(file_path) 30 elif file_extension == ".txt": 31 with open(file_path) as file: 32 return file.read() 33 elif file_extension == ".pdf": 34 return read_pdf_to_text(file_path) 35 else: 36 print(f"Unsupported file format: {file_extension}") 37 raise UnsupportedFileTypeError(file_extension)
class
UnsupportedFileTypeError(builtins.ValueError):
7class UnsupportedFileTypeError(ValueError): 8 """Raised when the supplied document type is not supported."""
Raised when the supplied document type is not supported.
def
read_markdown_to_text(file_path: str) -> str:
def
read_pdf_to_text(file_path: str) -> str:
def
read_document(file_path: str) -> str:
24def read_document(file_path: str) -> str: 25 """Read document content based on file extension""" 26 _, file_extension = os.path.splitext(file_path) 27 file_extension = file_extension.lower() 28 29 if file_extension == ".md": 30 return read_markdown_to_text(file_path) 31 elif file_extension == ".txt": 32 with open(file_path) as file: 33 return file.read() 34 elif file_extension == ".pdf": 35 return read_pdf_to_text(file_path) 36 else: 37 print(f"Unsupported file format: {file_extension}") 38 raise UnsupportedFileTypeError(file_extension)
Read document content based on file extension