lightudq.document_quality
1import os 2from pathlib import Path 3from typing import Optional, Union 4 5from dotenv import load_dotenv 6from pydantic import BaseModel 7from pydantic_ai import Agent 8 9from lightudq.prompts import ( 10 CUSTOM_METRIC_PROMPT, 11 FACT_CHECK_PROMPT, 12 MISSING_QUESTIONS_PROMPT, 13 PII_PRESENCE_CHECK_PROMPT, 14 QNA_EXTRACT_PROMPT, 15 SUMMARY_PROMPT, 16) 17from lightudq.schemas import ( 18 CustomMetric, 19 CustomMetricResult, 20 DocumentProfile, 21 DocumentQualityCheckResult, 22 InconsistentFacts, 23 MissingQuestions, 24 PIIPresence, 25 QnAPairs, 26) 27from lightudq.utils import read_document 28 29load_dotenv() 30 31 32class DuplicateMetricNameError(ValueError): 33 """Raised when the same custom metric is registered twice.""" 34 35 36class DocumentQuality: 37 """ 38 Checks the quality of the document 39 """ 40 41 def __init__( 42 self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5 43 ): 44 """Initialize the DocumentQuality class. 45 Parameters 46 ---------- 47 file_path : str 48 The path to the document file to be analyzed. 49 model_name : str, optional 50 The name of the LLM model to use for analysis, available models: 51 https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName. 52 The default is 'openai:gpt-4o'. 53 num_questions : int, optional 54 The number of question-answer pairs to extract from the document, by default 5. 55 """ 56 self.file_path = file_path 57 self.document = read_document(file_path) 58 self.output: Optional[DocumentQualityCheckResult] = None 59 self.profile = None 60 self.llm_client = Agent(model_name) 61 self._custom_metrics = [] 62 self._num_questions = num_questions 63 64 def add_custom_metric(self, custom_metric: CustomMetric): 65 """Add a custom metric to the DocumentQuality instance. 66 Parameters 67 ---------- 68 custom_metric : CustomMetric 69 A pydantic model containing the custom metric details. 70 """ 71 if custom_metric.name in [cm.name for cm in self._custom_metrics]: 72 raise DuplicateMetricNameError( 73 f"Custom metric {custom_metric.name} already exists." 74 ) 75 self._custom_metrics.append(custom_metric) 76 77 def get_custom_metrics(self) -> list[CustomMetric]: 78 """Get the list of custom metrics added to the DocumentQuality instance. 79 Returns 80 ------- 81 list[CustomMetric]: A list of custom metrics. 82 """ 83 return self._custom_metrics 84 85 def remove_custom_metric(self, custom_metric_name: str): 86 """Remove a custom metric from the DocumentQuality instance by name. 87 Parameters 88 ---------- 89 custom_metric_name : str 90 The name of the custom metric to be removed. 91 """ 92 self._custom_metrics = [ 93 cm for cm in self._custom_metrics if cm.name != custom_metric_name 94 ] 95 96 def run(self) -> DocumentQualityCheckResult: 97 """Run the document quality checks and return the results. 98 Returns 99 ------- 100 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks. 101 """ 102 current_profile = self.get_document_profile() 103 inconsistency_metric = self.compute_fact_checks( 104 facts=current_profile.qnaPairs.answers 105 ) 106 pii_metric = self.pii_presence_check() 107 custom_metric_res = [] 108 for custom_metric in self._custom_metrics: 109 custom_metric_output = self.get_custom_metric(custom_metric) 110 custom_metric_res.append( 111 CustomMetricResult(name=custom_metric.name, result=custom_metric_output) 112 ) 113 return DocumentQualityCheckResult( 114 profile=current_profile, 115 inconsistency=inconsistency_metric, 116 pii=pii_metric, 117 customMetrics=custom_metric_res if custom_metric_res else None, 118 ) 119 120 def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult: 121 """Compare the document quality against a reference profile. 122 Parameters 123 ---------- 124 reference_profile : DocumentProfile 125 The reference profile to compare against. 126 Returns 127 ------- 128 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile. 129 """ 130 incompleteness = self.incompleteness_metric( 131 questions=reference_profile.qnaPairs.questions 132 ) 133 if self.profile is None: 134 self.profile = self.get_document_profile() 135 inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers) 136 pii = self.pii_presence_check() 137 inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers) 138 139 return DocumentQualityCheckResult( 140 profile=self.profile, 141 inconsistency=inconsistency, 142 incompleteness=incompleteness, 143 pii=pii, 144 inaccuracy=inaccuracy, 145 ) 146 147 def get_response_from_llm( 148 self, msg: str, output_model: Optional[type[BaseModel]] = None 149 ) -> Union[BaseModel, str]: 150 """get response from LLM for a given message and output model 151 Parameters 152 ---------- 153 msg : str 154 The message to send to the LLM 155 output_model : Type[BaseModel], optional 156 pydantic model to parse the output, by default None. 157 158 Returns 159 ------- 160 a pydantic model instance 161 """ 162 163 res = self.llm_client.run_sync(msg, output_type=output_model) 164 return res.output 165 166 def extract_qna(self) -> QnAPairs: 167 """extract pairs of questions and answers from a document 168 Returns: 169 ------- 170 QnAPairs: a pydantic model containing the list of questions and answers 171 172 """ 173 prompt = QNA_EXTRACT_PROMPT.format( 174 document=self.document, 175 output_schema=QnAPairs.model_json_schema(), 176 num_questions=self._num_questions, 177 ) 178 resp = self.get_response_from_llm(prompt, QnAPairs) 179 return resp 180 181 def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts: 182 """Checks whether the provided facts are consistent against the document 183 Parameters 184 ---------- 185 facts : list[str] 186 The list of facts to check against the document 187 Returns 188 ------- 189 InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any 190 """ 191 prompt = FACT_CHECK_PROMPT.format( 192 document=self.document, 193 output_schema=InconsistentFacts.model_json_schema(), 194 facts=facts, 195 ) 196 resp = self.get_response_from_llm(prompt, InconsistentFacts) 197 return resp 198 199 def incompleteness_metric(self, questions: list[str]) -> MissingQuestions: 200 """check for questions not answered in a document 201 Parameters 202 ---------- 203 questions : list[str] 204 The list of questions to check against the document 205 Returns 206 ------- 207 MissingQuestions: a pydantic model containing the list of questions not answered in the document 208 """ 209 prompt = MISSING_QUESTIONS_PROMPT.format( 210 document=self.document, 211 questions=questions, 212 output_schema=MissingQuestions.model_json_schema(), 213 ) 214 resp = self.get_response_from_llm(prompt, MissingQuestions) 215 return resp 216 217 def pii_presence_check(self) -> PIIPresence: 218 """check for presence of PII in a document 219 220 Returns 221 ------- 222 PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found 223 """ 224 prompt = PII_PRESENCE_CHECK_PROMPT.format( 225 document=self.document, output_schema=PIIPresence.model_json_schema() 226 ) 227 resp = self.get_response_from_llm(prompt, PIIPresence) 228 return resp 229 230 def get_word_count(self) -> int: 231 """get the word count of a document 232 Returns 233 ------- 234 int: the number of words in the document 235 """ 236 237 content = self.document 238 words = content.strip().split() 239 return len(words) 240 241 def get_doc_summary(self) -> str: 242 """get a summary of a document 243 Returns 244 ------- 245 str: the summary of the document 246 """ 247 prompt = SUMMARY_PROMPT.format(document=self.document) 248 resp = self.get_response_from_llm(prompt) 249 return resp 250 251 def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel: 252 """get a custom metric for a document""" 253 prompt = CUSTOM_METRIC_PROMPT.format( 254 document=self.document, 255 output_schema=custom_metric.outputModel.model_json_schema(), 256 prompt=custom_metric.prompt, 257 ) 258 resp = self.get_response_from_llm(prompt, custom_metric.outputModel) 259 return resp 260 261 def get_document_profile(self) -> DocumentProfile: 262 """get the profile of a document 263 Returns 264 ------- 265 DocumentProfile: a pydantic model containing profile of the document 266 """ 267 if self.profile: 268 return self.profile 269 270 qna = self.extract_qna() 271 word_count = self.get_word_count() 272 summary = self.get_doc_summary() 273 title = os.path.basename(self.file_path) 274 file_type = Path(self.file_path).suffix 275 size = Path(self.file_path).stat().st_size 276 277 self.profile = DocumentProfile( 278 title=title, 279 wordCount=word_count, 280 qnaPairs=qna, 281 summary=summary, 282 fileType=file_type, 283 fileSize=size, 284 ) 285 return self.profile
33class DuplicateMetricNameError(ValueError): 34 """Raised when the same custom metric is registered twice."""
Raised when the same custom metric is registered twice.
37class DocumentQuality: 38 """ 39 Checks the quality of the document 40 """ 41 42 def __init__( 43 self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5 44 ): 45 """Initialize the DocumentQuality class. 46 Parameters 47 ---------- 48 file_path : str 49 The path to the document file to be analyzed. 50 model_name : str, optional 51 The name of the LLM model to use for analysis, available models: 52 https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName. 53 The default is 'openai:gpt-4o'. 54 num_questions : int, optional 55 The number of question-answer pairs to extract from the document, by default 5. 56 """ 57 self.file_path = file_path 58 self.document = read_document(file_path) 59 self.output: Optional[DocumentQualityCheckResult] = None 60 self.profile = None 61 self.llm_client = Agent(model_name) 62 self._custom_metrics = [] 63 self._num_questions = num_questions 64 65 def add_custom_metric(self, custom_metric: CustomMetric): 66 """Add a custom metric to the DocumentQuality instance. 67 Parameters 68 ---------- 69 custom_metric : CustomMetric 70 A pydantic model containing the custom metric details. 71 """ 72 if custom_metric.name in [cm.name for cm in self._custom_metrics]: 73 raise DuplicateMetricNameError( 74 f"Custom metric {custom_metric.name} already exists." 75 ) 76 self._custom_metrics.append(custom_metric) 77 78 def get_custom_metrics(self) -> list[CustomMetric]: 79 """Get the list of custom metrics added to the DocumentQuality instance. 80 Returns 81 ------- 82 list[CustomMetric]: A list of custom metrics. 83 """ 84 return self._custom_metrics 85 86 def remove_custom_metric(self, custom_metric_name: str): 87 """Remove a custom metric from the DocumentQuality instance by name. 88 Parameters 89 ---------- 90 custom_metric_name : str 91 The name of the custom metric to be removed. 92 """ 93 self._custom_metrics = [ 94 cm for cm in self._custom_metrics if cm.name != custom_metric_name 95 ] 96 97 def run(self) -> DocumentQualityCheckResult: 98 """Run the document quality checks and return the results. 99 Returns 100 ------- 101 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks. 102 """ 103 current_profile = self.get_document_profile() 104 inconsistency_metric = self.compute_fact_checks( 105 facts=current_profile.qnaPairs.answers 106 ) 107 pii_metric = self.pii_presence_check() 108 custom_metric_res = [] 109 for custom_metric in self._custom_metrics: 110 custom_metric_output = self.get_custom_metric(custom_metric) 111 custom_metric_res.append( 112 CustomMetricResult(name=custom_metric.name, result=custom_metric_output) 113 ) 114 return DocumentQualityCheckResult( 115 profile=current_profile, 116 inconsistency=inconsistency_metric, 117 pii=pii_metric, 118 customMetrics=custom_metric_res if custom_metric_res else None, 119 ) 120 121 def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult: 122 """Compare the document quality against a reference profile. 123 Parameters 124 ---------- 125 reference_profile : DocumentProfile 126 The reference profile to compare against. 127 Returns 128 ------- 129 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile. 130 """ 131 incompleteness = self.incompleteness_metric( 132 questions=reference_profile.qnaPairs.questions 133 ) 134 if self.profile is None: 135 self.profile = self.get_document_profile() 136 inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers) 137 pii = self.pii_presence_check() 138 inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers) 139 140 return DocumentQualityCheckResult( 141 profile=self.profile, 142 inconsistency=inconsistency, 143 incompleteness=incompleteness, 144 pii=pii, 145 inaccuracy=inaccuracy, 146 ) 147 148 def get_response_from_llm( 149 self, msg: str, output_model: Optional[type[BaseModel]] = None 150 ) -> Union[BaseModel, str]: 151 """get response from LLM for a given message and output model 152 Parameters 153 ---------- 154 msg : str 155 The message to send to the LLM 156 output_model : Type[BaseModel], optional 157 pydantic model to parse the output, by default None. 158 159 Returns 160 ------- 161 a pydantic model instance 162 """ 163 164 res = self.llm_client.run_sync(msg, output_type=output_model) 165 return res.output 166 167 def extract_qna(self) -> QnAPairs: 168 """extract pairs of questions and answers from a document 169 Returns: 170 ------- 171 QnAPairs: a pydantic model containing the list of questions and answers 172 173 """ 174 prompt = QNA_EXTRACT_PROMPT.format( 175 document=self.document, 176 output_schema=QnAPairs.model_json_schema(), 177 num_questions=self._num_questions, 178 ) 179 resp = self.get_response_from_llm(prompt, QnAPairs) 180 return resp 181 182 def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts: 183 """Checks whether the provided facts are consistent against the document 184 Parameters 185 ---------- 186 facts : list[str] 187 The list of facts to check against the document 188 Returns 189 ------- 190 InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any 191 """ 192 prompt = FACT_CHECK_PROMPT.format( 193 document=self.document, 194 output_schema=InconsistentFacts.model_json_schema(), 195 facts=facts, 196 ) 197 resp = self.get_response_from_llm(prompt, InconsistentFacts) 198 return resp 199 200 def incompleteness_metric(self, questions: list[str]) -> MissingQuestions: 201 """check for questions not answered in a document 202 Parameters 203 ---------- 204 questions : list[str] 205 The list of questions to check against the document 206 Returns 207 ------- 208 MissingQuestions: a pydantic model containing the list of questions not answered in the document 209 """ 210 prompt = MISSING_QUESTIONS_PROMPT.format( 211 document=self.document, 212 questions=questions, 213 output_schema=MissingQuestions.model_json_schema(), 214 ) 215 resp = self.get_response_from_llm(prompt, MissingQuestions) 216 return resp 217 218 def pii_presence_check(self) -> PIIPresence: 219 """check for presence of PII in a document 220 221 Returns 222 ------- 223 PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found 224 """ 225 prompt = PII_PRESENCE_CHECK_PROMPT.format( 226 document=self.document, output_schema=PIIPresence.model_json_schema() 227 ) 228 resp = self.get_response_from_llm(prompt, PIIPresence) 229 return resp 230 231 def get_word_count(self) -> int: 232 """get the word count of a document 233 Returns 234 ------- 235 int: the number of words in the document 236 """ 237 238 content = self.document 239 words = content.strip().split() 240 return len(words) 241 242 def get_doc_summary(self) -> str: 243 """get a summary of a document 244 Returns 245 ------- 246 str: the summary of the document 247 """ 248 prompt = SUMMARY_PROMPT.format(document=self.document) 249 resp = self.get_response_from_llm(prompt) 250 return resp 251 252 def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel: 253 """get a custom metric for a document""" 254 prompt = CUSTOM_METRIC_PROMPT.format( 255 document=self.document, 256 output_schema=custom_metric.outputModel.model_json_schema(), 257 prompt=custom_metric.prompt, 258 ) 259 resp = self.get_response_from_llm(prompt, custom_metric.outputModel) 260 return resp 261 262 def get_document_profile(self) -> DocumentProfile: 263 """get the profile of a document 264 Returns 265 ------- 266 DocumentProfile: a pydantic model containing profile of the document 267 """ 268 if self.profile: 269 return self.profile 270 271 qna = self.extract_qna() 272 word_count = self.get_word_count() 273 summary = self.get_doc_summary() 274 title = os.path.basename(self.file_path) 275 file_type = Path(self.file_path).suffix 276 size = Path(self.file_path).stat().st_size 277 278 self.profile = DocumentProfile( 279 title=title, 280 wordCount=word_count, 281 qnaPairs=qna, 282 summary=summary, 283 fileType=file_type, 284 fileSize=size, 285 ) 286 return self.profile
Checks the quality of the document
42 def __init__( 43 self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5 44 ): 45 """Initialize the DocumentQuality class. 46 Parameters 47 ---------- 48 file_path : str 49 The path to the document file to be analyzed. 50 model_name : str, optional 51 The name of the LLM model to use for analysis, available models: 52 https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName. 53 The default is 'openai:gpt-4o'. 54 num_questions : int, optional 55 The number of question-answer pairs to extract from the document, by default 5. 56 """ 57 self.file_path = file_path 58 self.document = read_document(file_path) 59 self.output: Optional[DocumentQualityCheckResult] = None 60 self.profile = None 61 self.llm_client = Agent(model_name) 62 self._custom_metrics = [] 63 self._num_questions = num_questions
Initialize the DocumentQuality class.
Parameters
file_path : str The path to the document file to be analyzed. model_name : str, optional The name of the LLM model to use for analysis, available models: https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName. The default is 'openai:gpt-4o'. num_questions : int, optional The number of question-answer pairs to extract from the document, by default 5.
65 def add_custom_metric(self, custom_metric: CustomMetric): 66 """Add a custom metric to the DocumentQuality instance. 67 Parameters 68 ---------- 69 custom_metric : CustomMetric 70 A pydantic model containing the custom metric details. 71 """ 72 if custom_metric.name in [cm.name for cm in self._custom_metrics]: 73 raise DuplicateMetricNameError( 74 f"Custom metric {custom_metric.name} already exists." 75 ) 76 self._custom_metrics.append(custom_metric)
Add a custom metric to the DocumentQuality instance.
Parameters
custom_metric : CustomMetric A pydantic model containing the custom metric details.
78 def get_custom_metrics(self) -> list[CustomMetric]: 79 """Get the list of custom metrics added to the DocumentQuality instance. 80 Returns 81 ------- 82 list[CustomMetric]: A list of custom metrics. 83 """ 84 return self._custom_metrics
Get the list of custom metrics added to the DocumentQuality instance.
Returns
list[CustomMetric]: A list of custom metrics.
86 def remove_custom_metric(self, custom_metric_name: str): 87 """Remove a custom metric from the DocumentQuality instance by name. 88 Parameters 89 ---------- 90 custom_metric_name : str 91 The name of the custom metric to be removed. 92 """ 93 self._custom_metrics = [ 94 cm for cm in self._custom_metrics if cm.name != custom_metric_name 95 ]
Remove a custom metric from the DocumentQuality instance by name.
Parameters
custom_metric_name : str The name of the custom metric to be removed.
97 def run(self) -> DocumentQualityCheckResult: 98 """Run the document quality checks and return the results. 99 Returns 100 ------- 101 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks. 102 """ 103 current_profile = self.get_document_profile() 104 inconsistency_metric = self.compute_fact_checks( 105 facts=current_profile.qnaPairs.answers 106 ) 107 pii_metric = self.pii_presence_check() 108 custom_metric_res = [] 109 for custom_metric in self._custom_metrics: 110 custom_metric_output = self.get_custom_metric(custom_metric) 111 custom_metric_res.append( 112 CustomMetricResult(name=custom_metric.name, result=custom_metric_output) 113 ) 114 return DocumentQualityCheckResult( 115 profile=current_profile, 116 inconsistency=inconsistency_metric, 117 pii=pii_metric, 118 customMetrics=custom_metric_res if custom_metric_res else None, 119 )
Run the document quality checks and return the results.
Returns
DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks.
121 def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult: 122 """Compare the document quality against a reference profile. 123 Parameters 124 ---------- 125 reference_profile : DocumentProfile 126 The reference profile to compare against. 127 Returns 128 ------- 129 DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile. 130 """ 131 incompleteness = self.incompleteness_metric( 132 questions=reference_profile.qnaPairs.questions 133 ) 134 if self.profile is None: 135 self.profile = self.get_document_profile() 136 inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers) 137 pii = self.pii_presence_check() 138 inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers) 139 140 return DocumentQualityCheckResult( 141 profile=self.profile, 142 inconsistency=inconsistency, 143 incompleteness=incompleteness, 144 pii=pii, 145 inaccuracy=inaccuracy, 146 )
Compare the document quality against a reference profile.
Parameters
reference_profile : DocumentProfile The reference profile to compare against.
Returns
DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile.
148 def get_response_from_llm( 149 self, msg: str, output_model: Optional[type[BaseModel]] = None 150 ) -> Union[BaseModel, str]: 151 """get response from LLM for a given message and output model 152 Parameters 153 ---------- 154 msg : str 155 The message to send to the LLM 156 output_model : Type[BaseModel], optional 157 pydantic model to parse the output, by default None. 158 159 Returns 160 ------- 161 a pydantic model instance 162 """ 163 164 res = self.llm_client.run_sync(msg, output_type=output_model) 165 return res.output
get response from LLM for a given message and output model
Parameters
msg : str The message to send to the LLM output_model : Type[BaseModel], optional pydantic model to parse the output, by default None.
Returns
a pydantic model instance
167 def extract_qna(self) -> QnAPairs: 168 """extract pairs of questions and answers from a document 169 Returns: 170 ------- 171 QnAPairs: a pydantic model containing the list of questions and answers 172 173 """ 174 prompt = QNA_EXTRACT_PROMPT.format( 175 document=self.document, 176 output_schema=QnAPairs.model_json_schema(), 177 num_questions=self._num_questions, 178 ) 179 resp = self.get_response_from_llm(prompt, QnAPairs) 180 return resp
extract pairs of questions and answers from a document
Returns:
QnAPairs: a pydantic model containing the list of questions and answers
182 def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts: 183 """Checks whether the provided facts are consistent against the document 184 Parameters 185 ---------- 186 facts : list[str] 187 The list of facts to check against the document 188 Returns 189 ------- 190 InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any 191 """ 192 prompt = FACT_CHECK_PROMPT.format( 193 document=self.document, 194 output_schema=InconsistentFacts.model_json_schema(), 195 facts=facts, 196 ) 197 resp = self.get_response_from_llm(prompt, InconsistentFacts) 198 return resp
Checks whether the provided facts are consistent against the document
Parameters
facts : list[str] The list of facts to check against the document
Returns
InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any
200 def incompleteness_metric(self, questions: list[str]) -> MissingQuestions: 201 """check for questions not answered in a document 202 Parameters 203 ---------- 204 questions : list[str] 205 The list of questions to check against the document 206 Returns 207 ------- 208 MissingQuestions: a pydantic model containing the list of questions not answered in the document 209 """ 210 prompt = MISSING_QUESTIONS_PROMPT.format( 211 document=self.document, 212 questions=questions, 213 output_schema=MissingQuestions.model_json_schema(), 214 ) 215 resp = self.get_response_from_llm(prompt, MissingQuestions) 216 return resp
check for questions not answered in a document
Parameters
questions : list[str] The list of questions to check against the document
Returns
MissingQuestions: a pydantic model containing the list of questions not answered in the document
218 def pii_presence_check(self) -> PIIPresence: 219 """check for presence of PII in a document 220 221 Returns 222 ------- 223 PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found 224 """ 225 prompt = PII_PRESENCE_CHECK_PROMPT.format( 226 document=self.document, output_schema=PIIPresence.model_json_schema() 227 ) 228 resp = self.get_response_from_llm(prompt, PIIPresence) 229 return resp
check for presence of PII in a document
Returns
PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found
231 def get_word_count(self) -> int: 232 """get the word count of a document 233 Returns 234 ------- 235 int: the number of words in the document 236 """ 237 238 content = self.document 239 words = content.strip().split() 240 return len(words)
get the word count of a document
Returns
int: the number of words in the document
242 def get_doc_summary(self) -> str: 243 """get a summary of a document 244 Returns 245 ------- 246 str: the summary of the document 247 """ 248 prompt = SUMMARY_PROMPT.format(document=self.document) 249 resp = self.get_response_from_llm(prompt) 250 return resp
get a summary of a document
Returns
str: the summary of the document
252 def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel: 253 """get a custom metric for a document""" 254 prompt = CUSTOM_METRIC_PROMPT.format( 255 document=self.document, 256 output_schema=custom_metric.outputModel.model_json_schema(), 257 prompt=custom_metric.prompt, 258 ) 259 resp = self.get_response_from_llm(prompt, custom_metric.outputModel) 260 return resp
get a custom metric for a document
262 def get_document_profile(self) -> DocumentProfile: 263 """get the profile of a document 264 Returns 265 ------- 266 DocumentProfile: a pydantic model containing profile of the document 267 """ 268 if self.profile: 269 return self.profile 270 271 qna = self.extract_qna() 272 word_count = self.get_word_count() 273 summary = self.get_doc_summary() 274 title = os.path.basename(self.file_path) 275 file_type = Path(self.file_path).suffix 276 size = Path(self.file_path).stat().st_size 277 278 self.profile = DocumentProfile( 279 title=title, 280 wordCount=word_count, 281 qnaPairs=qna, 282 summary=summary, 283 fileType=file_type, 284 fileSize=size, 285 ) 286 return self.profile
get the profile of a document
Returns
DocumentProfile: a pydantic model containing profile of the document