lightudq.document_quality

  1import os
  2from pathlib import Path
  3from typing import Optional, Union
  4
  5from dotenv import load_dotenv
  6from pydantic import BaseModel
  7from pydantic_ai import Agent
  8
  9from lightudq.prompts import (
 10    CUSTOM_METRIC_PROMPT,
 11    FACT_CHECK_PROMPT,
 12    MISSING_QUESTIONS_PROMPT,
 13    PII_PRESENCE_CHECK_PROMPT,
 14    QNA_EXTRACT_PROMPT,
 15    SUMMARY_PROMPT,
 16)
 17from lightudq.schemas import (
 18    CustomMetric,
 19    CustomMetricResult,
 20    DocumentProfile,
 21    DocumentQualityCheckResult,
 22    InconsistentFacts,
 23    MissingQuestions,
 24    PIIPresence,
 25    QnAPairs,
 26)
 27from lightudq.utils import read_document
 28
 29load_dotenv()
 30
 31
 32class DuplicateMetricNameError(ValueError):
 33    """Raised when the same custom metric is registered twice."""
 34
 35
 36class DocumentQuality:
 37    """
 38    Checks the quality of the document
 39    """
 40
 41    def __init__(
 42        self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5
 43    ):
 44        """Initialize the DocumentQuality class.
 45        Parameters
 46        ----------
 47        file_path : str
 48            The path to the document file to be analyzed.
 49        model_name : str, optional
 50            The name of the LLM model to use for analysis, available models:
 51            https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName.
 52            The default is 'openai:gpt-4o'.
 53        num_questions : int, optional
 54            The number of question-answer pairs to extract from the document, by default 5.
 55        """
 56        self.file_path = file_path
 57        self.document = read_document(file_path)
 58        self.output: Optional[DocumentQualityCheckResult] = None
 59        self.profile = None
 60        self.llm_client = Agent(model_name)
 61        self._custom_metrics = []
 62        self._num_questions = num_questions
 63
 64    def add_custom_metric(self, custom_metric: CustomMetric):
 65        """Add a custom metric to the DocumentQuality instance.
 66        Parameters
 67        ----------
 68        custom_metric : CustomMetric
 69            A pydantic model containing the custom metric details.
 70        """
 71        if custom_metric.name in [cm.name for cm in self._custom_metrics]:
 72            raise DuplicateMetricNameError(
 73                f"Custom metric {custom_metric.name} already exists."
 74            )
 75        self._custom_metrics.append(custom_metric)
 76
 77    def get_custom_metrics(self) -> list[CustomMetric]:
 78        """Get the list of custom metrics added to the DocumentQuality instance.
 79        Returns
 80        -------
 81        list[CustomMetric]: A list of custom metrics.
 82        """
 83        return self._custom_metrics
 84
 85    def remove_custom_metric(self, custom_metric_name: str):
 86        """Remove a custom metric from the DocumentQuality instance by name.
 87        Parameters
 88        ----------
 89        custom_metric_name : str
 90            The name of the custom metric to be removed.
 91        """
 92        self._custom_metrics = [
 93            cm for cm in self._custom_metrics if cm.name != custom_metric_name
 94        ]
 95
 96    def run(self) -> DocumentQualityCheckResult:
 97        """Run the document quality checks and return the results.
 98        Returns
 99        -------
100        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks.
101        """
102        current_profile = self.get_document_profile()
103        inconsistency_metric = self.compute_fact_checks(
104            facts=current_profile.qnaPairs.answers
105        )
106        pii_metric = self.pii_presence_check()
107        custom_metric_res = []
108        for custom_metric in self._custom_metrics:
109            custom_metric_output = self.get_custom_metric(custom_metric)
110            custom_metric_res.append(
111                CustomMetricResult(name=custom_metric.name, result=custom_metric_output)
112            )
113        return DocumentQualityCheckResult(
114            profile=current_profile,
115            inconsistency=inconsistency_metric,
116            pii=pii_metric,
117            customMetrics=custom_metric_res if custom_metric_res else None,
118        )
119
120    def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult:
121        """Compare the document quality against a reference profile.
122        Parameters
123        ----------
124        reference_profile : DocumentProfile
125            The reference profile to compare against.
126        Returns
127        -------
128        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile.
129        """
130        incompleteness = self.incompleteness_metric(
131            questions=reference_profile.qnaPairs.questions
132        )
133        if self.profile is None:
134            self.profile = self.get_document_profile()
135        inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers)
136        pii = self.pii_presence_check()
137        inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers)
138
139        return DocumentQualityCheckResult(
140            profile=self.profile,
141            inconsistency=inconsistency,
142            incompleteness=incompleteness,
143            pii=pii,
144            inaccuracy=inaccuracy,
145        )
146
147    def get_response_from_llm(
148        self, msg: str, output_model: Optional[type[BaseModel]] = None
149    ) -> Union[BaseModel, str]:
150        """get response from LLM for a given message and output model
151        Parameters
152        ----------
153        msg : str
154            The message to send to the LLM
155        output_model : Type[BaseModel], optional
156            pydantic model to parse the output, by default None.
157
158        Returns
159        -------
160        a pydantic model instance
161        """
162
163        res = self.llm_client.run_sync(msg, output_type=output_model)
164        return res.output
165
166    def extract_qna(self) -> QnAPairs:
167        """extract pairs of questions and answers from a document
168        Returns:
169        -------
170        QnAPairs: a pydantic model containing the list of questions and answers
171
172        """
173        prompt = QNA_EXTRACT_PROMPT.format(
174            document=self.document,
175            output_schema=QnAPairs.model_json_schema(),
176            num_questions=self._num_questions,
177        )
178        resp = self.get_response_from_llm(prompt, QnAPairs)
179        return resp
180
181    def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts:
182        """Checks whether the provided facts are consistent against the document
183        Parameters
184        ----------
185        facts : list[str]
186            The list of facts to check against the document
187        Returns
188        -------
189        InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any
190        """
191        prompt = FACT_CHECK_PROMPT.format(
192            document=self.document,
193            output_schema=InconsistentFacts.model_json_schema(),
194            facts=facts,
195        )
196        resp = self.get_response_from_llm(prompt, InconsistentFacts)
197        return resp
198
199    def incompleteness_metric(self, questions: list[str]) -> MissingQuestions:
200        """check for questions not answered in a document
201        Parameters
202        ----------
203        questions : list[str]
204            The list of questions to check against the document
205        Returns
206        -------
207        MissingQuestions: a pydantic model containing the list of questions not answered in the document
208        """
209        prompt = MISSING_QUESTIONS_PROMPT.format(
210            document=self.document,
211            questions=questions,
212            output_schema=MissingQuestions.model_json_schema(),
213        )
214        resp = self.get_response_from_llm(prompt, MissingQuestions)
215        return resp
216
217    def pii_presence_check(self) -> PIIPresence:
218        """check for presence of PII in a document
219
220        Returns
221        -------
222        PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found
223        """
224        prompt = PII_PRESENCE_CHECK_PROMPT.format(
225            document=self.document, output_schema=PIIPresence.model_json_schema()
226        )
227        resp = self.get_response_from_llm(prompt, PIIPresence)
228        return resp
229
230    def get_word_count(self) -> int:
231        """get the word count of a document
232        Returns
233        -------
234        int: the number of words in the document
235        """
236
237        content = self.document
238        words = content.strip().split()
239        return len(words)
240
241    def get_doc_summary(self) -> str:
242        """get a summary of a document
243        Returns
244        -------
245        str: the summary of the document
246        """
247        prompt = SUMMARY_PROMPT.format(document=self.document)
248        resp = self.get_response_from_llm(prompt)
249        return resp
250
251    def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel:
252        """get a custom metric for a document"""
253        prompt = CUSTOM_METRIC_PROMPT.format(
254            document=self.document,
255            output_schema=custom_metric.outputModel.model_json_schema(),
256            prompt=custom_metric.prompt,
257        )
258        resp = self.get_response_from_llm(prompt, custom_metric.outputModel)
259        return resp
260
261    def get_document_profile(self) -> DocumentProfile:
262        """get the profile of a document
263        Returns
264        -------
265        DocumentProfile: a pydantic model containing profile of the document
266        """
267        if self.profile:
268            return self.profile
269
270        qna = self.extract_qna()
271        word_count = self.get_word_count()
272        summary = self.get_doc_summary()
273        title = os.path.basename(self.file_path)
274        file_type = Path(self.file_path).suffix
275        size = Path(self.file_path).stat().st_size
276
277        self.profile = DocumentProfile(
278            title=title,
279            wordCount=word_count,
280            qnaPairs=qna,
281            summary=summary,
282            fileType=file_type,
283            fileSize=size,
284        )
285        return self.profile
class DuplicateMetricNameError(builtins.ValueError):
33class DuplicateMetricNameError(ValueError):
34    """Raised when the same custom metric is registered twice."""

Raised when the same custom metric is registered twice.

class DocumentQuality:
 37class DocumentQuality:
 38    """
 39    Checks the quality of the document
 40    """
 41
 42    def __init__(
 43        self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5
 44    ):
 45        """Initialize the DocumentQuality class.
 46        Parameters
 47        ----------
 48        file_path : str
 49            The path to the document file to be analyzed.
 50        model_name : str, optional
 51            The name of the LLM model to use for analysis, available models:
 52            https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName.
 53            The default is 'openai:gpt-4o'.
 54        num_questions : int, optional
 55            The number of question-answer pairs to extract from the document, by default 5.
 56        """
 57        self.file_path = file_path
 58        self.document = read_document(file_path)
 59        self.output: Optional[DocumentQualityCheckResult] = None
 60        self.profile = None
 61        self.llm_client = Agent(model_name)
 62        self._custom_metrics = []
 63        self._num_questions = num_questions
 64
 65    def add_custom_metric(self, custom_metric: CustomMetric):
 66        """Add a custom metric to the DocumentQuality instance.
 67        Parameters
 68        ----------
 69        custom_metric : CustomMetric
 70            A pydantic model containing the custom metric details.
 71        """
 72        if custom_metric.name in [cm.name for cm in self._custom_metrics]:
 73            raise DuplicateMetricNameError(
 74                f"Custom metric {custom_metric.name} already exists."
 75            )
 76        self._custom_metrics.append(custom_metric)
 77
 78    def get_custom_metrics(self) -> list[CustomMetric]:
 79        """Get the list of custom metrics added to the DocumentQuality instance.
 80        Returns
 81        -------
 82        list[CustomMetric]: A list of custom metrics.
 83        """
 84        return self._custom_metrics
 85
 86    def remove_custom_metric(self, custom_metric_name: str):
 87        """Remove a custom metric from the DocumentQuality instance by name.
 88        Parameters
 89        ----------
 90        custom_metric_name : str
 91            The name of the custom metric to be removed.
 92        """
 93        self._custom_metrics = [
 94            cm for cm in self._custom_metrics if cm.name != custom_metric_name
 95        ]
 96
 97    def run(self) -> DocumentQualityCheckResult:
 98        """Run the document quality checks and return the results.
 99        Returns
100        -------
101        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks.
102        """
103        current_profile = self.get_document_profile()
104        inconsistency_metric = self.compute_fact_checks(
105            facts=current_profile.qnaPairs.answers
106        )
107        pii_metric = self.pii_presence_check()
108        custom_metric_res = []
109        for custom_metric in self._custom_metrics:
110            custom_metric_output = self.get_custom_metric(custom_metric)
111            custom_metric_res.append(
112                CustomMetricResult(name=custom_metric.name, result=custom_metric_output)
113            )
114        return DocumentQualityCheckResult(
115            profile=current_profile,
116            inconsistency=inconsistency_metric,
117            pii=pii_metric,
118            customMetrics=custom_metric_res if custom_metric_res else None,
119        )
120
121    def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult:
122        """Compare the document quality against a reference profile.
123        Parameters
124        ----------
125        reference_profile : DocumentProfile
126            The reference profile to compare against.
127        Returns
128        -------
129        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile.
130        """
131        incompleteness = self.incompleteness_metric(
132            questions=reference_profile.qnaPairs.questions
133        )
134        if self.profile is None:
135            self.profile = self.get_document_profile()
136        inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers)
137        pii = self.pii_presence_check()
138        inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers)
139
140        return DocumentQualityCheckResult(
141            profile=self.profile,
142            inconsistency=inconsistency,
143            incompleteness=incompleteness,
144            pii=pii,
145            inaccuracy=inaccuracy,
146        )
147
148    def get_response_from_llm(
149        self, msg: str, output_model: Optional[type[BaseModel]] = None
150    ) -> Union[BaseModel, str]:
151        """get response from LLM for a given message and output model
152        Parameters
153        ----------
154        msg : str
155            The message to send to the LLM
156        output_model : Type[BaseModel], optional
157            pydantic model to parse the output, by default None.
158
159        Returns
160        -------
161        a pydantic model instance
162        """
163
164        res = self.llm_client.run_sync(msg, output_type=output_model)
165        return res.output
166
167    def extract_qna(self) -> QnAPairs:
168        """extract pairs of questions and answers from a document
169        Returns:
170        -------
171        QnAPairs: a pydantic model containing the list of questions and answers
172
173        """
174        prompt = QNA_EXTRACT_PROMPT.format(
175            document=self.document,
176            output_schema=QnAPairs.model_json_schema(),
177            num_questions=self._num_questions,
178        )
179        resp = self.get_response_from_llm(prompt, QnAPairs)
180        return resp
181
182    def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts:
183        """Checks whether the provided facts are consistent against the document
184        Parameters
185        ----------
186        facts : list[str]
187            The list of facts to check against the document
188        Returns
189        -------
190        InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any
191        """
192        prompt = FACT_CHECK_PROMPT.format(
193            document=self.document,
194            output_schema=InconsistentFacts.model_json_schema(),
195            facts=facts,
196        )
197        resp = self.get_response_from_llm(prompt, InconsistentFacts)
198        return resp
199
200    def incompleteness_metric(self, questions: list[str]) -> MissingQuestions:
201        """check for questions not answered in a document
202        Parameters
203        ----------
204        questions : list[str]
205            The list of questions to check against the document
206        Returns
207        -------
208        MissingQuestions: a pydantic model containing the list of questions not answered in the document
209        """
210        prompt = MISSING_QUESTIONS_PROMPT.format(
211            document=self.document,
212            questions=questions,
213            output_schema=MissingQuestions.model_json_schema(),
214        )
215        resp = self.get_response_from_llm(prompt, MissingQuestions)
216        return resp
217
218    def pii_presence_check(self) -> PIIPresence:
219        """check for presence of PII in a document
220
221        Returns
222        -------
223        PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found
224        """
225        prompt = PII_PRESENCE_CHECK_PROMPT.format(
226            document=self.document, output_schema=PIIPresence.model_json_schema()
227        )
228        resp = self.get_response_from_llm(prompt, PIIPresence)
229        return resp
230
231    def get_word_count(self) -> int:
232        """get the word count of a document
233        Returns
234        -------
235        int: the number of words in the document
236        """
237
238        content = self.document
239        words = content.strip().split()
240        return len(words)
241
242    def get_doc_summary(self) -> str:
243        """get a summary of a document
244        Returns
245        -------
246        str: the summary of the document
247        """
248        prompt = SUMMARY_PROMPT.format(document=self.document)
249        resp = self.get_response_from_llm(prompt)
250        return resp
251
252    def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel:
253        """get a custom metric for a document"""
254        prompt = CUSTOM_METRIC_PROMPT.format(
255            document=self.document,
256            output_schema=custom_metric.outputModel.model_json_schema(),
257            prompt=custom_metric.prompt,
258        )
259        resp = self.get_response_from_llm(prompt, custom_metric.outputModel)
260        return resp
261
262    def get_document_profile(self) -> DocumentProfile:
263        """get the profile of a document
264        Returns
265        -------
266        DocumentProfile: a pydantic model containing profile of the document
267        """
268        if self.profile:
269            return self.profile
270
271        qna = self.extract_qna()
272        word_count = self.get_word_count()
273        summary = self.get_doc_summary()
274        title = os.path.basename(self.file_path)
275        file_type = Path(self.file_path).suffix
276        size = Path(self.file_path).stat().st_size
277
278        self.profile = DocumentProfile(
279            title=title,
280            wordCount=word_count,
281            qnaPairs=qna,
282            summary=summary,
283            fileType=file_type,
284            fileSize=size,
285        )
286        return self.profile

Checks the quality of the document

DocumentQuality( file_path: str, model_name: str = 'openai:gpt-4o', num_questions: int = 5)
42    def __init__(
43        self, file_path: str, model_name: str = "openai:gpt-4o", num_questions: int = 5
44    ):
45        """Initialize the DocumentQuality class.
46        Parameters
47        ----------
48        file_path : str
49            The path to the document file to be analyzed.
50        model_name : str, optional
51            The name of the LLM model to use for analysis, available models:
52            https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName.
53            The default is 'openai:gpt-4o'.
54        num_questions : int, optional
55            The number of question-answer pairs to extract from the document, by default 5.
56        """
57        self.file_path = file_path
58        self.document = read_document(file_path)
59        self.output: Optional[DocumentQualityCheckResult] = None
60        self.profile = None
61        self.llm_client = Agent(model_name)
62        self._custom_metrics = []
63        self._num_questions = num_questions

Initialize the DocumentQuality class.

Parameters

file_path : str The path to the document file to be analyzed. model_name : str, optional The name of the LLM model to use for analysis, available models: https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName. The default is 'openai:gpt-4o'. num_questions : int, optional The number of question-answer pairs to extract from the document, by default 5.

file_path
document
profile
llm_client
def add_custom_metric(self, custom_metric: lightudq.schemas.CustomMetric):
65    def add_custom_metric(self, custom_metric: CustomMetric):
66        """Add a custom metric to the DocumentQuality instance.
67        Parameters
68        ----------
69        custom_metric : CustomMetric
70            A pydantic model containing the custom metric details.
71        """
72        if custom_metric.name in [cm.name for cm in self._custom_metrics]:
73            raise DuplicateMetricNameError(
74                f"Custom metric {custom_metric.name} already exists."
75            )
76        self._custom_metrics.append(custom_metric)

Add a custom metric to the DocumentQuality instance.

Parameters

custom_metric : CustomMetric A pydantic model containing the custom metric details.

def get_custom_metrics(self) -> list[lightudq.schemas.CustomMetric]:
78    def get_custom_metrics(self) -> list[CustomMetric]:
79        """Get the list of custom metrics added to the DocumentQuality instance.
80        Returns
81        -------
82        list[CustomMetric]: A list of custom metrics.
83        """
84        return self._custom_metrics

Get the list of custom metrics added to the DocumentQuality instance.

Returns

list[CustomMetric]: A list of custom metrics.

def remove_custom_metric(self, custom_metric_name: str):
86    def remove_custom_metric(self, custom_metric_name: str):
87        """Remove a custom metric from the DocumentQuality instance by name.
88        Parameters
89        ----------
90        custom_metric_name : str
91            The name of the custom metric to be removed.
92        """
93        self._custom_metrics = [
94            cm for cm in self._custom_metrics if cm.name != custom_metric_name
95        ]

Remove a custom metric from the DocumentQuality instance by name.

Parameters

custom_metric_name : str The name of the custom metric to be removed.

def run(self) -> lightudq.schemas.DocumentQualityCheckResult:
 97    def run(self) -> DocumentQualityCheckResult:
 98        """Run the document quality checks and return the results.
 99        Returns
100        -------
101        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks.
102        """
103        current_profile = self.get_document_profile()
104        inconsistency_metric = self.compute_fact_checks(
105            facts=current_profile.qnaPairs.answers
106        )
107        pii_metric = self.pii_presence_check()
108        custom_metric_res = []
109        for custom_metric in self._custom_metrics:
110            custom_metric_output = self.get_custom_metric(custom_metric)
111            custom_metric_res.append(
112                CustomMetricResult(name=custom_metric.name, result=custom_metric_output)
113            )
114        return DocumentQualityCheckResult(
115            profile=current_profile,
116            inconsistency=inconsistency_metric,
117            pii=pii_metric,
118            customMetrics=custom_metric_res if custom_metric_res else None,
119        )

Run the document quality checks and return the results.

Returns

DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks.

def compare( self, reference_profile: lightudq.schemas.DocumentProfile) -> lightudq.schemas.DocumentQualityCheckResult:
121    def compare(self, reference_profile: DocumentProfile) -> DocumentQualityCheckResult:
122        """Compare the document quality against a reference profile.
123        Parameters
124        ----------
125        reference_profile : DocumentProfile
126            The reference profile to compare against.
127        Returns
128        -------
129        DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile.
130        """
131        incompleteness = self.incompleteness_metric(
132            questions=reference_profile.qnaPairs.questions
133        )
134        if self.profile is None:
135            self.profile = self.get_document_profile()
136        inconsistency = self.compute_fact_checks(facts=self.profile.qnaPairs.answers)
137        pii = self.pii_presence_check()
138        inaccuracy = self.compute_fact_checks(facts=reference_profile.qnaPairs.answers)
139
140        return DocumentQualityCheckResult(
141            profile=self.profile,
142            inconsistency=inconsistency,
143            incompleteness=incompleteness,
144            pii=pii,
145            inaccuracy=inaccuracy,
146        )

Compare the document quality against a reference profile.

Parameters

reference_profile : DocumentProfile The reference profile to compare against.

Returns

DocumentQualityCheckResult: A pydantic model containing the results of the document quality checks against the reference profile.

def get_response_from_llm( self, msg: str, output_model: Optional[type[pydantic.main.BaseModel]] = None) -> Union[pydantic.main.BaseModel, str]:
148    def get_response_from_llm(
149        self, msg: str, output_model: Optional[type[BaseModel]] = None
150    ) -> Union[BaseModel, str]:
151        """get response from LLM for a given message and output model
152        Parameters
153        ----------
154        msg : str
155            The message to send to the LLM
156        output_model : Type[BaseModel], optional
157            pydantic model to parse the output, by default None.
158
159        Returns
160        -------
161        a pydantic model instance
162        """
163
164        res = self.llm_client.run_sync(msg, output_type=output_model)
165        return res.output

get response from LLM for a given message and output model

Parameters

msg : str The message to send to the LLM output_model : Type[BaseModel], optional pydantic model to parse the output, by default None.

Returns

a pydantic model instance

def extract_qna(self) -> lightudq.schemas.QnAPairs:
167    def extract_qna(self) -> QnAPairs:
168        """extract pairs of questions and answers from a document
169        Returns:
170        -------
171        QnAPairs: a pydantic model containing the list of questions and answers
172
173        """
174        prompt = QNA_EXTRACT_PROMPT.format(
175            document=self.document,
176            output_schema=QnAPairs.model_json_schema(),
177            num_questions=self._num_questions,
178        )
179        resp = self.get_response_from_llm(prompt, QnAPairs)
180        return resp

extract pairs of questions and answers from a document

Returns:

QnAPairs: a pydantic model containing the list of questions and answers

def compute_fact_checks(self, facts: list[str]) -> lightudq.schemas.InconsistentFacts:
182    def compute_fact_checks(self, facts: list[str]) -> InconsistentFacts:
183        """Checks whether the provided facts are consistent against the document
184        Parameters
185        ----------
186        facts : list[str]
187            The list of facts to check against the document
188        Returns
189        -------
190        InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any
191        """
192        prompt = FACT_CHECK_PROMPT.format(
193            document=self.document,
194            output_schema=InconsistentFacts.model_json_schema(),
195            facts=facts,
196        )
197        resp = self.get_response_from_llm(prompt, InconsistentFacts)
198        return resp

Checks whether the provided facts are consistent against the document

Parameters

facts : list[str] The list of facts to check against the document

Returns

InconsistentFacts: a pydantic model containing the inconsistent facts and metadata if any

def incompleteness_metric(self, questions: list[str]) -> lightudq.schemas.MissingQuestions:
200    def incompleteness_metric(self, questions: list[str]) -> MissingQuestions:
201        """check for questions not answered in a document
202        Parameters
203        ----------
204        questions : list[str]
205            The list of questions to check against the document
206        Returns
207        -------
208        MissingQuestions: a pydantic model containing the list of questions not answered in the document
209        """
210        prompt = MISSING_QUESTIONS_PROMPT.format(
211            document=self.document,
212            questions=questions,
213            output_schema=MissingQuestions.model_json_schema(),
214        )
215        resp = self.get_response_from_llm(prompt, MissingQuestions)
216        return resp

check for questions not answered in a document

Parameters

questions : list[str] The list of questions to check against the document

Returns

MissingQuestions: a pydantic model containing the list of questions not answered in the document

def pii_presence_check(self) -> lightudq.schemas.PIIPresence:
218    def pii_presence_check(self) -> PIIPresence:
219        """check for presence of PII in a document
220
221        Returns
222        -------
223        PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found
224        """
225        prompt = PII_PRESENCE_CHECK_PROMPT.format(
226            document=self.document, output_schema=PIIPresence.model_json_schema()
227        )
228        resp = self.get_response_from_llm(prompt, PIIPresence)
229        return resp

check for presence of PII in a document

Returns

PIIPresence: a pydantic model containing the presence of PII in the document, metadata if any, and count of PII found

def get_word_count(self) -> int:
231    def get_word_count(self) -> int:
232        """get the word count of a document
233        Returns
234        -------
235        int: the number of words in the document
236        """
237
238        content = self.document
239        words = content.strip().split()
240        return len(words)

get the word count of a document

Returns

int: the number of words in the document

def get_doc_summary(self) -> str:
242    def get_doc_summary(self) -> str:
243        """get a summary of a document
244        Returns
245        -------
246        str: the summary of the document
247        """
248        prompt = SUMMARY_PROMPT.format(document=self.document)
249        resp = self.get_response_from_llm(prompt)
250        return resp

get a summary of a document

Returns

str: the summary of the document

def get_custom_metric( self, custom_metric: lightudq.schemas.CustomMetric) -> pydantic.main.BaseModel:
252    def get_custom_metric(self, custom_metric: CustomMetric) -> BaseModel:
253        """get a custom metric for a document"""
254        prompt = CUSTOM_METRIC_PROMPT.format(
255            document=self.document,
256            output_schema=custom_metric.outputModel.model_json_schema(),
257            prompt=custom_metric.prompt,
258        )
259        resp = self.get_response_from_llm(prompt, custom_metric.outputModel)
260        return resp

get a custom metric for a document

def get_document_profile(self) -> lightudq.schemas.DocumentProfile:
262    def get_document_profile(self) -> DocumentProfile:
263        """get the profile of a document
264        Returns
265        -------
266        DocumentProfile: a pydantic model containing profile of the document
267        """
268        if self.profile:
269            return self.profile
270
271        qna = self.extract_qna()
272        word_count = self.get_word_count()
273        summary = self.get_doc_summary()
274        title = os.path.basename(self.file_path)
275        file_type = Path(self.file_path).suffix
276        size = Path(self.file_path).stat().st_size
277
278        self.profile = DocumentProfile(
279            title=title,
280            wordCount=word_count,
281            qnaPairs=qna,
282            summary=summary,
283            fileType=file_type,
284            fileSize=size,
285        )
286        return self.profile

get the profile of a document

Returns

DocumentProfile: a pydantic model containing profile of the document