diff --git a/giskard/scanner/calibration/overconfidence_detector.py b/giskard/scanner/calibration/overconfidence_detector.py index 953312195f..5fefcbef51 100644 --- a/giskard/scanner/calibration/overconfidence_detector.py +++ b/giskard/scanner/calibration/overconfidence_detector.py @@ -18,6 +18,10 @@ @detector(name="overconfidence", tags=["overconfidence", "classification"]) class OverconfidenceDetector(LossBasedDetector): + """ + You can explicitly run this detector by adding the tag "overconfidence" in the `only` parameter of the scan method. + """ + def __init__(self, threshold=0.10, p_threshold=None, method="tree", **kwargs): self.threshold = threshold self.p_threshold = p_threshold diff --git a/giskard/scanner/calibration/underconfidence_detector.py b/giskard/scanner/calibration/underconfidence_detector.py index 9ce9f8409f..91a510cc81 100644 --- a/giskard/scanner/calibration/underconfidence_detector.py +++ b/giskard/scanner/calibration/underconfidence_detector.py @@ -15,6 +15,10 @@ @detector(name="underconfidence", tags=["underconfidence", "classification"]) class UnderconfidenceDetector(LossBasedDetector): + """ + You can explicitly run this detector by adding the tag "underconfidence" in the `only` parameter of the scan method. + """ + _needs_target = False def __init__(self, threshold=0.1, p_threshold=0.95, method="tree", **kwargs): diff --git a/giskard/scanner/correlation/spurious_correlation_detector.py b/giskard/scanner/correlation/spurious_correlation_detector.py index ff82279d05..6f3d519c72 100644 --- a/giskard/scanner/correlation/spurious_correlation_detector.py +++ b/giskard/scanner/correlation/spurious_correlation_detector.py @@ -15,6 +15,10 @@ @detector(name="spurious_correlation", tags=["spurious_correlation", "classification"]) class SpuriousCorrelationDetector(Detector): + """ + You can explicitly run this detector by adding the tag "spurious_correlation" in the `only` parameter of the scan method. + """ + def __init__( self, method: Optional[str] = "theil", threshold: Optional[float] = 0.5, min_slice_size: Optional[float] = None ): diff --git a/giskard/scanner/data_leakage/data_leakage_detector.py b/giskard/scanner/data_leakage/data_leakage_detector.py index 912cb5a4d1..2bc5b482ce 100644 --- a/giskard/scanner/data_leakage/data_leakage_detector.py +++ b/giskard/scanner/data_leakage/data_leakage_detector.py @@ -15,6 +15,10 @@ @detector(name="data_leakage", tags=["data_leakage", "classification", "regression"]) class DataLeakageDetector(Detector): + """ + You can explicitly run this detector by adding the tag "data_leakage" in the `only` parameter of the scan method. + """ + def run(self, model: BaseModel, dataset: Dataset, features: Optional[Sequence[str]] = None): logger.info("DataLeakageDetector: Running") diff --git a/giskard/scanner/llm/llm_basic_sycophancy_detector.py b/giskard/scanner/llm/llm_basic_sycophancy_detector.py index d9b29a4950..1079f0b59c 100644 --- a/giskard/scanner/llm/llm_basic_sycophancy_detector.py +++ b/giskard/scanner/llm/llm_basic_sycophancy_detector.py @@ -21,6 +21,8 @@ class LLMBasicSycophancyDetector: """Detects sycophancy in LLM-based models. + You can explicitly run this detector by adding the tag "sycophancy" in the `only` parameter of the scan method. + Sycophancy is the tendency of a model to produce outputs that agree with the input bias. This is often linked to model hallucination. This detector will probe if the model is affected by this issue by generating adversarial inputs (based on the model name & description) and that the model outputs are coherent. diff --git a/giskard/scanner/llm/llm_chars_injection_detector.py b/giskard/scanner/llm/llm_chars_injection_detector.py index bf4030ef29..255d31c694 100644 --- a/giskard/scanner/llm/llm_chars_injection_detector.py +++ b/giskard/scanner/llm/llm_chars_injection_detector.py @@ -18,6 +18,8 @@ class LLMCharsInjectionDetector(Detector): """Detects control character injection vulnerabilities in LLM-based models. + You can explicitly run this detector by adding the tag "control_chars_injection" in the `only` parameter of the scan method. + Some LLMs can be manipulated by injecting sequences of special characters in the prompt. These injections can cause the model to produce unexpected outputs, or even forget the prompt and produce unrelated outputs. diff --git a/giskard/scanner/llm/llm_faithfulness_detector.py b/giskard/scanner/llm/llm_faithfulness_detector.py index 07930a93de..91e8c893ac 100644 --- a/giskard/scanner/llm/llm_faithfulness_detector.py +++ b/giskard/scanner/llm/llm_faithfulness_detector.py @@ -25,6 +25,10 @@ @detector("llm_faithfulness", tags=["faithfulness", "llm", "text_generation"]) class LLMFaithfulnessDetector(RequirementBasedDetector): + """ + You can explicitly run this detector by adding the tag "faithfulness" in the `only` parameter of the scan method. + """ + _issue_group = Robustness _issue_level = IssueLevel.MAJOR diff --git a/giskard/scanner/llm/llm_harmful_content_detector.py b/giskard/scanner/llm/llm_harmful_content_detector.py index b7193c8c1c..5aa350363b 100644 --- a/giskard/scanner/llm/llm_harmful_content_detector.py +++ b/giskard/scanner/llm/llm_harmful_content_detector.py @@ -17,6 +17,8 @@ class LLMHarmfulContentDetector(RequirementBasedDetector): """Detects harmful content generation in LLM-based models. + You can explicitly run this detector by adding the tag "llm_harmful_content" in the `only` parameter of the scan method. + Harmful content generation is the tendency of a model to generate responses that could be used for malicious purposes or promote harmful actions. This detector will probe if the model is affected by this issue by generating ad hoc adversarial inputs (based on the model name & description) and checking that the model outputs do not contain diff --git a/giskard/scanner/llm/llm_implausible_output_detector.py b/giskard/scanner/llm/llm_implausible_output_detector.py index 49f3b784cd..af3536d3bb 100644 --- a/giskard/scanner/llm/llm_implausible_output_detector.py +++ b/giskard/scanner/llm/llm_implausible_output_detector.py @@ -23,6 +23,8 @@ class LLMImplausibleOutputDetector(Detector): """Detects implausible output in LLM-based models. + You can explicitly run this detector by adding the tag "implausible_output" in the `only` parameter of the scan method. + The detector will stimulate the model in producing outputs that are implausible or controversial by generating a set of ad hoc adversarial inputs. This can be seen as a proxy for hallucination and misinformation detection. """ diff --git a/giskard/scanner/llm/llm_information_disclosure_detector.py b/giskard/scanner/llm/llm_information_disclosure_detector.py index 45b9c4c5cd..0759b97815 100644 --- a/giskard/scanner/llm/llm_information_disclosure_detector.py +++ b/giskard/scanner/llm/llm_information_disclosure_detector.py @@ -12,6 +12,8 @@ class LLMInformationDisclosureDetector(RequirementBasedDetector): """Detects sensitive information disclosure in LLM-based models. + You can explicitly run this detector by adding the tag "information_disclosure" in the `only` parameter of the scan method. + This detector checks that the model does divulge or hallucinate sensitive or confidential information in its responses. We probe the model by generating ad hoc adversarial inputs and checking that the model outputs do not contain information that could be considered sensitive, such as personal identifiable information (PII) or secret diff --git a/giskard/scanner/llm/llm_output_formatting_detector.py b/giskard/scanner/llm/llm_output_formatting_detector.py index 68dfd8d89a..4695013f4c 100644 --- a/giskard/scanner/llm/llm_output_formatting_detector.py +++ b/giskard/scanner/llm/llm_output_formatting_detector.py @@ -34,6 +34,8 @@ class LLMOutputFormattingDetector(RequirementBasedDetector): """Detects output formatting issues in LLM-based models. + You can explicitly run this detector by adding the tag "output_formatting" in the `only` parameter of the scan method. + This detector checks that the model output is consistent with format requirements indicated in the model description, if any. """ diff --git a/giskard/scanner/llm/llm_prompt_injection_detector.py b/giskard/scanner/llm/llm_prompt_injection_detector.py index 56bb7eb533..696796ca5b 100644 --- a/giskard/scanner/llm/llm_prompt_injection_detector.py +++ b/giskard/scanner/llm/llm_prompt_injection_detector.py @@ -15,6 +15,8 @@ class LLMPromptInjectionDetector(Detector): """Detects prompt injection in LLM-based models. + You can explicitly run this detector by adding the tag "jailbreak" in the `only` parameter of the scan method. + Prompt injection is the vulnerability that occurs when an LLM can be manipulated through specially crafted inputs, leading to partial or full control over the model behaviour [#]_. This detector will probe if the model is affected by this issue by testing it against a set of adversarial inputs comprising a large variety of prompt injection diff --git a/giskard/scanner/llm/llm_stereotypes_detector.py b/giskard/scanner/llm/llm_stereotypes_detector.py index b6d4e2439d..99619d931a 100644 --- a/giskard/scanner/llm/llm_stereotypes_detector.py +++ b/giskard/scanner/llm/llm_stereotypes_detector.py @@ -20,6 +20,8 @@ class LLMStereotypesDetector(RequirementBasedDetector): """Detects stereotypes and discrimination in LLM-based models. + You can explicitly run this detector by adding the tag "llm_stereotypes_detector" in the `only` parameter of the scan method. + This detector checks that the model does not generate responses containing stereotypes, discriminatory content, or biased opinions. We do that by generating ad hoc adversarial inputs based on the model name & description, aimed at eliciting responses that could be considered stereotypical or discriminatory. diff --git a/giskard/scanner/performance/performance_bias_detector.py b/giskard/scanner/performance/performance_bias_detector.py index 9497666538..95ebc9ec24 100644 --- a/giskard/scanner/performance/performance_bias_detector.py +++ b/giskard/scanner/performance/performance_bias_detector.py @@ -33,6 +33,8 @@ def __init__( ): """Performance bias detector. + You can explicitly run this detector by adding the tag "performance_bias" in the `only` parameter of the scan method. + Parameters ---------- metrics : Optional[Sequence] diff --git a/giskard/scanner/robustness/ethical_bias_detector.py b/giskard/scanner/robustness/ethical_bias_detector.py index 65321bd72f..5c48d3239e 100644 --- a/giskard/scanner/robustness/ethical_bias_detector.py +++ b/giskard/scanner/robustness/ethical_bias_detector.py @@ -15,6 +15,8 @@ class EthicalBiasDetector(BaseTextPerturbationDetector): """Detects ethical bias in a model by applying text perturbations to the input data. + You can explicitly run this detector by adding the tag "ethical_bias" in the `only` parameter of the scan method. + By default, we perform specific metamorphic testing aimed at detecting bias in the model predictions based on transformation of gender, nationality, or religious terms in the textual features. diff --git a/giskard/scanner/robustness/text_perturbation_detector.py b/giskard/scanner/robustness/text_perturbation_detector.py index b714852354..7f6fefc113 100644 --- a/giskard/scanner/robustness/text_perturbation_detector.py +++ b/giskard/scanner/robustness/text_perturbation_detector.py @@ -19,6 +19,8 @@ class TextPerturbationDetector(BaseTextPerturbationDetector): """Detects robustness problems in a model by applying text perturbations to the textual features. + You can explicitly run this detector by adding the tag "text_perturbation" in the `only` parameter of the scan method. + This detector will check invariance of model predictions when the formatting of textual features is altered, e.g. transforming to uppercase, lowercase, or title case, or by introducing typos. """ diff --git a/giskard/scanner/stochasticity/stochasticity_detector.py b/giskard/scanner/stochasticity/stochasticity_detector.py index 2d3275528e..88d09e9a81 100644 --- a/giskard/scanner/stochasticity/stochasticity_detector.py +++ b/giskard/scanner/stochasticity/stochasticity_detector.py @@ -15,6 +15,8 @@ class StochasticityDetector(Detector): """Detects stochasticity in the model predictions. + You can explicitly run this detector by adding the tag "stochasticity" in the `only` parameter of the scan method. + This detector ensures that the model predictions are deterministic, i.e. that the same input always produces the same output. """