Skip to content

Validation Toolkit

check_binary_values(pred_col, label1=0, label2=1)

Check that values are binary (default: 0 or 1).

Example Use Case

Predictions can only be 0 (no disease present) or 1 (disease present).

PARAMETER DESCRIPTION
pred_col

Dataframe column containing the values to validate.

TYPE: Series

label1

First acceptable binary value.

TYPE: int DEFAULT: 0

label2

Second acceptable binary value.

TYPE: int DEFAULT: 1

RETURNS DESCRIPTION
str

An error message, if any (default is an empty string)

Source code in cnb_tools/validation_toolkit.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def check_binary_values(
    pred_col: Series, label1: int = 0, label2: int = 1
) -> str:
    """Check that values are binary (default: 0 or 1).

    Tip: Example Use Case
      Predictions can only be 0 (no disease present) or 1 (disease present).

    Args:
        pred_col: Dataframe column containing the values to validate.
        label1: First acceptable binary value.
        label2: Second acceptable binary value.

    Returns:
        An error message, if any (default is an empty string)

    """
    if not pred_col.isin([label1, label2]).all():
        return f"'{pred_col.name}' values should only be {label1} or {label2}."
    return ""

check_duplicate_keys(pred_col, verbose=False)

Check for duplicate keys.

Example Use Case

There is exactly one prediction for a patient / sample / etc.

PARAMETER DESCRIPTION
pred_col

Dataframe column containing the keys to validate

TYPE: Series

verbose

Include list of affected keys in error message

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
str

An error message, if any (default is an empty string)

Source code in cnb_tools/validation_toolkit.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def check_duplicate_keys(pred_col: Series, verbose: bool = False) -> str:
    """Check for duplicate keys.

    Tip: Example Use Case
      There is exactly one prediction for a patient / sample / etc.

    Args:
      pred_col: Dataframe column containing the keys to validate
      verbose: Include list of affected keys in error message

    Returns:
       An error message, if any (default is an empty string)

    """
    error = ""
    duplicates = pred_col.duplicated()
    if duplicates.any():
        error = f"Found {duplicates.sum()} duplicate ID(s)"

        if verbose:
            error += f": {pred_col[duplicates].to_list()}"
    return error

check_missing_keys(gold_col, pred_col, verbose=False)

Check for missing keys.

Example Use Case

There is at least one prediction for every patient / sample / etc.

PARAMETER DESCRIPTION
gold_col

Dataframe column containing the true keys

TYPE: Series

pred_col

Dataframe column containing the keys to validate

TYPE: Series

verbose

Include list of affected keys in error message

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
str

An error message, if any (default is an empty string)

Source code in cnb_tools/validation_toolkit.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def check_missing_keys(
    gold_col: Series, pred_col: Series, verbose: bool = False
) -> str:
    """Check for missing keys.

    Tip: Example Use Case
      There is at least one prediction for every patient / sample / etc.

    Args:
      gold_col: Dataframe column containing the true keys
      pred_col: Dataframe column containing the keys to validate
      verbose: Include list of affected keys in error message

    Returns:
       An error message, if any (default is an empty string)

    """
    error = ""
    missing_ids = gold_col[~gold_col.isin(pred_col)]
    if missing_ids.any():
        error = f"Found {missing_ids.shape[0]} missing ID(s)"

        if verbose:
            error += f": {missing_ids.to_list()}"
    return error

check_nan_values(pred_col)

Check for NAN values.

Example Use Case

Predictions must not be null / None.

PARAMETER DESCRIPTION
pred_col

Dataframe column containing the values to validate

TYPE: Series

RETURNS DESCRIPTION
str

An error message, if any (default is an empty string)

Source code in cnb_tools/validation_toolkit.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def check_nan_values(pred_col: Series) -> str:
    """Check for NAN values.

    Tip: Example Use Case
      Predictions must not be null / None.

    Args:
      pred_col: Dataframe column containing the values to validate

    Returns:
       An error message, if any (default is an empty string)

    """
    nan_count = pred_col.isna().sum()
    if nan_count:
        return f"'{pred_col.name}' column contains {nan_count} NaN value(s)."
    return ""

check_unknown_keys(gold_col, pred_col, verbose=False)

Check for unknown keys.

Example Use Case

There are no predictions without a corresponding groundtruth value.

PARAMETER DESCRIPTION
gold_col

Dataframe column containing the true keys

TYPE: Series

pred_col

Dataframe column containing the keys to validate

TYPE: Series

verbose

Include list of affected keys in error message

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
str

An error message, if any (default is an empty string)

Source code in cnb_tools/validation_toolkit.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def check_unknown_keys(
    gold_col: Series, pred_col: Series, verbose: bool = False
) -> str:
    """Check for unknown keys.

    Tip: Example Use Case
      There are no predictions without a corresponding groundtruth value.

    Args:
      gold_col: Dataframe column containing the true keys
      pred_col: Dataframe column containing the keys to validate
      verbose: Include list of affected keys in error message

    Returns:
       An error message, if any (default is an empty string)

    """
    error = ""
    unknown_ids = pred_col[~pred_col.isin(gold_col)]
    if unknown_ids.any():
        error = f"Found {unknown_ids.shape[0]} unknown ID(s)"

        if verbose:
            error += f": {unknown_ids.to_list()}"
    return error

check_values_range(pred_col, min_val=0, max_val=1)

Check that values are between min and max values, inclusive.

Example Use Case

Predictions must be a probability from 0 (disease not likely) to 1 (disease likely).

PARAMETER DESCRIPTION
pred_col

Dataframe column containing the values to validate

TYPE: Series

min_val

Lower limit of range

TYPE: Union[int, float] DEFAULT: 0

max_val

Upper limit of range

TYPE: Union[int, float] DEFAULT: 1

RETURNS DESCRIPTION
str

An error message, if any (default is an empty string)

Source code in cnb_tools/validation_toolkit.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def check_values_range(
    pred_col: Series,
    min_val: Union[int, float] = 0,
    max_val: Union[int, float] = 1
) -> str:
    """Check that values are between min and max values, inclusive.

    Tip: Example Use Case
      Predictions must be a probability from 0 (disease not likely) to 1
      (disease likely).

    Args:
      pred_col: Dataframe column containing the values to validate
      min_val: Lower limit of range
      max_val: Upper limit of range

    Returns:
       An error message, if any (default is an empty string)

    """
    if (pred_col < min_val).any() or (pred_col > max_val).any():
        return f"'{pred_col.name}' values should be between [{min_val}, {max_val}]."
    return ""