benchmark

__parse_check_p_n_y(p, n, y)

Parses and checks n, y and p, returns (inferred) n.

Source code in doubtlab/benchmark.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
def __parse_check_p_n_y(p, n, y):
    """Parses and checks `n`, `y` and `p`, returns (inferred) `n`."""
    if p:
        if p < 0:
            raise ValueError("Probability value `p` must be larger than 0.")
        if p > 1:
            raise ValueError("Probability value `p` must be less than 1.")
        n = round(len(y) * p)
    if not n:
        raise ValueError("Either `n` or `p` must be given to `shuffle_labels`.")
    if n <= 1:
        raise ValueError("Must shuffle at least 2 values. Increase `n` or `p`.")
    return n

calculate_precision_recall_at_k(predicate_df, idx_flip, max_k=100, give_random=False, give_ensemble=True)

Plots precision/recall at k values for flipped label experiments.

Returns an interactive altair visualisation. Make sure it is installed beforehand.

Parameters:

Name Type Description Default
predicate_df

the dataframe with predicates from ensemble.get_predicates

required
idx_flip

array that indicates if labels are wrong

required
max_k

the maximum value for k to consider

100
give_random

plot the "at k" statistics for the randomly selected lower bound

False
give_ensemble

plot the "at k" statistics from the reason ensemble

True
Source code in doubtlab/benchmark.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def calculate_precision_recall_at_k(
    predicate_df, idx_flip, max_k=100, give_random=False, give_ensemble=True
):
    """
    Plots precision/recall at `k` values for flipped label experiments.

    Returns an interactive altair visualisation. Make sure it is installed beforehand.

    Arguments:
        predicate_df: the dataframe with predicates from `ensemble.get_predicates`
        idx_flip: array that indicates if labels are wrong
        max_k: the maximum value for `k` to consider
        give_random: plot the "at k" statistics for the randomly selected lower bound
        give_ensemble: plot the "at k" statistics from the reason ensemble
    """
    # First we need to ensure that the original dataframe with X values is
    # is combined with our reasons dataframe and sorted appropriately.
    df = predicate_df.assign(
        s=lambda d: d[[c for c in d.columns if "predicate" in c]].sum(axis=1),
        flipped=idx_flip,
    ).sort_values("s", ascending=False)

    # Next we calculate the precision/recall at k values
    data = []
    for k in range(1, max_k):
        recall_at_k = df["flipped"][:k].sum() / df["flipped"].sum()
        precision_at_k = (df["flipped"][:k] == np.ones(k)).sum() / k
        random_recall = df["flipped"].mean() * k / df["flipped"].sum()
        random_precision = df["flipped"].mean()
        data.append(
            {
                "recall_at_k": recall_at_k,
                "precision_at_k": precision_at_k,
                "k": k,
                "setting": "ensemble",
            }
        )
        data.append(
            {
                "recall_at_k": random_recall,
                "precision_at_k": random_precision,
                "k": k,
                "setting": "random",
            }
        )
    result = pd.DataFrame(data).melt(["k", "setting"])
    # Give the user the option to only return draw a subset
    if not give_random:
        result = result.loc[lambda d: d["setting"] != "random"]
    if not give_ensemble:
        result = result.loc[lambda d: d["setting"] != "ensemble"]

    # Return the data in a tidy format.
    return result

flip_labels(y, random_seed=42, n=None, p=None)

Flips subset of labels for benchmarking. Recommended for classification.

Either p or n should be given. Returns a tuple (y_out, indicator)-tuple.

Parameters:

Name Type Description Default
y

array of labels

required
random_seed

random seed

42
n

number of labels to flip

None
p

percentage of labels to flip

None

Usage:

import numpy as np
from doubtlab.benchmark import flip_labels

# Let's pretend these are the actual labels
y = np.random.randint(0, 3, 10000)

# You now have some shuffled labels and an indicator
y_out, indicator = flip_labels(y, n=100)
Source code in doubtlab/benchmark.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def flip_labels(y, random_seed=42, n=None, p=None):
    """
    Flips subset of labels for benchmarking. Recommended for classification.

    Either `p` or `n` should be given. Returns a tuple `(y_out, indicator)`-tuple.

    Arguments:
        y: array of labels
        random_seed: random seed
        n: number of labels to flip
        p: percentage of labels to flip

    Usage:

    ```python
    import numpy as np
    from doubtlab.benchmark import flip_labels

    # Let's pretend these are the actual labels
    y = np.random.randint(0, 3, 10000)

    # You now have some shuffled labels and an indicator
    y_out, indicator = flip_labels(y, n=100)
    ```
    """
    np.random.seed(random_seed)
    y = np.array(y)
    n = __parse_check_p_n_y(p=p, n=n, y=y)

    y_out = y.copy()
    classes = np.unique(y)
    if len(classes) == 1:
        raise ValueError("Need more that 1 class in `y`.")

    # Only sample classes that didn't appear before.
    idx = np.random.choice(np.arange(y.shape[0]), size=n, replace=False)
    y_out[idx] = [np.random.choice(classes[classes != _]) for _ in y_out[idx]]
    return y_out, (y != y_out).astype(int)

plot_precision_recall_at_k(predicate_df, idx_flip, max_k=100, give_random=True, give_ensemble=True)

Plots precision/recall at k values for flipped label experiments.

Returns an interactive altair visualisation. Make sure it is installed beforehand.

Parameters:

Name Type Description Default
predicate_df

the dataframe with predicates from ensemble.get_predicates

required
idx_flip

array that indicates if labels are wrong

required
max_k

the maximum value for k to consider

100
give_random

plot the "at k" statistics for the randomly selected lower bound

True
give_ensemble

plot the "at k" statistics from the reason ensemble

True
Source code in doubtlab/benchmark.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def plot_precision_recall_at_k(
    predicate_df, idx_flip, max_k=100, give_random=True, give_ensemble=True
):
    """
    Plots precision/recall at `k` values for flipped label experiments.

    Returns an interactive altair visualisation. Make sure it is installed beforehand.

    Arguments:
        predicate_df: the dataframe with predicates from `ensemble.get_predicates`
        idx_flip: array that indicates if labels are wrong
        max_k: the maximum value for `k` to consider
        give_random: plot the "at k" statistics for the randomly selected lower bound
        give_ensemble: plot the "at k" statistics from the reason ensemble
    """
    import altair as alt

    alt.data_transformers.disable_max_rows()

    # We combine the results in dataframes
    plot_df = calculate_precision_recall_at_k(
        predicate_df=predicate_df,
        idx_flip=idx_flip,
        max_k=max_k,
        give_random=give_random,
        give_ensemble=give_ensemble,
    )

    # So that we may plot it.
    return (
        alt.Chart(plot_df)
        .mark_line()
        .encode(x="k", y="value", color="variable", strokeDash="setting")
        .interactive()
    )

shuffle_labels(y, random_seed=42, n=None, p=None)

Shuffles subset of labels for benchmarking. Recommended for regression.

Either p or n should be given. Returns a tuple (y_out, indicator)-tuple.

Parameters:

Name Type Description Default
y

array of labels

required
random_seed

random seed

42
n

number of labels to flip

None
p

percentage of labels to flip

None

Usage:

import numpy as np
from doubtlab.benchmark import shuffle_labels

# Let's pretend these are the actual labels
y = np.random.normal(0, 1, 10000)

# You now have some shuffled labels and an indicator
y_out, indicator = shuffle_labels(y, n=100)
Source code in doubtlab/benchmark.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def shuffle_labels(y, random_seed=42, n=None, p=None):
    """
    Shuffles subset of labels for benchmarking. Recommended for regression.

    Either `p` or `n` should be given. Returns a tuple `(y_out, indicator)`-tuple.

    Arguments:
        y: array of labels
        random_seed: random seed
        n: number of labels to flip
        p: percentage of labels to flip

    Usage:

    ```python
    import numpy as np
    from doubtlab.benchmark import shuffle_labels

    # Let's pretend these are the actual labels
    y = np.random.normal(0, 1, 10000)

    # You now have some shuffled labels and an indicator
    y_out, indicator = shuffle_labels(y, n=100)
    ```
    """
    np.random.seed(random_seed)
    y = np.array(y)
    n = __parse_check_p_n_y(p=p, n=n, y=y)

    y_out = y.copy()
    sample = np.random.choice(np.arange(y.shape[0]), size=n, replace=False)

    # Since `sample` is already randomly shuffled, we can move everything
    # over by one index to guarantee a shuffle of the values from another index.
    y_out[sample] = np.concatenate([sample[1:], sample[0:1]])
    return y_out, (y != y_out).astype(int)