DoubtEnsemble

A pipeline to find bad labels.

Parameters:

Name Type Description Default
reasons

kwargs with (name, reason)-pairs

required

Usage:

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

from doubtlab.ensemble import DoubtEnsemble
from doubtlab.reason import ProbaReason, WrongPredictionReason

X, y = load_iris(return_X_y=True)
model = LogisticRegression(max_iter=1_000)
model.fit(X, y)

reasons = {
    "proba": ProbaReason(model=model),
    "wrong_pred": WrongPredictionReason(model=model),
}

doubt = DoubtEnsemble(**reasons)
Source code in doubtlab/ensemble.py
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class DoubtEnsemble:
    """
    A pipeline to find bad labels.

    Arguments:
        reasons: kwargs with (name, reason)-pairs

    Usage:

    ```python
    from sklearn.datasets import load_iris
    from sklearn.linear_model import LogisticRegression

    from doubtlab.ensemble import DoubtEnsemble
    from doubtlab.reason import ProbaReason, WrongPredictionReason

    X, y = load_iris(return_X_y=True)
    model = LogisticRegression(max_iter=1_000)
    model.fit(X, y)

    reasons = {
        "proba": ProbaReason(model=model),
        "wrong_pred": WrongPredictionReason(model=model),
    }

    doubt = DoubtEnsemble(**reasons)
    ```
    """

    def __init__(self, **reasons):
        self.reasons = reasons

    def get_predicates(self, X, y=None):
        """
        Returns a sorted dataframe that shows the reasoning behind the sorting.

        Arguments:
            X: the `X` data to be processed
            y: the `y` data to be processed

        Usage:

        ```python
        from sklearn.datasets import load_iris
        from sklearn.linear_model import LogisticRegression

        from doubtlab.ensemble import DoubtEnsemble
        from doubtlab.reason import ProbaReason, WrongPredictionReason

        X, y = load_iris(return_X_y=True)
        model = LogisticRegression(max_iter=1_000)
        model.fit(X, y)

        reasons = {
            "proba": ProbaReason(model=model),
            "wrong_pred": WrongPredictionReason(model=model),
        }

        doubt = DoubtEnsemble(**reasons)

        predicates = doubt.get_predicates(X, y)
        ```
        """
        df = pd.DataFrame(
            {f"predicate_{name}": func(X, y) for name, func in self.reasons.items()}
        )
        sorted_index = df.sum(axis=1).sort_values(ascending=False).index
        return df.reindex(sorted_index)

    def get_indices(self, X, y=None):
        """
        Calculates indices worth checking again.

        Arguments:
            X: the `X` data to be processed
            y: the `y` data to be processed

        Usage:

        ```python
        from sklearn.datasets import load_iris
        from sklearn.linear_model import LogisticRegression

        from doubtlab.ensemble import DoubtEnsemble
        from doubtlab.reason import ProbaReason, WrongPredictionReason

        X, y = load_iris(return_X_y=True)
        model = LogisticRegression(max_iter=1_000)
        model.fit(X, y)

        reasons = {
            "proba": ProbaReason(model=model),
            "wrong_pred": WrongPredictionReason(model=model),
        }

        doubt = DoubtEnsemble(**reasons)

        indices = doubt.get_indices(X, y)
        ```
        """
        df = self.get_predicates(X, y)
        predicates = [
            c for c in df.columns if isinstance(c, str) and ("predicate" in c)
        ]
        df = (
            df[predicates]
            .assign(s=lambda d: d[predicates].sum(axis=1))
            .sort_values(["s"], ascending=False)
            .loc[lambda d: d["s"] > 0]
        )
        return np.array(df.index)

get_indices(X, y=None)

Calculates indices worth checking again.

Parameters:

Name Type Description Default
X

the X data to be processed

required
y

the y data to be processed

None

Usage:

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

from doubtlab.ensemble import DoubtEnsemble
from doubtlab.reason import ProbaReason, WrongPredictionReason

X, y = load_iris(return_X_y=True)
model = LogisticRegression(max_iter=1_000)
model.fit(X, y)

reasons = {
    "proba": ProbaReason(model=model),
    "wrong_pred": WrongPredictionReason(model=model),
}

doubt = DoubtEnsemble(**reasons)

indices = doubt.get_indices(X, y)
Source code in doubtlab/ensemble.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def get_indices(self, X, y=None):
    """
    Calculates indices worth checking again.

    Arguments:
        X: the `X` data to be processed
        y: the `y` data to be processed

    Usage:

    ```python
    from sklearn.datasets import load_iris
    from sklearn.linear_model import LogisticRegression

    from doubtlab.ensemble import DoubtEnsemble
    from doubtlab.reason import ProbaReason, WrongPredictionReason

    X, y = load_iris(return_X_y=True)
    model = LogisticRegression(max_iter=1_000)
    model.fit(X, y)

    reasons = {
        "proba": ProbaReason(model=model),
        "wrong_pred": WrongPredictionReason(model=model),
    }

    doubt = DoubtEnsemble(**reasons)

    indices = doubt.get_indices(X, y)
    ```
    """
    df = self.get_predicates(X, y)
    predicates = [
        c for c in df.columns if isinstance(c, str) and ("predicate" in c)
    ]
    df = (
        df[predicates]
        .assign(s=lambda d: d[predicates].sum(axis=1))
        .sort_values(["s"], ascending=False)
        .loc[lambda d: d["s"] > 0]
    )
    return np.array(df.index)

get_predicates(X, y=None)

Returns a sorted dataframe that shows the reasoning behind the sorting.

Parameters:

Name Type Description Default
X

the X data to be processed

required
y

the y data to be processed

None

Usage:

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

from doubtlab.ensemble import DoubtEnsemble
from doubtlab.reason import ProbaReason, WrongPredictionReason

X, y = load_iris(return_X_y=True)
model = LogisticRegression(max_iter=1_000)
model.fit(X, y)

reasons = {
    "proba": ProbaReason(model=model),
    "wrong_pred": WrongPredictionReason(model=model),
}

doubt = DoubtEnsemble(**reasons)

predicates = doubt.get_predicates(X, y)
Source code in doubtlab/ensemble.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def get_predicates(self, X, y=None):
    """
    Returns a sorted dataframe that shows the reasoning behind the sorting.

    Arguments:
        X: the `X` data to be processed
        y: the `y` data to be processed

    Usage:

    ```python
    from sklearn.datasets import load_iris
    from sklearn.linear_model import LogisticRegression

    from doubtlab.ensemble import DoubtEnsemble
    from doubtlab.reason import ProbaReason, WrongPredictionReason

    X, y = load_iris(return_X_y=True)
    model = LogisticRegression(max_iter=1_000)
    model.fit(X, y)

    reasons = {
        "proba": ProbaReason(model=model),
        "wrong_pred": WrongPredictionReason(model=model),
    }

    doubt = DoubtEnsemble(**reasons)

    predicates = doubt.get_predicates(X, y)
    ```
    """
    df = pd.DataFrame(
        {f"predicate_{name}": func(X, y) for name, func in self.reasons.items()}
    )
    sorted_index = df.sum(axis=1).sort_values(ascending=False).index
    return df.reindex(sorted_index)