from hulearn.preprocessing import *

PipeTransformer

This transformer allows you to define a function that will take in data and transform it however you like. You can specify keyword arguments that you can benchmark as well.

Parameters

Name Type Description Default
func the function that can make predictions required
**kwargs extra keyword arguments will be pass to the function, can be grid-search-able {}

The functions that are passed need to be pickle-able. That means no lambda functions!

Usage:

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

from hulearn.datasets import load_titanic
from hulearn.preprocessing import PipeTransformer


def preprocessing(dataf, n_char=True, gender=True):
    dataf = dataf.copy()
    # I'm not using .assign() in this pipeline because lambda functions
    # do not pickle and GridSearchCV demands that it can.
    if n_char:
        dataf['nchar'] = dataf['name'].str.len()
    if gender:
        dataf['gender'] = (dataf['sex'] == 'male').astype("float")
    return dataf.drop(columns=["name", "sex"])


df = load_titanic(as_frame=True)
X, y = df.drop(columns=['survived']), df['survived']

pipe = Pipeline([
    ('prep', PipeTransformer(preprocessing, n_char=True, gender=True)),
    ('mod', GaussianNB())
])

params = {
    "prep__n_char": [True, False],
    "prep__gender": [True, False]
}

grid = GridSearchCV(pipe, cv=3, param_grid=params).fit(X, y)
pd.DataFrame(grid.cv_results_)[['param_prep__gender', 'param_prep__n_char', 'mean_test_score']]

fit(self, X, y=None)

Show source code in preprocessing/pipetransformer.py
62
63
64
65
66
67
68
69
70
71
72
    def fit(self, X, y=None):
        """
        Fit the classifier.

        This classifier tries to confirm if the passed function can predict appropriate values on the train set.
        """
        # Run it to confirm no error happened.
        _ = self.func(X, **self.kwargs)
        self.fitted_ = True
        self.ncol_ = 0 if len(X.shape) == 1 else X.shape[1]
        return self

Fit the classifier.

This classifier tries to confirm if the passed function can predict appropriate values on the train set.

transform(self, X)

Show source code in preprocessing/pipetransformer.py
74
75
76
77
78
79
80
81
82
83
84
    def transform(self, X):
        """
        Make predictions using the passed function.
        """
        check_is_fitted(self, ["fitted_", "ncol_"])
        ncol = 0 if len(X.shape) == 1 else X.shape[1]
        if self.ncol_ != ncol:
            raise ValueError(
                f"Reshape your data, there were {self.ncol_} features during training, now={ncol}."
            )
        return self.func(X, **self.kwargs)

Make predictions using the passed function.

InteractivePreprocessor

This tool allows you to take a drawn model and use it as a featurizer.

Parameters

Name Type Description Default
json_desc chart da ta in dictionary form required
refit if True, you no longer need to call .fit(X, y) in order to .predict(X) True

fit(self, X, y=None)

Show source code in preprocessing/interactivepreprocessor.py
75
76
77
78
79
80
81
    def fit(self, X, y=None):
        """
        Fit the classifier. Bit of a formality, it's not doing anything specifically.
        """
        self.classes_ = list(self.json_desc[0]["polygons"].keys())
        self.fitted_ = True
        return self

Fit the classifier. Bit of a formality, it's not doing anything specifically.

from_json(path, refit=True) (classmethod)

Show source code in preprocessing/interactivepreprocessor.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
    @classmethod
    def from_json(cls, path, refit=True):
        """
        Load the classifier from json stored on disk.

        Arguments:
            path: path of the json file
            refit: if `True`, you no longer need to call `.fit(X, y)` in order to `.predict(X)`

        Usage:

        ```python
        from hulearn.classification import InteractivePreprocessor

        InteractivePreprocessor.from_json("path/to/file.json")
        ```
        """
        json_desc = json.loads(pathlib.Path(path).read_text())
        return InteractivePreprocessor(json_desc=json_desc, refit=refit)

Load the classifier from json stored on disk.

Parameters

Name Type Description Default
path path of the json file required
refit if True, you no longer need to call .fit(X, y) in order to .predict(X) True

Usage:

from hulearn.classification import InteractivePreprocessor

InteractivePreprocessor.from_json("path/to/file.json")

pandas_pipe(self, dataf)

Show source code in preprocessing/interactivepreprocessor.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
    def pandas_pipe(self, dataf):
        """
        Use this transformer as part of a `.pipe()` method chain in pandas.

        Usage:

        ```python
        import numpy as np
        import pandas as pd

        # Load in a dataframe from somewhere
        df = load_data(...)

        # Load in drawn chart data
        from hulearn.preprocessing import InteractivePreprocessor
        tfm = InteractivePreprocessor.from_json("path/file.json")

        # This adds new columns to the dataframe
        df.pipe(pandas_pipe)
        ```
        """
        new_dataf = pd.DataFrame(
            self.fit(dataf).transform(dataf), columns=self.classes_
        )
        return pd.concat(
            [dataf.copy().reset_index(drop=True), new_dataf.reset_index(drop=True)],
            axis=1,
        )

Use this transformer as part of a .pipe() method chain in pandas.

Usage:

import numpy as np
import pandas as pd

# Load in a dataframe from somewhere
df = load_data(...)

# Load in drawn chart data
from hulearn.preprocessing import InteractivePreprocessor
tfm = InteractivePreprocessor.from_json("path/file.json")

# This adds new columns to the dataframe
df.pipe(pandas_pipe)

transform(self, X)

Show source code in preprocessing/interactivepreprocessor.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
    def transform(self, X):
        """
        Apply the counting/binning based on the drawings.

        Usage:

        ```python
        from hulearn.preprocessing import InteractivePreprocessor
        clf = InteractivePreprocessor(clf_data)
        X, y = load_data(...)

        # This doesn't do anything. But scikit-learn demands it.
        clf.fit(X, y)

        # This makes predictions, based on your drawn model.
        clf.transform(X)
        ```
        """
        # Because we're not doing anything during training, for convenience this
        # method can formally "fit" during the predict call. This is a scikit-learn
        # anti-pattern so we allow you to turn this off.
        if self.refit:
            if not self.fitted_:
                self.fit(X)
        check_is_fitted(self, ["classes_", "fitted_"])
        if isinstance(X, pd.DataFrame):
            hits = [
                self._count_hits(self.poly_data, x[1].to_dict()) for x in X.iterrows()
            ]
        else:
            hits = [
                self._count_hits(self.poly_data, {k: v for k, v in enumerate(x)})
                for x in X
            ]
        count_arr = np.array([[h[c] for c in self.classes_] for h in hits])
        return count_arr

Apply the counting/binning based on the drawings.

Usage:

from hulearn.preprocessing import InteractivePreprocessor
clf = InteractivePreprocessor(clf_data)
X, y = load_data(...)

# This doesn't do anything. But scikit-learn demands it.
clf.fit(X, y)

# This makes predictions, based on your drawn model.
clf.transform(X)