brent.sklearn
module
The sklearn
module contains objects that be used in scikit-learn pipelines.
In particulate it offers a classifier as well as an imputer.
Source code
"""
The `sklearn` module contains objects that be used in scikit-learn pipelines.
In particulate it offers a classifier as well as an imputer.
"""
from brent.graph import DAG
from brent.query import Query
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
class BrentClassifier(BaseEstimator, ClassifierMixin):
"""
A classifier that allows you to define your own model via a DAG.
"""
def __init__(self, dag: DAG, to_predict: str):
"""
Construct an estimator based on a DAG. You need to specify the DAG as well as
the column name that requires prediction.
## Inputs
- **dag**: DAG object that describes the dag
- **to_predict**: the column to predict
## Output
A classifier that can be used in scikit-learn pipelines.
"""
if to_predict not in dag.df.columns:
raise ValueError(f"column {to_predict} not found in DAG {dag}")
self.dag = dag
self.to_predict = to_predict
self.to_use = [_ for _ in self.dag.df.columns if _ != self.to_predict]
self.query = None
self.k = self.dag.df[to_predict].nunique()
def _check_dataframe(self, X):
for node in self.dag.nodes:
if node not in X.columns:
raise ValueError(f"column {node} not in dataframe but in DAG")
def fit(self, X: pd.DataFrame, y):
"""
Make the estimator "train". This is a bit verbose since the DAG object
is already pretrained. We mainly check if the supplied dataframe given in `X`
is consistent with the graph.
## Inputs
- **X**: a dataframe to be used
- **y**: ignored but required by the api
## Output
A "trained" classifier that can be used in scikit-learn pipelines.
"""
self._check_dataframe(X)
return self
def predict(self, X):
"""
Predict the class.
## Inputs
- **X**: a dataframe to be used
## Output
A numpy array containing the predicted classes.
"""
return np.argmax(self.predict_proba(X), axis=1)
def predict_proba(self, X):
"""
Predict the probabilities for all classes
## Inputs
- **X**: a dataframe to be used
## Output
A numpy array (num_rows, num_classes) containing the predicted classes.
"""
self._check_dataframe(X)
predictions = np.zeros((X.shape[0], self.k))
for idx, row in X[self.to_use].reset_index(drop=True).iterrows():
query = Query(dag=self.dag, given=row.to_dict())
table = query.infer(give_table=True).sort_values(self.to_predict)[[self.to_predict, 'prob']]
for i, r in table.iterrows():
k = r[self.to_predict]
predictions[idx, int(k)] = r['prob']
return predictions
Classes
class BrentClassifier (ancestors: sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin)
-
A classifier that allows you to define your own model via a DAG.
Source code
class BrentClassifier(BaseEstimator, ClassifierMixin): """ A classifier that allows you to define your own model via a DAG. """ def __init__(self, dag: DAG, to_predict: str): """ Construct an estimator based on a DAG. You need to specify the DAG as well as the column name that requires prediction. ## Inputs - **dag**: DAG object that describes the dag - **to_predict**: the column to predict ## Output A classifier that can be used in scikit-learn pipelines. """ if to_predict not in dag.df.columns: raise ValueError(f"column {to_predict} not found in DAG {dag}") self.dag = dag self.to_predict = to_predict self.to_use = [_ for _ in self.dag.df.columns if _ != self.to_predict] self.query = None self.k = self.dag.df[to_predict].nunique() def _check_dataframe(self, X): for node in self.dag.nodes: if node not in X.columns: raise ValueError(f"column {node} not in dataframe but in DAG") def fit(self, X: pd.DataFrame, y): """ Make the estimator "train". This is a bit verbose since the DAG object is already pretrained. We mainly check if the supplied dataframe given in `X` is consistent with the graph. ## Inputs - **X**: a dataframe to be used - **y**: ignored but required by the api ## Output A "trained" classifier that can be used in scikit-learn pipelines. """ self._check_dataframe(X) return self def predict(self, X): """ Predict the class. ## Inputs - **X**: a dataframe to be used ## Output A numpy array containing the predicted classes. """ return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X): """ Predict the probabilities for all classes ## Inputs - **X**: a dataframe to be used ## Output A numpy array (num_rows, num_classes) containing the predicted classes. """ self._check_dataframe(X) predictions = np.zeros((X.shape[0], self.k)) for idx, row in X[self.to_use].reset_index(drop=True).iterrows(): query = Query(dag=self.dag, given=row.to_dict()) table = query.infer(give_table=True).sort_values(self.to_predict)[[self.to_predict, 'prob']] for i, r in table.iterrows(): k = r[self.to_predict] predictions[idx, int(k)] = r['prob'] return predictions
Methods
def __init__(self, dag, to_predict)
-
Construct an estimator based on a DAG. You need to specify the DAG as well as the column name that requires prediction.
Inputs
- dag: DAG object that describes the dag
- to_predict: the column to predict
Output
A classifier that can be used in scikit-learn pipelines.
Source code
def __init__(self, dag: DAG, to_predict: str): """ Construct an estimator based on a DAG. You need to specify the DAG as well as the column name that requires prediction. ## Inputs - **dag**: DAG object that describes the dag - **to_predict**: the column to predict ## Output A classifier that can be used in scikit-learn pipelines. """ if to_predict not in dag.df.columns: raise ValueError(f"column {to_predict} not found in DAG {dag}") self.dag = dag self.to_predict = to_predict self.to_use = [_ for _ in self.dag.df.columns if _ != self.to_predict] self.query = None self.k = self.dag.df[to_predict].nunique()
def fit(self, X, y)
-
Make the estimator "train". This is a bit verbose since the DAG object is already pretrained. We mainly check if the supplied dataframe given in
X
is consistent with the graph.Inputs
- X: a dataframe to be used
- y: ignored but required by the api
Output
A "trained" classifier that can be used in scikit-learn pipelines.
Source code
def fit(self, X: pd.DataFrame, y): """ Make the estimator "train". This is a bit verbose since the DAG object is already pretrained. We mainly check if the supplied dataframe given in `X` is consistent with the graph. ## Inputs - **X**: a dataframe to be used - **y**: ignored but required by the api ## Output A "trained" classifier that can be used in scikit-learn pipelines. """ self._check_dataframe(X) return self
def predict(self, X)
-
Predict the class.
Inputs
- X: a dataframe to be used
Output
A numpy array containing the predicted classes.
Source code
def predict(self, X): """ Predict the class. ## Inputs - **X**: a dataframe to be used ## Output A numpy array containing the predicted classes. """ return np.argmax(self.predict_proba(X), axis=1)
def predict_proba(self, X)
-
Predict the probabilities for all classes
Inputs
- X: a dataframe to be used
Output
A numpy array (num_rows, num_classes) containing the predicted classes.
Source code
def predict_proba(self, X): """ Predict the probabilities for all classes ## Inputs - **X**: a dataframe to be used ## Output A numpy array (num_rows, num_classes) containing the predicted classes. """ self._check_dataframe(X) predictions = np.zeros((X.shape[0], self.k)) for idx, row in X[self.to_use].reset_index(drop=True).iterrows(): query = Query(dag=self.dag, given=row.to_dict()) table = query.infer(give_table=True).sort_values(self.to_predict)[[self.to_predict, 'prob']] for i, r in table.iterrows(): k = r[self.to_predict] predictions[idx, int(k)] = r['prob'] return predictions