brent.sklearn module

The sklearn module contains objects that be used in scikit-learn pipelines. In particulate it offers a classifier as well as an imputer.

Source code
The `sklearn` module contains objects that be used in scikit-learn pipelines.
In particulate it offers a classifier as well as an imputer.

from brent.graph import DAG
from brent.query import Query

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin

class BrentClassifier(BaseEstimator, ClassifierMixin):
    A classifier that allows you to define your own model via a DAG.
    def __init__(self, dag: DAG, to_predict: str):
        Construct an estimator based on a DAG. You need to specify the DAG as well as
        the column name that requires prediction.

        ## Inputs

        - **dag**: DAG object that describes the dag
        - **to_predict**: the column to predict

        ## Output

        A classifier that can be used in scikit-learn pipelines.
        if to_predict not in dag.df.columns:
            raise ValueError(f"column {to_predict} not found in DAG {dag}")
        self.dag = dag
        self.to_predict = to_predict
        self.to_use = [_ for _ in self.dag.df.columns if _ != self.to_predict]
        self.query = None
        self.k = self.dag.df[to_predict].nunique()

    def _check_dataframe(self, X):
        for node in self.dag.nodes:
            if node not in X.columns:
                raise ValueError(f"column {node} not in dataframe but in DAG")

    def fit(self, X: pd.DataFrame, y):
        Make the estimator "train". This is a bit verbose since the DAG object
        is already pretrained. We mainly check if the supplied dataframe given in `X`
        is consistent with the graph.

        ## Inputs

        - **X**: a dataframe to be used
        - **y**: ignored but required by the api

        ## Output

        A "trained" classifier that can be used in scikit-learn pipelines.
        return self

    def predict(self, X):
        Predict the class.

        ## Inputs

        - **X**: a dataframe to be used

        ## Output

        A numpy array containing the predicted classes.
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        Predict the probabilities for all classes

        ## Inputs

        - **X**: a dataframe to be used

        ## Output

        A numpy array (num_rows, num_classes) containing the predicted classes.
        predictions = np.zeros((X.shape[0], self.k))
        for idx, row in X[self.to_use].reset_index(drop=True).iterrows():
            query = Query(dag=self.dag, given=row.to_dict())
            table = query.infer(give_table=True).sort_values(self.to_predict)[[self.to_predict, 'prob']]
            for i, r in table.iterrows():
                k = r[self.to_predict]
                predictions[idx, int(k)] = r['prob']
        return predictions


class BrentClassifier (ancestors: sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin)

A classifier that allows you to define your own model via a DAG.

Source code
class BrentClassifier(BaseEstimator, ClassifierMixin):
    A classifier that allows you to define your own model via a DAG.
    def __init__(self, dag: DAG, to_predict: str):
        Construct an estimator based on a DAG. You need to specify the DAG as well as
        the column name that requires prediction.

        ## Inputs

        - **dag**: DAG object that describes the dag
        - **to_predict**: the column to predict

        ## Output

        A classifier that can be used in scikit-learn pipelines.
        if to_predict not in dag.df.columns:
            raise ValueError(f"column {to_predict} not found in DAG {dag}")
        self.dag = dag
        self.to_predict = to_predict
        self.to_use = [_ for _ in self.dag.df.columns if _ != self.to_predict]
        self.query = None
        self.k = self.dag.df[to_predict].nunique()

    def _check_dataframe(self, X):
        for node in self.dag.nodes:
            if node not in X.columns:
                raise ValueError(f"column {node} not in dataframe but in DAG")

    def fit(self, X: pd.DataFrame, y):
        Make the estimator "train". This is a bit verbose since the DAG object
        is already pretrained. We mainly check if the supplied dataframe given in `X`
        is consistent with the graph.

        ## Inputs

        - **X**: a dataframe to be used
        - **y**: ignored but required by the api

        ## Output

        A "trained" classifier that can be used in scikit-learn pipelines.
        return self

    def predict(self, X):
        Predict the class.

        ## Inputs

        - **X**: a dataframe to be used

        ## Output

        A numpy array containing the predicted classes.
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        Predict the probabilities for all classes

        ## Inputs

        - **X**: a dataframe to be used

        ## Output

        A numpy array (num_rows, num_classes) containing the predicted classes.
        predictions = np.zeros((X.shape[0], self.k))
        for idx, row in X[self.to_use].reset_index(drop=True).iterrows():
            query = Query(dag=self.dag, given=row.to_dict())
            table = query.infer(give_table=True).sort_values(self.to_predict)[[self.to_predict, 'prob']]
            for i, r in table.iterrows():
                k = r[self.to_predict]
                predictions[idx, int(k)] = r['prob']
        return predictions


def __init__(self, dag, to_predict)

Construct an estimator based on a DAG. You need to specify the DAG as well as the column name that requires prediction.


  • dag: DAG object that describes the dag
  • to_predict: the column to predict


A classifier that can be used in scikit-learn pipelines.

Source code
def __init__(self, dag: DAG, to_predict: str):
    Construct an estimator based on a DAG. You need to specify the DAG as well as
    the column name that requires prediction.

    ## Inputs

    - **dag**: DAG object that describes the dag
    - **to_predict**: the column to predict

    ## Output

    A classifier that can be used in scikit-learn pipelines.
    if to_predict not in dag.df.columns:
        raise ValueError(f"column {to_predict} not found in DAG {dag}")
    self.dag = dag
    self.to_predict = to_predict
    self.to_use = [_ for _ in self.dag.df.columns if _ != self.to_predict]
    self.query = None
    self.k = self.dag.df[to_predict].nunique()
def fit(self, X, y)

Make the estimator "train". This is a bit verbose since the DAG object is already pretrained. We mainly check if the supplied dataframe given in X is consistent with the graph.


  • X: a dataframe to be used
  • y: ignored but required by the api


A "trained" classifier that can be used in scikit-learn pipelines.

Source code
def fit(self, X: pd.DataFrame, y):
    Make the estimator "train". This is a bit verbose since the DAG object
    is already pretrained. We mainly check if the supplied dataframe given in `X`
    is consistent with the graph.

    ## Inputs

    - **X**: a dataframe to be used
    - **y**: ignored but required by the api

    ## Output

    A "trained" classifier that can be used in scikit-learn pipelines.
    return self
def predict(self, X)

Predict the class.


  • X: a dataframe to be used


A numpy array containing the predicted classes.

Source code
def predict(self, X):
    Predict the class.

    ## Inputs

    - **X**: a dataframe to be used

    ## Output

    A numpy array containing the predicted classes.
    return np.argmax(self.predict_proba(X), axis=1)
def predict_proba(self, X)

Predict the probabilities for all classes


  • X: a dataframe to be used


A numpy array (num_rows, num_classes) containing the predicted classes.

Source code
def predict_proba(self, X):
    Predict the probabilities for all classes

    ## Inputs

    - **X**: a dataframe to be used

    ## Output

    A numpy array (num_rows, num_classes) containing the predicted classes.
    predictions = np.zeros((X.shape[0], self.k))
    for idx, row in X[self.to_use].reset_index(drop=True).iterrows():
        query = Query(dag=self.dag, given=row.to_dict())
        table = query.infer(give_table=True).sort_values(self.to_predict)[[self.to_predict, 'prob']]
        for i, r in table.iterrows():
            k = r[self.to_predict]
            predictions[idx, int(k)] = r['prob']
    return predictions