Skip to content

Finetuners

If you're interested in how to use these components, you'll probably want to read this section first.

FeedForwardTuner

Bases: BaseEstimator, TransformerMixin

Create a feed forward model to finetune the embeddings towards a class.

Parameters:

Name Type Description Default
hidden_dim

The size of the hidden layer

50
n_epochs

The number of epochs to run the optimiser for

500
learning_rate

The learning rate of the feed forward model

0.01
Source code in embetter/finetune/_forward.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
class FeedForwardTuner(BaseEstimator, TransformerMixin):
    """
    Create a feed forward model to finetune the embeddings towards a class.

    Arguments:
        hidden_dim: The size of the hidden layer
        n_epochs: The number of epochs to run the optimiser for
        learning_rate: The learning rate of the feed forward model
    """

    def __init__(
        self, hidden_dim=50, n_epochs=500, learning_rate=0.01, batch_size=32
    ) -> None:
        self.hidden_dim = hidden_dim
        self.n_epochs = n_epochs
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.label_enc = LabelEncoder()

    def fit(self, X, y):
        """Fits the finetuner."""
        return self.partial_fit(X, y, classes=np.unique(y))

    def partial_fit(self, X, y, classes=None):
        """Fits the finetuner using the partial_fit API."""
        if not hasattr(self, "_classes"):
            if classes is None:
                raise ValueError("`classes` must be provided for partial_fit")
            self._classes = classes
            self.label_enc.fit(classes)
            assert (self._classes == self.label_enc.classes_).all()
        # Create a model if it does not exist yet.
        if not hasattr(self, "_model"):
            self._model = FeedForwardModel(
                X.shape[1], self.hidden_dim, len(self._classes)
            )
            self._optimizer = torch.optim.Adam(
                self._model.parameters(), lr=self.learning_rate
            )
            self._criterion = nn.CrossEntropyLoss()

        torch_X = torch.from_numpy(X).detach().float()
        torch_y = torch.from_numpy(self.label_enc.transform(y)).detach()

        dataset = torch.utils.data.TensorDataset(torch_X, torch_y)
        dataloader = torch.utils.data.DataLoader(
            dataset, batch_size=self.batch_size, shuffle=True
        )

        for _ in range(self.n_epochs):
            for batch_X, batch_y in dataloader:
                self._optimizer.zero_grad()
                out = self._model(batch_X)
                loss = self._criterion(out, batch_y)
                loss.backward()
                self._optimizer.step()

        return self

    def transform(self, X, y=None):
        """Transforms the data according to the sklearn api by using the hidden layer."""
        Xt = torch.from_numpy(X).float().detach()
        return self._model.embed(Xt).detach().numpy()

ContrastiveTuner

Bases: BaseEstimator, TransformerMixin

Run a contrastive network to finetune the embeddings towards a class.

Parameters:

Name Type Description Default
hidden_dim

the dimension of the new learned representation

50
n_neg

number of negative example pairs to sample per positive item

3
n_epochs

number of epochs to use for training

required
learning_rate

learning rate of the contrastive network

0.001
Source code in embetter/finetune/_contrastive_tuner.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class ContrastiveTuner(BaseEstimator, TransformerMixin):
    """
    Run a contrastive network to finetune the embeddings towards a class.

    Arguments:
        hidden_dim: the dimension of the new learned representation
        n_neg: number of negative example pairs to sample per positive item
        n_epochs: number of epochs to use for training
        learning_rate: learning rate of the contrastive network
    """

    def __init__(self, hidden_dim=50, n_neg=3, epochs=20, learning_rate=0.001) -> None:
        self.learner = ContrastiveLearner(
            shape_out=hidden_dim,
            batch_size=256,
            learning_rate=learning_rate,
            epochs=epochs,
        )
        self.n_neg = n_neg
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.learning_rate = learning_rate

    def fit(self, X, y):
        """Fits the finetuner."""
        return self.partial_fit(X, y, classes=np.unique(y))

    def generate_batch(self, X_torch, y):
        """Generate a batch of pytorch pairs used for finetuning"""
        pairs = generate_pairs_batch(y, n_neg=self.n_neg)
        X1 = torch.zeros(len(pairs), X_torch.shape[1])
        X2 = torch.zeros(len(pairs), X_torch.shape[1])
        labels = torch.tensor([ex.label for ex in pairs], dtype=torch.long)
        for i, pair in enumerate(pairs):
            X1[i] = X_torch[pair.i1]
            X2[i] = X_torch[pair.i2]
        return X1, X2, labels

    def partial_fit(self, X, y, classes=None):
        """Fits the finetuner using the partial_fit API."""
        if not hasattr(self, "_classes"):
            if classes is None:
                raise ValueError("`classes` must be provided for partial_fit")
            self._classes = classes

        X_torch = torch.from_numpy(X).detach().float()

        X1, X2, out = self.generate_batch(X_torch, y=y)
        # TODO: change this, we should just generate numpy internally not cast all over
        self.learner.fit(np.array(X1), np.array(X2), np.array(out))

        return self

    def transform(self, X, y=None):
        """Transforms the data according to the sklearn api by using the hidden layer."""
        return self.learner.transform(X)

ContrastiveLearner

A learner model that can finetune on pairs of data on top of numeric embeddings.

It's similar to the scikit-learn models that you're used to, but it accepts two inputs X1 and X2 and tries to predict if they are similar.

Parameters:

Name Type Description Default
sent_tfm

an instance of a SentenceTransformer that you'd like to finetune

required
batch_size int

the batch size during training

16
epochs int

the number of epochs to use while training

1
warmup_steps

the number of warmup steps before training

required

Usage:

from sentence_transformers import SentenceTransformer
from embetter.finetune import ContrastiveLearner
import random

sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
learner = SbertLearner(sent_tfm)

def sample_generator(examples, n_neg=3):
    # A generator that assumes examples to be a dictionary of the shape
    # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
    # this is typically a function that's very custom to your use-case though
    labels = set()
    for ex in examples:
        for cat in ex['cats'].keys():
            if cat not in labels:
                labels = labels.union([cat])
    for label in labels:
        pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
        neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
        for ex in pos_examples:
            sample = random.choice(pos_examples)
            yield (ex['text'], sample['text'], 1.0)
            for n in range(n_neg):
                sample = random.choice(neg_examples)
                yield (ex['text'], sample['text'], 0.0)

learn_examples = sample_generator(examples, n_neg=3)
X1, X2, y = zip(*learn_examples)

# Learn a new representation
learner.fit(X1, X2, y)

# You now have an updated model that can create more "finetuned" embeddings
learner.transform(X1)
learner.transform(X2)

After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.

Source code in embetter/finetune/_constrastive_learn.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
class ContrastiveLearner:
    """
    A learner model that can finetune on pairs of data on top of numeric embeddings.

    It's similar to the scikit-learn models that you're used to, but it accepts
    two inputs `X1` and `X2` and tries to predict if they are similar.

    Arguments:
        sent_tfm: an instance of a `SentenceTransformer` that you'd like to finetune
        batch_size: the batch size during training
        epochs: the number of epochs to use while training
        warmup_steps: the number of warmup steps before training

    Usage:

    ```python
    from sentence_transformers import SentenceTransformer
    from embetter.finetune import ContrastiveLearner
    import random

    sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
    learner = SbertLearner(sent_tfm)

    def sample_generator(examples, n_neg=3):
        # A generator that assumes examples to be a dictionary of the shape
        # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
        # this is typically a function that's very custom to your use-case though
        labels = set()
        for ex in examples:
            for cat in ex['cats'].keys():
                if cat not in labels:
                    labels = labels.union([cat])
        for label in labels:
            pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
            neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
            for ex in pos_examples:
                sample = random.choice(pos_examples)
                yield (ex['text'], sample['text'], 1.0)
                for n in range(n_neg):
                    sample = random.choice(neg_examples)
                    yield (ex['text'], sample['text'], 0.0)

    learn_examples = sample_generator(examples, n_neg=3)
    X1, X2, y = zip(*learn_examples)

    # Learn a new representation
    learner.fit(X1, X2, y)

    # You now have an updated model that can create more "finetuned" embeddings
    learner.transform(X1)
    learner.transform(X2)
    ```

    After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.
    """

    def __init__(
        self,
        shape_out: int = 300,
        batch_size: int = 16,
        epochs: int = 1,
        learning_rate=2e-05,
    ):
        self.learning_rate = learning_rate
        self.network_ = None
        self.batch_size = batch_size
        self.epochs = epochs
        self.shape_out = shape_out

    def fit(self, X1, X2, y):
        """Finetune an Sbert model based on similarities between two sets of texts."""
        self.network_ = ContrastiveNetwork(
            shape_in=X1.shape[1], hidden_dim=self.shape_out
        )
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.network_.parameters(), lr=self.learning_rate)

        X1_torch = torch.from_numpy(X1).detach().float()
        X2_torch = torch.from_numpy(X2).detach().float()
        y_torch = torch.from_numpy(np.array(y)).detach().float()

        dataset = torch.utils.data.TensorDataset(X1_torch, X2_torch, y_torch)
        dataloader = torch.utils.data.DataLoader(
            dataset, batch_size=self.batch_size, shuffle=True
        )

        for _ in range(self.epochs):  # loop over the dataset multiple times
            for batch_X1, batch_X2, batch_y in dataloader:
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                cos_sim = self.network_(batch_X1, batch_X2)
                loss = criterion(cos_sim, batch_y)
                loss.backward()
                optimizer.step()
        return self

    def transform(self, X, y=None):
        """Encode a single batch of inputs."""
        X_torch = torch.from_numpy(X).detach().float()
        return self.network_.embed(X_torch).detach().numpy()

    def predict(self, X1, X2):
        """Predicts the cosine similarity."""
        emb1 = self.transform(X1)
        emb2 = self.transform(X2)
        return np.array(CosineSimilarity()(emb1, emb2))

    def to_disk(self, path):
        """Save the finetuned Sbert model."""
        self.sent_tfm.save(path=path)

SbertLearner

A learner model that can finetune on pairs of data that leverages SBERT under the hood.

It's similar to the scikit-learn models that you're used to, but it accepts two inputs X1 and X2 and tries to predict if they are similar.

Parameters:

Name Type Description Default
sent_tfm SentenceTransformer

an instance of a SentenceTransformer that you'd like to finetune

required
batch_size int

the batch size during training

16
epochs int

the number of epochs to use while training

1
warmup_steps int

the number of warmup steps before training

100

Usage:

from sentence_transformers import SentenceTransformer
from embetter.finetune import SbertLearner
import random

sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
learner = SbertLearner(sent_tfm)

def sample_generator(examples, n_neg=3):
    # A generator that assumes examples to be a dictionary of the shape
    # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
    # this is typically a function that's very custom to your use-case though
    labels = set()
    for ex in examples:
        for cat in ex['cats'].keys():
            if cat not in labels:
                labels = labels.union([cat])
    for label in labels:
        pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
        neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
        for ex in pos_examples:
            sample = random.choice(pos_examples)
            yield (ex['text'], sample['text'], 1.0)
            for n in range(n_neg):
                sample = random.choice(neg_examples)
                yield (ex['text'], sample['text'], 0.0)

learn_examples = sample_generator(examples, n_neg=3)
X1, X2, y = zip(*learn_examples)

# Learn a new representation
learner.fit(X1, X2, y)

# You now have an updated model that can create more "finetuned" embeddings
learner.transform(X1)
learner.transform(X2)

After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.

Source code in embetter/finetune/_sbert_learn.py
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class SbertLearner:
    """
    A learner model that can finetune on pairs of data that leverages SBERT under the hood.

    It's similar to the scikit-learn models that you're used to, but it accepts
    two inputs `X1` and `X2` and tries to predict if they are similar.

    Arguments:
        sent_tfm: an instance of a `SentenceTransformer` that you'd like to finetune
        batch_size: the batch size during training
        epochs: the number of epochs to use while training
        warmup_steps: the number of warmup steps before training

    Usage:

    ```python
    from sentence_transformers import SentenceTransformer
    from embetter.finetune import SbertLearner
    import random

    sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
    learner = SbertLearner(sent_tfm)

    def sample_generator(examples, n_neg=3):
        # A generator that assumes examples to be a dictionary of the shape
        # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
        # this is typically a function that's very custom to your use-case though
        labels = set()
        for ex in examples:
            for cat in ex['cats'].keys():
                if cat not in labels:
                    labels = labels.union([cat])
        for label in labels:
            pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
            neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
            for ex in pos_examples:
                sample = random.choice(pos_examples)
                yield (ex['text'], sample['text'], 1.0)
                for n in range(n_neg):
                    sample = random.choice(neg_examples)
                    yield (ex['text'], sample['text'], 0.0)

    learn_examples = sample_generator(examples, n_neg=3)
    X1, X2, y = zip(*learn_examples)

    # Learn a new representation
    learner.fit(X1, X2, y)

    # You now have an updated model that can create more "finetuned" embeddings
    learner.transform(X1)
    learner.transform(X2)
    ```

    After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.
    """

    def __init__(
        self,
        sent_tfm: SentenceTransformer,
        batch_size: int = 16,
        epochs: int = 1,
        warmup_steps: int = 100,
    ):
        self.sent_tfm = sent_tfm
        self.batch_size = batch_size
        self.epochs = epochs
        self.warmup_steps = warmup_steps

    def fit(self, X1, X2, y):
        """Finetune an Sbert model based on similarities between two sets of texts."""
        train_examples = [
            InputExample(texts=[x1, x2], label=float(lab))
            for x1, x2, lab in zip(X1, X2, y)
        ]
        data_loader = DataLoader(train_examples, shuffle=True, batch_size=16)
        train_loss = losses.CosineSimilarityLoss(self.sent_tfm)
        self.sent_tfm.fit(
            train_objectives=[(data_loader, train_loss)],
            epochs=self.epochs,
            warmup_steps=self.warmup_steps,
        )
        return self

    def transform(self, X, y=None):
        """Encode a single batch of Sbert inputs (usually texts)."""
        return self.sent_tfm.encode(X)

    def predict(self, X1, X2):
        """Predicts the cosine similarity."""
        emb1 = self.transform(X1)
        emb2 = self.transform(X2)
        return np.array(CosineSimilarity(dim=1)(emb1, emb2))

    def to_disk(self, path):
        """Save the finetuned Sbert model."""
        self.sent_tfm.save(path=path)