Finetuners

If you're interested in how to use these components, you'll probably want to read this section first.

FeedForwardTuner

Bases: BaseEstimator, TransformerMixin

Create a feed forward model to finetune the embeddings towards a class.

Parameters:

Name	Description	Default
`hidden_dim`	The size of the hidden layer	`50`
`n_epochs`	The number of epochs to run the optimiser for	`500`
`learning_rate`	The learning rate of the feed forward model	`0.01`

Source code in embetter/finetune/_forward.py

class FeedForwardTuner(BaseEstimator, TransformerMixin):
    """
    Create a feed forward model to finetune the embeddings towards a class.

    Arguments:
        hidden_dim: The size of the hidden layer
        n_epochs: The number of epochs to run the optimiser for
        learning_rate: The learning rate of the feed forward model
    """

    def __init__(
        self, hidden_dim=50, n_epochs=500, learning_rate=0.01, batch_size=32
    ) -> None:
        self.hidden_dim = hidden_dim
        self.n_epochs = n_epochs
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.label_enc = LabelEncoder()

    def fit(self, X, y):
        """Fits the finetuner."""
        return self.partial_fit(X, y, classes=np.unique(y))

    def partial_fit(self, X, y, classes=None):
        """Fits the finetuner using the partial_fit API."""
        if not hasattr(self, "_classes"):
            if classes is None:
                raise ValueError("`classes` must be provided for partial_fit")
            self._classes = classes
            self.label_enc.fit(classes)
            assert (self._classes == self.label_enc.classes_).all()
        # Create a model if it does not exist yet.
        if not hasattr(self, "_model"):
            self._model = FeedForwardModel(
                X.shape[1], self.hidden_dim, len(self._classes)
            )
            self._optimizer = torch.optim.Adam(
                self._model.parameters(), lr=self.learning_rate
            )
            self._criterion = nn.CrossEntropyLoss()

        torch_X = torch.from_numpy(X).detach().float()
        torch_y = torch.from_numpy(self.label_enc.transform(y)).detach()

        dataset = torch.utils.data.TensorDataset(torch_X, torch_y)
        dataloader = torch.utils.data.DataLoader(
            dataset, batch_size=self.batch_size, shuffle=True
        )

        for _ in range(self.n_epochs):
            for batch_X, batch_y in dataloader:
                self._optimizer.zero_grad()
                out = self._model(batch_X)
                loss = self._criterion(out, batch_y)
                loss.backward()
                self._optimizer.step()

        return self

    def transform(self, X, y=None):
        """Transforms the data according to the sklearn api by using the hidden layer."""
        Xt = torch.from_numpy(X).float().detach()
        return self._model.embed(Xt).detach().numpy()

ContrastiveTuner

Bases: BaseEstimator, TransformerMixin

Run a contrastive network to finetune the embeddings towards a class.

Parameters:

Name	Description	Default
`hidden_dim`	the dimension of the new learned representation	`50`
`n_neg`	number of negative example pairs to sample per positive item	`3`
`n_epochs`	number of epochs to use for training	required
`learning_rate`	learning rate of the contrastive network	`0.001`

Source code in embetter/finetune/_contrastive_tuner.py

class ContrastiveTuner(BaseEstimator, TransformerMixin):
    """
    Run a contrastive network to finetune the embeddings towards a class.

    Arguments:
        hidden_dim: the dimension of the new learned representation
        n_neg: number of negative example pairs to sample per positive item
        n_epochs: number of epochs to use for training
        learning_rate: learning rate of the contrastive network
    """

    def __init__(self, hidden_dim=50, n_neg=3, epochs=20, learning_rate=0.001) -> None:
        self.learner = ContrastiveLearner(
            shape_out=hidden_dim,
            batch_size=256,
            learning_rate=learning_rate,
            epochs=epochs,
        )
        self.n_neg = n_neg
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.learning_rate = learning_rate

    def fit(self, X, y):
        """Fits the finetuner."""
        return self.partial_fit(X, y, classes=np.unique(y))

    def generate_batch(self, X_torch, y):
        """Generate a batch of pytorch pairs used for finetuning"""
        pairs = generate_pairs_batch(y, n_neg=self.n_neg)
        X1 = torch.zeros(len(pairs), X_torch.shape[1])
        X2 = torch.zeros(len(pairs), X_torch.shape[1])
        labels = torch.tensor([ex.label for ex in pairs], dtype=torch.long)
        for i, pair in enumerate(pairs):
            X1[i] = X_torch[pair.i1]
            X2[i] = X_torch[pair.i2]
        return X1, X2, labels

    def partial_fit(self, X, y, classes=None):
        """Fits the finetuner using the partial_fit API."""
        if not hasattr(self, "_classes"):
            if classes is None:
                raise ValueError("`classes` must be provided for partial_fit")
            self._classes = classes

        X_torch = torch.from_numpy(X).detach().float()

        X1, X2, out = self.generate_batch(X_torch, y=y)
        # TODO: change this, we should just generate numpy internally not cast all over
        self.learner.fit(np.array(X1), np.array(X2), np.array(out))

        return self

    def transform(self, X, y=None):
        """Transforms the data according to the sklearn api by using the hidden layer."""
        return self.learner.transform(X)

ContrastiveLearner

A learner model that can finetune on pairs of data on top of numeric embeddings.

It's similar to the scikit-learn models that you're used to, but it accepts two inputs X1 and X2 and tries to predict if they are similar.

Parameters:

Name	Type	Description	Default
`sent_tfm`		an instance of a `SentenceTransformer` that you'd like to finetune	required
`batch_size`	`int`	the batch size during training	`16`
`epochs`	`int`	the number of epochs to use while training	`1`
`warmup_steps`		the number of warmup steps before training	required

Usage:

from sentence_transformers import SentenceTransformer
from embetter.finetune import ContrastiveLearner
import random

sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
learner = SbertLearner(sent_tfm)

def sample_generator(examples, n_neg=3):
    # A generator that assumes examples to be a dictionary of the shape
    # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
    # this is typically a function that's very custom to your use-case though
    labels = set()
    for ex in examples:
        for cat in ex['cats'].keys():
            if cat not in labels:
                labels = labels.union([cat])
    for label in labels:
        pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
        neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
        for ex in pos_examples:
            sample = random.choice(pos_examples)
            yield (ex['text'], sample['text'], 1.0)
            for n in range(n_neg):
                sample = random.choice(neg_examples)
                yield (ex['text'], sample['text'], 0.0)

learn_examples = sample_generator(examples, n_neg=3)
X1, X2, y = zip(*learn_examples)

# Learn a new representation
learner.fit(X1, X2, y)

# You now have an updated model that can create more "finetuned" embeddings
learner.transform(X1)
learner.transform(X2)

After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.

Source code in embetter/finetune/_constrastive_learn.py

class ContrastiveLearner:
    """
    A learner model that can finetune on pairs of data on top of numeric embeddings.

    It's similar to the scikit-learn models that you're used to, but it accepts
    two inputs `X1` and `X2` and tries to predict if they are similar.

    Arguments:
        sent_tfm: an instance of a `SentenceTransformer` that you'd like to finetune
        batch_size: the batch size during training
        epochs: the number of epochs to use while training
        warmup_steps: the number of warmup steps before training

    Usage:

    ```python
    from sentence_transformers import SentenceTransformer
    from embetter.finetune import ContrastiveLearner
    import random

    sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
    learner = SbertLearner(sent_tfm)

    def sample_generator(examples, n_neg=3):
        # A generator that assumes examples to be a dictionary of the shape
        # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
        # this is typically a function that's very custom to your use-case though
        labels = set()
        for ex in examples:
            for cat in ex['cats'].keys():
                if cat not in labels:
                    labels = labels.union([cat])
        for label in labels:
            pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
            neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
            for ex in pos_examples:
                sample = random.choice(pos_examples)
                yield (ex['text'], sample['text'], 1.0)
                for n in range(n_neg):
                    sample = random.choice(neg_examples)
                    yield (ex['text'], sample['text'], 0.0)

    learn_examples = sample_generator(examples, n_neg=3)
    X1, X2, y = zip(*learn_examples)

    # Learn a new representation
    learner.fit(X1, X2, y)

    # You now have an updated model that can create more "finetuned" embeddings
    learner.transform(X1)
    learner.transform(X2)
    ```

    After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.
    """

    def __init__(
        self,
        shape_out: int = 300,
        batch_size: int = 16,
        epochs: int = 1,
        learning_rate=2e-05,
    ):
        self.learning_rate = learning_rate
        self.network_ = None
        self.batch_size = batch_size
        self.epochs = epochs
        self.shape_out = shape_out

    def fit(self, X1, X2, y):
        """Finetune an Sbert model based on similarities between two sets of texts."""
        self.network_ = ContrastiveNetwork(
            shape_in=X1.shape[1], hidden_dim=self.shape_out
        )
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.network_.parameters(), lr=self.learning_rate)

        X1_torch = torch.from_numpy(X1).detach().float()
        X2_torch = torch.from_numpy(X2).detach().float()
        y_torch = torch.from_numpy(np.array(y)).detach().float()

        dataset = torch.utils.data.TensorDataset(X1_torch, X2_torch, y_torch)
        dataloader = torch.utils.data.DataLoader(
            dataset, batch_size=self.batch_size, shuffle=True
        )

        for _ in range(self.epochs):  # loop over the dataset multiple times
            for batch_X1, batch_X2, batch_y in dataloader:
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                cos_sim = self.network_(batch_X1, batch_X2)
                loss = criterion(cos_sim, batch_y)
                loss.backward()
                optimizer.step()
        return self

    def transform(self, X, y=None):
        """Encode a single batch of inputs."""
        X_torch = torch.from_numpy(X).detach().float()
        return self.network_.embed(X_torch).detach().numpy()

    def predict(self, X1, X2):
        """Predicts the cosine similarity."""
        emb1 = self.transform(X1)
        emb2 = self.transform(X2)
        return np.array(CosineSimilarity()(emb1, emb2))

    def to_disk(self, path):
        """Save the finetuned Sbert model."""
        self.sent_tfm.save(path=path)

SbertLearner

A learner model that can finetune on pairs of data that leverages SBERT under the hood.

It's similar to the scikit-learn models that you're used to, but it accepts two inputs X1 and X2 and tries to predict if they are similar.

Parameters:

Name	Type	Description	Default
`sent_tfm`	`SentenceTransformer`	an instance of a `SentenceTransformer` that you'd like to finetune	required
`batch_size`	`int`	the batch size during training	`16`
`epochs`	`int`	the number of epochs to use while training	`1`
`warmup_steps`	`int`	the number of warmup steps before training	`100`

Usage:

from sentence_transformers import SentenceTransformer
from embetter.finetune import SbertLearner
import random

sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
learner = SbertLearner(sent_tfm)

def sample_generator(examples, n_neg=3):
    # A generator that assumes examples to be a dictionary of the shape
    # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
    # this is typically a function that's very custom to your use-case though
    labels = set()
    for ex in examples:
        for cat in ex['cats'].keys():
            if cat not in labels:
                labels = labels.union([cat])
    for label in labels:
        pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
        neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
        for ex in pos_examples:
            sample = random.choice(pos_examples)
            yield (ex['text'], sample['text'], 1.0)
            for n in range(n_neg):
                sample = random.choice(neg_examples)
                yield (ex['text'], sample['text'], 0.0)

learn_examples = sample_generator(examples, n_neg=3)
X1, X2, y = zip(*learn_examples)

# Learn a new representation
learner.fit(X1, X2, y)

# You now have an updated model that can create more "finetuned" embeddings
learner.transform(X1)
learner.transform(X2)

After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.

Source code in embetter/finetune/_sbert_learn.py

class SbertLearner:
    """
    A learner model that can finetune on pairs of data that leverages SBERT under the hood.

    It's similar to the scikit-learn models that you're used to, but it accepts
    two inputs `X1` and `X2` and tries to predict if they are similar.

    Arguments:
        sent_tfm: an instance of a `SentenceTransformer` that you'd like to finetune
        batch_size: the batch size during training
        epochs: the number of epochs to use while training
        warmup_steps: the number of warmup steps before training

    Usage:

    ```python
    from sentence_transformers import SentenceTransformer
    from embetter.finetune import SbertLearner
    import random

    sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
    learner = SbertLearner(sent_tfm)

    def sample_generator(examples, n_neg=3):
        # A generator that assumes examples to be a dictionary of the shape
        # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
        # this is typically a function that's very custom to your use-case though
        labels = set()
        for ex in examples:
            for cat in ex['cats'].keys():
                if cat not in labels:
                    labels = labels.union([cat])
        for label in labels:
            pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
            neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
            for ex in pos_examples:
                sample = random.choice(pos_examples)
                yield (ex['text'], sample['text'], 1.0)
                for n in range(n_neg):
                    sample = random.choice(neg_examples)
                    yield (ex['text'], sample['text'], 0.0)

    learn_examples = sample_generator(examples, n_neg=3)
    X1, X2, y = zip(*learn_examples)

    # Learn a new representation
    learner.fit(X1, X2, y)

    # You now have an updated model that can create more "finetuned" embeddings
    learner.transform(X1)
    learner.transform(X2)
    ```

    After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.
    """

    def __init__(
        self,
        sent_tfm: SentenceTransformer,
        batch_size: int = 16,
        epochs: int = 1,
        warmup_steps: int = 100,
    ):
        self.sent_tfm = sent_tfm
        self.batch_size = batch_size
        self.epochs = epochs
        self.warmup_steps = warmup_steps

    def fit(self, X1, X2, y):
        """Finetune an Sbert model based on similarities between two sets of texts."""
        train_examples = [
            InputExample(texts=[x1, x2], label=float(lab))
            for x1, x2, lab in zip(X1, X2, y)
        ]
        data_loader = DataLoader(train_examples, shuffle=True, batch_size=16)
        train_loss = losses.CosineSimilarityLoss(self.sent_tfm)
        self.sent_tfm.fit(
            train_objectives=[(data_loader, train_loss)],
            epochs=self.epochs,
            warmup_steps=self.warmup_steps,
        )
        return self

    def transform(self, X, y=None):
        """Encode a single batch of Sbert inputs (usually texts)."""
        return self.sent_tfm.encode(X)

    def predict(self, X1, X2):
        """Predicts the cosine similarity."""
        emb1 = self.transform(X1)
        emb2 = self.transform(X2)
        return np.array(CosineSimilarity(dim=1)(emb1, emb2))

    def to_disk(self, path):
        """Save the finetuned Sbert model."""
        self.sent_tfm.save(path=path)