Text

TextEncoder

Bases: EmbetterBase

Encoder that can numerically encode text using a model from the model2vec library.

The main benefit of this encoder is that it uses distilled word embeddings, which means that they are super fast.

Parameters:

Name	Type	Description	Default
`name`		name of model, see available options, can also pass a model2vec StaticModel object directly	required

The following model names should be supported:

minishlab/potion-base-32M
minishlab/potion-base-8M
minishlab/potion-base-4M
minishlab/potion-base-2M
minishlab/potion-retrieval-32M
minishlab/M2V_multilingual_output

You can find the more options, and information, on the Github repository.

Usage:

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import TextEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    TextEncoder()
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)

Source code in embetter/text/_model2vec.py

class TextEncoder(EmbetterBase):
    """
    Encoder that can numerically encode text using a model from the model2vec library.

    The main benefit of this encoder is that it uses distilled word embeddings, which means that they are super *fast*. 

    Arguments:
        name: name of model, see available options, can also pass a model2vec StaticModel object directly

    The following model names should be supported:

    - `minishlab/potion-base-32M`
    - `minishlab/potion-base-8M`
    - `minishlab/potion-base-4M`
    - `minishlab/potion-base-2M`
    - `minishlab/potion-retrieval-32M`
    - `minishlab/M2V_multilingual_output`

    You can find the more options, and information, on the [Github repository](https://github.com/MinishLab/model2vec?tab=readme-ov-file#model-list).

    **Usage**:

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import TextEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        TextEncoder()
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(
        self, model="minishlab/potion-base-8M"
    ):
        if isinstance(model, str):
            self.model = StaticModel.from_pretrained(model)
        else:
            assert isinstance(model, StaticModel), "model must be a string or a StaticModel from model2vec"
            self.model = model

    def transform(self, X, y=None):
        """Transforms the text into a numeric representation."""
        return self.model.encode(X)

SentenceEncoder

Bases: EmbetterBase

Encoder that can numerically encode sentences.

Parameters:

Name	Description	Default
`name`	name of model, see available options	`'all-MiniLM-L6-v2'`
`device`	manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available	`None`
`quantize`	turns on quantization	`False`
`num_threads`	number of treads for pytorch to use, only affects when device=cpu	`None`

The following model names should be supported:

all-mpnet-base-v2
multi-qa-mpnet-base-dot-v1
all-distilroberta-v1
all-MiniLM-L12-v2
multi-qa-distilbert-cos-v1
all-MiniLM-L6-v2
multi-qa-MiniLM-L6-cos-v1
paraphrase-multilingual-mpnet-base-v2
paraphrase-albert-small-v2
paraphrase-multilingual-MiniLM-L12-v2
paraphrase-MiniLM-L3-v2
distiluse-base-multilingual-cased-v1
distiluse-base-multilingual-cased-v2

You can find the more options, and information, on the sentence-transformers docs page.

Usage:

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import SentenceEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    SentenceEncoder('all-MiniLM-L6-v2')
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)

Source code in embetter/text/_sbert.py

class SentenceEncoder(EmbetterBase):
    """
    Encoder that can numerically encode sentences.

    Arguments:
        name: name of model, see available options
        device: manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available
        quantize: turns on quantization
        num_threads: number of treads for pytorch to use, only affects when device=cpu

    The following model names should be supported:

    - `all-mpnet-base-v2`
    - `multi-qa-mpnet-base-dot-v1`
    - `all-distilroberta-v1`
    - `all-MiniLM-L12-v2`
    - `multi-qa-distilbert-cos-v1`
    - `all-MiniLM-L6-v2`
    - `multi-qa-MiniLM-L6-cos-v1`
    - `paraphrase-multilingual-mpnet-base-v2`
    - `paraphrase-albert-small-v2`
    - `paraphrase-multilingual-MiniLM-L12-v2`
    - `paraphrase-MiniLM-L3-v2`
    - `distiluse-base-multilingual-cased-v1`
    - `distiluse-base-multilingual-cased-v2`

    You can find the more options, and information, on the [sentence-transformers docs page](https://www.sbert.net/docs/pretrained_models.html#model-overview).

    **Usage**:

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import SentenceEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        SentenceEncoder('all-MiniLM-L6-v2')
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(
        self, name="all-MiniLM-L6-v2", device=None, quantize=False, num_threads=None
    ):
        if not device:
            if torch.cuda.is_available():
                device = torch.device("cuda")
            elif torch.backends.mps.is_available():
                device = torch.device("mps")
            else:
                device = torch.device("cpu")
        self.name = name
        self.device = device
        self.tfm = SBERT(name, device=self.device)
        self.num_threads = num_threads
        self.quantize = quantize
        if quantize:
            self.tfm = quantize_dynamic(self.tfm, {Linear})
        if num_threads:
            if self.device.type == "cpu":
                torch.set_num_threads(num_threads)

    def transform(self, X, y=None):
        """Transforms the text into a numeric representation."""
        # Convert pd.Series objects to encode compatable
        if isinstance(X, pd.Series):
            X = X.to_numpy()

        return self.tfm.encode(X)

MatryoshkaEncoder

Encoder that can numerically encode sentences.

This function, which looks like a class, offers a shorthand way to fetch pretrained Matryoshka embeddings. Under the hood it just returns a SentenceEncoder object, but the default name points to a pretrained Matryoshka model.

These embeddings are more flexible in the sense that you can more easily reduce the dimensions without losing as much information. The aforementioned docs give more details

Usage:

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import SentenceEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    MatryoshkaEncoder()
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)

Source code in embetter/text/_sbert.py

def MatryoshkaEncoder(name="tomaarsen/mpnet-base-nli-matryoshka", **kwargs):
    """
    Encoder that can numerically encode sentences.

    This function, which looks like a class, offers a shorthand way to fetch pretrained
    [Matryoshka embeddings](https://www.sbert.net/examples/training/matryoshka/README.html).
    Under the hood it just returns a `SentenceEncoder` object, but the default name points
    to a pretrained Matryoshka model.

    These embeddings are more flexible in the sense that you can more easily reduce the
    dimensions without losing as much information. The aforementioned docs give more details

    **Usage**:

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import SentenceEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        MatryoshkaEncoder()
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """
    return SentenceEncoder(name=name, **kwargs)

LiteDocEncoder

Function that looks like class so that it fits the API.

Parameters:

Name	Type	Description	Default
`path`		path where model is saved	required

This function can be used to load a model that's saved with featherbed_textrepr.

Usage:

You can leverage the multiple backends from keras-core by setting the KERAS_BACKEND environment variable.

from embetter.text import learn_lite_text_embeddings, LiteTextEncoder

learn_lite_text_embeddings(generator_of_strings, path="folder/embeddings.skops")

enc = LiteTextEncoder(path="folder/embeddings.skops")
enc.transform(["encode this examples", "and this one"])

Source code in embetter/text/_lite.py

def LiteTextEncoder(path):
    """
    Function that looks like class so that it fits the API.

    Arguments:
        path: path where model is saved

    This function can be used to load a model that's saved with `featherbed_textrepr`.

    **Usage**:

    You can leverage the multiple backends from keras-core by setting the `KERAS_BACKEND` environment variable.

    ```python
    from embetter.text import learn_lite_text_embeddings, LiteTextEncoder

    learn_lite_text_embeddings(generator_of_strings, path="folder/embeddings.skops")

    enc = LiteTextEncoder(path="folder/embeddings.skops")
    enc.transform(["encode this examples", "and this one"])
    ```
    """
    return load(path, trusted=True)

KerasNLPEncoder

Bases: EmbetterBase

Encoder that can numerically encode sentences.

Parameters:

Name	Description	Default
`name`	name of model, see available options	`'bert_tiny_en_uncased'`
`device`	manually override cpu/gpu device, tries to grab gpu automatically when available	required
`quantize`	turns on quantization	required
`num_threads`	number of treads for pytorch to use, only affects when device=cpu	required

The pre-trained model names that you could use can be found here.

Usage:

You can leverage the multiple backends from keras-core by setting the KERAS_BACKEND environment variable.

import os
# Pick the right setting
os.environ["KERAS_BACKEND"] = "jax"
os.environ["KERAS_BACKEND"] = "torch"
os.environ["KERAS_BACKEND"] = "tensorflow"

Once this is set, the following code will automatically use the right backend.

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import SentenceEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    KerasNLPEncoder()
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)

Source code in embetter/text/_keras.py

class KerasNLPEncoder(EmbetterBase):
    """
    Encoder that can numerically encode sentences.

    Arguments:
        name: name of model, see available options
        device: manually override cpu/gpu device, tries to grab gpu automatically when available
        quantize: turns on quantization
        num_threads: number of treads for pytorch to use, only affects when device=cpu

    The pre-trained model names that you could use can be found [here](https://keras.io/api/keras_nlp/models/).

    **Usage**:

    You can leverage the multiple backends from keras-core by setting the `KERAS_BACKEND` environment variable.

    ```python
    import os
    # Pick the right setting
    os.environ["KERAS_BACKEND"] = "jax"
    os.environ["KERAS_BACKEND"] = "torch"
    os.environ["KERAS_BACKEND"] = "tensorflow"
    ```

    Once this is set, the following code will automatically use the right backend.

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import SentenceEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        KerasNLPEncoder()
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(self, name="bert_tiny_en_uncased"):
        self.name = name
        self.backbone = keras_nlp.models.BertBackbone.from_preset(name)
        self.preprocessor = keras_nlp.models.BertPreprocessor.from_preset(name)

    def transform(self, X, y=None):
        """Transforms the text into a numeric representation."""
        if isinstance(X, pd.Series):
            X = X.to_numpy()
        out = self.backbone(self.preprocessor(X))["pooled_output"]

        # Depending on the backend, return numpy by calling right methods.
        if keras_nlp.src.backend.config.backend() == "torch":
            return out.detach().numpy()
        else:
            return np.asarray(out)

spaCyEncoder

Bases: EmbetterBase

Usage

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import spaCyEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which is then passed to the medium spaCy model.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    spaCyEncoder("en_core_web_md")
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)

Source code in embetter/text/_spacy.py

class spaCyEncoder(EmbetterBase):
    """
    **Usage**

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import spaCyEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which is then passed to the medium spaCy model.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        spaCyEncoder("en_core_web_md")
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(self, nlp: Union[str, Language], agg: str = "base"):
        if isinstance(nlp, str):
            self.nlp = spacy.load(nlp, disable=["ner", "tagger", "parser"])
        elif isinstance(nlp, Language):
            self.nlp = nlp
        else:
            raise ValueError("`nlp` must be `str` or spaCy-language object.")
        self.agg = agg

    def fit(self, X, y=None):
        """No-op. Merely checks for object inputs per sklearn standard."""
        # Scikit-learn also expects this in the `.fit()` command.
        self._check_inputs(X)
        return self

    def _check_inputs(self, X):
        options = ["mean", "max", "both", "base"]
        if self.agg not in options:
            raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.")

    def transform(self, X, y=None):
        """Transforms the phrase text into a numeric representation."""
        self._check_inputs(X)
        docs = self.nlp.pipe(X)
        if self.agg == "base":
            return np.array([d.vector for d in docs])
        token_vectors = [np.array([tok.vector for tok in doc]) for doc in docs]
        if self.agg == "mean":
            return np.array([v.mean(axis=0) for v in token_vectors])
        if self.agg == "max":
            return np.array([v.max(axis=0) for v in token_vectors])
        if self.agg == "both":
            mean_arr = np.array([v.mean(axis=0) for v in token_vectors])
            max_arr = np.array([v.max(axis=0) for v in token_vectors])
            return np.concatenate([mean_arr, max_arr], axis=1)

Sense2VecEncoder

Bases: BaseEstimator

Create a Sense2Vec encoder, meant to help when encoding phrases as opposed to sentences.

Parameters:

Name	Type	Description	Default
`path`	`str`	path to downloaded model	required

Usage

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import Sense2VecEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which is then passed to the sense2vec model.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    Sense2VecEncoder("path/to/s2v")
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

Source code in embetter/text/_s2v.py

class Sense2VecEncoder(BaseEstimator):
    """
    Create a [Sense2Vec encoder](https://github.com/explosion/sense2vec), meant to
    help when encoding phrases as opposed to sentences.

    Arguments:
        path: path to downloaded model

    **Usage**

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import Sense2VecEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which is then passed to the sense2vec model.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        Sense2VecEncoder("path/to/s2v")
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
    ```
    """

    def __init__(self, path: str):
        self.path = path
        self.s2v = Sense2Vec().from_disk(self.path)
        self.shape = self.s2v["duck|NOUN"].shape

    def _to_vector(self, text):
        sense = self.s2v.get_best_sense(text)
        if not sense:
            return np.zeros(shape=self.shape)
        return self.s2v[sense]

    def transform(self, X, y=None):
        """Transforms the phrase text into a numeric representation."""
        return np.array([self._to_vector(x) for x in X])

BytePairEncoder

Bases: EmbetterBase

This language represents token-free pre-trained subword embeddings. Originally created by Benjamin Heinzerling and Michael Strube.

These vectors will auto-download by the BPEmb package. You can also specify "multi" to download multi language embeddings. A full list of available languages can be found here. The article that belongs to this work can be found here The availability of vocabulary size as well as dimensionality can be varified on the project website. See here for an example link in English. Please credit the original authors if you use their work.

Parameters:

Name	Type	Description	Default
`lang`	`str`	name of the model to load	required
`vs`	`int`	vocabulary size of the byte pair model	`1000`
`dim`	`int`	the embedding dimensionality	`25`
`agg`	`str`	the aggregation method to reduce many subword vectors into a single one, can be "max", "mean" or "both"	`'mean'`
`cache_dir`	`Path`	The folder in which downloaded BPEmb files will be cached, can overwrite to custom folder.	`None`

Usage

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import BytePairEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which then get fed into a small English model
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    BytePairEncoder(lang="en")
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)

Source code in embetter/text/_bpemb.py

class BytePairEncoder(EmbetterBase):
    """
    This language represents token-free pre-trained subword embeddings. Originally created by
    Benjamin Heinzerling and Michael Strube.

    These vectors will auto-download by the [BPEmb package](https://nlp.h-its.org/bpemb/).
    You can also specify "multi" to download multi language embeddings. A full list of available
    languages can be found [here](https://nlp.h-its.org/bpemb). The article that
    belongs to this work can be found [here](http://www.lrec-conf.org/proceedings/lrec2018/pdf/1049.pdf)
    The availability of vocabulary size as well as dimensionality can be varified
    on the project website. See [here](https://nlp.h-its.org/bpemb/en/) for an
    example link in English. Please credit the original authors if you use their work.

    Arguments:
        lang: name of the model to load
        vs: vocabulary size of the byte pair model
        dim: the embedding dimensionality
        agg: the aggregation method to reduce many subword vectors into a single one, can be "max", "mean" or "both"
        cache_dir: The folder in which downloaded BPEmb files will be cached, can overwrite to custom folder.

    **Usage**

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import BytePairEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which then get fed into a small English model
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        BytePairEncoder(lang="en")
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(
        self,
        lang: str,
        vs: int = 1000,
        dim: int = 25,
        agg: str = "mean",
        cache_dir: Path = None,
    ):
        self.lang = lang
        self.vs = vs
        self.dim = dim
        self.cache_dir = cache_dir
        self.agg = agg
        if not cache_dir:
            cache_dir = Path.home() / Path(".cache/bpemb")
        self.module = BPEmb(lang=lang, vs=vs, dim=dim, cache_dir=cache_dir)

    def fit(self, X, y=None):
        """No-op. Merely checks for object inputs per sklearn standard."""
        # Scikit-learn also expects this in the `.fit()` command.
        self._check_inputs(X)
        return self

    def _check_inputs(self, X):
        options = ["mean", "max", "both"]
        if self.agg not in options:
            raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.")

    def transform(self, X, y=None):
        """Transforms the phrase text into a numeric representation."""
        self._check_inputs(X)
        if self.agg == "mean":
            return np.array([self.module.embed(x).mean(axis=0) for x in X])
        if self.agg == "max":
            return np.array([self.module.embed(x).max(axis=0) for x in X])
        if self.agg == "both":
            mean_arr = np.array([self.module.embed(x).max(axis=0) for x in X])
            max_arr = np.array([self.module.embed(x).max(axis=0) for x in X])
            return np.concatenate([mean_arr, max_arr], axis=1)

GensimEncoder

Bases: EmbetterBase

Encodes text using a static word embedding model. The component uses gensim's default tokenizer.

Parameters:

Name	Type	Description	Default
`model`	`Union[str, Word2Vec, KeyedVectors]`	Model name, path to model on disk, Word2Vec instance or KeyedVectors instance.	`'word2vec-google-news-300'`
`agg`	`Literal['mean', 'max', 'both']`	Way to aggregate the word embeddings in a document. Can either take the maximum, mean or both of them concatenated.	`'mean'`
`deacc`	`bool`	Specifies whether accents should be removed when tokenizing the text.	`False`
`lowercase`	`bool`	Specifies whether the text should be lowercased during tokenization.	`False`

Currently the following models are supported by default

conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200

Usage

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import Word2VecEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which is then passed to a Word2Vec model.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    Word2VecEncoder("glove-wiki-gigaword-50")
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)

Source code in embetter/text/_word2vec.py

class GensimEncoder(EmbetterBase):
    """
    Encodes text using a static word embedding model. The component uses gensim's default tokenizer.

    Arguments:
        model: Model name, path to model on disk, Word2Vec instance or KeyedVectors instance.
        agg: Way to aggregate the word embeddings in a document. Can either take the maximum, mean or both of them concatenated.
        deacc: Specifies whether accents should be removed when tokenizing the text.
        lowercase: Specifies whether the text should be lowercased during tokenization.

    Currently the following models are supported by default:
     - `conceptnet-numberbatch-17-06-300`
     - `word2vec-ruscorpora-300`
     - `word2vec-google-news-300`
     - `glove-wiki-gigaword-50`
     - `glove-wiki-gigaword-100`
     - `glove-wiki-gigaword-200`
     - `glove-wiki-gigaword-300`
     - `glove-twitter-25`
     - `glove-twitter-50`
     - `glove-twitter-100`
     - `glove-twitter-200`

    **Usage**

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import Word2VecEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which is then passed to a Word2Vec model.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        Word2VecEncoder("glove-wiki-gigaword-50")
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(
        self,
        model: Union[str, Word2Vec, KeyedVectors] = "word2vec-google-news-300",
        agg: Literal["mean", "max", "both"] = "mean",
        deacc: bool = False,
        lowercase: bool = False,
    ):
        self.model = model
        if isinstance(model, str):
            if model in downloader.info()["models"]:
                self.keyed_vectors: KeyedVectors = downloader.load(model)  # type: ignore
            else:
                loaded_object = SaveLoad().load(self.model)
                if isinstance(loaded_object, Word2Vec):
                    self.keyed_vectors = loaded_object.wv
                elif isinstance(loaded_object, KeyedVectors):
                    self.keyed_vectors = loaded_object
                else:
                    raise TypeError(
                        "Object loaded from disk is not Word2Vec nor a KeyedVectors instance."
                    )
        elif isinstance(model, Word2Vec):
            self.keyed_vectors: KeyedVectors = model.wv
        elif isinstance(model, KeyedVectors):
            self.keyed_vectors: KeyedVectors = model
        else:
            raise TypeError(
                f"You should pass a model name, keyed vectors or a Word2Vec model to Word2VecEncoder, not {type(model)}"
            )
        self.agg = agg
        self.deacc = deacc
        self.lowercase = lowercase
        self.n_features_out = (
            self.keyed_vectors.vector_size
            if self.agg != "both"
            else self.keyed_vectors.vector_size * 2
        )

    def fit(self, X, y=None):
        """No-op. Merely checks for object inputs per sklearn standard."""
        # Scikit-learn also expects this in the `.fit()` command.
        self._check_inputs(X)
        return self

    def _check_inputs(self, X):
        options = ["mean", "max", "both"]
        if self.agg not in options:
            raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.")

    def _tokenize(self, X) -> List[List[int]]:
        token_indices = []
        for text in X:
            tokens = tokenize(text, deacc=self.deacc, lowercase=self.lowercase)
            indices = []
            for token in tokens:
                index = self.keyed_vectors.get_index(token, default=-1)
                if index != -1:
                    indices.append(index)
            token_indices.append(indices)
        return token_indices

    def transform(self, X, y=None):
        """Transforms the phrase text into a numeric representation using word embeddings."""
        self._check_inputs(X)
        tokens = self._tokenize(X)
        embeddings = np.empty((len(X), self.n_features_out))
        for i_doc, token_indices in enumerate(tokens):
            if not len(token_indices):
                embeddings[i_doc, :] = np.nan
            doc_vectors = self.keyed_vectors.vectors[token_indices]
            if self.agg == "mean":
                embeddings[i_doc, :] = np.mean(doc_vectors, axis=0)
            elif self.agg == "max":
                embeddings[i_doc, :] = np.max(doc_vectors, axis=0)
            elif self.agg == "both":
                mean_vector = np.mean(doc_vectors, axis=0)
                max_vector = np.max(doc_vectors, axis=0)
                embeddings[i_doc, :] = np.concatenate((mean_vector, max_vector))
        return embeddings