Skip to content

Text

SentenceEncoder

Bases: EmbetterBase

Encoder that can numerically encode sentences.

Parameters:

Name Type Description Default
name

name of model, see available options

'all-MiniLM-L6-v2'
device

manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available

None
quantize

turns on quantization

False
num_threads

number of treads for pytorch to use, only affects when device=cpu

None

The following model names should be supported:

  • all-mpnet-base-v2
  • multi-qa-mpnet-base-dot-v1
  • all-distilroberta-v1
  • all-MiniLM-L12-v2
  • multi-qa-distilbert-cos-v1
  • all-MiniLM-L6-v2
  • multi-qa-MiniLM-L6-cos-v1
  • paraphrase-multilingual-mpnet-base-v2
  • paraphrase-albert-small-v2
  • paraphrase-multilingual-MiniLM-L12-v2
  • paraphrase-MiniLM-L3-v2
  • distiluse-base-multilingual-cased-v1
  • distiluse-base-multilingual-cased-v2

You can find the more options, and information, on the sentence-transformers docs page.

Usage:

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import SentenceEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    SentenceEncoder('all-MiniLM-L6-v2')
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
Source code in embetter/text/_sbert.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class SentenceEncoder(EmbetterBase):
    """
    Encoder that can numerically encode sentences.

    Arguments:
        name: name of model, see available options
        device: manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available
        quantize: turns on quantization
        num_threads: number of treads for pytorch to use, only affects when device=cpu

    The following model names should be supported:

    - `all-mpnet-base-v2`
    - `multi-qa-mpnet-base-dot-v1`
    - `all-distilroberta-v1`
    - `all-MiniLM-L12-v2`
    - `multi-qa-distilbert-cos-v1`
    - `all-MiniLM-L6-v2`
    - `multi-qa-MiniLM-L6-cos-v1`
    - `paraphrase-multilingual-mpnet-base-v2`
    - `paraphrase-albert-small-v2`
    - `paraphrase-multilingual-MiniLM-L12-v2`
    - `paraphrase-MiniLM-L3-v2`
    - `distiluse-base-multilingual-cased-v1`
    - `distiluse-base-multilingual-cased-v2`

    You can find the more options, and information, on the [sentence-transformers docs page](https://www.sbert.net/docs/pretrained_models.html#model-overview).

    **Usage**:

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import SentenceEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        SentenceEncoder('all-MiniLM-L6-v2')
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(
        self, name="all-MiniLM-L6-v2", device=None, quantize=False, num_threads=None
    ):
        if not device:
            if torch.cuda.is_available():
                device = torch.device("cuda")
            elif torch.backends.mps.is_available():
                device = torch.device("mps")
            else:
                device = torch.device("cpu")
        self.name = name
        self.device = device
        self.tfm = SBERT(name, device=self.device)
        self.num_threads = num_threads
        self.quantize = quantize
        if quantize:
            self.tfm = quantize_dynamic(self.tfm, {Linear})
        if num_threads:
            if self.device.type == "cpu":
                torch.set_num_threads(num_threads)

    def transform(self, X, y=None):
        """Transforms the text into a numeric representation."""
        # Convert pd.Series objects to encode compatable
        if isinstance(X, pd.Series):
            X = X.to_numpy()

        return self.tfm.encode(X)

MatryoshkaEncoder

Encoder that can numerically encode sentences.

This function, which looks like a class, offers a shorthand way to fetch pretrained Matryoshka embeddings. Under the hood it just returns a SentenceEncoder object, but the default name points to a pretrained Matryoshka model.

These embeddings are more flexible in the sense that you can more easily reduce the dimensions without losing as much information. The aforementioned docs give more details

Usage:

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import SentenceEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    MatryoshkaEncoder()
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
Source code in embetter/text/_sbert.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def MatryoshkaEncoder(name="tomaarsen/mpnet-base-nli-matryoshka", **kwargs):
    """
    Encoder that can numerically encode sentences.

    This function, which looks like a class, offers a shorthand way to fetch pretrained
    [Matryoshka embeddings](https://www.sbert.net/examples/training/matryoshka/README.html).
    Under the hood it just returns a `SentenceEncoder` object, but the default name points
    to a pretrained Matryoshka model.

    These embeddings are more flexible in the sense that you can more easily reduce the
    dimensions without losing as much information. The aforementioned docs give more details

    **Usage**:

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import SentenceEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        MatryoshkaEncoder()
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """
    return SentenceEncoder(name=name, **kwargs)

LiteDocEncoder

Function that looks like class so that it fits the API.

Parameters:

Name Type Description Default
path

path where model is saved

required

This function can be used to load a model that's saved with featherbed_textrepr.

Usage:

You can leverage the multiple backends from keras-core by setting the KERAS_BACKEND environment variable.

from embetter.text import learn_lite_text_embeddings, LiteTextEncoder

learn_lite_text_embeddings(generator_of_strings, path="folder/embeddings.skops")

enc = LiteTextEncoder(path="folder/embeddings.skops")
enc.transform(["encode this examples", "and this one"])
Source code in embetter/text/_lite.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def LiteTextEncoder(path):
    """
    Function that looks like class so that it fits the API.

    Arguments:
        path: path where model is saved

    This function can be used to load a model that's saved with `featherbed_textrepr`.

    **Usage**:

    You can leverage the multiple backends from keras-core by setting the `KERAS_BACKEND` environment variable.

    ```python
    from embetter.text import learn_lite_text_embeddings, LiteTextEncoder

    learn_lite_text_embeddings(generator_of_strings, path="folder/embeddings.skops")

    enc = LiteTextEncoder(path="folder/embeddings.skops")
    enc.transform(["encode this examples", "and this one"])
    ```
    """
    return load(path, trusted=True)

KerasNLPEncoder

Bases: EmbetterBase

Encoder that can numerically encode sentences.

Parameters:

Name Type Description Default
name

name of model, see available options

'bert_tiny_en_uncased'
device

manually override cpu/gpu device, tries to grab gpu automatically when available

required
quantize

turns on quantization

required
num_threads

number of treads for pytorch to use, only affects when device=cpu

required

The pre-trained model names that you could use can be found here.

Usage:

You can leverage the multiple backends from keras-core by setting the KERAS_BACKEND environment variable.

import os
# Pick the right setting
os.environ["KERAS_BACKEND"] = "jax"
os.environ["KERAS_BACKEND"] = "torch"
os.environ["KERAS_BACKEND"] = "tensorflow"

Once this is set, the following code will automatically use the right backend.

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import SentenceEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    KerasNLPEncoder()
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
Source code in embetter/text/_keras.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class KerasNLPEncoder(EmbetterBase):
    """
    Encoder that can numerically encode sentences.

    Arguments:
        name: name of model, see available options
        device: manually override cpu/gpu device, tries to grab gpu automatically when available
        quantize: turns on quantization
        num_threads: number of treads for pytorch to use, only affects when device=cpu

    The pre-trained model names that you could use can be found [here](https://keras.io/api/keras_nlp/models/).

    **Usage**:

    You can leverage the multiple backends from keras-core by setting the `KERAS_BACKEND` environment variable.

    ```python
    import os
    # Pick the right setting
    os.environ["KERAS_BACKEND"] = "jax"
    os.environ["KERAS_BACKEND"] = "torch"
    os.environ["KERAS_BACKEND"] = "tensorflow"
    ```

    Once this is set, the following code will automatically use the right backend.

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import SentenceEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        KerasNLPEncoder()
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(self, name="bert_tiny_en_uncased"):
        self.name = name
        self.backbone = keras_nlp.models.BertBackbone.from_preset(name)
        self.preprocessor = keras_nlp.models.BertPreprocessor.from_preset(name)

    def transform(self, X, y=None):
        """Transforms the text into a numeric representation."""
        if isinstance(X, pd.Series):
            X = X.to_numpy()
        out = self.backbone(self.preprocessor(X))["pooled_output"]

        # Depending on the backend, return numpy by calling right methods.
        if keras_nlp.src.backend.config.backend() == "torch":
            return out.detach().numpy()
        else:
            return np.asarray(out)

spaCyEncoder

Bases: EmbetterBase

Usage

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import spaCyEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which is then passed to the medium spaCy model.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    spaCyEncoder("en_core_web_md")
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
Source code in embetter/text/_spacy.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class spaCyEncoder(EmbetterBase):
    """
    **Usage**

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import spaCyEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which is then passed to the medium spaCy model.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        spaCyEncoder("en_core_web_md")
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(self, nlp: Union[str, Language], agg: str = "base"):
        if isinstance(nlp, str):
            self.nlp = spacy.load(nlp, disable=["ner", "tagger", "parser"])
        elif isinstance(nlp, Language):
            self.nlp = nlp
        else:
            raise ValueError("`nlp` must be `str` or spaCy-language object.")
        self.agg = agg

    def fit(self, X, y=None):
        """No-op. Merely checks for object inputs per sklearn standard."""
        # Scikit-learn also expects this in the `.fit()` command.
        self._check_inputs(X)
        return self

    def _check_inputs(self, X):
        options = ["mean", "max", "both", "base"]
        if self.agg not in options:
            raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.")

    def transform(self, X, y=None):
        """Transforms the phrase text into a numeric representation."""
        self._check_inputs(X)
        docs = self.nlp.pipe(X)
        if self.agg == "base":
            return np.array([d.vector for d in docs])
        token_vectors = [np.array([tok.vector for tok in doc]) for doc in docs]
        if self.agg == "mean":
            return np.array([v.mean(axis=0) for v in token_vectors])
        if self.agg == "max":
            return np.array([v.max(axis=0) for v in token_vectors])
        if self.agg == "both":
            mean_arr = np.array([v.mean(axis=0) for v in token_vectors])
            max_arr = np.array([v.max(axis=0) for v in token_vectors])
            return np.concatenate([mean_arr, max_arr], axis=1)

Sense2VecEncoder

Bases: BaseEstimator

Create a Sense2Vec encoder, meant to help when encoding phrases as opposed to sentences.

Parameters:

Name Type Description Default
path str

path to downloaded model

required

Usage

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import Sense2VecEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which is then passed to the sense2vec model.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    Sense2VecEncoder("path/to/s2v")
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
Source code in embetter/text/_s2v.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
class Sense2VecEncoder(BaseEstimator):
    """
    Create a [Sense2Vec encoder](https://github.com/explosion/sense2vec), meant to
    help when encoding phrases as opposed to sentences.

    Arguments:
        path: path to downloaded model

    **Usage**

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import Sense2VecEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which is then passed to the sense2vec model.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        Sense2VecEncoder("path/to/s2v")
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
    ```
    """

    def __init__(self, path: str):
        self.path = path
        self.s2v = Sense2Vec().from_disk(self.path)
        self.shape = self.s2v["duck|NOUN"].shape

    def _to_vector(self, text):
        sense = self.s2v.get_best_sense(text)
        if not sense:
            return np.zeros(shape=self.shape)
        return self.s2v[sense]

    def transform(self, X, y=None):
        """Transforms the phrase text into a numeric representation."""
        return np.array([self._to_vector(x) for x in X])

BytePairEncoder

Bases: EmbetterBase

This language represents token-free pre-trained subword embeddings. Originally created by Benjamin Heinzerling and Michael Strube.

These vectors will auto-download by the BPEmb package. You can also specify "multi" to download multi language embeddings. A full list of available languages can be found here. The article that belongs to this work can be found here The availability of vocabulary size as well as dimensionality can be varified on the project website. See here for an example link in English. Please credit the original authors if you use their work.

Parameters:

Name Type Description Default
lang str

name of the model to load

required
vs int

vocabulary size of the byte pair model

1000
dim int

the embedding dimensionality

25
agg str

the aggregation method to reduce many subword vectors into a single one, can be "max", "mean" or "both"

'mean'
cache_dir Path

The folder in which downloaded BPEmb files will be cached, can overwrite to custom folder.

None

Usage

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import BytePairEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which then get fed into a small English model
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    BytePairEncoder(lang="en")
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
Source code in embetter/text/_bpemb.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class BytePairEncoder(EmbetterBase):
    """
    This language represents token-free pre-trained subword embeddings. Originally created by
    Benjamin Heinzerling and Michael Strube.

    These vectors will auto-download by the [BPEmb package](https://nlp.h-its.org/bpemb/).
    You can also specify "multi" to download multi language embeddings. A full list of available
    languages can be found [here](https://nlp.h-its.org/bpemb). The article that
    belongs to this work can be found [here](http://www.lrec-conf.org/proceedings/lrec2018/pdf/1049.pdf)
    The availability of vocabulary size as well as dimensionality can be varified
    on the project website. See [here](https://nlp.h-its.org/bpemb/en/) for an
    example link in English. Please credit the original authors if you use their work.

    Arguments:
        lang: name of the model to load
        vs: vocabulary size of the byte pair model
        dim: the embedding dimensionality
        agg: the aggregation method to reduce many subword vectors into a single one, can be "max", "mean" or "both"
        cache_dir: The folder in which downloaded BPEmb files will be cached, can overwrite to custom folder.

    **Usage**

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import BytePairEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which then get fed into a small English model
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        BytePairEncoder(lang="en")
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(
        self,
        lang: str,
        vs: int = 1000,
        dim: int = 25,
        agg: str = "mean",
        cache_dir: Path = None,
    ):
        self.lang = lang
        self.vs = vs
        self.dim = dim
        self.cache_dir = cache_dir
        self.agg = agg
        if not cache_dir:
            cache_dir = Path.home() / Path(".cache/bpemb")
        self.module = BPEmb(lang=lang, vs=vs, dim=dim, cache_dir=cache_dir)

    def fit(self, X, y=None):
        """No-op. Merely checks for object inputs per sklearn standard."""
        # Scikit-learn also expects this in the `.fit()` command.
        self._check_inputs(X)
        return self

    def _check_inputs(self, X):
        options = ["mean", "max", "both"]
        if self.agg not in options:
            raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.")

    def transform(self, X, y=None):
        """Transforms the phrase text into a numeric representation."""
        self._check_inputs(X)
        if self.agg == "mean":
            return np.array([self.module.embed(x).mean(axis=0) for x in X])
        if self.agg == "max":
            return np.array([self.module.embed(x).max(axis=0) for x in X])
        if self.agg == "both":
            mean_arr = np.array([self.module.embed(x).max(axis=0) for x in X])
            max_arr = np.array([self.module.embed(x).max(axis=0) for x in X])
            return np.concatenate([mean_arr, max_arr], axis=1)

GensimEncoder

Bases: EmbetterBase

Encodes text using a static word embedding model. The component uses gensim's default tokenizer.

Parameters:

Name Type Description Default
model Union[str, Word2Vec, KeyedVectors]

Model name, path to model on disk, Word2Vec instance or KeyedVectors instance.

'word2vec-google-news-300'
agg Literal['mean', 'max', 'both']

Way to aggregate the word embeddings in a document. Can either take the maximum, mean or both of them concatenated.

'mean'
deacc bool

Specifies whether accents should be removed when tokenizing the text.

False
lowercase bool

Specifies whether the text should be lowercased during tokenization.

False
Currently the following models are supported by default
  • conceptnet-numberbatch-17-06-300
  • word2vec-ruscorpora-300
  • word2vec-google-news-300
  • glove-wiki-gigaword-50
  • glove-wiki-gigaword-100
  • glove-wiki-gigaword-200
  • glove-wiki-gigaword-300
  • glove-twitter-25
  • glove-twitter-50
  • glove-twitter-100
  • glove-twitter-200

Usage

import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from embetter.grab import ColumnGrabber
from embetter.text import Word2VecEncoder

# Let's suppose this is the input dataframe
dataf = pd.DataFrame({
    "text": ["positive sentiment", "super negative"],
    "label_col": ["pos", "neg"]
})

# This pipeline grabs the `text` column from a dataframe
# which is then passed to a Word2Vec model.
text_emb_pipeline = make_pipeline(
    ColumnGrabber("text"),
    Word2VecEncoder("glove-wiki-gigaword-50")
)
X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

# This pipeline can also be trained to make predictions, using
# the embedded features.
text_clf_pipeline = make_pipeline(
    text_emb_pipeline,
    LogisticRegression()
)

# Prediction example
text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
Source code in embetter/text/_word2vec.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
class GensimEncoder(EmbetterBase):
    """
    Encodes text using a static word embedding model. The component uses gensim's default tokenizer.

    Arguments:
        model: Model name, path to model on disk, Word2Vec instance or KeyedVectors instance.
        agg: Way to aggregate the word embeddings in a document. Can either take the maximum, mean or both of them concatenated.
        deacc: Specifies whether accents should be removed when tokenizing the text.
        lowercase: Specifies whether the text should be lowercased during tokenization.

    Currently the following models are supported by default:
     - `conceptnet-numberbatch-17-06-300`
     - `word2vec-ruscorpora-300`
     - `word2vec-google-news-300`
     - `glove-wiki-gigaword-50`
     - `glove-wiki-gigaword-100`
     - `glove-wiki-gigaword-200`
     - `glove-wiki-gigaword-300`
     - `glove-twitter-25`
     - `glove-twitter-50`
     - `glove-twitter-100`
     - `glove-twitter-200`

    **Usage**

    ```python
    import pandas as pd
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    from embetter.grab import ColumnGrabber
    from embetter.text import Word2VecEncoder

    # Let's suppose this is the input dataframe
    dataf = pd.DataFrame({
        "text": ["positive sentiment", "super negative"],
        "label_col": ["pos", "neg"]
    })

    # This pipeline grabs the `text` column from a dataframe
    # which is then passed to a Word2Vec model.
    text_emb_pipeline = make_pipeline(
        ColumnGrabber("text"),
        Word2VecEncoder("glove-wiki-gigaword-50")
    )
    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])

    # This pipeline can also be trained to make predictions, using
    # the embedded features.
    text_clf_pipeline = make_pipeline(
        text_emb_pipeline,
        LogisticRegression()
    )

    # Prediction example
    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    ```
    """

    def __init__(
        self,
        model: Union[str, Word2Vec, KeyedVectors] = "word2vec-google-news-300",
        agg: Literal["mean", "max", "both"] = "mean",
        deacc: bool = False,
        lowercase: bool = False,
    ):
        self.model = model
        if isinstance(model, str):
            if model in downloader.info()["models"]:
                self.keyed_vectors: KeyedVectors = downloader.load(model)  # type: ignore
            else:
                loaded_object = SaveLoad().load(self.model)
                if isinstance(loaded_object, Word2Vec):
                    self.keyed_vectors = loaded_object.wv
                elif isinstance(loaded_object, KeyedVectors):
                    self.keyed_vectors = loaded_object
                else:
                    raise TypeError(
                        "Object loaded from disk is not Word2Vec nor a KeyedVectors instance."
                    )
        elif isinstance(model, Word2Vec):
            self.keyed_vectors: KeyedVectors = model.wv
        elif isinstance(model, KeyedVectors):
            self.keyed_vectors: KeyedVectors = model
        else:
            raise TypeError(
                f"You should pass a model name, keyed vectors or a Word2Vec model to Word2VecEncoder, not {type(model)}"
            )
        self.agg = agg
        self.deacc = deacc
        self.lowercase = lowercase
        self.n_features_out = (
            self.keyed_vectors.vector_size
            if self.agg != "both"
            else self.keyed_vectors.vector_size * 2
        )

    def fit(self, X, y=None):
        """No-op. Merely checks for object inputs per sklearn standard."""
        # Scikit-learn also expects this in the `.fit()` command.
        self._check_inputs(X)
        return self

    def _check_inputs(self, X):
        options = ["mean", "max", "both"]
        if self.agg not in options:
            raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.")

    def _tokenize(self, X) -> List[List[int]]:
        token_indices = []
        for text in X:
            tokens = tokenize(text, deacc=self.deacc, lowercase=self.lowercase)
            indices = []
            for token in tokens:
                index = self.keyed_vectors.get_index(token, default=-1)
                if index != -1:
                    indices.append(index)
            token_indices.append(indices)
        return token_indices

    def transform(self, X, y=None):
        """Transforms the phrase text into a numeric representation using word embeddings."""
        self._check_inputs(X)
        tokens = self._tokenize(X)
        embeddings = np.empty((len(X), self.n_features_out))
        for i_doc, token_indices in enumerate(tokens):
            if not len(token_indices):
                embeddings[i_doc, :] = np.nan
            doc_vectors = self.keyed_vectors.vectors[token_indices]
            if self.agg == "mean":
                embeddings[i_doc, :] = np.mean(doc_vectors, axis=0)
            elif self.agg == "max":
                embeddings[i_doc, :] = np.max(doc_vectors, axis=0)
            elif self.agg == "both":
                mean_vector = np.mean(doc_vectors, axis=0)
                max_vector = np.max(doc_vectors, axis=0)
                embeddings[i_doc, :] = np.concatenate((mean_vector, max_vector))
        return embeddings