Skip to content

pipeline

from tokenwiser.pipeline import * 

In the pipeline submodule you can find scikit-learn compatbile pipelines that extend the standard behavior.

PartialPipeline (Pipeline)

Utility function to generate a PartialPipeline

Parameters:

Name Type Description Default
steps

a collection of text-transformers

required
from tokenwiser.pipeline import PartialPipeline
from tokenwiser.textprep import HyphenTextPrep, Cleaner

tc = PartialPipeline([('clean', Cleaner()), ('hyp', HyphenTextPrep())])
data = ["dinosaurhead", "another$$ sentence$$"]
results = tc.partial_fit(data).transform(data)
expected = ['di no saur head', 'an other  sen tence']

assert results == expected

partial_fit(self, X, y=None, classes=None, **kwargs)

Fits the components, but allow for batches.

Source code in tokenwiser/pipeline/_pipe.py
def partial_fit(self, X, y=None, classes=None, **kwargs):
    """
    Fits the components, but allow for batches.
    """
    for name, step in self.steps:
        if not hasattr(step, "partial_fit"):
            raise ValueError(
                f"Step {name} is a {step} which does not have `.partial_fit` implemented."
            )
    for name, step in self.steps:
        if hasattr(step, "predict"):
            step.partial_fit(X, y, classes=classes, **kwargs)
        else:
            step.partial_fit(X, y)
        if hasattr(step, "transform"):
            X = step.transform(X)
    return self

TextConcat (BaseEstimator)

A component like FeatureUnion but this also concatenates the text.

Parameters:

Name Type Description Default
transformer_list

list of (name, text-transformer)-tuples

required

Examples:

from tokenwiser.textprep import HyphenTextPrep, Cleaner
from tokenwiser.pipeline import TextConcat

tc = TextConcat([("hyp", HyphenTextPrep()), ("clean", Cleaner())])
results = tc.fit_transform(["dinosaurhead", "another$$ sentence$$"])
expected = ['di no saur head dinosaurhead', 'an other $$ sen tence$$ another sentence']

assert results == expected

partial_fit(self, X, y=None)

Fits the components, but allow for batches.

Source code in tokenwiser/pipeline/_concat.py
def partial_fit(self, X, y=None):
    """
    Fits the components, but allow for batches.
    """
    names = [n for n, t in self.transformer_list]
    if len(names) != len(set(names)):
        raise ValueError("Make sure that the names of each step are unique.")
    return self

PartialFeatureUnion (FeatureUnion)

A PartialFeatureUnion is a FeatureUnion but able to .partial_fit.

Parameters:

Name Type Description Default
transformer_list

a list of transformers to apply and concatenate

required

Examples:

import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer

from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
from tokenwiser.pipeline import PartialPipeline, PartialFeatureUnion

pipe = PartialPipeline([
    ("clean", Cleaner()),
    ("union", PartialFeatureUnion([
        ("full_text_pipe", PartialPipeline([
            ("identity", Identity()),
            ("hash1", HashingVectorizer()),
        ])),
        ("hyphen_pipe", PartialPipeline([
            ("hyphen", HyphenTextPrep()),
            ("hash2", HashingVectorizer()),
        ]))
    ])),
    ("clf", SGDClassifier())
])

X = [
    "i really like this post",
    "thanks for that comment",
    "i enjoy this friendly forum",
    "this is a bad post",
    "i dislike this article",
    "this is not well written"
]

y = np.array([1, 1, 1, 0, 0, 0])

for loop in range(3):
    pipe.partial_fit(X, y, classes=[0, 1])

assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0]))

partial_fit(self, X, y=None, classes=None, **kwargs)

Fits the components, but allow for batches.

Source code in tokenwiser/pipeline/_union.py
def partial_fit(self, X, y=None, classes=None, **kwargs):
    """
    Fits the components, but allow for batches.
    """
    for name, step in self.transformer_list:
        if not hasattr(step, "partial_fit"):
            raise ValueError(
                f"Step {name} is a {step} which does not have `.partial_fit` implemented."
            )
    for name, step in self.transformer_list:
        if hasattr(step, "predict"):
            step.partial_fit(X, y, classes=classes, **kwargs)
        else:
            step.partial_fit(X, y)
    return self

make_partial_pipeline(*steps)

Utility function to generate a PartialPipeline

Parameters:

Name Type Description Default
steps

a collection of text-transformers

()
from tokenwiser.pipeline import make_partial_pipeline
from tokenwiser.textprep import HyphenTextPrep, Cleaner

tc = make_partial_pipeline(Cleaner(), HyphenTextPrep())
data = ["dinosaurhead", "another$$ sentence$$"]
results = tc.partial_fit(data).transform(data)
expected = ['di no saur head', 'an other  sen tence']

assert results == expected
Source code in tokenwiser/pipeline/_pipe.py
def make_partial_pipeline(*steps):
    """
    Utility function to generate a `PartialPipeline`

    Arguments:
        steps: a collection of text-transformers

    ```python
    from tokenwiser.pipeline import make_partial_pipeline
    from tokenwiser.textprep import HyphenTextPrep, Cleaner

    tc = make_partial_pipeline(Cleaner(), HyphenTextPrep())
    data = ["dinosaurhead", "another$$ sentence$$"]
    results = tc.partial_fit(data).transform(data)
    expected = ['di no saur head', 'an other  sen tence']

    assert results == expected
    ```
    """
    return PartialPipeline(_name_estimators(steps))

make_concat(*steps)

Utility function to generate a TextConcat

Parameters:

Name Type Description Default
steps

a collection of text-transformers

()
from tokenwiser.textprep import HyphenTextPrep, Cleaner
from tokenwiser.pipeline import make_concat

tc = make_concat(HyphenTextPrep(), Cleaner())
results = tc.fit_transform(["dinosaurhead", "another$$ sentence$$"])
expected = ['di no saur head dinosaurhead', 'an other $$ sen tence$$ another sentence']

assert results == expected
Source code in tokenwiser/pipeline/_concat.py
def make_concat(*steps):
    """
    Utility function to generate a `TextConcat`

    Arguments:
        steps: a collection of text-transformers

    ```python
    from tokenwiser.textprep import HyphenTextPrep, Cleaner
    from tokenwiser.pipeline import make_concat

    tc = make_concat(HyphenTextPrep(), Cleaner())
    results = tc.fit_transform(["dinosaurhead", "another$$ sentence$$"])
    expected = ['di no saur head dinosaurhead', 'an other $$ sen tence$$ another sentence']

    assert results == expected
    ```
    """
    return TextConcat(_name_estimators(steps))

make_partial_union(*transformer_list)

Utility function to generate a PartialFeatureUnion

Parameters:

Name Type Description Default
transformer_list

a list of transformers to apply and concatenate

()

Examples:

import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer

from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
from tokenwiser.pipeline import make_partial_pipeline, make_partial_union

pipe = make_partial_pipeline(
    Cleaner(),
    make_partial_union(
        make_partial_pipeline(Identity(), HashingVectorizer()),
        make_partial_pipeline(HyphenTextPrep(), HashingVectorizer())
    ),
    SGDClassifier()
)

X = [
    "i really like this post",
    "thanks for that comment",
    "i enjoy this friendly forum",
    "this is a bad post",
    "i dislike this article",
    "this is not well written"
]

y = np.array([1, 1, 1, 0, 0, 0])

for loop in range(3):
    pipe.partial_fit(X, y, classes=[0, 1])

assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0]))
Source code in tokenwiser/pipeline/_union.py
def make_partial_union(*transformer_list):
    """
    Utility function to generate a `PartialFeatureUnion`

    Arguments:
        transformer_list: a list of transformers to apply and concatenate

    Example:

    ```python
    import numpy as np
    from sklearn.linear_model import SGDClassifier
    from sklearn.feature_extraction.text import HashingVectorizer

    from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
    from tokenwiser.pipeline import make_partial_pipeline, make_partial_union

    pipe = make_partial_pipeline(
        Cleaner(),
        make_partial_union(
            make_partial_pipeline(Identity(), HashingVectorizer()),
            make_partial_pipeline(HyphenTextPrep(), HashingVectorizer())
        ),
        SGDClassifier()
    )

    X = [
        "i really like this post",
        "thanks for that comment",
        "i enjoy this friendly forum",
        "this is a bad post",
        "i dislike this article",
        "this is not well written"
    ]

    y = np.array([1, 1, 1, 0, 0, 0])

    for loop in range(3):
        pipe.partial_fit(X, y, classes=[0, 1])

    assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0]))
    ```
    """
    return PartialFeatureUnion(_name_estimators(transformer_list))