pipeline
¶
from tokenwiser.pipeline import *
In the pipeline
submodule you can find scikit-learn compatbile
pipelines that extend the standard behavior.
PartialPipeline (Pipeline)
¶
Utility function to generate a PartialPipeline
Parameters:
Name | Type | Description | Default |
---|---|---|---|
steps |
a collection of text-transformers |
required |
from tokenwiser.pipeline import PartialPipeline
from tokenwiser.textprep import HyphenTextPrep, Cleaner
tc = PartialPipeline([('clean', Cleaner()), ('hyp', HyphenTextPrep())])
data = ["dinosaurhead", "another$$ sentence$$"]
results = tc.partial_fit(data).transform(data)
expected = ['di no saur head', 'an other sen tence']
assert results == expected
partial_fit(self, X, y=None, classes=None, **kwargs)
¶
Fits the components, but allow for batches.
Source code in tokenwiser/pipeline/_pipe.py
def partial_fit(self, X, y=None, classes=None, **kwargs):
"""
Fits the components, but allow for batches.
"""
for name, step in self.steps:
if not hasattr(step, "partial_fit"):
raise ValueError(
f"Step {name} is a {step} which does not have `.partial_fit` implemented."
)
for name, step in self.steps:
if hasattr(step, "predict"):
step.partial_fit(X, y, classes=classes, **kwargs)
else:
step.partial_fit(X, y)
if hasattr(step, "transform"):
X = step.transform(X)
return self
TextConcat (BaseEstimator)
¶
A component like FeatureUnion
but this also concatenates the text.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
transformer_list |
list of (name, text-transformer)-tuples |
required |
Examples:
from tokenwiser.textprep import HyphenTextPrep, Cleaner
from tokenwiser.pipeline import TextConcat
tc = TextConcat([("hyp", HyphenTextPrep()), ("clean", Cleaner())])
results = tc.fit_transform(["dinosaurhead", "another$$ sentence$$"])
expected = ['di no saur head dinosaurhead', 'an other $$ sen tence$$ another sentence']
assert results == expected
partial_fit(self, X, y=None)
¶
Fits the components, but allow for batches.
Source code in tokenwiser/pipeline/_concat.py
def partial_fit(self, X, y=None):
"""
Fits the components, but allow for batches.
"""
names = [n for n, t in self.transformer_list]
if len(names) != len(set(names)):
raise ValueError("Make sure that the names of each step are unique.")
return self
PartialFeatureUnion (FeatureUnion)
¶
A PartialFeatureUnion
is a FeatureUnion
but able to .partial_fit
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
transformer_list |
a list of transformers to apply and concatenate |
required |
Examples:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
from tokenwiser.pipeline import PartialPipeline, PartialFeatureUnion
pipe = PartialPipeline([
("clean", Cleaner()),
("union", PartialFeatureUnion([
("full_text_pipe", PartialPipeline([
("identity", Identity()),
("hash1", HashingVectorizer()),
])),
("hyphen_pipe", PartialPipeline([
("hyphen", HyphenTextPrep()),
("hash2", HashingVectorizer()),
]))
])),
("clf", SGDClassifier())
])
X = [
"i really like this post",
"thanks for that comment",
"i enjoy this friendly forum",
"this is a bad post",
"i dislike this article",
"this is not well written"
]
y = np.array([1, 1, 1, 0, 0, 0])
for loop in range(3):
pipe.partial_fit(X, y, classes=[0, 1])
assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0]))
partial_fit(self, X, y=None, classes=None, **kwargs)
¶
Fits the components, but allow for batches.
Source code in tokenwiser/pipeline/_union.py
def partial_fit(self, X, y=None, classes=None, **kwargs):
"""
Fits the components, but allow for batches.
"""
for name, step in self.transformer_list:
if not hasattr(step, "partial_fit"):
raise ValueError(
f"Step {name} is a {step} which does not have `.partial_fit` implemented."
)
for name, step in self.transformer_list:
if hasattr(step, "predict"):
step.partial_fit(X, y, classes=classes, **kwargs)
else:
step.partial_fit(X, y)
return self
make_partial_pipeline(*steps)
¶
Utility function to generate a PartialPipeline
Parameters:
Name | Type | Description | Default |
---|---|---|---|
steps |
a collection of text-transformers |
() |
from tokenwiser.pipeline import make_partial_pipeline
from tokenwiser.textprep import HyphenTextPrep, Cleaner
tc = make_partial_pipeline(Cleaner(), HyphenTextPrep())
data = ["dinosaurhead", "another$$ sentence$$"]
results = tc.partial_fit(data).transform(data)
expected = ['di no saur head', 'an other sen tence']
assert results == expected
Source code in tokenwiser/pipeline/_pipe.py
def make_partial_pipeline(*steps):
"""
Utility function to generate a `PartialPipeline`
Arguments:
steps: a collection of text-transformers
```python
from tokenwiser.pipeline import make_partial_pipeline
from tokenwiser.textprep import HyphenTextPrep, Cleaner
tc = make_partial_pipeline(Cleaner(), HyphenTextPrep())
data = ["dinosaurhead", "another$$ sentence$$"]
results = tc.partial_fit(data).transform(data)
expected = ['di no saur head', 'an other sen tence']
assert results == expected
```
"""
return PartialPipeline(_name_estimators(steps))
make_concat(*steps)
¶
Utility function to generate a TextConcat
Parameters:
Name | Type | Description | Default |
---|---|---|---|
steps |
a collection of text-transformers |
() |
from tokenwiser.textprep import HyphenTextPrep, Cleaner
from tokenwiser.pipeline import make_concat
tc = make_concat(HyphenTextPrep(), Cleaner())
results = tc.fit_transform(["dinosaurhead", "another$$ sentence$$"])
expected = ['di no saur head dinosaurhead', 'an other $$ sen tence$$ another sentence']
assert results == expected
Source code in tokenwiser/pipeline/_concat.py
def make_concat(*steps):
"""
Utility function to generate a `TextConcat`
Arguments:
steps: a collection of text-transformers
```python
from tokenwiser.textprep import HyphenTextPrep, Cleaner
from tokenwiser.pipeline import make_concat
tc = make_concat(HyphenTextPrep(), Cleaner())
results = tc.fit_transform(["dinosaurhead", "another$$ sentence$$"])
expected = ['di no saur head dinosaurhead', 'an other $$ sen tence$$ another sentence']
assert results == expected
```
"""
return TextConcat(_name_estimators(steps))
make_partial_union(*transformer_list)
¶
Utility function to generate a PartialFeatureUnion
Parameters:
Name | Type | Description | Default |
---|---|---|---|
transformer_list |
a list of transformers to apply and concatenate |
() |
Examples:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
from tokenwiser.pipeline import make_partial_pipeline, make_partial_union
pipe = make_partial_pipeline(
Cleaner(),
make_partial_union(
make_partial_pipeline(Identity(), HashingVectorizer()),
make_partial_pipeline(HyphenTextPrep(), HashingVectorizer())
),
SGDClassifier()
)
X = [
"i really like this post",
"thanks for that comment",
"i enjoy this friendly forum",
"this is a bad post",
"i dislike this article",
"this is not well written"
]
y = np.array([1, 1, 1, 0, 0, 0])
for loop in range(3):
pipe.partial_fit(X, y, classes=[0, 1])
assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0]))
Source code in tokenwiser/pipeline/_union.py
def make_partial_union(*transformer_list):
"""
Utility function to generate a `PartialFeatureUnion`
Arguments:
transformer_list: a list of transformers to apply and concatenate
Example:
```python
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
from tokenwiser.pipeline import make_partial_pipeline, make_partial_union
pipe = make_partial_pipeline(
Cleaner(),
make_partial_union(
make_partial_pipeline(Identity(), HashingVectorizer()),
make_partial_pipeline(HyphenTextPrep(), HashingVectorizer())
),
SGDClassifier()
)
X = [
"i really like this post",
"thanks for that comment",
"i enjoy this friendly forum",
"this is a bad post",
"i dislike this article",
"this is not well written"
]
y = np.array([1, 1, 1, 0, 0, 0])
for loop in range(3):
pipe.partial_fit(X, y, classes=[0, 1])
assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0]))
```
"""
return PartialFeatureUnion(_name_estimators(transformer_list))