Skip to content

extension

from tokenwiser.extension import *

In the extension submodule you can find spaCy compatible extensions.

attach_hyphen_extension()

This function will attach an extension ._.hyphen to the Tokens.

import spacy
from tokenwiser.extension import attach_hyphen_extension

nlp = spacy.load("en_core_web_sm")
# Attach the Hyphen extensions.
attach_hyphen_extension()

# Now you can query hyphens on the tokens.
doc = nlp("this is a dinosaurhead")
tok = doc[-1]

assert tok._.hyphen == ["di", "no", "saur", "head"]
Source code in tokenwiser/extension/_extension.py
def attach_hyphen_extension():
    """
    This function will attach an extension `._.hyphen` to the `Token`s.

    ```python
    import spacy
    from tokenwiser.extension import attach_hyphen_extension

    nlp = spacy.load("en_core_web_sm")
    # Attach the Hyphen extensions.
    attach_hyphen_extension()

    # Now you can query hyphens on the tokens.
    doc = nlp("this is a dinosaurhead")
    tok = doc[-1]

    assert tok._.hyphen == ["di", "no", "saur", "head"]
    ```
    """
    Token.set_extension(
        "hyphen",
        getter=lambda t: HyphenTextPrep().encode_single(t.text).split(" "),
        force=True,
    )

sklearn_method(estimator)

A helper to turn a scikit-learn estimator into a spaCy extension.

Just in case you really wanted to do it manually.

import spacy
from spacy.tokens import Doc

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from tokenwiser.extension import sklearn_method

X = [
    "i really like this post",
    "thanks for that comment",
    "i enjoy this friendly forum",
    "this is a bad post",
    "i dislike this article",
    "this is not well written"
]

y = ["pos", "pos", "pos", "neg", "neg", "neg"]

# First we train a (silly) model.
mod = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y)

# This is where we attach the scikit-learn model to spaCy as a method extension.
Doc.set_extension("sillysent_method", method=sklearn_method(mod))
# This is where we attach the scikit-learn model to spaCy as a property extension.
Doc.set_extension("sillysent_prop", getter=sklearn_method(mod))

# Demo
nlp = spacy.load("en_core_web_sm")
doc = nlp("thank you, really nice")

doc._.sillysent_method() # {"neg": 0.4446964938410244, "pos: 0.5553035061589756}
doc._.sillysent_prop # {"neg: 0.4446964938410244, "pos": 0.5553035061589756}
Source code in tokenwiser/extension/_extension.py
def sklearn_method(estimator):
    """
    A helper to turn a scikit-learn estimator into a spaCy extension.

    Just in case you *really* wanted to do it manually.

    ```python
    import spacy
    from spacy.tokens import Doc

    from sklearn.pipeline import make_pipeline
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.linear_model import LogisticRegression

    from tokenwiser.extension import sklearn_method

    X = [
        "i really like this post",
        "thanks for that comment",
        "i enjoy this friendly forum",
        "this is a bad post",
        "i dislike this article",
        "this is not well written"
    ]

    y = ["pos", "pos", "pos", "neg", "neg", "neg"]

    # First we train a (silly) model.
    mod = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y)

    # This is where we attach the scikit-learn model to spaCy as a method extension.
    Doc.set_extension("sillysent_method", method=sklearn_method(mod))
    # This is where we attach the scikit-learn model to spaCy as a property extension.
    Doc.set_extension("sillysent_prop", getter=sklearn_method(mod))

    # Demo
    nlp = spacy.load("en_core_web_sm")
    doc = nlp("thank you, really nice")

    doc._.sillysent_method() # {"neg": 0.4446964938410244, "pos: 0.5553035061589756}
    doc._.sillysent_prop # {"neg: 0.4446964938410244, "pos": 0.5553035061589756}
    ```
    """

    def method(doc):
        proba = estimator.predict_proba([doc.text])[0]
        return {c: p for c, p in zip(estimator.classes_, proba)}

    return method