Skip to content

Commit

Permalink
chore: cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Nov 28, 2020
1 parent 4dae419 commit 1ed9cf7
Show file tree
Hide file tree
Showing 8 changed files with 370 additions and 73 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,19 @@ You can install the latest stable version from [PyPI](https://pypi.org/project/c
$ pip install chaine
```


## Example

```python
>>> import chaine
>>> import datasets
>>> data = datasets.load_dataset("germeval_14")
>>> tokens = data["train"]["tokens"]
>>> labels = data["train"]["ner_tags"]
>>> crf = chaine.train(tokens, labels, max_iterations=100)
>>> sequence = chaine.featurize(["todo", "todo", "todo"])
>>> crf.predict(sequence)
["O", "O", "B-PER"]
```


Expand Down
2 changes: 1 addition & 1 deletion chaine/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from chaine.model import Trainer, CRF
from chaine.core import process
from chaine.core import featurize, train
50 changes: 29 additions & 21 deletions chaine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,47 @@
This module implements the high-level API
"""

from chaine.data import FeatureMatrix, Features, Sequence, Token
from chaine.typing import List, Optional, Iterable
from chaine.model import Trainer, CRF
from chaine.data import Token, Sequence
from chaine.typing import FeatureGenerator, List


def matrix(tokens: Iterable[str], features: Optional[Features] = None) -> FeatureMatrix:
"""Create a feature matrix from plain tokens
def featurize(tokens: List[str]) -> FeatureGenerator:
"""Featurize a sequence of tokens
Parameters
----------
tokens : Iterable[str]
Iterable of plain token strings
features : Optional[Features]
Feature set to vectorize tokens
tokens : List[str]
Sequence of tokens to generate features for
Returns
-------
FeatureMatrix
Matrix where rows represent tokens and columns features
FeatureGenerator
One feature set at a time
"""
# token objects
tokens = [Token(index, token) for index, token in enumerate(tokens)]
for features in Sequence(tokens).featurize():
yield features

# token sequence
sequence = Sequence(tokens)

# new feature set
if not features:
features = Features()
def train(dataset: List[List[str]], labels: List[List[str]], **kwargs) -> CRF:
"""Train a linear-chain conditional random field
# return a feature matrix object
return FeatureMatrix(sequence, features)
Parameters
----------
dataset : List[List[str]]
Dataset consisting of sequences of tokens
labels : List[List[str]]
Labels corresponding to the dataset
Returns
-------
CRF
A conditional random field fitted on the dataset
"""
features = [featurize(sequence) for sequence in dataset]

trainer = Trainer("lbfgs", **kwargs)
trainer.train(features, labels, "model.crf")

def process(dataset: Iterable[Iterable[str]], features: Optional[Features] = None):
for instance in dataset:
yield feature_matrix(instance, features)
return CRF("model.crf")
77 changes: 39 additions & 38 deletions chaine/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,43 +69,44 @@ def __len__(self) -> int:
def __repr__(self) -> str:
return f"<Sequence: {self.tokens}>"

def __str__(self) -> str:
return " ".join(self.tokens)

class Features:
_feature2index = {}

def index(self, name: str) -> int:
if name not in self._feature2index:
self._feature2index[name] = len(self._feature2index)
return self._feature2index[name]

def vectorize(self, token: Token) -> List[int]:
return [
self.index(feature)
for feature in [
f"word.lower():{token.lower()}",
f"word.is_upper:{token.is_upper}",
f"word.is_title:{token.is_title}",
f"word.is_digit:{token.is_digit}",
]
]


@dataclass
class FeatureMatrix:
sequence: Sequence
features: Features

def __iter__(self) -> FeatureGenerator:
for token in self.sequence:
yield self.features.vectorize(token)

def __repr__(self) -> str:
return f"<FeatureMatrix: {len(self.sequence)} Tokens>"


class Labels:
pass

@property
def indices(self) -> List[int]:
return [token.index for token in self.tokens]

class Parameters:
pass
def featurize(self) -> FeatureGenerator:
for token in self.tokens:
features = {
"bias=1.0",
f"token.lower()={token.lower()}",
f"token.is_upper()={token.is_upper}",
f"token.is_title()={token.is_title}",
f"token.is_digit()={token.is_digit}",
}

if token.index > 0:
left_token = self.tokens[token.index - 1]
features.update(
{
f"-1:token.lower()={left_token.lower()}",
f"-1:token.is_title()={left_token.is_title}",
f"-1:token.is_upper()={left_token.is_upper}",
}
)
else:
features.add("BOS=True")

if token.index < max(self.indices):
right_token = self.tokens[token.index + 1]
features.update(
{
f"+1:token.lower()={right_token.lower()}",
f"+1:token.is_title()={right_token.is_title}",
f"+1:token.is_upper()={right_token.is_upper}",
}
)
else:
features.add("EOS=True")
yield features
6 changes: 6 additions & 0 deletions chaine/model.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ cdef class Trainer:
self._c_trainer.select("lbfgs", "crf1d")
self._c_trainer._init_trainer()

def __repr__(self):
return f"<Trainer: {self.params}>"

cdef _on_message(self, string message):
self._message(message)

Expand Down Expand Up @@ -194,6 +197,9 @@ cdef class CRF:
def __init__(self, model_filepath):
self._load(model_filepath)

def __repr__(self):
return f"<CRF: {self.labels}>"

def _load(self, filepath):
self._check_model(filepath)
if not self.c_tagger.open(filepath):
Expand Down
13 changes: 1 addition & 12 deletions chaine/typing.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,4 @@
#from pathlib import Path
from typing import List, Optional, Iterable, Generator
from typing import List, Optional, Generator

FeatureGenerator = Generator[List[str], None, None]
TokenGenerator = Generator["Token", None, None]


"""
Filepath = Union[str, Path]
Vector = Union[List[str]]
Matrix = Union[List[Vector]]
MatrixGenerator = Generator[Matrix, None, None]
SentenceGenerator = Generator["Sentence", None, None]
FeatureGenerator = Generator[List[int], None, None]
"""
Loading

0 comments on commit 1ed9cf7

Please sign in to comment.