Skip to content

Commit

Permalink
release: v0.2.0
Browse files Browse the repository at this point in the history
v0.2.0
  • Loading branch information
severinsimmler authored Dec 6, 2020
2 parents 785bf51 + 9b73c81 commit 0be9f0b
Show file tree
Hide file tree
Showing 19 changed files with 1,224 additions and 717 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,5 @@ venv.bak/

.vscode

chaine/model.cpp
chaine/crf.cpp
*.crf
26 changes: 12 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,33 +1,31 @@
# A lightweight Linear-Chain Conditional Random Field
# A Lightweight Conditional Random Field

This is a modern, fast and no-dependency Python library implementing a linear-chain conditional random field for natural language processing tasks like named entity recognition or part-of-speech tagging.


## Installation
This is a modern Python library without any third-party dependencies and a backend written in C implementing conditional random fields for natural language processing tasks like named entity recognition or part-of-speech tagging.

You can install the latest stable version from [PyPI](https://pypi.org/project/chaine):

```
$ pip install chaine
```

If you are interested in the theoretical concepts behind conditional random fields, I can recommend the introducing paper by [Lafferty et al](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers).


## Example

```python
>>> import chaine
>>> import datasets
>>> data = datasets.load_dataset("germeval_14")
>>> tokens = data["train"]["tokens"]
>>> labels = data["train"]["ner_tags"]
>>> crf = chaine.train(tokens, labels, max_iterations=100)
>>> sequence = chaine.featurize(["todo", "todo", "todo"])
>>> crf.predict(sequence)
["O", "O", "B-PER"]
>>> sequences = [[["a", "a"], ["b", "b"]]]
>>> labels = [["0", "1"]]
>>> model = chaine.train(sequences, labels)
>>> model.predict(sequences)
[['0', '1']]
```

Check out the introducing [Jupyter notebook](https://github.com/severinsimmler/chaine/blob/master/notebooks/tutorial.ipynb).


## Disclaimer
## Credits

This library makes use of and is partially based on:

Expand Down
8 changes: 4 additions & 4 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


sources = [
"chaine/model.cpp",
"chaine/crf.cpp",
"chaine/trainer_wrapper.cpp",
"chaine/crfsuite/lib/cqdb/src/cqdb.c",
"chaine/crfsuite/lib/cqdb/src/lookup3.c",
Expand All @@ -17,7 +17,7 @@
sources = sorted(sources)


includes = [
include_dirs = [
"chaine/crfsuite/include/",
"chaine/crfsuite/lib/cqdb/include",
"chaine/liblbfgs/include",
Expand All @@ -39,13 +39,13 @@ def c_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):


ext_modules = [
Extension("chaine.model", include_dirs=includes, language="c++", sources=sources)
Extension("chaine.crf", include_dirs=include_dirs, language="c++", sources=sources)
]


def build(setup_kwargs):
# cythonize
command = ["cython", "chaine/model.pyx", "--cplus", "-2", "-I", "chaine"]
command = ["cython", "chaine/crf.pyx", "--cplus", "-2", "-I", "chaine"]
subprocess.check_call(command)

# update setup.py kwargs
Expand Down
4 changes: 2 additions & 2 deletions chaine/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from chaine.model import Trainer, CRF
from chaine.core import featurize, train
from chaine.core import train
from chaine.crf import Model, Trainer
45 changes: 13 additions & 32 deletions chaine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,47 +5,28 @@
This module implements the high-level API
"""

from chaine.model import Trainer, CRF
from chaine.data import Token, Sequence
from chaine.typing import FeatureGenerator, List
from chaine.crf import Model, Trainer
from chaine.typing import Dataset, Labels


def featurize(tokens: List[str]) -> FeatureGenerator:
"""Featurize a sequence of tokens
def train(dataset: Dataset, labels: Labels, **kwargs) -> Model:
"""Train a conditional random field
Parameters
----------
tokens : List[str]
Sequence of tokens to generate features for
dataset : Dataset
Dataset consisting of sequences of features
labels : Labels
Labels corresponding to each instance in the dataset
Returns
-------
FeatureGenerator
One feature set at a time
"""
tokens = [Token(index, token) for index, token in enumerate(tokens)]
for features in Sequence(tokens).featurize():
yield features


def train(dataset: List[List[str]], labels: List[List[str]], **kwargs) -> CRF:
"""Train a linear-chain conditional random field
Parameters
----------
dataset : List[List[str]]
Dataset consisting of sequences of tokens
labels : List[List[str]]
Labels corresponding to the dataset
Returns
-------
CRF
A conditional random field fitted on the dataset
"""
features = [featurize(sequence) for sequence in dataset]

trainer = Trainer("lbfgs", **kwargs)
trainer.train(features, labels, "model.crf")
# start training
trainer = Trainer(**kwargs)
trainer.train(dataset, labels, "model.crf")

return CRF("model.crf")
# load and return the trained model
return Model("model.crf")
Loading

0 comments on commit 0be9f0b

Please sign in to comment.