release: v0.2.0

v0.2.0
severinsimmler · Dec 6, 2020 · 0be9f0b · 0be9f0b
2 parents 785bf51 + 9b73c81
commit 0be9f0b
Show file tree

Hide file tree

Showing 19 changed files with 1,224 additions and 717 deletions.
diff --git a/.gitignore b/.gitignore
@@ -124,4 +124,5 @@ venv.bak/
 
 .vscode
 
-chaine/model.cpp
+chaine/crf.cpp
+*.crf
diff --git a/README.md b/README.md
@@ -1,33 +1,31 @@
-# A lightweight Linear-Chain Conditional Random Field
+# A Lightweight Conditional Random Field
 
-This is a modern, fast and no-dependency Python library implementing a linear-chain conditional random field for natural language processing tasks like named entity recognition or part-of-speech tagging.
-
-
-## Installation
+This is a modern Python library without any third-party dependencies and a backend written in C implementing conditional random fields for natural language processing tasks like named entity recognition or part-of-speech tagging.
 
 You can install the latest stable version from [PyPI](https://pypi.org/project/chaine):
 
 ```
 $ pip install chaine
 ```
 
+If you are interested in the theoretical concepts behind conditional random fields, I can recommend the introducing paper by [Lafferty et al](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers).
+
 
 ## Example
 
 ```python
 >>> import chaine
->>> import datasets
->>> data = datasets.load_dataset("germeval_14")
->>> tokens = data["train"]["tokens"]
->>> labels = data["train"]["ner_tags"]
->>> crf = chaine.train(tokens, labels, max_iterations=100)
->>> sequence = chaine.featurize(["todo", "todo", "todo"])
->>> crf.predict(sequence)
-["O", "O", "B-PER"]
+>>> sequences = [[["a", "a"], ["b", "b"]]]
+>>> labels = [["0", "1"]]
+>>> model = chaine.train(sequences, labels)
+>>> model.predict(sequences)
+[['0', '1']]
 ```
 
+Check out the introducing [Jupyter notebook](https://github.com/severinsimmler/chaine/blob/master/notebooks/tutorial.ipynb).
+
 
-## Disclaimer
+## Credits
 
 This library makes use of and is partially based on:
 

diff --git a/build.py b/build.py
@@ -6,7 +6,7 @@
 
 
 sources = [
-    "chaine/model.cpp",
+    "chaine/crf.cpp",
     "chaine/trainer_wrapper.cpp",
     "chaine/crfsuite/lib/cqdb/src/cqdb.c",
     "chaine/crfsuite/lib/cqdb/src/lookup3.c",
@@ -17,7 +17,7 @@
 sources = sorted(sources)
 
 
-includes = [
+include_dirs = [
     "chaine/crfsuite/include/",
     "chaine/crfsuite/lib/cqdb/include",
     "chaine/liblbfgs/include",
@@ -39,13 +39,13 @@ def c_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
 
 
 ext_modules = [
-    Extension("chaine.model", include_dirs=includes, language="c++", sources=sources)
+    Extension("chaine.crf", include_dirs=include_dirs, language="c++", sources=sources)
 ]
 
 
 def build(setup_kwargs):
     # cythonize
-    command = ["cython", "chaine/model.pyx", "--cplus", "-2", "-I", "chaine"]
+    command = ["cython", "chaine/crf.pyx", "--cplus", "-2", "-I", "chaine"]
     subprocess.check_call(command)
 
     # update setup.py kwargs

diff --git a/chaine/__init__.py b/chaine/__init__.py
@@ -1,2 +1,2 @@
-from chaine.model import Trainer, CRF
-from chaine.core import featurize, train
+from chaine.core import train
+from chaine.crf import Model, Trainer
diff --git a/chaine/core.py b/chaine/core.py
@@ -5,47 +5,28 @@
 This module implements the high-level API
 """
 
-from chaine.model import Trainer, CRF
-from chaine.data import Token, Sequence
-from chaine.typing import FeatureGenerator, List
+from chaine.crf import Model, Trainer
+from chaine.typing import Dataset, Labels
 
 
-def featurize(tokens: List[str]) -> FeatureGenerator:
-    """Featurize a sequence of tokens
+def train(dataset: Dataset, labels: Labels, **kwargs) -> Model:
+    """Train a conditional random field
 
     Parameters
     ----------
-    tokens : List[str]
-        Sequence of tokens to generate features for
+    dataset : Dataset
+        Dataset consisting of sequences of features
+    labels : Labels
+        Labels corresponding to each instance in the dataset
 
-    Returns
-    -------
-    FeatureGenerator
-        One feature set at a time
-    """
-    tokens = [Token(index, token) for index, token in enumerate(tokens)]
-    for features in Sequence(tokens).featurize():
-        yield features
-
-
-def train(dataset: List[List[str]], labels: List[List[str]], **kwargs) -> CRF:
-    """Train a linear-chain conditional random field
-
-    Parameters
-    ----------
-    dataset : List[List[str]]
-        Dataset consisting of sequences of tokens
-    labels : List[List[str]]
-        Labels corresponding to the dataset
-    
     Returns
     -------
     CRF
         A conditional random field fitted on the dataset
     """
-    features = [featurize(sequence) for sequence in dataset]
-
-    trainer = Trainer("lbfgs", **kwargs)
-    trainer.train(features, labels, "model.crf")
+    # start training
+    trainer = Trainer(**kwargs)
+    trainer.train(dataset, labels, "model.crf")
 
-    return CRF("model.crf")
+    # load and return the trained model
+    return Model("model.crf")
-Original file line number
+Diff line change
@@ Expand Up / @@ -124,4 +124,5 @@ venv.bak/ @@
     .vscode
-    chaine/model.cpp
+    chaine/crf.cpp
+    *.crf