test in CI

gautierdag · Nov 15, 2023 · 5eab80d · 5eab80d
1 parent a147225
commit 5eab80d
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 18 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -11,14 +11,58 @@ on:
       - main
       - master
     tags:
-      - '*'
+      - "*"
   pull_request:
   workflow_dispatch:
 
 permissions:
   contents: read
 
 jobs:
+  coverage:
+    name: Coverage for ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: ["ubuntu"]
+    runs-on: ${{ matrix.os }}-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: "3.10"
+          cache: "pip"
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          override: true
+          profile: minimal
+          components: llvm-tools-preview
+      - name: Install cargo-llvm-cov
+        uses: taiki-e/install-action@cargo-llvm-cov
+      - uses: Swatinem/rust-cache@v1
+        with:
+          key: coverage-cargo-${{ matrix.os }}
+        continue-on-error: true
+      - name: Setup virtual environment
+        run: |
+          python -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+      - name: Run coverage
+        run: |
+          source venv/bin/activate
+          source <(cargo llvm-cov show-env --export-prefix)
+          export CARGO_TARGET_DIR=$CARGO_LLVM_COV_TARGET_DIR
+          export CARGO_INCREMENTAL=1
+          cargo llvm-cov clean --workspace
+          cargo test
+          maturin develop
+          pytest tests --cov=foobar --cov-report xml
+          cargo llvm-cov --no-run --lcov --output-path coverage.lcov
+      - uses: codecov/codecov-action@v3
+        with:
+          files: coverage.lcov,coverage.xml
+          name: ${{ matrix.os }}
   linux:
     runs-on: ubuntu-latest
     strategy:
@@ -28,13 +72,13 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: "3.10"
       - name: Build wheels
         uses: PyO3/maturin-action@v1
         with:
           target: ${{ matrix.target }}
           args: --release --out dist --find-interpreter
-          sccache: 'true'
+          sccache: "true"
           manylinux: auto
       - name: Upload wheels
         uses: actions/upload-artifact@v3
@@ -51,14 +95,14 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: "3.10"
           architecture: ${{ matrix.target }}
       - name: Build wheels
         uses: PyO3/maturin-action@v1
         with:
           target: ${{ matrix.target }}
           args: --release --out dist --find-interpreter
-          sccache: 'true'
+          sccache: "true"
       - name: Upload wheels
         uses: actions/upload-artifact@v3
         with:
@@ -74,13 +118,13 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: "3.10"
       - name: Build wheels
         uses: PyO3/maturin-action@v1
         with:
           target: ${{ matrix.target }}
           args: --release --out dist --find-interpreter
-          sccache: 'true'
+          sccache: "true"
       - name: Upload wheels
         uses: actions/upload-artifact@v3
         with:

diff --git a/README.md b/README.md
@@ -1 +1,3 @@
 # bpeasy
+
+[![codecov](https://codecov.io/gh/gautierdag/bpeasy/branch/main/graph/badge.svg?token=NWHDJ22L8I)](https://codecov.io/gh/gautierdag/bpeasy)
diff --git a/src/lib.rs b/src/lib.rs
@@ -47,6 +47,7 @@ fn get_most_frequent_pair(
     let mut pair_freqs: HashMap<(Vec<u8>, Vec<u8>), u128> = HashMap::new();
 
     // Calculate frequencies for each pair of bytes in all sentences and words
+    // NOTE: Could be parallelized over sentences
     for sentence in tokenized_bytes {
         for word in sentence.windows(2) {
             if word[0].len() + word[1].len() > max_token_length {
@@ -69,6 +70,7 @@ fn get_most_frequent_pair(
 
 fn merge_frequent_pair(tokenized_bytes: &mut Vec<Vec<Vec<u8>>>, left: Vec<u8>, right: Vec<u8>) {
     // Merge the most frequent pair in all sentences and words
+    // NOTE: Could be parallelized over sentences
     for sentence in tokenized_bytes.iter_mut() {
         let mut i = 0;
         while i < sentence.len() - 1 {
@@ -161,7 +163,6 @@ fn train_bpe(
         if text.is_empty() {
             continue;
         }
-
         let tokens_bytes = tokenize(text, regex);
         tokenized_bytes.extend(tokens_bytes);
     }
@@ -178,9 +179,9 @@ fn train_bpe(
     Ok(python_dict_out.into())
 }
 
-/// A Python module implemented in Rust. The name of this function must match
-/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
-/// import the module.
+/// bpeasy is a bare-bones implementation of byte-pair encoding (BPE) in Rust.
+/// It is designed to be used as a Python module and returns a byte-pair vocabulary
+/// as a Python dictionary.
 #[pymodule]
 fn bpeasy(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(train_bpe, m)?)?;
@@ -197,9 +198,6 @@ mod tests {
         let text = "a b c";
         let regex = r"([^\s]+)|(\s+)";
         let tokens = tokenize(text, regex);
-        // assert no error
-        // assert!(tokens.is_ok());
-
         assert_eq!(
             tokens,
             vec![
@@ -216,17 +214,13 @@ mod tests {
     fn test_all() {
         let text: &str = "\tYou hear £ £ £ here";
         let regex = r"([^\s]+)|(\s+)";
-        // let tokens = tokenize(text, regex);
-        // println!("{:?}", tokens);
-        // let tokenized_bytes = convert_to_tokenized_bytes(tokens);
         let tokenized_bytes = tokenize(text, regex);
         println!("{:?}", tokenized_bytes);
 
         let vocab_size = 10;
         let max_token_length = 128;
         let bpe_vocab = build_bpe_vocab(tokenized_bytes, max_token_length, vocab_size);
         println!("{:?}", bpe_vocab);
-        // Output or use the encoded text
     }
 
     #[test]
Original file line number	Diff line number	Diff line change
		@@ -1 +1,3 @@
		# bpeasy

		[![codecov](https://codecov.io/gh/gautierdag/bpeasy/branch/main/graph/badge.svg?token=NWHDJ22L8I)](https://codecov.io/gh/gautierdag/bpeasy)