Skip to content

Commit

Permalink
test in CI
Browse files Browse the repository at this point in the history
  • Loading branch information
gautierdag committed Nov 15, 2023
1 parent a147225 commit 5eab80d
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 18 deletions.
58 changes: 51 additions & 7 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,58 @@ on:
- main
- master
tags:
- '*'
- "*"
pull_request:
workflow_dispatch:

permissions:
contents: read

jobs:
coverage:
name: Coverage for ${{ matrix.os }}
strategy:
matrix:
os: ["ubuntu"]
runs-on: ${{ matrix.os }}-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.10"
cache: "pip"
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
profile: minimal
components: llvm-tools-preview
- name: Install cargo-llvm-cov
uses: taiki-e/install-action@cargo-llvm-cov
- uses: Swatinem/rust-cache@v1
with:
key: coverage-cargo-${{ matrix.os }}
continue-on-error: true
- name: Setup virtual environment
run: |
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
- name: Run coverage
run: |
source venv/bin/activate
source <(cargo llvm-cov show-env --export-prefix)
export CARGO_TARGET_DIR=$CARGO_LLVM_COV_TARGET_DIR
export CARGO_INCREMENTAL=1
cargo llvm-cov clean --workspace
cargo test
maturin develop
pytest tests --cov=foobar --cov-report xml
cargo llvm-cov --no-run --lcov --output-path coverage.lcov
- uses: codecov/codecov-action@v3
with:
files: coverage.lcov,coverage.xml
name: ${{ matrix.os }}
linux:
runs-on: ubuntu-latest
strategy:
Expand All @@ -28,13 +72,13 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: "3.10"
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
sccache: "true"
manylinux: auto
- name: Upload wheels
uses: actions/upload-artifact@v3
Expand All @@ -51,14 +95,14 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: "3.10"
architecture: ${{ matrix.target }}
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
sccache: "true"
- name: Upload wheels
uses: actions/upload-artifact@v3
with:
Expand All @@ -74,13 +118,13 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: "3.10"
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
sccache: "true"
- name: Upload wheels
uses: actions/upload-artifact@v3
with:
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
# bpeasy

[![codecov](https://codecov.io/gh/gautierdag/bpeasy/branch/main/graph/badge.svg?token=NWHDJ22L8I)](https://codecov.io/gh/gautierdag/bpeasy)
16 changes: 5 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ fn get_most_frequent_pair(
let mut pair_freqs: HashMap<(Vec<u8>, Vec<u8>), u128> = HashMap::new();

// Calculate frequencies for each pair of bytes in all sentences and words
// NOTE: Could be parallelized over sentences
for sentence in tokenized_bytes {
for word in sentence.windows(2) {
if word[0].len() + word[1].len() > max_token_length {
Expand All @@ -69,6 +70,7 @@ fn get_most_frequent_pair(

fn merge_frequent_pair(tokenized_bytes: &mut Vec<Vec<Vec<u8>>>, left: Vec<u8>, right: Vec<u8>) {
// Merge the most frequent pair in all sentences and words
// NOTE: Could be parallelized over sentences
for sentence in tokenized_bytes.iter_mut() {
let mut i = 0;
while i < sentence.len() - 1 {
Expand Down Expand Up @@ -161,7 +163,6 @@ fn train_bpe(
if text.is_empty() {
continue;
}

let tokens_bytes = tokenize(text, regex);
tokenized_bytes.extend(tokens_bytes);
}
Expand All @@ -178,9 +179,9 @@ fn train_bpe(
Ok(python_dict_out.into())
}

/// A Python module implemented in Rust. The name of this function must match
/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
/// import the module.
/// bpeasy is a bare-bones implementation of byte-pair encoding (BPE) in Rust.
/// It is designed to be used as a Python module and returns a byte-pair vocabulary
/// as a Python dictionary.
#[pymodule]
fn bpeasy(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(train_bpe, m)?)?;
Expand All @@ -197,9 +198,6 @@ mod tests {
let text = "a b c";
let regex = r"([^\s]+)|(\s+)";
let tokens = tokenize(text, regex);
// assert no error
// assert!(tokens.is_ok());

assert_eq!(
tokens,
vec![
Expand All @@ -216,17 +214,13 @@ mod tests {
fn test_all() {
let text: &str = "\tYou hear £ £ £ here";
let regex = r"([^\s]+)|(\s+)";
// let tokens = tokenize(text, regex);
// println!("{:?}", tokens);
// let tokenized_bytes = convert_to_tokenized_bytes(tokens);
let tokenized_bytes = tokenize(text, regex);
println!("{:?}", tokenized_bytes);

let vocab_size = 10;
let max_token_length = 128;
let bpe_vocab = build_bpe_vocab(tokenized_bytes, max_token_length, vocab_size);
println!("{:?}", bpe_vocab);
// Output or use the encoded text
}

#[test]
Expand Down

0 comments on commit 5eab80d

Please sign in to comment.