-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5e1b8d9
commit 6dac3c7
Showing
10 changed files
with
194 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
import bpeasy as bpeasy | ||
import tokenizers |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .bpeasy import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from typing import Iterator | ||
|
||
def train_bpe( | ||
iterator: Iterator[str], | ||
python_regex: str, | ||
max_token_length: int, | ||
vocab_size: int, | ||
) -> dict[bytes, int]: ... |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import json | ||
|
||
|
||
def load_json(path): | ||
with open(path, "r") as f: | ||
return json.load(f) | ||
|
||
|
||
def get |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
// Original source: https://github.com/huggingface/tokenizers/blob/e3bcef288b5309a19556c02a16a9c58a52197b76/bindings/python/src/utils/iterators.rs | ||
use pyo3::prelude::*; | ||
use pyo3::AsPyPointer; | ||
use std::collections::VecDeque; | ||
|
||
/// A buffered iterator that takes care of locking the GIL only when needed. | ||
/// The `PyIterator` provided by PyO3 keeps a Python GIL token all along | ||
/// and thus doesn't allow us to release the GIL to allow having other threads. | ||
/// | ||
/// This iterator serves two purposes: | ||
/// - First, as opposed to the `pyo3::PyIterator`, it is Send and can easily be parallelized | ||
/// - Second, this let us release the GIL between two refills of the buffer, allowing other | ||
/// Python threads to work | ||
pub struct PyBufferedIterator<T, F> { | ||
iter: Option<Py<PyAny>>, | ||
converter: F, | ||
buffer: VecDeque<PyResult<T>>, | ||
size: usize, | ||
} | ||
|
||
impl<T, F, I> PyBufferedIterator<T, F> | ||
where | ||
F: Fn(&PyAny) -> I, | ||
I: IntoIterator<Item = PyResult<T>>, | ||
{ | ||
/// Create a new PyBufferedIterator using the provided Python object. | ||
/// This object must implement the Python Iterator Protocol, and an error will | ||
/// be return if the contract is not respected. | ||
/// | ||
/// The `converter` provides a way to convert each item in the iterator into | ||
/// something that doesn't embed a 'py token and thus allows the GIL to be released | ||
/// | ||
/// The `buffer_size` represents the number of items that we buffer before we | ||
/// need to acquire the GIL again. | ||
pub fn new(iter: &PyAny, converter: F, buffer_size: usize) -> PyResult<Self> { | ||
let py = iter.py(); | ||
let iter: Py<PyAny> = unsafe { | ||
py.from_borrowed_ptr_or_err::<PyAny>(pyo3::ffi::PyObject_GetIter(iter.as_ptr()))? | ||
.to_object(py) | ||
}; | ||
|
||
Ok(Self { | ||
iter: Some(iter), | ||
converter, | ||
buffer: VecDeque::with_capacity(buffer_size), | ||
size: buffer_size, | ||
}) | ||
} | ||
|
||
/// Refill the buffer, and set `self.iter` as `None` if nothing more to get | ||
fn refill(&mut self) -> PyResult<()> { | ||
if self.iter.is_none() { | ||
return Ok(()); | ||
} | ||
Python::with_gil(|py| loop { | ||
if self.buffer.len() >= self.size { | ||
return Ok(()); | ||
} | ||
match unsafe { | ||
py.from_owned_ptr_or_opt::<PyAny>(pyo3::ffi::PyIter_Next( | ||
self.iter.as_ref().unwrap().as_ref(py).as_ptr(), | ||
)) | ||
} { | ||
Some(obj) => self.buffer.extend((self.converter)(obj)), | ||
None => { | ||
if PyErr::occurred(py) { | ||
return Err(PyErr::fetch(py)); | ||
} else { | ||
self.iter = None; | ||
} | ||
} | ||
}; | ||
if self.iter.is_none() { | ||
return Ok(()); | ||
} | ||
}) | ||
} | ||
} | ||
|
||
impl<T, F, I> Iterator for PyBufferedIterator<T, F> | ||
where | ||
F: Fn(&PyAny) -> I, | ||
I: IntoIterator<Item = PyResult<T>>, | ||
{ | ||
type Item = PyResult<T>; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
if !self.buffer.is_empty() { | ||
self.buffer.pop_front() | ||
} else if self.iter.is_some() { | ||
if let Err(e) = self.refill() { | ||
return Some(Err(e)); | ||
} | ||
self.next() | ||
} else { | ||
None | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters