Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom ICU build #367

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
1dd6b66
Add unicode normalization layer tests
apaniukov Nov 26, 2024
afd4a60
WiP
apaniukov Nov 29, 2024
2f24fec
WiP
apaniukov Dec 20, 2024
08052c2
Switch Casefold and UnicodeNormalization to CharsMap
apaniukov Jan 8, 2025
f6c001b
Add unicode normalization layer tests
apaniukov Nov 26, 2024
472b163
WiP
apaniukov Nov 29, 2024
04fb20c
WiP
apaniukov Dec 20, 2024
ed1203f
Switch Casefold and UnicodeNormalization to CharsMap
apaniukov Jan 8, 2025
012fb8e
Update tests and fix custom charsmap support
apaniukov Jan 9, 2025
e3831ec
Merge remote-tracking branch 'origin/update-normalization' into updat…
apaniukov Jan 9, 2025
8092720
Ruff checks
apaniukov Jan 9, 2025
df34dee
Merge branch 'master' into update-normalization
apaniukov Jan 9, 2025
6a611f3
wip
apaniukov Jan 9, 2025
258f0f4
wip
apaniukov Jan 9, 2025
baf0e70
wip
apaniukov Jan 9, 2025
6177b81
Switch Off FastTokenizer
apaniukov Jan 10, 2025
68b7e4e
Delete torch from dependencies
apaniukov Jan 10, 2025
7244191
Delete FastTokenizer from cmake and readme
apaniukov Jan 10, 2025
082064c
Delete FastTokenizer related patches
apaniukov Jan 10, 2025
7380898
Delete FastTokenizer build form CI
apaniukov Jan 10, 2025
68d0300
Delete FastTokenizer build form CI
apaniukov Jan 10, 2025
fc094a0
Delete FastTokenizer from Cmake
apaniukov Jan 10, 2025
72b0646
Delete FastTokenizer from Cmake
apaniukov Jan 10, 2025
4eb7dd0
use custom icu
mryzhov Jan 13, 2025
9a85097
Merge branch 'master' into icu_build
mryzhov Jan 13, 2025
cafaf03
filter supported targets
mryzhov Jan 13, 2025
deb6873
removed tmp solution
mryzhov Jan 13, 2025
0e13658
brew icu4c
mryzhov Jan 13, 2025
ac21acd
install icu4c
mryzhov Jan 14, 2025
a9c5b38
fixed arch detection
mryzhov Jan 14, 2025
e3eb2fd
fixed win subpath
mryzhov Jan 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 8 additions & 16 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,9 @@ jobs:


openvino_tokenizers_cpack:
name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }})
name: OpenVINO tokenizers cpack, BUILD_TYPE=${{ matrix.build_type }})
strategy:
matrix:
build_fast_tokenizers: [ON]
build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package
needs: [ openvino_download ]
if: |
Expand Down Expand Up @@ -110,8 +109,7 @@ jobs:
- name: CMake configure - tokenizers
run: |
source ${INSTALL_DIR}/setupvars.sh
cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-S ${{ env.OPENVINO_TOKENIZERS_REPO }} \
-B ${{ env.BUILD_DIR }}

Expand All @@ -138,15 +136,13 @@ jobs:
if: ${{ always() }}
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
name: openvino_tokenizers_cpack_${{ matrix.build_type }}
path: ${{ env.BUILD_DIR }}/*.tar.gz
if-no-files-found: 'error'

openvino_tokenizers_wheel:
name: OpenVINO tokenizers extension (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }})
strategy:
matrix:
build_fast_tokenizers: [ON, OFF]
name: OpenVINO tokenizers extension wheel

needs: [ openvino_download ]
if: |
always() &&
Expand Down Expand Up @@ -188,7 +184,6 @@ jobs:
run: |
python -m pip wheel -v --no-deps --wheel-dir ${BUILD_DIR} \
--config-settings=override=cross.arch="manylinux_2_31_x86_64" \
--config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
${{ needs.openvino_download.outputs.ov_wheel_source }} \
${OPENVINO_TOKENIZERS_REPO}
env:
Expand All @@ -204,15 +199,12 @@ jobs:
if: ${{ always() }}
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: openvino_tokenizers_wheel_${{ matrix.build_fast_tokenizers }}
name: openvino_tokenizers_wheel
path: ${{ env.BUILD_DIR }}/*.whl
if-no-files-found: 'error'

openvino_tokenizers_tests:
name: OpenVINO tokenizers tests (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }})
strategy:
matrix:
build_fast_tokenizers: [ON, OFF]
name: OpenVINO tokenizers tests
needs: [ openvino_download, openvino_tokenizers_wheel]
if: always() && needs.openvino_tokenizers_wheel.result == 'success'
timeout-minutes: 45
Expand Down Expand Up @@ -242,7 +234,7 @@ jobs:
- name: Download tokenizers package
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
name: openvino_tokenizers_wheel_${{ matrix.build_fast_tokenizers }}
name: openvino_tokenizers_wheel
path: ${{ env.INSTALL_DIR }}/ov_tokenizers

- name: Download OpenVINO package
Expand Down
12 changes: 5 additions & 7 deletions .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,9 @@ jobs:
if-no-files-found: 'error'

openvino_tokenizers_cpack:
name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }})
name: OpenVINO tokenizers cpack (BUILD_TYPE=${{ matrix.build_type }})
strategy:
matrix:
build_fast_tokenizers: [ON]
build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package
needs: [ openvino_download, openvino_build ]
if: |
Expand Down Expand Up @@ -221,13 +220,12 @@ jobs:
# Build
#
- name: Install build dependencies
run: brew install coreutils ninja
run: brew install coreutils ninja icu4c

- name: CMake configure - tokenizers
run: |
source ${INSTALL_DIR}/setupvars.sh
cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-S ${{ env.OPENVINO_TOKENIZERS_REPO }} \
-B ${{ env.BUILD_DIR }}

Expand All @@ -254,7 +252,7 @@ jobs:
if: ${{ always() }}
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
name: openvino_tokenizers_cpack_${{ matrix.build_type }}
path: ${{ env.BUILD_DIR }}/*.tar.gz
if-no-files-found: 'error'

Expand Down Expand Up @@ -314,7 +312,7 @@ jobs:
#

- name: Install build dependencies
run: brew install coreutils ninja
run: brew install coreutils ninja icu4c

#
# Build
Expand Down
8 changes: 3 additions & 5 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,9 @@ jobs:
revision: 'latest_available_commit'

openvino_tokenizers_cpack:
name: OpenVINO tokenizers cpack (BUILD_FAST_TOKENIZERS=${{ matrix.build_fast_tokenizers }}, BUILD_TYPE=${{ matrix.build_type }})
name: OpenVINO tokenizers cpack (BUILD_TYPE=${{ matrix.build_type }})
strategy:
matrix:
build_fast_tokenizers: [ON]
build_type: [Release] # TODO: Add Debug build when OV provider is ready or use OV package
needs: [ openvino_download ]
if: |
Expand Down Expand Up @@ -115,8 +114,7 @@ jobs:
shell: pwsh
run: |
${{ env.OV_INSTALL_DIR }}/setupvars.ps1
cmake -DBUILD_FAST_TOKENIZERS="${{ matrix.build_fast_tokenizers }}" `
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} `
cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} `
-S ${{ env.OPENVINO_TOKENIZERS_REPO }} `
-B ${{ env.BUILD_DIR }}
env:
Expand Down Expand Up @@ -149,7 +147,7 @@ jobs:
if: ${{ always() }}
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
name: openvino_tokenizers_cpack_${{ matrix.build_type }}
path: ${{ env.BUILD_DIR }}/*.zip
if-no-files-found: 'error'

Expand Down
71 changes: 0 additions & 71 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,77 +150,6 @@ make

After that, you can transfer all binaries from `build/src` to `<openvino_dir>` as described in the C++ installation instruction above.

### Reducing the ICU Data Size

By default, all available ICU locales are supported, which significantly increases the package size. To reduce the size of the ICU libraries included in your final package, follow these steps:

1. **Use the ICU Data Configuration File**:
- This file specifies which features and locales to include in a custom data bundle. You can find more information [here](https://unicode-org.github.io/icu/userguide/icu_data/buildtool.html#icu-data-configuration-file).

2. **Set the ICU Data Filter File as an Environment Variable**:
- **On Unix-like systems (Linux, macOS)**:
Set the `ICU_DATA_FILTER_FILE` environment variable to the path of your configuration file (`filters.json`):

```bash
export ICU_DATA_FILTER_FILE="filters.json"
```

- **On Windows**:
Set the `ICU_DATA_FILTER_FILE` environment variable using the Command Prompt or PowerShell:

**Command Prompt:**
```cmd
set ICU_DATA_FILTER_FILE=filters.json
```

**PowerShell:**
```powershell
$env:ICU_DATA_FILTER_FILE="filters.json"
```

3. **Create a Configuration File**:
- An example configuration file (`filters.json`) might look like this:

```json
{
"localeFilter": {
"filterType": "language",
"includelist": [
"en"
]
}
}
```

4. **Configure OpenVINO Tokenizers**:
- When building OpenVINO tokenizers, set the following CMake option during the project configuration:

```bash
-DBUILD_FAST_TOKENIZERS=ON
```
- Example for a pip installation path:
```bash
ICU_DATA_FILTER_FILE=</path/to/filters.json> pip install git+https://github.com/openvinotoolkit/openvino_tokenizers.git --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS=ON
```

By following these instructions, you can effectively reduce the size of the ICU libraries in your final package.

### Build OpenVINO Tokenizers without FastTokenizer Library

If a tokenizer doesn't use `CaseFold`, `UnicodeNormalization` or `Wordpiece` operations, you can drastically reduce package binary size by building OpenVINO Tokenizers without FastTokenizer dependency with this flag:

```bash
-DENABLE_FAST_TOKENIZERS=OFF
```

This option can also help with building for platform that is supported by FastTokenizer, for example `Android x86_64`.

Example for a pip installation path:
```bash

pip install git+https://github.com/openvinotoolkit/openvino_tokenizers.git --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --config-settings=override=cmake.options.ENABLE_FAST_TOKENIZERS=OFF
```

## Usage

:warning: OpenVINO Tokenizers can be inferred on a `CPU` device only.
Expand Down
7 changes: 2 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,14 @@ transformers = [
"transformers[sentencepiece] >= 4.36.0",
"tiktoken"
]
# chatglm2 custom tokenizer file imports torch, have to add torch dependency for tests
torch = [
'torch'
]
dev = [
"ruff",
"bandit",
"pytest",
"pytest_harvest",
"pandas",
"openvino_tokenizers[transformers, torch]"
"jinja2",
"openvino_tokenizers[transformers]"
]
benchmark = [
"pandas",
Expand Down
60 changes: 55 additions & 5 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,23 @@ class NormalizationStep(BasePipelineStep):
class NormalizeUnicode(NormalizationStep):
normalization_form: str = "NFD"

def __post_init__(self):
if self.normalization_form not in ["NFD", "NFC", "NFKD", "NFKC"]:
raise ValueError(
'[ NormalizeUnicode ] `normalization_form` attribute must be one of ["NFD", "NFC", "NFKD", "NFKC"], '
f"got {self.normalization_form}."
)

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
return (
_get_factory()
.create(
"NormalizeUnicode",
"CharsMapNormalization",
input_nodes,
{"normalization_form": self.normalization_form},
{
"normalization_form": self.normalization_form.lower(),
"remove_extra_whitespaces": False,
},
)
.outputs()
)
Expand All @@ -182,7 +192,22 @@ def __post_init__(self):
)

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
return _get_factory().create("CaseFold", input_nodes, {"encoding": self.encoding}).outputs()
if self.encoding == "":
return _get_factory().create("CaseFold", input_nodes, {"encoding": self.encoding}).outputs()
else:
return (
_get_factory()
.create(
"CharsMapNormalization",
input_nodes,
{
"normalization_form": "identity",
"case_fold": True,
"remove_extra_whitespaces": False,
},
)
.outputs()
)


@dataclass
Expand Down Expand Up @@ -245,15 +270,40 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:

@dataclass
class CharsmapStep(NormalizationStep):
charsmap: bytes
charsmap: Optional[bytes] = None
normalization_form: Optional[str] = None
add_dummy_prefix: bool = False
remove_extra_whitespaces: bool = True
escape_whitespaces: bool = False
case_fold: bool = False
nmt: bool = False

def __post_init__(self):
if self.charsmap is None and self.normalization_form is None:
raise ValueError("[ CharsmapStep ] `charsmap` or `normalization_form` attribute must be set")

@classmethod
def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep":
return cls(charsmap=base64.b64decode(step_json["precompiled_charsmap"]))

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
input_nodes += make_constant_node(np.frombuffer(self.charsmap, dtype=np.uint8), dtype=Type.u8).outputs()
return _get_factory().create("CharsMapNormalization", input_nodes).outputs()
return (
_get_factory()
.create(
"CharsMapNormalization",
input_nodes,
{
"normalization_form": self.normalization_form or "",
"add_dummy_prefix": self.add_dummy_prefix,
"remove_extra_whitespaces": self.remove_extra_whitespaces,
"escape_whitespaces": self.escape_whitespaces,
"case_fold": self.case_fold,
"nmt": self.nmt,
},
)
.outputs()
)


@dataclass
Expand Down
Loading
Loading