Skip to content

Commit

Permalink
Remove OpenNMT and Tensorflow, Update to Python 3.10 (#583)
Browse files Browse the repository at this point in the history
* Removed OpenNMT and Tensorflow Dependencies

-Removed the memory_growth and eager_execution command line arguments
-Removed WER score metric

* Update Python to 3.10 and transformers to 4.46
  • Loading branch information
TaperChipmunk32 authored Nov 11, 2024
1 parent 7ef209b commit 539b213
Show file tree
Hide file tree
Showing 30 changed files with 3,617 additions and 6,104 deletions.
6 changes: 3 additions & 3 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ARG PYTHON_VERSION=3.8
ARG PYTHON_VERSION=3.10
ARG POETRY_VERSION=1.7.1
FROM ubuntu:20.04
FROM ubuntu:22.04
ARG PYTHON_VERSION
ARG POETRY_VERSION
WORKDIR /app
Expand Down Expand Up @@ -44,5 +44,5 @@ ENV SIL_NLP_CACHE_PROJECT_DIR=/root/.cache/silnlp/projects
# Set environment variables
ENV CLEARML_API_HOST="https://api.sil.hosted.allegro.ai"
ENV SIL_NLP_DATA_PATH=/silnlp
ENV EFLOMAL_PATH=/workspaces/silnlp/.venv/lib/python3.8/site-packages/eflomal/bin
ENV EFLOMAL_PATH=/workspaces/silnlp/.venv/lib/python3.12/site-packages/eflomal/bin
CMD ["bash"]
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ARG PYTHON_VERSION=3.8
ARG PYTHON_VERSION=3.10
ARG POETRY_VERSION=1.7.1

FROM python:$PYTHON_VERSION-slim as builder
FROM python:$PYTHON_VERSION-slim AS builder
ARG POETRY_VERSION

ENV POETRY_HOME=/opt/poetry
Expand All @@ -22,9 +22,9 @@ RUN poetry export -E eflomal --without-hashes -f requirements.txt > requirements
COPY . /src
RUN poetry build

FROM ubuntu:20.04
FROM ubuntu:22.04

ARG PYTHON_VERSION=3.8
ARG PYTHON_VERSION=3.10

ENV PIP_DISABLE_PIP_VERSION_CHECK=on
ENV TZ=America/New_York
Expand Down
6,642 changes: 3,545 additions & 3,097 deletions poetry.lock

Large diffs are not rendered by default.

21 changes: 9 additions & 12 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.black]
line-length = 120
target-version = ['py38']
target-version = ['py310']
include = '\.pyi?$'
exclude = '''
/(
Expand Down Expand Up @@ -48,7 +48,7 @@ silnlp-alignment-aggregate-results = "silnlp.alignment.aggregate_results:main"
clowder = "clowder.clowder:main"

[tool.poetry.dependencies]
python = ">=3.8.1,<3.9"
python = ">=3.10,<3.11"
pandas = "^1.0.4"
sentencepiece = "^0.1.97"
nltk = "^3.5"
Expand All @@ -57,13 +57,11 @@ seaborn = "0.11.2"
morfessor = "^2.0.6"
Morfessor-FlatCat = "^1.0.8"
psutil = "^5.7.3"
scikit-learn = "1.1.2"
numpy = "^1.23.1"
OpenNMT-tf = "^2.31.0"
scikit-learn = "^1.1.2"
numpy = "^1.26.0"
lit-nlp = "0.4.1"
tensorflow = "2.7.3"
google-cloud-translate = "^3.0.2"
scipy = "1.8"
scipy = "^1.11.2"
clearml = ">=1.4.1"
XlsxWriter = "^3.2.0"
python-Levenshtein = "^0.20.9"
Expand All @@ -72,18 +70,17 @@ tqdm = "^4.62.2"
s3path = "0.3.4"
sacrebleu = "^2.3.1"
ctranslate2 = "^3.5.1"
tensorflow-addons = "0.17.1"
libclang = "14.0.6"
sil-machine = {extras = ["thot"], version = "^1.1.0"}
sil-machine = {extras = ["thot"], version = "^1.3.0"}
datasets = "^2.7.1"
torch = {version = "2.1.2", source = "torch"}
torch = {version = "^2.4", source = "torch"}
sacremoses = "^0.0.53"
evaluate = "^0.3.0"
python-docx = "^0.8.11"
iso639-lang = "^2.1.0"
eflomal = { version = "^2.0.0", optional = true }
accelerate = "^0.23.0"
transformers = "^4.36.2"
accelerate = "^0.26.0"
transformers = "^4.46"
optimum = "^1.16.0"
google = "^3.0.0"
google-api-python-client = "^2.101.0"
Expand Down
1 change: 1 addition & 0 deletions scripts/bible_alignment/scratchpad_align_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import os

LOGGER = logging.getLogger("silnlp")

Expand Down
2 changes: 1 addition & 1 deletion scripts/clear_clearml_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
task_name="clear_cache",
)
task.set_base_docker(
docker_image="nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04",
docker_image="nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu22.04",
docker_arguments="-v /home/clearml/.clearml/hf-cache:/root/.cache/huggingface",
docker_setup_bash_script=[
"apt install -y python3-venv",
Expand Down
3 changes: 0 additions & 3 deletions scripts/scratchpad_s3.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from silnlp.common.tf_utils import enable_memory_growth
from silnlp.nmt.experiment import SILExperiment
from silnlp.nmt.translate import TranslationTask

enable_memory_growth()

exp = SILExperiment(
name="BT-Swahili/en-swh-1",
make_stats=True, # limited by stats_max_size to process only Bibles
Expand Down
20 changes: 0 additions & 20 deletions silnlp/common/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,12 @@
from typing import List, Optional

import psutil
from opennmt.utils.wer import sentence_wer

from ..common.corpus import write_corpus

METEOR_FULLY_SUPPORTED_LANGS = {"en", "cz", "de", "es", "fr", "ar"}


def compute_wer_score(hyps: List[str], refs: List[List[str]]) -> float:
if len(hyps) == 0:
return 100.0

try:
wer_score = 0.0
for hyp, ref in zip(hyps, refs[0]):
wer_score += sentence_wer(ref.lower(), hyp.lower())
result = wer_score / len(hyps)
except UnicodeDecodeError:
print("Unable to compute WER score")
result = -1
except ZeroDivisionError:
print("Cannot divide by zero. Check for empty lines.")
result = -1

return result * 100


def compute_meteor_score(lang: str, hyps: List[str], refs: List[List[str]]) -> Optional[float]:
if lang.lower() not in METEOR_FULLY_SUPPORTED_LANGS:
return None
Expand Down
7 changes: 4 additions & 3 deletions silnlp/common/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
Normalization tooling for cleaning up whitespace and punctuation in extract sentences
See normalize_extracts.py for context
"""
import logging
import regex

import logging
from dataclasses import dataclass
from enum import Enum, IntEnum
from typing import Dict, List, Optional, Set, Tuple

import regex


class PunctuationCategory(Enum):
LEFT_CLINGING = "LEFT_CLINGING"
Expand Down Expand Up @@ -46,7 +47,7 @@ def shift_slice(slice: StringSlice, offset: int, new_outer: str) -> StringSlice:
)


def find_slices(reg: regex.Regex, text: str) -> List[StringSlice]:
def find_slices(reg: regex.Pattern, text: str) -> List[StringSlice]:
return [
StringSlice(start_index=match.span()[0], end_index=match.span()[1], slice=match.group(), outer=text)
for match in regex.finditer(reg, text)
Expand Down
32 changes: 0 additions & 32 deletions silnlp/common/tf_utils.py

This file was deleted.

7 changes: 5 additions & 2 deletions silnlp/common/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from machine.corpora import (
FileParatextProjectSettingsParser,
FileParatextProjectTextUpdater,
UpdateUsfmBehavior,
UpdateUsfmParserHandler,
UsfmFileText,
UsfmStylesheet,
Expand Down Expand Up @@ -207,7 +208,9 @@ def translate_usfm(
dest_project_path = get_project_dir(trg_format_project)
dest_updater = FileParatextProjectTextUpdater(dest_project_path)
usfm_out = dest_updater.update_usfm(
src_file_text.id, rows, strip_all_text=use_src_project, prefer_existing_text=False
src_file_text.id,
rows,
UpdateUsfmBehavior.STRIP_EXISTING if use_src_project else UpdateUsfmBehavior.PREFER_NEW,
)

if usfm_out is None:
Expand All @@ -216,7 +219,7 @@ def translate_usfm(
else:
with open(src_file_path, encoding="utf-8-sig") as f:
usfm = f.read()
handler = UpdateUsfmParserHandler(rows, vrefs[0].book, strip_all_text=True)
handler = UpdateUsfmParserHandler(rows, vrefs[0].book, UpdateUsfmBehavior.STRIP_EXISTING)
parse_usfm(usfm, handler)
usfm_out = handler.get_usfm()

Expand Down
6 changes: 0 additions & 6 deletions silnlp/nmt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +0,0 @@
# This is to prevent double logging on tensorflow:
# https://stackoverflow.com/questions/33662648/tensorflow-causes-logging-messages-to-double
import tensorflow as tf

logger = tf.get_logger()
logger.propagate = False
4 changes: 0 additions & 4 deletions silnlp/nmt/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from ..common.utils import get_mt_exp_dir
from .config import Config
from .hugging_face_config import HuggingFaceConfig
from .open_nmt_config import OpenNMTConfig, is_open_nmt_model


def load_config(exp_name: str) -> Config:
Expand All @@ -20,7 +19,4 @@ def load_config(exp_name: str) -> Config:

def create_config(exp_dir: Path, config: dict) -> Config:
model_name: Optional[str] = config.get("model")
if model_name is None or is_open_nmt_model(model_name):
return OpenNMTConfig(exp_dir, config)

return HuggingFaceConfig(exp_dir, config)
31 changes: 17 additions & 14 deletions silnlp/nmt/diff_predictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from tqdm import tqdm

from ..common.corpus import load_corpus
from ..common.metrics import compute_wer_score
from ..common.utils import get_git_revision_hash
from .config import get_mt_exp_dir
from .sp_utils import decode_sp, decode_sp_lines
Expand Down Expand Up @@ -47,12 +46,11 @@
CHRF3_SCORE = "chrF3"
CHRF3P_SCORE = "chrF3+"
CHRF3PP_SCORE = "chrF3++"
WER_SCORE = "WER"
TER_SCORE = "TER"
DICT_SRC = "Source"
DICT_TRG = "Target"

_SUPPORTED_SCORERS = {BLEU_SCORE, SPBLEU_SCORE, CHRF3_SCORE, CHRF3P_SCORE, CHRF3PP_SCORE, WER_SCORE, TER_SCORE}
_SUPPORTED_SCORERS = {BLEU_SCORE, SPBLEU_SCORE, CHRF3_SCORE, CHRF3P_SCORE, CHRF3PP_SCORE, TER_SCORE}


def sentence_bleu(
Expand Down Expand Up @@ -165,7 +163,7 @@ def add_stats(df: pd.DataFrame, sheet):
sheet.write_string("A4", "STD")
column_list = ["B", "C", "D", "E", "F"]
column_idx = 0
for column_name in [BLEU_SCORE, SPBLEU_SCORE, CHRF3_SCORE, CHRF3P_SCORE, CHRF3PP_SCORE, WER_SCORE, TER_SCORE]:
for column_name in [BLEU_SCORE, SPBLEU_SCORE, CHRF3_SCORE, CHRF3P_SCORE, CHRF3PP_SCORE, TER_SCORE]:
if column_name in df:
column_id = column_list[column_idx]
sheet.write_string(f"{column_id}1", f"{column_name}")
Expand Down Expand Up @@ -441,22 +439,29 @@ def add_scores(df: pd.DataFrame, scorers: List[str], preserve_case: bool, tokeni
elif scorer == CHRF3P_SCORE.lower():
for index, row in tqdm(df.iterrows(), desc="Calculating chrF3+ scores ..."):
chrf3p_score = sacrebleu.corpus_chrf(
[row[PREDICTION]], [[row[TRG_SENTENCE]]], char_order=6, beta=3, word_order=1, remove_whitespace=True, eps_smoothing=True
[row[PREDICTION]],
[[row[TRG_SENTENCE]]],
char_order=6,
beta=3,
word_order=1,
remove_whitespace=True,
eps_smoothing=True,
)
scores.append(chrf3p_score.score)
df[CHRF3P_SCORE] = scores
elif scorer == CHRF3PP_SCORE.lower():
for index, row in tqdm(df.iterrows(), desc="Calculating chrF3++ scores ..."):
chrf3pp_score = sacrebleu.corpus_chrf(
[row[PREDICTION]], [[row[TRG_SENTENCE]]], char_order=6, beta=3, word_order=2, remove_whitespace=True, eps_smoothing=True
[row[PREDICTION]],
[[row[TRG_SENTENCE]]],
char_order=6,
beta=3,
word_order=2,
remove_whitespace=True,
eps_smoothing=True,
)
scores.append(chrf3pp_score.score)
df[CHRF3PP_SCORE] = scores
elif scorer == WER_SCORE.lower():
for index, row in tqdm(df.iterrows(), desc="Calculating WER scores ..."):
wer_score = compute_wer_score([row[PREDICTION]], [[row[TRG_SENTENCE]]])
scores.append(wer_score if wer_score >= 0 else 0)
df[WER_SCORE] = scores
elif scorer == TER_SCORE.lower():
for index, row in tqdm(df.iterrows(), desc="Calculating TER scores ..."):
ter_score = sacrebleu.corpus_ter([row[PREDICTION]], [[row[TRG_SENTENCE]]])
Expand Down Expand Up @@ -526,9 +531,7 @@ def main() -> None:
exp1_step = (
0
if exp1_type == "SMT"
else get_last_checkpoint(str(exp1_dir))
if args.last
else get_best_checkpoint(str(exp1_dir))
else get_last_checkpoint(str(exp1_dir)) if args.last else get_best_checkpoint(str(exp1_dir))
)
output_path = os.path.join(exp1_dir, f"diff_predictions.{exp1_step}.xlsx")

Expand Down
5 changes: 0 additions & 5 deletions silnlp/nmt/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import yaml

from ..common.environment import SIL_NLP_ENV
from ..common.tf_utils import enable_memory_growth
from ..common.utils import get_git_revision_hash, show_attrs
from .clearml_connection import SILClearML
from .config import Config, get_mt_exp_dir
Expand Down Expand Up @@ -141,7 +140,6 @@ def main() -> None:
"--force-align", default=False, action="store_true", help="Force recalculation of all alignment scores"
)
parser.add_argument("--disable-mixed-precision", default=False, action="store_true", help="Disable mixed precision")
parser.add_argument("--memory-growth", default=False, action="store_true", help="Enable memory growth")
parser.add_argument("--num-devices", type=int, default=1, help="Number of devices to train on")
parser.add_argument(
"--clearml-queue",
Expand Down Expand Up @@ -190,9 +188,6 @@ def main() -> None:
show_attrs(cli_args=args)
exit()

if args.memory_growth:
enable_memory_growth()

if not (args.preprocess or args.train or args.test):
args.preprocess = True
args.train = True
Expand Down
Loading

0 comments on commit 539b213

Please sign in to comment.