Skip to content

Commit

Permalink
change dockerfile
Browse files Browse the repository at this point in the history
  • Loading branch information
jalencato committed Feb 1, 2024
1 parent a2d36d5 commit 9cbf74e
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 10 deletions.
10 changes: 0 additions & 10 deletions graphstorm-processing/docker/0.2.1/emr-serverless/Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,6 @@ RUN pip install -r /usr/lib/spark/code/requirements.txt \
# GSProcessing codebase
COPY code/ /usr/lib/spark/code/

# Install Hugging Face model cache if it is necessary
ARG MODEL=""
ENV TRANSFORMERS_CACHE=/home/hadoop/.cache/huggingface/hub
RUN if [ $MODEL == "" ]; then \
echo "Skip installing model cache"; \
else \
echo "Installing model cache for $MODEL" && \
python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \
fi

FROM runtime AS prod
RUN python -m pip install --no-deps /usr/lib/spark/code/graphstorm_processing-*.whl && \
rm /usr/lib/spark/code/graphstorm_processing-*.whl && rm -rf /root/.cache
Expand Down
68 changes: 68 additions & 0 deletions graphstorm-processing/docker/0.2.2/emr-serverless/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
ARG ARCH=x86_64
FROM public.ecr.aws/emr-serverless/spark/emr-6.13.0:20230906-${ARCH} as base
FROM base as runtime

USER root
ENV PYTHON_VERSION=3.9.18

# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONIOENCODING=UTF-8

# Set up pyenv
ENV PYENV_ROOT="${HOME}/.pyenv"
ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}"
ENV PYSPARK_DRIVER_PYTHON=${PYENV_ROOT}/shims/python
ENV PYSPARK_PYTHON=${PYENV_ROOT}/shims/python

# TODO: These can probably all go to another builder stage?
RUN yum erase -y openssl-devel && \
yum install -y \
bzip2-devel\
gcc \
git \
libffi-devel \
ncurses-devel \
openssl11-devel \
readline-devel \
sqlite-devel \
sudo \
xz-devel && \
rm -rf /var/cache/yum
RUN git clone https://github.com/pyenv/pyenv.git ${PYENV_ROOT} && \
pyenv install ${PYTHON_VERSION} && \
pyenv global ${PYTHON_VERSION}

WORKDIR /usr/lib/spark/code/

# Install GSProcessing requirements to pyenv Python
COPY requirements.txt requirements.txt
# Use --mount=type=cache,target=/root/.cache when Buildkit CI issue is fixed:
# https://github.com/moby/buildkit/issues/1512
RUN pip install -r /usr/lib/spark/code/requirements.txt \
&& rm -rf /root/.cache

# GSProcessing codebase
COPY code/ /usr/lib/spark/code/

# Install Hugging Face model cache if it is necessary
ARG MODEL=""
ENV TRANSFORMERS_CACHE=/home/hadoop/.cache/huggingface/hub
RUN if [ $MODEL == "" ]; then \
echo "Skip installing model cache"; \
else \
echo "Installing model cache for $MODEL" && \
python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \
fi

FROM runtime AS prod
RUN python -m pip install --no-deps /usr/lib/spark/code/graphstorm_processing-*.whl && \
rm /usr/lib/spark/code/graphstorm_processing-*.whl && rm -rf /root/.cache

FROM runtime AS test
RUN python -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/ && rm -rf /root/.cache

USER hadoop:hadoop
WORKDIR /home/hadoop
56 changes: 56 additions & 0 deletions graphstorm-processing/docker/0.2.2/sagemaker/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# syntax=docker/dockerfile:experimental
FROM 153931337802.dkr.ecr.us-west-2.amazonaws.com/sagemaker-spark-processing:3.4-cpu-py39-v1.0 AS base

# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONIOENCODING=UTF-8
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/conda/lib"
ENV PATH=/opt/conda/bin:$PATH

# Install GSProcessing requirements to pipenv Python
RUN pipenv install \
boto3==1.28.38 \
joblib==1.3.1 \
mock==5.1.0 \
pandas==1.3.5 \
pip==23.1.2 \
protobuf==3.20.3 \
psutil==5.9.5 \
pyarrow==13.0.0 \
pyspark==3.4.1 \
scipy==1.11.3 \
setuptools \
spacy==3.6.0 \
wheel \
&& rm -rf /root/.cache
# Do a pipenv sync so our base libs are independent from our editable code, making them cacheable
RUN pipenv sync --system && python3 -m spacy download en_core_web_lg \
&& rm -rf /root/.cache

# Graphloader codebase
COPY code/ /usr/lib/spark/code/
WORKDIR /usr/lib/spark/code/

# Base container assumes this is the workdir
ENV SPARK_HOME /usr/lib/spark
WORKDIR $SPARK_HOME

# Ensure our python3 installation is the one used
RUN echo 'alias python3=python3.9' >> ~/.bashrc

# Starts framework
ENTRYPOINT ["bash", "/usr/lib/spark/code/docker-entry.sh"]

FROM base AS prod
RUN python3 -m pip install /usr/lib/spark/code/graphstorm_processing-*.whl && \
rm /usr/lib/spark/code/graphstorm_processing-*.whl
CMD ["gs-processing"]

FROM base AS test
RUN python3 -m pip install /usr/lib/spark/code/graphstorm-processing/
CMD ["sh", "-c", "pytest ./code/tests/"]

0 comments on commit 9cbf74e

Please sign in to comment.