change dockerfile

awslabs · Feb 1, 2024 · 9cbf74e · 9cbf74e
1 parent a2d36d5
commit 9cbf74e
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 10 deletions.
diff --git a/graphstorm-processing/docker/0.2.1/emr-serverless/Dockerfile.cpu b/graphstorm-processing/docker/0.2.1/emr-serverless/Dockerfile.cpu
@@ -47,16 +47,6 @@ RUN pip install -r /usr/lib/spark/code/requirements.txt \
 # GSProcessing codebase
 COPY code/ /usr/lib/spark/code/
 
-# Install Hugging Face model cache if it is necessary
-ARG MODEL=""
-ENV TRANSFORMERS_CACHE=/home/hadoop/.cache/huggingface/hub
-RUN if [ $MODEL == "" ]; then \
-        echo "Skip installing model cache"; \
-else \
-        echo "Installing model cache for $MODEL" && \
-        python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \
-fi
-
 FROM runtime AS prod
 RUN python -m pip install --no-deps /usr/lib/spark/code/graphstorm_processing-*.whl && \
     rm /usr/lib/spark/code/graphstorm_processing-*.whl && rm -rf /root/.cache

diff --git a/graphstorm-processing/docker/0.2.2/emr-serverless/Dockerfile.cpu b/graphstorm-processing/docker/0.2.2/emr-serverless/Dockerfile.cpu
@@ -0,0 +1,68 @@
+ARG ARCH=x86_64
+FROM public.ecr.aws/emr-serverless/spark/emr-6.13.0:20230906-${ARCH} as base
+FROM base as runtime
+
+USER root
+ENV PYTHON_VERSION=3.9.18
+
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+
+# Set up pyenv
+ENV PYENV_ROOT="${HOME}/.pyenv"
+ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}"
+ENV PYSPARK_DRIVER_PYTHON=${PYENV_ROOT}/shims/python
+ENV PYSPARK_PYTHON=${PYENV_ROOT}/shims/python
+
+# TODO: These can probably all go to another builder stage?
+RUN yum erase -y openssl-devel && \
+    yum install -y \
+        bzip2-devel\
+        gcc \
+        git \
+        libffi-devel \
+        ncurses-devel \
+        openssl11-devel \
+        readline-devel \
+        sqlite-devel \
+        sudo \
+        xz-devel && \
+        rm -rf /var/cache/yum
+RUN git clone https://github.com/pyenv/pyenv.git ${PYENV_ROOT} && \
+    pyenv install ${PYTHON_VERSION} && \
+    pyenv global ${PYTHON_VERSION}
+
+WORKDIR /usr/lib/spark/code/
+
+# Install GSProcessing requirements to pyenv Python
+COPY requirements.txt requirements.txt
+# Use --mount=type=cache,target=/root/.cache when Buildkit CI issue is fixed:
+# https://github.com/moby/buildkit/issues/1512
+RUN pip install -r /usr/lib/spark/code/requirements.txt \
+    && rm -rf /root/.cache
+
+# GSProcessing codebase
+COPY code/ /usr/lib/spark/code/
+
+# Install Hugging Face model cache if it is necessary
+ARG MODEL=""
+ENV TRANSFORMERS_CACHE=/home/hadoop/.cache/huggingface/hub
+RUN if [ $MODEL == "" ]; then \
+        echo "Skip installing model cache"; \
+else \
+        echo "Installing model cache for $MODEL" && \
+        python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \
+fi
+
+FROM runtime AS prod
+RUN python -m pip install --no-deps /usr/lib/spark/code/graphstorm_processing-*.whl && \
+    rm /usr/lib/spark/code/graphstorm_processing-*.whl && rm -rf /root/.cache
+
+FROM runtime AS test
+RUN python -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/ && rm -rf /root/.cache
+
+USER hadoop:hadoop
+WORKDIR /home/hadoop
diff --git a/graphstorm-processing/docker/0.2.2/sagemaker/Dockerfile.cpu b/graphstorm-processing/docker/0.2.2/sagemaker/Dockerfile.cpu
@@ -0,0 +1,56 @@
+# syntax=docker/dockerfile:experimental
+FROM 153931337802.dkr.ecr.us-west-2.amazonaws.com/sagemaker-spark-processing:3.4-cpu-py39-v1.0 AS base
+
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/conda/lib"
+ENV PATH=/opt/conda/bin:$PATH
+
+# Install GSProcessing requirements to pipenv Python
+RUN pipenv install \
+    boto3==1.28.38 \
+    joblib==1.3.1 \
+    mock==5.1.0 \
+    pandas==1.3.5 \
+    pip==23.1.2 \
+    protobuf==3.20.3 \
+    psutil==5.9.5 \
+    pyarrow==13.0.0 \
+    pyspark==3.4.1 \
+    scipy==1.11.3 \
+    setuptools \
+    spacy==3.6.0 \
+    wheel \
+    && rm -rf /root/.cache
+# Do a pipenv sync so our base libs are independent from our editable code, making them cacheable
+RUN pipenv sync --system && python3 -m spacy download en_core_web_lg \
+    && rm -rf /root/.cache
+
+# Graphloader codebase
+COPY code/ /usr/lib/spark/code/
+WORKDIR /usr/lib/spark/code/
+
+# Base container assumes this is the workdir
+ENV SPARK_HOME /usr/lib/spark
+WORKDIR $SPARK_HOME
+
+# Ensure our python3 installation is the one used
+RUN echo 'alias python3=python3.9' >> ~/.bashrc
+
+# Starts framework
+ENTRYPOINT ["bash", "/usr/lib/spark/code/docker-entry.sh"]
+
+FROM base AS prod
+RUN python3 -m pip install /usr/lib/spark/code/graphstorm_processing-*.whl && \
+    rm /usr/lib/spark/code/graphstorm_processing-*.whl
+CMD ["gs-processing"]
+
+FROM base AS test
+RUN python3 -m pip install /usr/lib/spark/code/graphstorm-processing/
+CMD ["sh", "-c", "pytest ./code/tests/"]