pytorch · mreso · Dec 14, 2023 · Nov 6, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/examples/large_models/deepspeed_mii/LLM/DeepSpeed_mii_handler.py b/examples/large_models/deepspeed_mii/LLM/DeepSpeed_mii_handler.py
@@ -0,0 +1,87 @@
+import logging
+import os
+from abc import ABC
+
+import mii
+
+from ts.context import Context
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+logger.info("DeepSpeed MII version %s", mii.__version__)
+
+
+class DeepSpeedMIIHandler(BaseHandler, ABC):
+    """
+    Diffusers handler class for text to image generation.
+    """
+
+    def __init__(self):
+        self.device = int(os.getenv("LOCAL_RANK", 0))
+        self.initialized = False
+
+    def initialize(self, ctx: Context):
+        """In this initialize function, the Stable Diffusion model is loaded and
+        initialized here.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artifacts parameters.
+        """
+        model_dir = ctx.system_properties.get("model_dir")
+        model_name = ctx.model_yaml_config["handler"]["model_name"]
+        model_path = ctx.model_yaml_config["handler"]["model_path"]
+        self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"])
+
+        model_config = {
+            "tensor_parallel": int(ctx.model_yaml_config["handler"]["tensor_parallel"]),
+            "max_length": int(ctx.model_yaml_config["handler"]["max_length"]),
+        }
+        self.pipe = mii.pipeline(
+            model_name_or_path=model_path,
+            model_config=model_config,
+        )
+        logger.info("Model %s loaded successfully", model_name)
+        self.initialized = True
+
+    def preprocess(self, requests):
+        """Basic text preprocessing, of the user's prompt.
+        Args:
+            requests (str): The Input data in the form of text is passed on to the preprocess
+            function.
+        Returns:
+            list : The preprocess function returns a list of prompts.
+        """
+        inputs = []
+        for _, data in enumerate(requests):
+            input_text = data.get("data")
+            if input_text is None:
+                input_text = data.get("body")
+            if isinstance(input_text, (bytes, bytearray)):
+                input_text = input_text.decode("utf-8")
+            logger.info("Received text: '%s'", input_text)
+            inputs.append(input_text)
+        return inputs
+
+    def inference(self, inputs):
+        """Generates the image relevant to the received text.
+        Args:
+            input_batch (list): List of Text from the pre-process function is passed here
+        Returns:
+            list : It returns a list of the generate images for the input text
+        """
+        inferences = self.pipe(
+            inputs, max_new_tokens=self.max_new_tokens
+        ).generated_texts
+
+        logger.info("Generated text: %s", inferences)
+        return inferences
+
+    def postprocess(self, inference_output):
+        """Post Process Function converts the generated image into Torchserve readable format.
+        Args:
+            inference_output (list): It contains the generated image of the input text.
+        Returns:
+            (list): Returns a list of the images.
+        """
+
+        return inference_output
diff --git a/examples/large_models/deepspeed_mii/LLM/Readme.md b/examples/large_models/deepspeed_mii/LLM/Readme.md
@@ -0,0 +1,5 @@
+# Running LLM model using Microsoft DeepSpeed-MII in Torchserve
+
+This example demo serving HF LLM model with Microsoft DeepSpeed-MII in Torchserve. With DeepSpeed-MII there has been significant progress in system optimizations for DL model inference, drastically reducing both latency and cost.
+
+The notebook example can be found in mii-deepspeed-fastgen.ipynb.
diff --git a/examples/large_models/deepspeed_mii/LLM/mii-deepspeed-fastgen.ipynb b/examples/large_models/deepspeed_mii/LLM/mii-deepspeed-fastgen.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Running LLM model using Microsoft DeepSpeed-MII in Torchserve.\n",
+    "This notebook briefs on serving HF LLM model with Microsoft DeepSpeed-MII in Torchserve. With DeepSpeed-MII there has been significant progress in system optimizations for DL model inference, drastically reducing both latency and cost."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Step 1: Download model\n",
+    "Login into huggingface hub with token by running the below command"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "huggingface-cli login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!python ../../utils/Download_model.py --model_name meta-llama/Llama-2-13b-hf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Step 2: Generate model artifacts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2045.86s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "!torch-model-archiver --model-name mii-llama--Llama-2-13b-hf --version 1.0 --handler DeepSpeed_mii_handler.py --config-file model-config.yaml -r requirements.txt --archive-format no-archive"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!mv model mii-llama--Llama-2-13b-hf\n",
+    "!cd ../../../../ && mkdir model_store && mv mii-llama--Llama-2-13b-hf model_store"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Step 3: Start torchserve"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!torchserve --ncs --start --model-store model_store --models mii-llama--Llama-2-13b-hf --ts-config benchmarks/config.properties"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Step 4: Run inference\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!curl  \"http://localhost:8080/predictions/mii-Llama-2-13b-hf\" -T examples/large_models/deepspeed_mii/LLM/sample.txt"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/examples/large_models/deepspeed_mii/LLM/model-config.yaml b/examples/large_models/deepspeed_mii/LLM/model-config.yaml
@@ -0,0 +1,20 @@
+# TorchServe frontend parameters
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 100
+responseTimeout: 1200
+parallelType: "tp"
+deviceType: "gpu"
+# example of user specified GPU deviceIds
+deviceIds: [0,1,2,3] # seting CUDA_VISIBLE_DEVICES
+
+torchrun:
+    nproc-per-node: 4
+
+# TorchServe Backend parameters
+handler:
+    model_name: "meta-llama/Llama-2-13b-hf"
+    model_path: "model/models--meta-llama--Llama-2-13b-hf/snapshots/99afe33d7eaa87c7fc6ea2594a0e4e7e588ee0a4"
+    tensor_parallel: 4
+    max_length: 4096
+    max_new_tokens: 256
diff --git a/examples/large_models/deepspeed_mii/LLM/requirements.txt b/examples/large_models/deepspeed_mii/LLM/requirements.txt
@@ -0,0 +1 @@
+deepspeed-mii
diff --git a/examples/large_models/deepspeed_mii/LLM/sample.txt b/examples/large_models/deepspeed_mii/LLM/sample.txt
@@ -0,0 +1 @@
+The museum format went through significant transformations in the 20th century. For a long time, museums collected the art of previous generations. The demonstration of contemporary art required new approaches and fresh ideas. Modernization attempts appeared most often in the design of the outer parts of buildings; museums received attractive exterior decoration, such as the glass pyramids of the Louvre. The museum was supposed to evoke a respectful attitude towards what was stored within its walls. That is why museums were arranged in palaces or in specially built buildings, the appearance of which was supposed to inspire respect. However, it gradually became clear that this approach did not attract modern visitors. It became apparent that contemporary art needed a contemporary place of expression.
diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java
@@ -227,6 +227,9 @@ public void run() {
                     long begin = System.currentTimeMillis();
                     for (int i = 0; i < repeats; i++) {
                         reply = replies.poll(responseTimeout, TimeUnit.SECONDS);
+                        if (req.getCommand() != WorkerCommands.LOAD) {
+                            break;
+                        }
                     }
 
                     long duration = System.currentTimeMillis() - begin;

diff --git a/ts/model_service_worker.py b/ts/model_service_worker.py
@@ -180,7 +180,10 @@ def handle_connection(self, cl_socket):
             if cmd == b"I":
                 if service is not None:
                     resp = service.predict(msg)
-                    cl_socket.sendall(resp)
+                    if LOCAL_RANK == 0:
+                        cl_socket.sendall(resp)
+                    else:
+                        logging.info("skip sending response at rank %d", LOCAL_RANK)
                 else:
                     raise RuntimeError(
                         "Received command: {}, but service is not loaded".format(cmd)

diff --git a/ts/protocol/otf_message_handler.py b/ts/protocol/otf_message_handler.py
@@ -64,6 +64,9 @@ def create_predict_response(
     :param code:
     :return:
     """
+    if str(os.getenv("LOCAL_RANK", 0)) != "0":
+        return None
+
     msg = bytearray()
     msg += struct.pack("!i", code)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The museum format went through significant transformations in the 20th century. For a long time, museums collected the art of previous generations. The demonstration of contemporary art required new approaches and fresh ideas. Modernization attempts appeared most often in the design of the outer parts of buildings; museums received attractive exterior decoration, such as the glass pyramids of the Louvre. The museum was supposed to evoke a respectful attitude towards what was stored within its walls. That is why museums were arranged in palaces or in specially built buildings, the appearance of which was supposed to inspire respect. However, it gradually became clear that this approach did not attract modern visitors. It became apparent that contemporary art needed a contemporary place of expression.