From e83d306ad0456055dc2e6c90c263851b94dd1691 Mon Sep 17 00:00:00 2001 From: dnth Date: Wed, 27 Nov 2024 13:32:52 +0800 Subject: [PATCH] update --- nbs/smolvlm.ipynb | 54 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/nbs/smolvlm.ipynb b/nbs/smolvlm.ipynb index 397ed2f..086b524 100644 --- a/nbs/smolvlm.ipynb +++ b/nbs/smolvlm.ipynb @@ -51,9 +51,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-11-27 13:18:13.627\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m63\u001b[0m - \u001b[1mModel: HuggingFaceTB/SmolVLM-Instruct\u001b[0m\n", - "\u001b[32m2024-11-27 13:18:13.628\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n", - "\u001b[32m2024-11-27 13:18:13.628\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m65\u001b[0m - \u001b[1mDtype: bfloat16\u001b[0m\n" + "\u001b[32m2024-11-27 13:31:49.434\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m63\u001b[0m - \u001b[1mModel: HuggingFaceTB/SmolVLM-Instruct\u001b[0m\n", + "\u001b[32m2024-11-27 13:31:49.434\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n", + "\u001b[32m2024-11-27 13:31:49.435\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m65\u001b[0m - \u001b[1mDtype: bfloat16\u001b[0m\n" ] } ], @@ -65,16 +65,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'User:Describe this image. \\nAssistant: The image depicts a street scene with a parade taking place. The street is lined with buildings, some of which are shops, and there are people standing on both sides of the street, watching the parade. The parade consists of a number of vehicles, including a truck, and a float, which is decorated with flags and banners. The people on the street are dressed in casual attire, and some of them are waving small flags. The sky is cloudy, and there is a clock tower visible in the background. The clock tower is part of a building that is located on the side of a hill. The buildings are mostly made of wood and have a rustic appearance. The image captures a moment of community celebration and togetherness.'" + "'The image depicts a street scene with a parade taking place. The street is lined with buildings, some of which are shops, and there are people standing on both sides of the street, watching the parade. The parade consists of a number of vehicles, including a truck, and a float with a large American flag prominently displayed. The truck is carrying a large number of flags, including the American flag, and there are also smaller flags on the float. The people on the street are dressed in a variety of ways, including some wearing winter coats and hats, indicating that the weather is likely cold.\\n\\nIn the background, there is a clock tower on a building, and a mountain can be seen in the distance. The sky is overcast, and the weather appears to be cloudy and possibly rainy. The people on the street are standing in various poses, some with their hands raised in the air, indicating excitement or participation in the parade.\\n\\nThe image captures a moment of community celebration and togetherness, with people from all walks of life coming together to enjoy a parade. The parade is likely an annual event, given the presence of the clock tower and the presence of multiple buildings, suggesting that it takes place in a town or city.\\n\\n### Analysis and Description:\\n1. **Parade Participants**:\\n - The parade includes a truck and a float with an American flag prominently displayed.\\n - The truck is carrying a large number of flags, including the American flag.'" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -88,10 +88,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
                          Model Info                          \n",
+       "╭───────────────────────────┬────────────────────────────────╮\n",
+       "│ Attribute                  Value                          │\n",
+       "├───────────────────────────┼────────────────────────────────┤\n",
+       "│ Model ID                   HuggingFaceTB/SmolVLM-Instruct │\n",
+       "│ Device                     cuda                           │\n",
+       "│ Dtype                      torch.bfloat16                 │\n",
+       "│ Number of Inferences       1                              │\n",
+       "│ Total Inference Time (ms)  5042.1868                      │\n",
+       "│ Average Latency (ms)       5042.1868                      │\n",
+       "╰───────────────────────────┴────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Model Info \u001b[0m\n", + "╭───────────────────────────┬────────────────────────────────╮\n", + "│\u001b[1m \u001b[0m\u001b[1mAttribute \u001b[0m\u001b[1m \u001b[0m│\u001b[1m \u001b[0m\u001b[1mValue \u001b[0m\u001b[1m \u001b[0m│\n", + "├───────────────────────────┼────────────────────────────────┤\n", + "│\u001b[36m \u001b[0m\u001b[36mModel ID \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mHuggingFaceTB/SmolVLM-Instruct\u001b[0m\u001b[35m \u001b[0m│\n", + "│\u001b[36m \u001b[0m\u001b[36mDevice \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mcuda \u001b[0m\u001b[35m \u001b[0m│\n", + "│\u001b[36m \u001b[0m\u001b[36mDtype \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mtorch.bfloat16 \u001b[0m\u001b[35m \u001b[0m│\n", + "│\u001b[36m \u001b[0m\u001b[36mNumber of Inferences \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m│\n", + "│\u001b[36m \u001b[0m\u001b[36mTotal Inference Time (ms)\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m5042.1868 \u001b[0m\u001b[35m \u001b[0m│\n", + "│\u001b[36m \u001b[0m\u001b[36mAverage Latency (ms) \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m5042.1868 \u001b[0m\u001b[35m \u001b[0m│\n", + "╰───────────────────────────┴────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "model.print_stats()" + ] } ], "metadata": {