Skip to content

Commit

Permalink
Add Apple system metrics support (pytorch#3377)
Browse files Browse the repository at this point in the history
* Add Apple system metrics support

Co-authored-by: Bipradip Chowdhury <[email protected]>
Co-authored-by: Rony Leppänen <[email protected]>
Co-authored-by: Anders Smedegaard Pedersen <[email protected]>

* Fix ModelServerTest.testMetricManager for other HW vendors

* Add GPUUtilization as expect metric

---------

Co-authored-by: Bipradip Chowdhury <[email protected]>
Co-authored-by: Rony Leppänen <[email protected]>
Co-authored-by: Anders Smedegaard Pedersen <[email protected]>
  • Loading branch information
4 people authored Dec 20, 2024
1 parent 9bcbd22 commit 1a3b18b
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.pytorch.serve.device.Accelerator;
import org.pytorch.serve.device.AcceleratorVendor;
import org.pytorch.serve.device.interfaces.IAcceleratorUtility;
Expand Down Expand Up @@ -75,15 +74,12 @@ public List<JsonObject> extractAccelerators(JsonElement rootObject) {
.getAsJsonObject() // Gets the outer object
.get("SPDisplaysDataType") // Gets the "SPDisplaysDataType" element
.getAsJsonArray();

JsonObject gpuObject = displaysArray.get(0).getAsJsonObject();
int number_of_cores = Integer.parseInt(gpuObject.get("sppci_cores").getAsString());

// add the object `number_of_cores` times to maintain the exsisitng
// functionality
accelerators =
IntStream.range(0, number_of_cores)
.mapToObj(i -> gpuObject)
.collect(Collectors.toList());

// Create list with only a single accelerator object as
// M1, M2, M3 Macs have only single integrated GPU
accelerators = Collections.singletonList(gpuObject);

return accelerators;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1372,8 +1372,17 @@ public void testMetricManager() throws JsonParseException, InterruptedException
Assert.assertTrue(++count < 5);
}

// 7 system-level metrics + 3 gpu-specific metrics
Assert.assertEquals(metrics.size(), 7 + 3 * configManager.getNumberOfGpu());
// Determine if the device is Apple or not
String vendor = System.getProperty("os.name");
if (vendor != null) {
if (vendor.startsWith("Mac")) {
// 7 system-level metrics + 2 gpu-specific metrics (per GPU) for Apple devices
Assert.assertEquals(metrics.size(), 7 + 2 * configManager.getNumberOfGpu());
} else {
// 7 system-level metrics + 3 gpu-specific metrics (per GPU) for non-Apple devices
Assert.assertEquals(metrics.size(), 7 + 3 * configManager.getNumberOfGpu());
}
}

for (Metric metric : metrics) {
String metricName = metric.getMetricName();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public void testExtractAcceleratorId() {
public void testExtractAccelerators() {
List<JsonObject> accelerators = appleUtil.extractAccelerators(sampleOutputJson);

assertEquals(accelerators.size(), 7);
assertEquals(accelerators.size(), 1);
assertEquals(accelerators.get(0).get("sppci_model").getAsString(), "Apple M1");
}

Expand All @@ -88,7 +88,7 @@ public void testSmiOutputToUpdatedAccelerators() {
ArrayList<Accelerator> updatedAccelerators =
appleUtil.smiOutputToUpdatedAccelerators(sampleOutputJson.toString(), parsedGpuIds);

assertEquals(updatedAccelerators.size(), 7);
assertEquals(updatedAccelerators.size(), 1);
Accelerator accelerator = updatedAccelerators.get(0);
assertEquals(accelerator.getAcceleratorModel(), "Apple M1");
assertEquals(accelerator.getVendor(), AcceleratorVendor.APPLE);
Expand All @@ -112,7 +112,7 @@ public String[] getUtilizationSmiCommand() {
ArrayList<Accelerator> availableAccelerators =
spyAppleUtil.getAvailableAccelerators(availableAcceleratorIds);

assertEquals(availableAccelerators.size(), 7);
assertEquals(availableAccelerators.size(), 1);
Accelerator accelerator = availableAccelerators.get(0);
assertEquals(accelerator.getAcceleratorModel(), "Apple M1");
assertEquals(accelerator.getVendor(), AcceleratorVendor.APPLE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ public void testNumGpuM1() throws ReflectiveOperationException, IOException {
String mac_arm64_cpu_only = System.getenv().getOrDefault("TS_MAC_ARM64_CPU_ONLY", "False");
if (arch.equals("aarch64")) {
if (mac_arm64_cpu_only.equals("True")) {
Assert.assertEquals(configManager.getNumberOfGpu(), 0);
// Mac M1 returns 1 accelerator device
Assert.assertEquals(configManager.getNumberOfGpu(), 1);
} else {
Assert.assertTrue(configManager.getNumberOfGpu() > 0);
}
Expand Down
14 changes: 14 additions & 0 deletions ts/metrics/system_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,20 @@ def collect_gpu_metrics(num_of_gpus):
amdsmi.amdsmi_shut_down()
except amdsmi.AmdSmiException as e:
logging.error("Could not shut down AMD-SMI library.")
elif torch.backends.mps.is_available():
try:
total_memory = torch.mps.driver_allocated_memory()
mem_used = torch.mps.current_allocated_memory()
gpu_mem_utilization = (
(mem_used / total_memory * 100) if total_memory > 0 else 0
)
# Currently there is no way to calculate GPU utilization with MPS.
gpu_utilization = None
except Exception as e:
logging.error(f"Could not capture MPS memory metrics")
mem_used = 0
gpu_mem_utilization = 0
gpu_utilization = None

dimension_gpu = [
Dimension("Level", "Host"),
Expand Down

0 comments on commit 1a3b18b

Please sign in to comment.