Skip to content

Commit

Permalink
Best case scenario for MPS SDPA.
Browse files Browse the repository at this point in the history
  • Loading branch information
liuliu committed Dec 21, 2024
1 parent d78aa08 commit e3013d7
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 21 deletions.
26 changes: 13 additions & 13 deletions bin/nnc/sdpa_bench.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ int main(int argc, char** argv)
int is_causal = is_causal_candidates[trial];
float scale = 1.0 / sqrt((float)D);

ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, Hq, R, D), 0);
ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, Hk, C, D), 0);
ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, Hk, C, D), 0);

for (int i = 0; i < B * R * Hq * D; ++i) {
q_tensor->data.f32[i] = (float)(i) / (float)(B * R * Hq * D);
Expand All @@ -51,18 +51,18 @@ int main(int argc, char** argv)
v_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
}

ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, Hq, R, D), 0);
// ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, NULL, NULL, NULL), TENSOR_LIST(o_tensor, NULL), 0);
ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, Hq, R, D), 0);
ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, Hk, C, D), 0);
ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, Hk, C, D), 0);
ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), 0);

// Why it there 000 in the beginning of the argument list for GPU_TENSOR_NHWC?
ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, Hq, R, D), 0);
ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, Hk, C, D), 0);
ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, Hk, C, D), 0);
ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, Hq, R, D), 0);
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);

ccv_nnc_cmd_t scaled_dot_product_attention = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal);
Expand All @@ -75,9 +75,9 @@ int main(int argc, char** argv)
elapsed_time = get_current_time() - elapsed_time;
printf("%d, %d, %d, %d, %d, %d, %2.3f\n", B, R, C, Hq, Hk, D, elapsed_time);

ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, Hq, R, D), 0);
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor_f16), 0);
ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, Hq, R, D), 0);
ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_o_tensor_f16), TENSOR_LIST(copy_of_gpu_o_tensor), 0);

// REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 3e-3, "GPU computed output should be the same as CPU computed ones");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,7 @@ static int _ccv_nnc_scaled_dot_product_attention_forw(const ccv_nnc_cmd_t cmd, c
[inputShapedTypes addObject:mps_k_shape];
[inputTensors addObject:mps_input_v];
[inputShapedTypes addObject:mps_v_shape];
mps_q = [graph transposeTensor:mps_q dimension:-3 withDimension:-2 name:nil];
mps_k = [graph transposeTensor:mps_k dimension:-3 withDimension:-2 name:nil];
mps_v = [graph transposeTensor:mps_v dimension:-3 withDimension:-2 name:nil];
MPSGraphTensor* mps_o = [graph scaledDotProductAttentionWithQueryTensor:mps_q keyTensor:mps_k valueTensor:mps_v scale:scale name:nil];
mps_o = [graph transposeTensor:mps_o dimension:-3 withDimension:-2 name:nil];
[resultTensors addObject:mps_o];
});
MPSGraphTensorData* data_q = ccv_nnc_mps_graph_tensor_data(q, qdim, qstride);
Expand Down
8 changes: 4 additions & 4 deletions lib/nnc/mps/ccv_nnc_mps.m
Original file line number Diff line number Diff line change
Expand Up @@ -459,8 +459,8 @@ void ccv_nnc_mps_clear_graph_executable_cache(void)
assert(inputTensors.count == inputShapedTypes.count);
MPSGraphCompilationDescriptor* compilationDescriptor = [MPSGraphCompilationDescriptor new];
// Need more investigation into what this does.
compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
compilationDescriptor.optimizationProfile = MPSGraphOptimizationProfilePerformance;
// compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
// compilationDescriptor.optimizationProfile = MPSGraphOptimizationProfilePerformance;
MPSGraphExecutable* executable = [[graph compileWithDevice:ccv_nnc_default_mps_device() feeds:[NSDictionary dictionaryWithObjects:inputShapedTypes forKeys:inputTensors] targetTensors:targetTensors targetOperations:nil compilationDescriptor:compilationDescriptor] retain];
executable.options = MPSGraphOptionsSynchronizeResults;
[compilationDescriptor release];
Expand Down Expand Up @@ -1238,8 +1238,8 @@ void ccv_nnc_mps_graph_result(MPSGraph* graph, MPSCommandBuffer* command_buffer,
off_t offset = mpgetoffset((ccv_nnc_tensor_t*)data);
MPSGraphCompilationDescriptor* compilationDescriptor = [MPSGraphCompilationDescriptor new];
// Need more investigation into what this does.
compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
compilationDescriptor.optimizationProfile = MPSGraphOptimizationProfilePerformance;
// compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
// compilationDescriptor.optimizationProfile = MPSGraphOptimizationProfilePerformance;
MPSGraphExecutionDescriptor* executionDescriptor = [MPSGraphExecutionDescriptor new];
executionDescriptor.compilationDescriptor = compilationDescriptor;
if (CCV_IS_TENSOR_CONTIGUOUS(data) && offset == 0)
Expand Down

0 comments on commit e3013d7

Please sign in to comment.