diff --git a/bin/nnc/sdpa_bench.c b/bin/nnc/sdpa_bench.c index 5ca64a25f..c314eb9cc 100644 --- a/bin/nnc/sdpa_bench.c +++ b/bin/nnc/sdpa_bench.c @@ -37,9 +37,9 @@ int main(int argc, char** argv) int is_causal = is_causal_candidates[trial]; float scale = 1.0 / sqrt((float)D); - ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0); - ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0); - ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0); + ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, Hq, R, D), 0); + ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, Hk, C, D), 0); + ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, Hk, C, D), 0); for (int i = 0; i < B * R * Hq * D; ++i) { q_tensor->data.f32[i] = (float)(i) / (float)(B * R * Hq * D); @@ -51,18 +51,18 @@ int main(int argc, char** argv) v_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D); } - ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0); + ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, Hq, R, D), 0); // ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, NULL, NULL, NULL), TENSOR_LIST(o_tensor, NULL), 0); - ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0); - ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0); - ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0); + ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, Hq, R, D), 0); + ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, Hk, C, D), 0); + ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, Hk, C, D), 0); ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), 0); // Why it there 000 in the beginning of the argument list for GPU_TENSOR_NHWC? - ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0); - ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0); - ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0); - ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0); + ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, Hq, R, D), 0); + ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, Hk, C, D), 0); + ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, Hk, C, D), 0); + ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, Hq, R, D), 0); ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0); ccv_nnc_cmd_t scaled_dot_product_attention = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal); @@ -75,9 +75,9 @@ int main(int argc, char** argv) elapsed_time = get_current_time() - elapsed_time; printf("%d, %d, %d, %d, %d, %d, %2.3f\n", B, R, C, Hq, Hk, D, elapsed_time); - ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0); + ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, Hq, R, D), 0); ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor_f16), 0); - ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0); + ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, Hq, R, D), 0); ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_o_tensor_f16), TENSOR_LIST(copy_of_gpu_o_tensor), 0); // REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 3e-3, "GPU computed output should be the same as CPU computed ones"); diff --git a/lib/nnc/cmd/scaled_dot_product_attention/mps/ccv_nnc_scaled_dot_product_attention_mps.m b/lib/nnc/cmd/scaled_dot_product_attention/mps/ccv_nnc_scaled_dot_product_attention_mps.m index 6c285c748..1bfbd8c9b 100644 --- a/lib/nnc/cmd/scaled_dot_product_attention/mps/ccv_nnc_scaled_dot_product_attention_mps.m +++ b/lib/nnc/cmd/scaled_dot_product_attention/mps/ccv_nnc_scaled_dot_product_attention_mps.m @@ -184,11 +184,7 @@ static int _ccv_nnc_scaled_dot_product_attention_forw(const ccv_nnc_cmd_t cmd, c [inputShapedTypes addObject:mps_k_shape]; [inputTensors addObject:mps_input_v]; [inputShapedTypes addObject:mps_v_shape]; - mps_q = [graph transposeTensor:mps_q dimension:-3 withDimension:-2 name:nil]; - mps_k = [graph transposeTensor:mps_k dimension:-3 withDimension:-2 name:nil]; - mps_v = [graph transposeTensor:mps_v dimension:-3 withDimension:-2 name:nil]; MPSGraphTensor* mps_o = [graph scaledDotProductAttentionWithQueryTensor:mps_q keyTensor:mps_k valueTensor:mps_v scale:scale name:nil]; - mps_o = [graph transposeTensor:mps_o dimension:-3 withDimension:-2 name:nil]; [resultTensors addObject:mps_o]; }); MPSGraphTensorData* data_q = ccv_nnc_mps_graph_tensor_data(q, qdim, qstride); diff --git a/lib/nnc/mps/ccv_nnc_mps.m b/lib/nnc/mps/ccv_nnc_mps.m index 17815ae63..f882aed07 100644 --- a/lib/nnc/mps/ccv_nnc_mps.m +++ b/lib/nnc/mps/ccv_nnc_mps.m @@ -459,8 +459,8 @@ void ccv_nnc_mps_clear_graph_executable_cache(void) assert(inputTensors.count == inputShapedTypes.count); MPSGraphCompilationDescriptor* compilationDescriptor = [MPSGraphCompilationDescriptor new]; // Need more investigation into what this does. - compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0; - compilationDescriptor.optimizationProfile = MPSGraphOptimizationProfilePerformance; + // compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0; + // compilationDescriptor.optimizationProfile = MPSGraphOptimizationProfilePerformance; MPSGraphExecutable* executable = [[graph compileWithDevice:ccv_nnc_default_mps_device() feeds:[NSDictionary dictionaryWithObjects:inputShapedTypes forKeys:inputTensors] targetTensors:targetTensors targetOperations:nil compilationDescriptor:compilationDescriptor] retain]; executable.options = MPSGraphOptionsSynchronizeResults; [compilationDescriptor release]; @@ -1238,8 +1238,8 @@ void ccv_nnc_mps_graph_result(MPSGraph* graph, MPSCommandBuffer* command_buffer, off_t offset = mpgetoffset((ccv_nnc_tensor_t*)data); MPSGraphCompilationDescriptor* compilationDescriptor = [MPSGraphCompilationDescriptor new]; // Need more investigation into what this does. - compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0; - compilationDescriptor.optimizationProfile = MPSGraphOptimizationProfilePerformance; + // compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0; + // compilationDescriptor.optimizationProfile = MPSGraphOptimizationProfilePerformance; MPSGraphExecutionDescriptor* executionDescriptor = [MPSGraphExecutionDescriptor new]; executionDescriptor.compilationDescriptor = compilationDescriptor; if (CCV_IS_TENSOR_CONTIGUOUS(data) && offset == 0)