From 10e029b59fa376bc09d3dc1127c3e5fdc99f3b04 Mon Sep 17 00:00:00 2001 From: Sven-Bodo Scholz Date: Tue, 14 Aug 2018 21:13:54 +0100 Subject: [PATCH 1/3] fixed the error checking for CUDA block sizes and refined the error messages --- src/libsac2c/codegen/icm2c_cuda.c | 35 ++++++++++++++++----- src/libsac2c/cuda/annotate_cuda_withloop2.c | 35 +++++++++++++++------ src/libsac2c/global/globals.mac | 7 +++-- 3 files changed, 57 insertions(+), 20 deletions(-) diff --git a/src/libsac2c/codegen/icm2c_cuda.c b/src/libsac2c/codegen/icm2c_cuda.c index fd172979b..aa902f95e 100644 --- a/src/libsac2c/codegen/icm2c_cuda.c +++ b/src/libsac2c/codegen/icm2c_cuda.c @@ -270,14 +270,15 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) INDENT; \ INDENT; \ fprintf (global.outfile, "if (grid.x > %u || grid.y > %u || grid.z > %u) {\n", \ - global.cuda_max_x_dim, global.cuda_max_yz_dim, global.cuda_max_yz_dim); \ + global.cuda_max_x_grid, global.cuda_max_yz_grid, global.cuda_max_yz_grid); \ INDENT; \ INDENT; \ INDENT; \ fprintf (global.outfile, \ - "SAC_RuntimeError(\"CUDA XYZ grid dimension exceeds compute " \ - "compatibilities max value: %u x %u x %u\");\n", \ - global.cuda_max_x_dim, global.cuda_max_yz_dim, global.cuda_max_yz_dim); \ + "SAC_RuntimeError(\"CUDA XYZ grid dimension of %%u x %%u x %%u exceeds " \ + "the compute capability's max value: %u x %u x %u\"," \ + " grid.x, grid.y, grid.z );\n", \ + global.cuda_max_x_grid, global.cuda_max_yz_grid, global.cuda_max_yz_grid); \ INDENT; \ INDENT; \ fprintf (global.outfile, "}\n"); @@ -287,14 +288,32 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) INDENT; \ INDENT; \ fprintf (global.outfile, "if (block.x > %u || block.y > %u || block.z > %u) {\n", \ - global.cuda_max_x_dim, global.cuda_max_yz_dim, global.cuda_max_yz_dim); \ + global.cuda_max_xy_block, global.cuda_max_xy_block, \ + global.cuda_max_z_block); \ INDENT; \ INDENT; \ INDENT; \ fprintf (global.outfile, \ - "SAC_RuntimeError(\"CUDA XYZ block dimension exceeds compute " \ - "compatibilities max value: %u x %u x %u\");\n", \ - global.cuda_max_x_dim, global.cuda_max_yz_dim, global.cuda_max_yz_dim); \ + "SAC_RuntimeError(\"CUDA XYZ block dimension of %%u x %%u x %%u exceeds " \ + "the compute capability's max value: %u x %u x %u\", " \ + "block.x, block.y, block.z);\n", \ + global.cuda_max_xy_block, global.cuda_max_xy_block, \ + global.cuda_max_z_block); \ + INDENT; \ + INDENT; \ + fprintf (global.outfile, "}\n"); \ + INDENT; \ + INDENT; \ + fprintf (global.outfile, "if (block.x * block.y *block.z > %u ) {\n", \ + global.cuda_max_threads_block); \ + INDENT; \ + INDENT; \ + INDENT; \ + fprintf (global.outfile, \ + "SAC_RuntimeError(\"CUDA XYZ block dimension of %%u x %%u x %%u = %%u " \ + "exceeds compute capability's max number of threads per block: %u\", " \ + " block.x, block.y, block.zi, block.x * block.y *block.z );\n", \ + global.cuda_max_threads_block); \ INDENT; \ INDENT; \ fprintf (global.outfile, "}\n"); diff --git a/src/libsac2c/cuda/annotate_cuda_withloop2.c b/src/libsac2c/cuda/annotate_cuda_withloop2.c index 8d37c8d58..e4b2076e7 100644 --- a/src/libsac2c/cuda/annotate_cuda_withloop2.c +++ b/src/libsac2c/cuda/annotate_cuda_withloop2.c @@ -116,8 +116,11 @@ InitCudaBlockSizes (void) global.cuda_blocking_factor = 16; global.cuda_2d_block_x = 16; global.cuda_2d_block_y = 16; - global.cuda_max_x_dim = 65535; - global.cuda_max_yz_dim = 65535; + global.cuda_max_x_grid = 65535; + global.cuda_max_yz_grid = 65535; + global.cuda_max_xy_block = 512; + global.cuda_max_z_block = 64; + global.cuda_max_threads_block = 512; } else if (STReq (global.config.cuda_arch, "-arch=sm_12") || STReq (global.config.cuda_arch, "-arch=sm_13")) { global.optimal_threads = 256; @@ -127,8 +130,11 @@ InitCudaBlockSizes (void) global.cuda_blocking_factor = 16; global.cuda_2d_block_x = 16; global.cuda_2d_block_y = 16; - global.cuda_max_x_dim = 65535; - global.cuda_max_yz_dim = 65535; + global.cuda_max_x_grid = 65535; + global.cuda_max_yz_grid = 65535; + global.cuda_max_xy_block = 512; + global.cuda_max_z_block = 64; + global.cuda_max_threads_block = 512; } else if (STReq (global.config.cuda_arch, "-arch=sm_20")) { /* global.optimal_threads = 512; @@ -148,8 +154,11 @@ InitCudaBlockSizes (void) global.cuda_blocking_factor = 32; global.cuda_2d_block_x = 16; global.cuda_2d_block_y = 16; - global.cuda_max_x_dim = 65535; - global.cuda_max_yz_dim = 65535; + global.cuda_max_x_grid = 65535; + global.cuda_max_yz_grid = 65535; + global.cuda_max_xy_block = 1024; + global.cuda_max_z_block = 64; + global.cuda_max_threads_block = 1024; } else if (STReq (global.config.cuda_arch, "-arch=sm_35")) { global.optimal_threads = 512; global.optimal_blocks = 3; @@ -163,8 +172,11 @@ InitCudaBlockSizes (void) global.cuda_blocking_factor = 32; global.cuda_2d_block_x = 16; global.cuda_2d_block_y = 16; - global.cuda_max_x_dim = 2147483647; - global.cuda_max_yz_dim = 65535; + global.cuda_max_x_grid = 2147483647; + global.cuda_max_yz_grid = 65535; + global.cuda_max_xy_block = 1024; + global.cuda_max_z_block = 64; + global.cuda_max_threads_block = 1024; } else if (STReq (global.config.cuda_arch, "-arch=sm_50")) { global.optimal_threads = 512; global.optimal_blocks = 3; @@ -178,8 +190,11 @@ InitCudaBlockSizes (void) global.cuda_blocking_factor = 32; global.cuda_2d_block_x = 32; global.cuda_2d_block_y = 32; - global.cuda_max_x_dim = 2147483647; - global.cuda_max_yz_dim = 65535; + global.cuda_max_x_grid = 2147483647; + global.cuda_max_yz_grid = 65535; + global.cuda_max_xy_block = 1024; + global.cuda_max_z_block = 64; + global.cuda_max_threads_block = 1024; } else { if (STReq (global.config.cuda_arch, "no")) { CTIwarn ("CUDA architecture was not detected during install, setting to " diff --git a/src/libsac2c/global/globals.mac b/src/libsac2c/global/globals.mac index 2b05172ae..0044680e3 100644 --- a/src/libsac2c/global/globals.mac +++ b/src/libsac2c/global/globals.mac @@ -1320,8 +1320,11 @@ GLOBAL (int, cuda_1d_block_small, 0, xfree_dummy, ) GLOBAL (int, cuda_2d_block_x, 0, xfree_dummy, ) GLOBAL (int, cuda_2d_block_y, 0, xfree_dummy, ) GLOBAL (int, cuda_blocking_factor, 0, xfree_dummy, ) -GLOBAL (unsigned int, cuda_max_x_dim, 0, xfree_dummy, ) -GLOBAL (unsigned int, cuda_max_yz_dim, 0, xfree_dummy, ) /* for both the Y and Z dimensions */ +GLOBAL (unsigned int, cuda_max_x_grid, 0, xfree_dummy, ) +GLOBAL (unsigned int, cuda_max_yz_grid, 0, xfree_dummy, ) /* for both the Y and Z dimensions */ +GLOBAL (unsigned int, cuda_max_xy_block, 0, xfree_dummy, ) +GLOBAL (unsigned int, cuda_max_z_block, 0, xfree_dummy, ) +GLOBAL (unsigned int, cuda_max_threads_block, 0, xfree_dummy, ) /* * DistMem backend options -- GitLab From 35f9ca18bebded32545897c167b6b3f416bc79ce Mon Sep 17 00:00:00 2001 From: Sven-Bodo Scholz Date: Wed, 15 Aug 2018 15:43:57 +0100 Subject: [PATCH 2/3] fixed minor typo and added -trace g option trace g traces GPU actions; currently only kernel launches will be reported. --- src/libsac2c/codegen/gen_startup_code.c | 2 ++ src/libsac2c/codegen/icm2c_cuda.c | 32 ++++++++++++++++++++++++- src/libsac2c/global/flags.mac | 1 + src/libsac2c/global/usage.c | 1 + src/runtime/extras_h/rt_trace.h | 6 +++++ 5 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/libsac2c/codegen/gen_startup_code.c b/src/libsac2c/codegen/gen_startup_code.c index 21b8c3ad1..f8da00851 100644 --- a/src/libsac2c/codegen/gen_startup_code.c +++ b/src/libsac2c/codegen/gen_startup_code.c @@ -149,6 +149,8 @@ PrintGlobalSwitches (void) (global.trace.aa) ? 1 : 0); fprintf (global.outfile, "#define SAC_DO_TRACE_MT %d\n", (global.trace.mt) ? 1 : 0); + fprintf (global.outfile, "#define SAC_DO_TRACE_GPU %d\n", + (global.trace.gpu) ? 1 : 0); fprintf (global.outfile, "#define SAC_DO_TRACE_RTSPEC %d\n", (global.trace.rtspec) ? 1 : 0); fprintf (global.outfile, "#define SAC_DO_TRACE_DISTMEM %d\n", diff --git a/src/libsac2c/codegen/icm2c_cuda.c b/src/libsac2c/codegen/icm2c_cuda.c index aa902f95e..112d3c144 100644 --- a/src/libsac2c/codegen/icm2c_cuda.c +++ b/src/libsac2c/codegen/icm2c_cuda.c @@ -268,7 +268,21 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) #define CUDA_SET_GRID(fmt, ...) \ fprintf (global.outfile, "dim3 grid(" fmt ");\n", __VA_ARGS__); \ INDENT; \ + fprintf (global.outfile, "SAC_TR_GPU_PRINT (\"CUDA XYZ grid dimension of " \ + "%%u x %%u x %%u\", grid.x , grid.y , grid.z );\n" ); \ INDENT; \ + fprintf (global.outfile, "if (grid.x <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA X grid dimension must be bigger than zero. Current"\ + " value is %%u\", grid.x);" ); \ + fprintf (global.outfile, "}\n"); \ + fprintf (global.outfile, "if (grid.y <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA Y grid dimension must be bigger than zero. Current"\ + " value is %%u\", grid.y);" ); \ + fprintf (global.outfile, "}\n"); \ + fprintf (global.outfile, "if (grid.z <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA Z grid dimension must be bigger than zero. Current"\ + " value is %%u\", grid.z);" ); \ + fprintf (global.outfile, "}\n"); \ fprintf (global.outfile, "if (grid.x > %u || grid.y > %u || grid.z > %u) {\n", \ global.cuda_max_x_grid, global.cuda_max_yz_grid, global.cuda_max_yz_grid); \ INDENT; \ @@ -286,7 +300,21 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) #define CUDA_SET_BLOCK(fmt, ...) \ fprintf (global.outfile, "dim3 block(" fmt ");", __VA_ARGS__); \ INDENT; \ + fprintf (global.outfile, "SAC_TR_GPU_PRINT (\"CUDA XYZ block dimension of " \ + "%%u x %%u x %%u\\n\", block.x , block.y , block.z );\n" ); \ INDENT; \ + fprintf (global.outfile, "if (block.x <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA X block dimension must be bigger than zero. Current"\ + " value is %%u\", block.x);" ); \ + fprintf (global.outfile, "}\n"); \ + fprintf (global.outfile, "if (block.y <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA Y block dimension must be bigger than zero. Current"\ + " value is %%u\", block.y);" ); \ + fprintf (global.outfile, "}\n"); \ + fprintf (global.outfile, "if (block.z <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA Z block dimension must be bigger than zero. Current"\ + " value is %%u\", block.z);" ); \ + fprintf (global.outfile, "}\n"); \ fprintf (global.outfile, "if (block.x > %u || block.y > %u || block.z > %u) {\n", \ global.cuda_max_xy_block, global.cuda_max_xy_block, \ global.cuda_max_z_block); \ @@ -312,7 +340,7 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) fprintf (global.outfile, \ "SAC_RuntimeError(\"CUDA XYZ block dimension of %%u x %%u x %%u = %%u " \ "exceeds compute capability's max number of threads per block: %u\", " \ - " block.x, block.y, block.zi, block.x * block.y *block.z );\n", \ + " block.x, block.y, block.z, block.x * block.y *block.z );\n", \ global.cuda_max_threads_block); \ INDENT; \ INDENT; \ @@ -353,6 +381,8 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) INDENT; fprintf (global.outfile, "{\n"); + fprintf (global.outfile, "SAC_TR_GPU_PRINT (\"launching %dD kernel\");", bounds_count/3); + INDENT; if (bounds_count == 3) { /* 1D CUDA withloop */ INDENT; INDENT; diff --git a/src/libsac2c/global/flags.mac b/src/libsac2c/global/flags.mac index 70e2f7a85..bfe973287 100644 --- a/src/libsac2c/global/flags.mac +++ b/src/libsac2c/global/flags.mac @@ -27,6 +27,7 @@ TRACE (mem, 'm', FALSE) TRACE (wl, 'w', FALSE) TRACE (aa, 's', FALSE) TRACE (mt, 't', FALSE) +TRACE (gpu, 'g', FALSE) /* CUDA backend */ TRACE (cenv, 'c', FALSE) TRACE (distmem, 'd', FALSE) /* Distributed memory backend */ diff --git a/src/libsac2c/global/usage.c b/src/libsac2c/global/usage.c index 5e4d951e1..b00760dfe 100644 --- a/src/libsac2c/global/usage.c +++ b/src/libsac2c/global/usage.c @@ -850,6 +850,7 @@ PrintRuntimeTraceOptions (void) " p: Trace primitive function calls.\n" " w: Trace with-loop execution.\n" " s: Trace array accesses.\n" + " g: Trace CUDA runtime.\n" " t: Trace multi-threading specific operations.\n" " c: Trace runtime enviroment init/exit when\n" " using SAC libraries in C programs.\n" diff --git a/src/runtime/extras_h/rt_trace.h b/src/runtime/extras_h/rt_trace.h index 4635f4d87..7492eb2bd 100644 --- a/src/runtime/extras_h/rt_trace.h +++ b/src/runtime/extras_h/rt_trace.h @@ -176,6 +176,12 @@ typedef enum { #endif /* SAC_DO_TRACE_MT */ +#if SAC_DO_TRACE_GPU +#define SAC_TR_GPU_PRINT(...) SAC_TR_PRINT (("GPU -> " __VA_ARGS__)) +#else /* SAC_DO_TRACE_GPU */ +#define SAC_TR_GPU_PRINT( ...) +#endif /* SAC_DO_TRACE_GPU */ + #if SAC_DO_TRACE_DISTMEM #define SAC_TR_DISTMEM_PRINT(...) SAC_TR_PRINT (("DSM -> " __VA_ARGS__)) -- GitLab From 6c479dad94b0e5fd092835ff67ae6c0d5f8c8bce Mon Sep 17 00:00:00 2001 From: Sven-Bodo Scholz Date: Thu, 16 Aug 2018 15:51:37 +0100 Subject: [PATCH 3/3] style-corrections requested at merge --- src/libsac2c/codegen/icm2c_cuda.c | 36 ++++++++++++++++--------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/libsac2c/codegen/icm2c_cuda.c b/src/libsac2c/codegen/icm2c_cuda.c index 112d3c144..feec52f9c 100644 --- a/src/libsac2c/codegen/icm2c_cuda.c +++ b/src/libsac2c/codegen/icm2c_cuda.c @@ -268,20 +268,21 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) #define CUDA_SET_GRID(fmt, ...) \ fprintf (global.outfile, "dim3 grid(" fmt ");\n", __VA_ARGS__); \ INDENT; \ - fprintf (global.outfile, "SAC_TR_GPU_PRINT (\"CUDA XYZ grid dimension of " \ - "%%u x %%u x %%u\", grid.x , grid.y , grid.z );\n" ); \ + fprintf (global.outfile, \ + "SAC_TR_GPU_PRINT (\"CUDA XYZ grid dimension of " \ + "%%u x %%u x %%u\", grid.x , grid.y , grid.z );\n"); \ INDENT; \ fprintf (global.outfile, "if (grid.x <= 0 ) {\n" \ "SAC_RuntimeError(\"CUDA X grid dimension must be bigger than zero. Current"\ - " value is %%u\", grid.x);" ); \ + " value is %%u\", grid.x);"); \ fprintf (global.outfile, "}\n"); \ fprintf (global.outfile, "if (grid.y <= 0 ) {\n" \ "SAC_RuntimeError(\"CUDA Y grid dimension must be bigger than zero. Current"\ - " value is %%u\", grid.y);" ); \ + " value is %%u\", grid.y);"); \ fprintf (global.outfile, "}\n"); \ fprintf (global.outfile, "if (grid.z <= 0 ) {\n" \ "SAC_RuntimeError(\"CUDA Z grid dimension must be bigger than zero. Current"\ - " value is %%u\", grid.z);" ); \ + " value is %%u\", grid.z);"); \ fprintf (global.outfile, "}\n"); \ fprintf (global.outfile, "if (grid.x > %u || grid.y > %u || grid.z > %u) {\n", \ global.cuda_max_x_grid, global.cuda_max_yz_grid, global.cuda_max_yz_grid); \ @@ -300,20 +301,21 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) #define CUDA_SET_BLOCK(fmt, ...) \ fprintf (global.outfile, "dim3 block(" fmt ");", __VA_ARGS__); \ INDENT; \ - fprintf (global.outfile, "SAC_TR_GPU_PRINT (\"CUDA XYZ block dimension of " \ - "%%u x %%u x %%u\\n\", block.x , block.y , block.z );\n" ); \ + fprintf (global.outfile, \ + "SAC_TR_GPU_PRINT (\"CUDA XYZ block dimension of " \ + "%%u x %%u x %%u\\n\", block.x , block.y , block.z );\n"); \ INDENT; \ - fprintf (global.outfile, "if (block.x <= 0 ) {\n" \ - "SAC_RuntimeError(\"CUDA X block dimension must be bigger than zero. Current"\ - " value is %%u\", block.x);" ); \ + fprintf (global.outfile, "if (block.x <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA X block dimension must be bigger than zero. " \ + "Current value is %%u\", block.x);"); \ fprintf (global.outfile, "}\n"); \ - fprintf (global.outfile, "if (block.y <= 0 ) {\n" \ - "SAC_RuntimeError(\"CUDA Y block dimension must be bigger than zero. Current"\ - " value is %%u\", block.y);" ); \ + fprintf (global.outfile, "if (block.y <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA Y block dimension must be bigger than zero. " \ + "Current value is %%u\", block.y);"); \ fprintf (global.outfile, "}\n"); \ - fprintf (global.outfile, "if (block.z <= 0 ) {\n" \ - "SAC_RuntimeError(\"CUDA Z block dimension must be bigger than zero. Current"\ - " value is %%u\", block.z);" ); \ + fprintf (global.outfile, "if (block.z <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA Z block dimension must be bigger than zero. " \ + "Current value is %%u\", block.z);"); \ fprintf (global.outfile, "}\n"); \ fprintf (global.outfile, "if (block.x > %u || block.y > %u || block.z > %u) {\n", \ global.cuda_max_xy_block, global.cuda_max_xy_block, \ @@ -340,7 +342,7 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) fprintf (global.outfile, \ "SAC_RuntimeError(\"CUDA XYZ block dimension of %%u x %%u x %%u = %%u " \ "exceeds compute capability's max number of threads per block: %u\", " \ - " block.x, block.y, block.z, block.x * block.y *block.z );\n", \ + "block.x, block.y, block.z, block.x * block.y * block.z);\n", \ global.cuda_max_threads_block); \ INDENT; \ INDENT; \ -- GitLab