diff --git a/src/libsac2c/codegen/gen_startup_code.c b/src/libsac2c/codegen/gen_startup_code.c index 21b8c3ad1537fb76361cd08c6a778fc29c73e0f3..f8da00851d9e5204f15291ccd9c0b704b33b1e9d 100644 --- a/src/libsac2c/codegen/gen_startup_code.c +++ b/src/libsac2c/codegen/gen_startup_code.c @@ -149,6 +149,8 @@ PrintGlobalSwitches (void) (global.trace.aa) ? 1 : 0); fprintf (global.outfile, "#define SAC_DO_TRACE_MT %d\n", (global.trace.mt) ? 1 : 0); + fprintf (global.outfile, "#define SAC_DO_TRACE_GPU %d\n", + (global.trace.gpu) ? 1 : 0); fprintf (global.outfile, "#define SAC_DO_TRACE_RTSPEC %d\n", (global.trace.rtspec) ? 1 : 0); fprintf (global.outfile, "#define SAC_DO_TRACE_DISTMEM %d\n", diff --git a/src/libsac2c/codegen/icm2c_cuda.c b/src/libsac2c/codegen/icm2c_cuda.c index fd172979bc900a54262d0a88c8dea264c72799b0..feec52f9c6e08d57d4d8ed0d5a868474476b2fb3 100644 --- a/src/libsac2c/codegen/icm2c_cuda.c +++ b/src/libsac2c/codegen/icm2c_cuda.c @@ -268,16 +268,32 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) #define CUDA_SET_GRID(fmt, ...) \ fprintf (global.outfile, "dim3 grid(" fmt ");\n", __VA_ARGS__); \ INDENT; \ + fprintf (global.outfile, \ + "SAC_TR_GPU_PRINT (\"CUDA XYZ grid dimension of " \ + "%%u x %%u x %%u\", grid.x , grid.y , grid.z );\n"); \ INDENT; \ + fprintf (global.outfile, "if (grid.x <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA X grid dimension must be bigger than zero. Current"\ + " value is %%u\", grid.x);"); \ + fprintf (global.outfile, "}\n"); \ + fprintf (global.outfile, "if (grid.y <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA Y grid dimension must be bigger than zero. Current"\ + " value is %%u\", grid.y);"); \ + fprintf (global.outfile, "}\n"); \ + fprintf (global.outfile, "if (grid.z <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA Z grid dimension must be bigger than zero. Current"\ + " value is %%u\", grid.z);"); \ + fprintf (global.outfile, "}\n"); \ fprintf (global.outfile, "if (grid.x > %u || grid.y > %u || grid.z > %u) {\n", \ - global.cuda_max_x_dim, global.cuda_max_yz_dim, global.cuda_max_yz_dim); \ + global.cuda_max_x_grid, global.cuda_max_yz_grid, global.cuda_max_yz_grid); \ INDENT; \ INDENT; \ INDENT; \ fprintf (global.outfile, \ - "SAC_RuntimeError(\"CUDA XYZ grid dimension exceeds compute " \ - "compatibilities max value: %u x %u x %u\");\n", \ - global.cuda_max_x_dim, global.cuda_max_yz_dim, global.cuda_max_yz_dim); \ + "SAC_RuntimeError(\"CUDA XYZ grid dimension of %%u x %%u x %%u exceeds " \ + "the compute capability's max value: %u x %u x %u\"," \ + " grid.x, grid.y, grid.z );\n", \ + global.cuda_max_x_grid, global.cuda_max_yz_grid, global.cuda_max_yz_grid); \ INDENT; \ INDENT; \ fprintf (global.outfile, "}\n"); @@ -285,16 +301,49 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) #define CUDA_SET_BLOCK(fmt, ...) \ fprintf (global.outfile, "dim3 block(" fmt ");", __VA_ARGS__); \ INDENT; \ + fprintf (global.outfile, \ + "SAC_TR_GPU_PRINT (\"CUDA XYZ block dimension of " \ + "%%u x %%u x %%u\\n\", block.x , block.y , block.z );\n"); \ INDENT; \ + fprintf (global.outfile, "if (block.x <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA X block dimension must be bigger than zero. " \ + "Current value is %%u\", block.x);"); \ + fprintf (global.outfile, "}\n"); \ + fprintf (global.outfile, "if (block.y <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA Y block dimension must be bigger than zero. " \ + "Current value is %%u\", block.y);"); \ + fprintf (global.outfile, "}\n"); \ + fprintf (global.outfile, "if (block.z <= 0 ) {\n" \ + "SAC_RuntimeError(\"CUDA Z block dimension must be bigger than zero. " \ + "Current value is %%u\", block.z);"); \ + fprintf (global.outfile, "}\n"); \ fprintf (global.outfile, "if (block.x > %u || block.y > %u || block.z > %u) {\n", \ - global.cuda_max_x_dim, global.cuda_max_yz_dim, global.cuda_max_yz_dim); \ + global.cuda_max_xy_block, global.cuda_max_xy_block, \ + global.cuda_max_z_block); \ INDENT; \ INDENT; \ INDENT; \ fprintf (global.outfile, \ - "SAC_RuntimeError(\"CUDA XYZ block dimension exceeds compute " \ - "compatibilities max value: %u x %u x %u\");\n", \ - global.cuda_max_x_dim, global.cuda_max_yz_dim, global.cuda_max_yz_dim); \ + "SAC_RuntimeError(\"CUDA XYZ block dimension of %%u x %%u x %%u exceeds " \ + "the compute capability's max value: %u x %u x %u\", " \ + "block.x, block.y, block.z);\n", \ + global.cuda_max_xy_block, global.cuda_max_xy_block, \ + global.cuda_max_z_block); \ + INDENT; \ + INDENT; \ + fprintf (global.outfile, "}\n"); \ + INDENT; \ + INDENT; \ + fprintf (global.outfile, "if (block.x * block.y *block.z > %u ) {\n", \ + global.cuda_max_threads_block); \ + INDENT; \ + INDENT; \ + INDENT; \ + fprintf (global.outfile, \ + "SAC_RuntimeError(\"CUDA XYZ block dimension of %%u x %%u x %%u = %%u " \ + "exceeds compute capability's max number of threads per block: %u\", " \ + "block.x, block.y, block.z, block.x * block.y * block.z);\n", \ + global.cuda_max_threads_block); \ INDENT; \ INDENT; \ fprintf (global.outfile, "}\n"); @@ -334,6 +383,8 @@ ICMCompileCUDA_GRID_BLOCK (int bounds_count, char **var_ANY) INDENT; fprintf (global.outfile, "{\n"); + fprintf (global.outfile, "SAC_TR_GPU_PRINT (\"launching %dD kernel\");", bounds_count/3); + INDENT; if (bounds_count == 3) { /* 1D CUDA withloop */ INDENT; INDENT; diff --git a/src/libsac2c/cuda/annotate_cuda_withloop2.c b/src/libsac2c/cuda/annotate_cuda_withloop2.c index 8d37c8d5849c5a0332dfaf1376be2706f8321620..e4b2076e7fc29012e04c1e5d0ded69711d4f62f3 100644 --- a/src/libsac2c/cuda/annotate_cuda_withloop2.c +++ b/src/libsac2c/cuda/annotate_cuda_withloop2.c @@ -116,8 +116,11 @@ InitCudaBlockSizes (void) global.cuda_blocking_factor = 16; global.cuda_2d_block_x = 16; global.cuda_2d_block_y = 16; - global.cuda_max_x_dim = 65535; - global.cuda_max_yz_dim = 65535; + global.cuda_max_x_grid = 65535; + global.cuda_max_yz_grid = 65535; + global.cuda_max_xy_block = 512; + global.cuda_max_z_block = 64; + global.cuda_max_threads_block = 512; } else if (STReq (global.config.cuda_arch, "-arch=sm_12") || STReq (global.config.cuda_arch, "-arch=sm_13")) { global.optimal_threads = 256; @@ -127,8 +130,11 @@ InitCudaBlockSizes (void) global.cuda_blocking_factor = 16; global.cuda_2d_block_x = 16; global.cuda_2d_block_y = 16; - global.cuda_max_x_dim = 65535; - global.cuda_max_yz_dim = 65535; + global.cuda_max_x_grid = 65535; + global.cuda_max_yz_grid = 65535; + global.cuda_max_xy_block = 512; + global.cuda_max_z_block = 64; + global.cuda_max_threads_block = 512; } else if (STReq (global.config.cuda_arch, "-arch=sm_20")) { /* global.optimal_threads = 512; @@ -148,8 +154,11 @@ InitCudaBlockSizes (void) global.cuda_blocking_factor = 32; global.cuda_2d_block_x = 16; global.cuda_2d_block_y = 16; - global.cuda_max_x_dim = 65535; - global.cuda_max_yz_dim = 65535; + global.cuda_max_x_grid = 65535; + global.cuda_max_yz_grid = 65535; + global.cuda_max_xy_block = 1024; + global.cuda_max_z_block = 64; + global.cuda_max_threads_block = 1024; } else if (STReq (global.config.cuda_arch, "-arch=sm_35")) { global.optimal_threads = 512; global.optimal_blocks = 3; @@ -163,8 +172,11 @@ InitCudaBlockSizes (void) global.cuda_blocking_factor = 32; global.cuda_2d_block_x = 16; global.cuda_2d_block_y = 16; - global.cuda_max_x_dim = 2147483647; - global.cuda_max_yz_dim = 65535; + global.cuda_max_x_grid = 2147483647; + global.cuda_max_yz_grid = 65535; + global.cuda_max_xy_block = 1024; + global.cuda_max_z_block = 64; + global.cuda_max_threads_block = 1024; } else if (STReq (global.config.cuda_arch, "-arch=sm_50")) { global.optimal_threads = 512; global.optimal_blocks = 3; @@ -178,8 +190,11 @@ InitCudaBlockSizes (void) global.cuda_blocking_factor = 32; global.cuda_2d_block_x = 32; global.cuda_2d_block_y = 32; - global.cuda_max_x_dim = 2147483647; - global.cuda_max_yz_dim = 65535; + global.cuda_max_x_grid = 2147483647; + global.cuda_max_yz_grid = 65535; + global.cuda_max_xy_block = 1024; + global.cuda_max_z_block = 64; + global.cuda_max_threads_block = 1024; } else { if (STReq (global.config.cuda_arch, "no")) { CTIwarn ("CUDA architecture was not detected during install, setting to " diff --git a/src/libsac2c/global/flags.mac b/src/libsac2c/global/flags.mac index 70e2f7a85b66ed962fe271230dab33672c8a81eb..bfe973287beb318b7ae688b7ed495f10c6bc0e33 100644 --- a/src/libsac2c/global/flags.mac +++ b/src/libsac2c/global/flags.mac @@ -27,6 +27,7 @@ TRACE (mem, 'm', FALSE) TRACE (wl, 'w', FALSE) TRACE (aa, 's', FALSE) TRACE (mt, 't', FALSE) +TRACE (gpu, 'g', FALSE) /* CUDA backend */ TRACE (cenv, 'c', FALSE) TRACE (distmem, 'd', FALSE) /* Distributed memory backend */ diff --git a/src/libsac2c/global/globals.mac b/src/libsac2c/global/globals.mac index 2b05172ae9a24f524809b99e0bc5f9c0d83c1ba7..0044680e36e1552ebfb943098375aef043c11645 100644 --- a/src/libsac2c/global/globals.mac +++ b/src/libsac2c/global/globals.mac @@ -1320,8 +1320,11 @@ GLOBAL (int, cuda_1d_block_small, 0, xfree_dummy, ) GLOBAL (int, cuda_2d_block_x, 0, xfree_dummy, ) GLOBAL (int, cuda_2d_block_y, 0, xfree_dummy, ) GLOBAL (int, cuda_blocking_factor, 0, xfree_dummy, ) -GLOBAL (unsigned int, cuda_max_x_dim, 0, xfree_dummy, ) -GLOBAL (unsigned int, cuda_max_yz_dim, 0, xfree_dummy, ) /* for both the Y and Z dimensions */ +GLOBAL (unsigned int, cuda_max_x_grid, 0, xfree_dummy, ) +GLOBAL (unsigned int, cuda_max_yz_grid, 0, xfree_dummy, ) /* for both the Y and Z dimensions */ +GLOBAL (unsigned int, cuda_max_xy_block, 0, xfree_dummy, ) +GLOBAL (unsigned int, cuda_max_z_block, 0, xfree_dummy, ) +GLOBAL (unsigned int, cuda_max_threads_block, 0, xfree_dummy, ) /* * DistMem backend options diff --git a/src/libsac2c/global/usage.c b/src/libsac2c/global/usage.c index 5e4d951e15b85029e58bdcd9625da7fb4c57b666..b00760dfe047f8f24f54ba8fd844144895827ea9 100644 --- a/src/libsac2c/global/usage.c +++ b/src/libsac2c/global/usage.c @@ -850,6 +850,7 @@ PrintRuntimeTraceOptions (void) " p: Trace primitive function calls.\n" " w: Trace with-loop execution.\n" " s: Trace array accesses.\n" + " g: Trace CUDA runtime.\n" " t: Trace multi-threading specific operations.\n" " c: Trace runtime enviroment init/exit when\n" " using SAC libraries in C programs.\n" diff --git a/src/runtime/extras_h/rt_trace.h b/src/runtime/extras_h/rt_trace.h index 4635f4d8741e469708baf13a8c9d35974ae89d6d..7492eb2bd29f7d3869166d8ad234369b7a1a869c 100644 --- a/src/runtime/extras_h/rt_trace.h +++ b/src/runtime/extras_h/rt_trace.h @@ -176,6 +176,12 @@ typedef enum { #endif /* SAC_DO_TRACE_MT */ +#if SAC_DO_TRACE_GPU +#define SAC_TR_GPU_PRINT(...) SAC_TR_PRINT (("GPU -> " __VA_ARGS__)) +#else /* SAC_DO_TRACE_GPU */ +#define SAC_TR_GPU_PRINT( ...) +#endif /* SAC_DO_TRACE_GPU */ + #if SAC_DO_TRACE_DISTMEM #define SAC_TR_DISTMEM_PRINT(...) SAC_TR_PRINT (("DSM -> " __VA_ARGS__))