diff --git a/CMakeLists.txt b/CMakeLists.txt index 3338b5b81dca0826e0961a12ba53ad59536e0e6d..c6067abb354a8502b96861751ba6ca0cea92a5fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,7 +91,7 @@ IF (ENABLE_ISL AND ENABLE_BARVINOK) PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -Wno-conversion" ) ADD_EXECUTABLE (sacislinterface src/tools/sacislinterface/sacislinterface.c) - ADD_DEPENDENCIES (sac2cShared check_repo_version) + ADD_DEPENDENCIES (sacislinterface check_repo_version) TARGET_LINK_LIBRARIES (sacislinterface ${LIB_ISL} ${LIB_BARVINOK} ${BARVINOK_LIB}) TARGET_INCLUDE_DIRECTORIES (sacislinterface PUBLIC ${BARVINOK_INC_PATH} ${ISL_INC_PATH}) ENDIF () @@ -115,7 +115,6 @@ STRING (REPLACE ";" ":" _TARGETS "${RT_TARGETS}") # This is where we call the build of the sac2c shared-libraries # This *depends* on sac2c having been build first! ExternalProject_Add(runtime_libraries - DEPENDS sac2cShared sac2c check_repo_version sac_h DOWNLOAD_COMMAND "" # this is to prevent any download target from being called INSTALL_COMMAND "" # this is to prevent any install target from being called PREFIX runtime_build @@ -133,7 +132,7 @@ ExternalProject_Add(runtime_libraries ) # We set dependencies on the configure step, this makes sure we propogate # certain values (such as from sac2crc). -ExternalProject_Add_StepDependencies(runtime_libraries configure sac2c sac2cShared check_repo_version) +ExternalProject_Add_StepDependencies(runtime_libraries configure sac2c sac2cShared sac_h check_repo_version) # Get runtime library build directory ExternalProject_Get_Property (runtime_libraries BINARY_DIR) SET (RUNTIME_BINARY_DIR ${BINARY_DIR}) # redefine name to something more useful @@ -159,7 +158,12 @@ ADD_DEPENDENCIES (fullclean runtime_libraries-clean) # directly as part of e.g. target_link_libraries functions. As such we instead # force add target properties which we can access explicitly. # Expose configure step as target -ExternalProject_Add_StepTargets (runtime_libraries configure) +# XXX (hans) additionaly we explicitly set inter-step dependencies as this +# facility is broken in cmake >= version 3.10, +# - see https://gitlab.kitware.com/cmake/cmake/issues/18663 +ExternalProject_Add_StepTargets (runtime_libraries configure build install) +ExternalProject_Add_StepDependencies (runtime_libraries install runtime_libraries-build) +ExternalProject_Add_StepDependencies (runtime_libraries build runtime_libraries-configure) FOREACH (__target ${RT_TARGETS}) ADD_CUSTOM_TARGET (libsac-${__target} COMMAND +${CMAKE_COMMAND} --build ${RUNTIME_BINARY_DIR} --target libsac-${__target} @@ -202,7 +206,7 @@ ENDMACRO () ADD_INSTALL_TARGET ("applications" sac2c sac4c sac2tex "${PROJECT_BINARY_DIR}/saccc") ADD_INSTALL_TARGET ("config" "${SAC2CRC_BUILD_CONF}") ADD_INSTALL_TARGET ("headers" sac_h) -ADD_INSTALL_TARGET ("libraries" sac2cShared runtime_libraries) +ADD_INSTALL_TARGET ("libraries" runtime_libraries) ADD_INSTALL_TARGET ("rtapplications" runtime_libraries) ADD_INSTALL_TARGET ("sources") ADD_INSTALL_TARGET ("symlinks") diff --git a/src/libsac2c/CMakeLists.txt b/src/libsac2c/CMakeLists.txt index c96a052ac83847bd3d85511b47ad0fb64d7f39df..f5619ce6652592ae8d4eef1e3da746645e268e03 100644 --- a/src/libsac2c/CMakeLists.txt +++ b/src/libsac2c/CMakeLists.txt @@ -228,6 +228,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_block_transfers2.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_cond_transfers.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_cudast_transfers.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_loop_transfers.c +${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_emr_transfers.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_transfers.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/partial_fold.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/prepare_forloop_generation.c @@ -627,7 +628,7 @@ SET_SOURCE_FILES_PROPERTIES( ${CMAKE_CURRENT_SOURCE_DIR}/precompile/functionprecompile.c #RET & ARG_LINKSIGN ${CMAKE_CURRENT_SOURCE_DIR}/scanparse/handle_dots.c #TBmakeNum warnings ${CMAKE_CURRENT_SOURCE_DIR}/scanparse/parser.c #SHmakeShape (n) - n is size_t - ${CMAKE_CURRENT_SOURCE_DIR}/scanparse/resolvepragma.c #PRAGMA_NUMPARAMS + ${CMAKE_CURRENT_SOURCE_DIR}/scanparse/resolvepragma.c #PRAGMA_NUMPARAMS ${CMAKE_CURRENT_SOURCE_DIR}/stdopt/makedimexpr.c #TBmakeNum with TCcountExprs ${CMAKE_CURRENT_SOURCE_DIR}/stdopt/makeshapeexpr.c #TBmakeNum with TCcountExprs ${CMAKE_CURRENT_SOURCE_DIR}/stdopt/prfunroll.c #SHgetUnrlen diff --git a/src/libsac2c/arrayopt/algebraic_wlfi.c b/src/libsac2c/arrayopt/algebraic_wlfi.c index 2dc726cafd6a6b85e445f15e048ac9ef11f7bb3a..1f9fbb7b8bd2532bd06716e8457748032fccda0e 100644 --- a/src/libsac2c/arrayopt/algebraic_wlfi.c +++ b/src/libsac2c/arrayopt/algebraic_wlfi.c @@ -77,6 +77,7 @@ #include "tree_basic.h" #include "tree_compound.h" +#include "traverse_optcounter.h" #include "node_basic.h" #include "print.h" @@ -312,20 +313,17 @@ SimplifySymbioticExpression (node *arg_node, info *arg_info) { int i = 0; int ct = 0; - size_t countDLIR = 0; - size_t countWLIR = global.optcounters.wlir_expr; - size_t countINL = 0; - size_t countCSE = global.optcounters.cse_expr; - size_t countTUP = 0; - size_t countCF = 0; - size_t countVP = 0; - size_t countREA = 0; - size_t countAS = 0; - size_t countAL = 0; - size_t countDL = 0; - size_t countESD = global.optcounters.esd_expr; - size_t countUESD = 0; - size_t countDCR = 0; + bool done = false; + + TOC_SETUP(14, COUNT_DLIR, COUNT_WLIR, COUNT_INL, + COUNT_CSE, COUNT_TUP, COUNT_CF, + COUNT_VP, COUNT_REA, COUNT_AS, + COUNT_AL, COUNT_DL, COUNT_ESD, + COUNT_UESD, COUNT_DCR) + + TOC_SETCOUNTER (COUNT_WLIR, global.optcounters.wlir_expr) + TOC_SETCOUNTER (COUNT_CSE, global.optcounters.cse_expr) + TOC_SETCOUNTER (COUNT_ESD, global.optcounters.esd_expr) DBUG_ENTER (); @@ -350,102 +348,75 @@ SimplifySymbioticExpression (node *arg_node, info *arg_info) #endif /* Invoke each opt */ - -#ifndef DBUG_OFF - /* debug compiler */ -#define RUNCHECK(Name) \ - if (global.check_frequency >= 4) { \ - DBUG_PRINT_TAG ("SSE", "Cycle iteration %d: running post-" #Name " check", i); \ - arg_node = PHrunConsistencyChecks (arg_node); \ - } -#else - /* production compiler does not have PHrunConsistencyChecks() */ -#define RUNCHECK(Name) /*empty*/ -#endif - -#define RUNOPT(Name, Cond, CntStmt, PassFun) \ - if (Cond) { \ - DBUG_PRINT_TAG ("SSE", "Cycle iteration %d: running " #Name, i); \ - CntStmt; \ - arg_node = PassFun (arg_node); \ - RUNCHECK (Name) \ - } - - RUNOPT (DLIR, global.optimize.dodlir, - countDLIR = global.optcounters.dlir_expr, DLIRdoLoopInvariantRemoval); - RUNOPT (WLIR, global.optimize.dowlir, - countWLIR = global.optcounters.wlir_expr, WLIRdoLoopInvariantRemoval); - RUNOPT (INL, global.optimize.doinl, countINL = global.optcounters.inl_fun, - INLdoInlining); - RUNOPT (ISAA, global.optimize.dosaa, , ISAAdoInsertShapeVariables); - RUNOPT (CSE, global.optimize.docse, countCSE = global.optcounters.cse_expr, - CSEdoCommonSubexpressionElimination); - RUNOPT (NTC, global.optimize.dotup, - countTUP = global.optcounters.tup_upgrades, NTCdoNewTypeCheck); - RUNOPT (EAT, global.optimize.dotup, , EATdoEliminateAlphaTypes); - RUNOPT (EBT, global.optimize.dotup, , EBTdoEliminateBottomTypes); - RUNOPT (DFC, TRUE, , DFCdoDispatchFunCalls); - RUNOPT (CF, global.optimize.docf, countCF = global.optcounters.cf_expr, - CFdoConstantFolding); - RUNOPT (VP, global.optimize.dovp, countVP = global.optcounters.vp_expr, - VPdoVarPropagation); - RUNOPT (REA, global.optimize.dorea, countREA = global.optcounters.rea_expr, - REAdoReorderEqualityprfArguments); - RUNOPT (TGTL, global.optimize.dotgtl, countREA = global.optcounters.tgtl_expr, - TGTLdoTransformGtgeToLtle); - RUNOPT (ESD, global.optimize.dosde, countESD = global.optcounters.esd_expr, - ESDdoElimSubDiv); - RUNOPT (AS, global.optimize.doas, countAS = global.optcounters.as_expr, - ASdoArithmeticSimplification); - RUNOPT (CF, global.optimize.docf, countCF = global.optcounters.cf_expr, - CFdoConstantFolding); - RUNOPT (CSE, global.optimize.docse, , CSEdoCommonSubexpressionElimination); - RUNOPT (AL, global.optimize.doal, countAL = global.optcounters.al_expr, - ALdoAssocLawOptimization); - RUNOPT (DL, global.optimize.dodl, countDL = global.optcounters.dl_expr, - DLdoDistributiveLawOptimization); - RUNOPT (UESD, global.optimize.dosde, countUESD = global.optcounters.uesd_expr, - UESDdoUndoElimSubDiv); - RUNOPT (DCR, global.optimize.dodcr, - countDCR = global.optcounters.dead_var + global.optcounters.dead_expr, - DCRdoDeadCodeRemoval); - -#undef RUNOPT -#undef RUNCHECK + TOC_RUNOPT_TAG ("SSE", "DLIR", global.optimize.dodlir, COUNT_DLIR, + global.optcounters.dlir_expr, arg_node, DLIRdoLoopInvariantRemoval); + TOC_RUNOPT_TAG ("SSE", "WLIR", global.optimize.dowlir, COUNT_WLIR, + global.optcounters.wlir_expr, arg_node, WLIRdoLoopInvariantRemoval); + TOC_RUNOPT_TAG ("SSE", "INL", global.optimize.doinl, COUNT_INL, global.optcounters.inl_fun, + arg_node, INLdoInlining); + TOC_RUNOPT_TAG ("SSE", "ISAA", global.optimize.dosaa, TOC_IGNORE, 0, arg_node, + ISAAdoInsertShapeVariables); + TOC_RUNOPT_TAG ("SSE", "CSE", global.optimize.docse, COUNT_CSE, global.optcounters.cse_expr, + arg_node, CSEdoCommonSubexpressionElimination); + TOC_RUNOPT_TAG ("SSE", "NTC", global.optimize.dotup, COUNT_TUP, global.optcounters.tup_upgrades, + arg_node, NTCdoNewTypeCheck); + TOC_RUNOPT_TAG ("SSE", "EAT", global.optimize.dotup, TOC_IGNORE, 0, arg_node, + EATdoEliminateAlphaTypes); + TOC_RUNOPT_TAG ("SSE", "EBT", global.optimize.dotup, TOC_IGNORE, 0, arg_node, + EBTdoEliminateBottomTypes); + TOC_RUNOPT_TAG ("SSE", "DFC", TRUE, TOC_IGNORE, 0, arg_node, DFCdoDispatchFunCalls); + TOC_RUNOPT_TAG ("SSE", "CF", global.optimize.docf, COUNT_CF, global.optcounters.cf_expr, + arg_node, CFdoConstantFolding); + TOC_RUNOPT_TAG ("SSE", "VP", global.optimize.dovp, COUNT_VP, global.optcounters.vp_expr, + arg_node, VPdoVarPropagation); + TOC_RUNOPT_TAG ("SSE", "REA", global.optimize.dorea, COUNT_REA, global.optcounters.rea_expr, + arg_node, REAdoReorderEqualityprfArguments); + TOC_RUNOPT_TAG ("SSE", "TGTL", global.optimize.dotgtl, COUNT_REA, global.optcounters.tgtl_expr, + arg_node, TGTLdoTransformGtgeToLtle); + TOC_RUNOPT_TAG ("SSE", "ESD", global.optimize.dosde, COUNT_ESD, global.optcounters.esd_expr, + arg_node, ESDdoElimSubDiv); + TOC_RUNOPT_TAG ("SSE", "AS", global.optimize.doas, COUNT_AS, global.optcounters.as_expr, + arg_node, ASdoArithmeticSimplification); + TOC_RUNOPT_TAG ("SSE", "CF", global.optimize.docf, COUNT_CF, global.optcounters.cf_expr, + arg_node, CFdoConstantFolding); + TOC_RUNOPT_TAG ("SSE", "CSE", global.optimize.docse, TOC_IGNORE, 0, arg_node, + CSEdoCommonSubexpressionElimination); + TOC_RUNOPT_TAG ("SSE", "AL", global.optimize.doal, COUNT_AL, global.optcounters.al_expr, + arg_node, ALdoAssocLawOptimization); + TOC_RUNOPT_TAG ("SSE", "DL", global.optimize.dodl, COUNT_DL, global.optcounters.dl_expr, + arg_node, DLdoDistributiveLawOptimization); + TOC_RUNOPT_TAG ("SSE", "UESD", global.optimize.dosde, COUNT_UESD, global.optcounters.uesd_expr, + arg_node, UESDdoUndoElimSubDiv); + TOC_RUNOPT_TAG ("SSE", "DCR", global.optimize.dodcr, COUNT_DCR, + global.optcounters.dead_var + global.optcounters.dead_expr, arg_node, + DCRdoDeadCodeRemoval); /* We do not count DCR, as it's merely for cleanup */ DBUG_PRINT_TAG ("SSE", "DLIR= %zu, WLIR= %zu, INL=%zu, CSE=%zu, TUP=%zu, CF=%zu, VP=%zu, " "AS=%zu, AL=%zu, DL=%zu, " "ESD=%zu, UESD=%zu, DCR=%zu", - (global.optcounters.dlir_expr - countDLIR), - (global.optcounters.wlir_expr - countWLIR), - (global.optcounters.inl_fun - countINL), - (global.optcounters.cse_expr - countCSE), - (global.optcounters.tup_upgrades - countTUP), - (global.optcounters.cf_expr - countCF), - (global.optcounters.vp_expr - countVP), - (global.optcounters.as_expr - countAS), - (global.optcounters.al_expr - countAL), - (global.optcounters.dl_expr - countDL), + (global.optcounters.dlir_expr - TOC_GETCOUNTER (COUNT_DLIR)), + (global.optcounters.wlir_expr - TOC_GETCOUNTER (COUNT_WLIR)), + (global.optcounters.inl_fun - TOC_GETCOUNTER (COUNT_INL)), + (global.optcounters.cse_expr - TOC_GETCOUNTER (COUNT_CSE)), + (global.optcounters.tup_upgrades - TOC_GETCOUNTER (COUNT_TUP)), + (global.optcounters.cf_expr - TOC_GETCOUNTER (COUNT_CF)), + (global.optcounters.vp_expr - TOC_GETCOUNTER (COUNT_VP)), + (global.optcounters.as_expr - TOC_GETCOUNTER (COUNT_AS)), + (global.optcounters.al_expr - TOC_GETCOUNTER (COUNT_AL)), + (global.optcounters.dl_expr - TOC_GETCOUNTER (COUNT_DL)), /* The following are not for some reason in the fixpoint check below: */ - (global.optcounters.esd_expr - countESD), - (global.optcounters.uesd_expr - countUESD), + (global.optcounters.esd_expr - TOC_GETCOUNTER (COUNT_ESD)), + (global.optcounters.uesd_expr - TOC_GETCOUNTER (COUNT_UESD)), ((global.optcounters.dead_var + global.optcounters.dead_expr) - - countDCR)); - - if (/* Fix point check */ - (countDLIR == global.optcounters.dlir_expr) - && (countWLIR == global.optcounters.wlir_expr) - && (countINL == global.optcounters.inl_fun) - && (countCSE == global.optcounters.cse_expr) - && (countTUP == global.optcounters.tup_upgrades) - && (countCF == global.optcounters.cf_expr) - && (countVP == global.optcounters.vp_expr) - && (countAS == global.optcounters.as_expr) - && (countAL == global.optcounters.al_expr) - && (countDL == global.optcounters.dl_expr)) { + - TOC_GETCOUNTER (COUNT_DCR))); + + /* Fix point check */ + TOC_COMPARE_RANGE (COUNT_DLIR, COUNT_DL, done) + + if (done) { i = global.max_optcycles; } } diff --git a/src/libsac2c/cuda/annotate_memory_transfers.c b/src/libsac2c/cuda/annotate_memory_transfers.c index f00639edc580f6dc52eef7faa4fa0b74e7990181..66eace9dc3750f3d3f620c2570c866fbdf2d728b 100644 --- a/src/libsac2c/cuda/annotate_memory_transfers.c +++ b/src/libsac2c/cuda/annotate_memory_transfers.c @@ -1,33 +1,28 @@ -/***************************************************************************** +/** + * @file + * @defgroup amtran Annotate Memory Transfers + * @ingroup cuda + * + * @brief Annotate the memory transfers that are allowed to be + * lifted from a do-fun. + * + * This module decides which and can be + * lifted out of the enclosing do-fun. Since host<->device transfers + * are expensive operations to perform in CUDA programs, and transfers + * within loop make it even more severe, eliminating transfers within + * loops as much as possible is crucial to program performance. For + * detailed explanation of what transfers can be moved out and what + * cannot, please see commets in the code. * - * @defgroup Annotate the memory transfers that are allowed to be - * lifted from a do-fun. - * - * - * This module decides which and can be - * lifted out of the enclosing do-fun. Since host<->device transfers - * are expensive operations to perform in CUDA programs, and transfers - * within loop make it even more severe, eliminating transfers within - * loops as much as possible is crucial to program performance. For - * detailed explanation of what transfers can be moved out and what - * cannot, please see commets in the code. - * - *****************************************************************************/ - -/** - * - * @file annotate_memory_transfers.c - * - * Prefix: AMTRAN - * - *****************************************************************************/ + * @{ + */ #include "annotate_memory_transfers.h" #include #include "tree_basic.h" #include "tree_compound.h" -#define DBUG_PREFIX "UNDEFINED" +#define DBUG_PREFIX "AMTRAN" #include "debug.h" #include "traverse.h" @@ -43,12 +38,10 @@ */ enum traverse_mode { trav_collect, trav_consolidate, trav_annotate }; -/** - * +/** * @name INFO structure * @{ - * - *****************************************************************************/ + */ struct INFO { bool indofun; nlut_t *nlut; @@ -106,10 +99,16 @@ FreeInfo (info *info) DBUG_RETURN (info); } -/** - * @} - *****************************************************************************/ +/** @} */ +/** + * @brief Find fundef arguments in application arguments + * + * @param fundef_args + * @param ap_args + * @param id + * @return matching fundef arguments + */ static node * GetFundefArgFromApArg (node *fundef_args, node *ap_args, node *id) { @@ -127,17 +126,18 @@ GetFundefArgFromApArg (node *fundef_args, node *ap_args, node *id) DBUG_RETURN (fundef_args); } -/** - * +/** * @name Entry functions * @{ + */ + +/** + * @brief * - *****************************************************************************/ -/** - * - * @fn node *AMTRANdoAnnotateMemoryTransfers( node *syntax_tree) - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANdoAnnotateMemoryTransfers (node *syntax_tree) { @@ -153,24 +153,20 @@ AMTRANdoAnnotateMemoryTransfers (node *syntax_tree) DBUG_RETURN (syntax_tree); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * +/** * @name Traversal functions * @{ - * - *****************************************************************************/ + */ -/** - * - * @fn node *AMTRANfundef( node *arg_node, info *arg_info) - * +/** * @brief * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANfundef (node *arg_node, info *arg_info) { @@ -180,6 +176,7 @@ AMTRANfundef (node *arg_node, info *arg_info) /* We only traverse do-fun. */ if (FUNDEF_ISLOOPFUN (arg_node)) { + DBUG_PRINT ("(LOOP) Looking at %s...", FUNDEF_NAME (arg_node)); INFO_INDOFUN (arg_info) = TRUE; INFO_NLUT (arg_info) = NLUTgenerateNlut (FUNDEF_ARGS (arg_node), FUNDEF_VARDECS (arg_node)); @@ -208,14 +205,13 @@ AMTRANfundef (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANarg( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANarg (node *arg_node, info *arg_info) { @@ -231,14 +227,13 @@ AMTRANarg (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANassign( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANassign (node *arg_node, info *arg_info) { @@ -257,14 +252,13 @@ AMTRANassign (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANlet( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANlet (node *arg_node, info *arg_info) { @@ -299,14 +293,13 @@ AMTRANlet (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANfuncond( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANfuncond (node *arg_node, info *arg_info) { @@ -328,24 +321,26 @@ AMTRANfuncond (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANap( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANap (node *arg_node, info *arg_info) { DBUG_ENTER (); + DBUG_PRINT ("inspecting N_ap of %s...", FUNDEF_NAME (AP_FUNDEF (arg_node))); + if (INFO_INDOFUN (arg_info)) { /* If the N_ap is a recursive do-fun application * and the traverse mode is collect. */ if (INFO_FUNDEF (arg_info) == AP_FUNDEF (arg_node) && INFO_TRAVMODE (arg_info) == trav_collect) { + DBUG_PRINT ("(mode: collect), at recursive N_ap"); /* The arguments of the recursive do-fun application * need to be stored and will be used in the annotate * traversal. */ @@ -362,6 +357,7 @@ AMTRANap (node *arg_node, info *arg_info) INFO_INRECURSIVEAPARGS (arg_info) = FALSE; } else if (INFO_FUNDEF (arg_info) == AP_FUNDEF (arg_node) && INFO_TRAVMODE (arg_info) == trav_annotate) { + DBUG_PRINT ("(mode: annotate), at recursive N_ap"); INFO_INRECURSIVEAPARGS (arg_info) = TRUE; AP_ARGS (arg_node) = TRAVopt (AP_ARGS (arg_node), arg_info); INFO_INRECURSIVEAPARGS (arg_info) = FALSE; @@ -372,19 +368,22 @@ AMTRANap (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANid( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANid (node *arg_node, info *arg_info) { + int nlut_num; + DBUG_ENTER (); + DBUG_PRINT ("inspecting N_id of %s...", ID_NAME (arg_node)); + if (INFO_INDOFUN (arg_info)) { if (INFO_TRAVMODE (arg_info) == trav_collect) { /* If the N_id is: @@ -445,6 +444,7 @@ AMTRANid (node *arg_node, info *arg_info) * a_host = host2device( a_dev); */ if (!INFO_INRECURSIVEAPARGS (arg_info) && !INFO_INFUNCOND (arg_info)) { + DBUG_PRINT ("(mode: collect), adding %s to NLUT", ID_NAME (arg_node)); NLUTincNum (INFO_NLUT (arg_info), ID_AVIS (arg_node), 1); } } else if (INFO_TRAVMODE (arg_info) == trav_annotate) { @@ -456,7 +456,9 @@ AMTRANid (node *arg_node, info *arg_info) /* If the N_arg at correpsonding position cannot be * replaced by its cuda counterpart, this devicetohost * cannot be lifted */ - if (NLUTgetNum (INFO_NLUT (arg_info), ARG_AVIS (arg)) != 0) { + nlut_num = NLUTgetNum (INFO_NLUT (arg_info), ARG_AVIS (arg)); + if (nlut_num != 0) { + DBUG_PRINT ("(mode: annotate), N_avis %s found %d time, can not move done D2H", ID_NAME (arg_node), nlut_num); ASSIGN_ISNOTALLOWEDTOBEMOVEDDOWN (ID_SSAASSIGN (arg_node)) = TRUE; } } @@ -468,23 +470,24 @@ AMTRANid (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANprf( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANprf (node *arg_node, info *arg_info) { node *id; DBUG_ENTER (); + if (INFO_INDOFUN (arg_info)) { switch (PRF_PRF (arg_node)) { case F_host2device: + DBUG_PRINT ("inspecting N_prf `F_host2device`"); /* Ensure that each is initially * tagged as can be moved out. */ if (INFO_TRAVMODE (arg_info) == trav_collect) { @@ -492,6 +495,7 @@ AMTRANprf (node *arg_node, info *arg_info) } /* If we are in trav_annotate traverse mode */ if (INFO_TRAVMODE (arg_info) == trav_annotate) { + DBUG_PRINT ("(mode: annoate), checking N_prf argument refcount"); id = PRF_ARG1 (arg_node); /* We only look at whose host N_id @@ -506,6 +510,7 @@ AMTRANprf (node *arg_node, info *arg_info) /* If the reference count of the host N_id is not 0, * we annotates the transfer to be not allowed to be moved out. */ if (NLUTgetNum (INFO_NLUT (arg_info), ID_AVIS (id)) != 0) { + DBUG_PRINT (" cannot move-out h2d of %s", ID_NAME (id)); ASSIGN_ISNOTALLOWEDTOBEMOVEDUP (INFO_LASTASSIGN (arg_info)) = TRUE; } else { @@ -595,12 +600,14 @@ AMTRANprf (node *arg_node, info *arg_info) } break; case F_device2host: + DBUG_PRINT ("inspecting N_prf `F_device2host`"); /* Ensure that each device2host is initially * tagged as can be moved out */ if (INFO_TRAVMODE (arg_info) == trav_collect) { ASSIGN_ISNOTALLOWEDTOBEMOVEDDOWN (INFO_LASTASSIGN (arg_info)) = FALSE; } if (INFO_TRAVMODE (arg_info) == trav_annotate) { + DBUG_PRINT ("(mode: annoate), checking N_prf argument refcount"); /* If the reference count of the host N_id is not 0, * we annotates the transfer to be not allowed to be moved out. */ if (NLUTgetNum (INFO_NLUT (arg_info), IDS_AVIS (INFO_LETIDS (arg_info))) @@ -618,12 +625,6 @@ AMTRANprf (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * @} - *****************************************************************************/ - -/** - * @} - *****************************************************************************/ - +/** @} */ +/** @} */ #undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/cuda_utils.c b/src/libsac2c/cuda/cuda_utils.c index 4b21a487339c441a4eb47c19f3cf0f1231d8c781..89511ffa7a52df5d02c8cd4992f194a062b4909f 100644 --- a/src/libsac2c/cuda/cuda_utils.c +++ b/src/libsac2c/cuda/cuda_utils.c @@ -1,6 +1,13 @@ - +/** + * @file + * @defgroup cutil CUDA utils + * @ingroup cuda + * + * @{ + */ #include "cuda_utils.h" +#include "type_utils.h" #include "tree_basic.h" #include "tree_compound.h" #include "str.h" @@ -273,4 +280,36 @@ CUisDeviceArrayTypeNew (ntype *ty) DBUG_RETURN (res); } +/** + * @brief Convert from a host ntype to a device ntype, while preserving shape information. + * + * @param host_type The host ntype + * @return A device ntype struct, or NULL if the host_type *does not* have a simpletype + */ +ntype * +CUconvertHostToDeviceType (ntype *host_type) +{ + ntype *scalar_type, *dev_type = NULL; + simpletype sty; + + DBUG_ENTER (); + + /* If the host_type is of known dimension */ + if (!TUdimKnown (host_type)) + CTIerrorInternal ("AUD type found!"); + + /* If the scalar type is simple, e.g. int, float ... */ + if (TYgetDim (host_type) > 0 + && TYisSimple (TYgetScalar (host_type))) { + dev_type = TYcopyType (host_type); + scalar_type = TYgetScalar (dev_type); + /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ + sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); + /* Set the device simple type */ + scalar_type = TYsetSimpleType (scalar_type, sty); + } + + DBUG_RETURN (dev_type); +} + #undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/cuda_utils.h b/src/libsac2c/cuda/cuda_utils.h index 553619739abb90bd1ce4a0d417dbb384575603fa..05b36630536d9a339681710c2471a322527ba025 100644 --- a/src/libsac2c/cuda/cuda_utils.h +++ b/src/libsac2c/cuda/cuda_utils.h @@ -1,4 +1,3 @@ - #ifndef _SAC_CUDA_UTILS_H_ #define _SAC_CUDA_UTILS_H_ @@ -33,5 +32,6 @@ extern bool CUisShmemTypeNew (ntype *ty); extern bool CUisDeviceTypeOld (types *ty); extern bool CUisShmemTypeOld (types *ty); extern bool CUisDeviceArrayTypeNew (ntype *ty); +extern ntype *CUconvertHostToDeviceType (ntype *host_type); -#endif +#endif /* _SAC_CUDA_UTILS_H_ */ diff --git a/src/libsac2c/cuda/insert_memory_transfers.c b/src/libsac2c/cuda/insert_memory_transfers.c deleted file mode 100644 index 926a77e9d455d192b90cd504a676ce3eb8ba5184..0000000000000000000000000000000000000000 --- a/src/libsac2c/cuda/insert_memory_transfers.c +++ /dev/null @@ -1,806 +0,0 @@ -/** - * - * @defgroup Insert CUDA memory transfer primitives - * - * - * This module inserts CUDA type conversion primitives before and after - * each cudarizable N_with. The two primitives are and - * . They are used to trasfer the data of a host(device) array - * variable to a device(host) array variable. This is essentially - * compiled into host<->device memory transfers in the backend. As an - * example: - * - * a_host = with - * { - * ... = b_host; - * ... = c_host; - * ... = d_host; - * }:genarray( shp); - * - * is transformed into: - * - * b_dev = host2device( b_host); - * c_dev = host2device( c_host); - * d_dev = host2device( d_host); - * a_dev = with - * { - * ... = b_dev; - * ... = c_dev; - * ... = d_dev; - * }:genarray( shp); - * a_host = device2host( a_dev); - * - * Note that simple scalar variables need not be type converted since they - * can be passed as function parameters directly to CUDA kernels. - * - * @ingroup - * - * @{ASSIGN_STMT( arg_node) - * - *****************************************************************************/ - -/** - * - * @file cuda_type_conversion.c - * - * Prefix: IMEM - * - *****************************************************************************/ -#include "insert_memory_transfers.h" - -/* - * Other includes go here - */ -#include -#include "tree_basic.h" -#include "tree_compound.h" -#include "str.h" -#include "str_buffer.h" -#include "memory.h" -#include "globals.h" - -#define DBUG_PREFIX "UNDEFINED" -#include "debug.h" - -#include "ctinfo.h" -#include "traverse.h" -#include "free.h" -#include "DupTree.h" -#include "print.h" -#include "new_types.h" -#include "LookUpTable.h" -#include "math_utils.h" -#include "types.h" -#include "type_utils.h" -#include "cuda_utils.h" -#include "DataFlowMask.h" -#include "DataFlowMaskUtils.h" -#include "remove_dfms.h" -#include "infer_dfms.h" - -/** - * - * @name INFO structure - * @{ - * - *****************************************************************************/ -struct INFO { - node *fundef; - bool in_cudawl; - bool create_d2h; - node *postassigns; - node *preassigns; - lut_t *lut; - lut_t *notran; - node *let_expr; - bool is_modarr; - bool in_cexprs; - bool from_ap; -}; - -/* - * INFO_FUNDEF N_fundef node of the enclosing function - * - * INFO_INCUDAWL Flag indicating whether the code currently being - * traversed is in a cudarizable N_with - * - * INFO_CREATE_D2H Flag indicating whether needs to be - * created for the N_let->N_ids - * - * INFO_POSTASSIGNS Chain of that needs to be appended - * at the end of the current N_assign - * - * INFO_PREASSIGNS Chain of that needs to be prepended - * at the beginning of the current N_assign - * - * INFO_LUT Lookup table storing pairs of Avis(host)->Avis(device) - * e.g. Given a_dev = host2device( a_host), - * Avis(a_host)->Avis(a_dev) will be stored into the table - * - * INFO_NOTRAN Lookup table storing N_avis of arrays varaibles that - * no data transfers should be created. - * - */ - -#define INFO_FUNDEF(n) (n->fundef) -#define INFO_INCUDAWL(n) (n->in_cudawl) -#define INFO_CREATE_D2H(n) (n->create_d2h) -#define INFO_POSTASSIGNS(n) (n->postassigns) -#define INFO_PREASSIGNS(n) (n->preassigns) -#define INFO_LUT(n) (n->lut) -#define INFO_NOTRAN(n) (n->notran) -#define INFO_LETEXPR(n) (n->let_expr) -#define INFO_IS_MODARR(n) (n->is_modarr) -#define INFO_IN_CEXPRS(n) (n->in_cexprs) -#define INFO_FROM_AP(n) (n->from_ap) - -static info * -MakeInfo () -{ - info *result; - - DBUG_ENTER (); - - result = MEMmalloc (sizeof (info)); - - INFO_FUNDEF (result) = NULL; - INFO_INCUDAWL (result) = FALSE; - INFO_CREATE_D2H (result) = FALSE; - INFO_POSTASSIGNS (result) = NULL; - INFO_PREASSIGNS (result) = NULL; - INFO_LUT (result) = NULL; - INFO_NOTRAN (result) = NULL; - INFO_IS_MODARR (result) = FALSE; - INFO_IN_CEXPRS (result) = FALSE; - INFO_FROM_AP (result) = FALSE; - - DBUG_RETURN (result); -} - -static info * -FreeInfo (info *info) -{ - DBUG_ENTER (); - - info = MEMfree (info); - - DBUG_RETURN (info); -} - -/** - * @} - *****************************************************************************/ - -static void CreateHost2Device (node **id, node *host_avis, node *dev_avis, - info *arg_info); - -/** - * - * @name Entry functions - * @{ - * - *****************************************************************************/ -/** - * - * @fn node *IMEMdoInsertMemoryTransfers( node *syntax_tree) - * - *****************************************************************************/ -node * -IMEMdoInsertMemoryTransfers (node *syntax_tree) -{ - info *info; - - DBUG_ENTER (); - - info = MakeInfo (); - - /* - * Infer dataflow masks - */ - // syntax_tree = INFDFMSdoInferDfms( syntax_tree, HIDE_LOCALS_NEVER); - - TRAVpush (TR_imem); - syntax_tree = TRAVdo (syntax_tree, info); - TRAVpop (); - - info = FreeInfo (info); - - DBUG_RETURN (syntax_tree); -} - -/** - * @} - *****************************************************************************/ - -/** - * - * @name Static helper functions - * @{ - * - *****************************************************************************/ - -/** - * - * @fn node* TypeConvert( node *host_avis) - * - * @brief - * - *****************************************************************************/ -static ntype * -TypeConvert (ntype *host_type, nodetype nty, info *arg_info) -{ - ntype *scalar_type, *dev_type = NULL; - simpletype sty; - - DBUG_ENTER (); - - if (nty == N_id) { - /* If the N_ids is of known dimension and is not a scalar */ - DBUG_ASSERT (TUdimKnown (host_type), "AUD N_id found in cudarizable N_with!"); - if (TYgetDim (host_type) > 0) { - /* If the scalar type is simple, e.g. int, float ... */ - if (TYisSimple (TYgetScalar (host_type))) { - dev_type = TYcopyType (host_type); - scalar_type = TYgetScalar (dev_type); - /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ - sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); - /* Set the device simple type */ - scalar_type = TYsetSimpleType (scalar_type, sty); - } - } - } - /* If the node to be type converted is N_ids, its original type - * can be AUD as well as long as the N_with on the RHS is cudarizable. - * The reason a cudarizbale can produce a AUD result illustrated by - * the following example: - * - * cond_fun() - * { - * int[*] aa; - * int bb; - * - * if( cond) { - * aa = with {}:genarray( shp); (cudarizable N_with) - * } - * else { - * bb = 1; - * } - * ret = cond ? aa : bb; - * } - * - */ - else if (nty == N_ids) { - if (NODE_TYPE (INFO_LETEXPR (arg_info)) == N_with) { - /* If the scalar type is simple, e.g. int, float ... */ - if (WITH_CUDARIZABLE (INFO_LETEXPR (arg_info)) - && TYisSimple (TYgetScalar (host_type))) { - dev_type = TYcopyType (host_type); - scalar_type = TYgetScalar (dev_type); - /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ - sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); - /* Set the device simple type */ - scalar_type = TYsetSimpleType (scalar_type, sty); - } - } - } else { - DBUG_UNREACHABLE ("Neither N_id nor N_ids found in TypeConvert!"); - } - - DBUG_RETURN (dev_type); -} - -/** - * @} - *****************************************************************************/ - -/** - * - * @name Traversal functions - * @{ - * - *****************************************************************************/ - -/** - * - * @fn node *IMEMfundef( node *arg_node, info *arg_info) - * - * @brief - * - *****************************************************************************/ -node * -IMEMfundef (node *arg_node, info *arg_info) -{ - node *old_fundef; - - DBUG_ENTER (); - - /* During the main traversal, we only look at non-lac functions */ - if (!FUNDEF_ISLACFUN (arg_node)) { - INFO_FUNDEF (arg_info) = arg_node; - FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); - INFO_FUNDEF (arg_info) = NULL; - - FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); - } else { - if (INFO_FROM_AP (arg_info)) { - old_fundef = INFO_FUNDEF (arg_info); - INFO_FUNDEF (arg_info) = arg_node; - /* Traversal of lac functions are initiated from the calling site */ - FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); - INFO_FUNDEF (arg_info) = old_fundef; - } else { - FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); - } - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMap( node *arg_node, info *arg_info) - * - * @brief - * - *****************************************************************************/ -node * -IMEMap (node *arg_node, info *arg_info) -{ - bool traverse_lac_fun, old_from_ap; - node *ap_args, *fundef_args; - node *avis, *id_avis, *new_avis, *dup_avis; - ntype *dev_type; - node *fundef; - - DBUG_ENTER (); - - fundef = AP_FUNDEF (arg_node); - - /* For us to traverse a function from calling site, it must be a - * condictional function or a loop function and must not be the - * recursive function call in the loop function. */ - traverse_lac_fun = (FUNDEF_ISLACFUN (fundef) && fundef != INFO_FUNDEF (arg_info)); - - if (traverse_lac_fun) { - old_from_ap = INFO_FROM_AP (arg_info); - INFO_FROM_AP (arg_info) = TRUE; - if (!INFO_INCUDAWL (arg_info)) { - AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); - } else { - ap_args = AP_ARGS (arg_node); - fundef_args = FUNDEF_ARGS (AP_FUNDEF (arg_node)); - - while (ap_args != NULL) { - DBUG_ASSERT (fundef_args != NULL, "# of Ap args != # of Fundef args!"); - - DBUG_ASSERT (NODE_TYPE (EXPRS_EXPR (ap_args)) == N_id, - "N_ap argument is not N_id node!"); - - id_avis = ID_AVIS (EXPRS_EXPR (ap_args)); - avis = LUTsearchInLutPp (INFO_LUT (arg_info), id_avis); - - /* If the avis has NOT been come across before */ - if (avis == id_avis) { - /* If the id is NOT the one we don't want to create data transfer for - */ - if (LUTsearchInLutPp (INFO_NOTRAN (arg_info), id_avis) == id_avis) { - dev_type = TypeConvert (AVIS_TYPE (id_avis), N_id, arg_info); - - if( dev_type != NULL /* && - NODE_TYPE( AVIS_DECL( avis)) == N_arg */) { - new_avis = TBmakeAvis (TRAVtmpVarName ("dev"), dev_type); - CreateHost2Device (&EXPRS_EXPR (ap_args), id_avis, new_avis, - arg_info); - - dup_avis = DUPdoDupNode (new_avis); - AVIS_SSAASSIGN (dup_avis) = NULL; - - INFO_LUT (arg_info) - = LUTinsertIntoLutP (INFO_LUT (arg_info), - ARG_AVIS (fundef_args), dup_avis); - ARG_AVIS (fundef_args) = dup_avis; - AVIS_DECL (dup_avis) = fundef_args; - } - } else { - /* If the N_id is the one we don't want to create host2device for, - * propogate that information to the traversal of LAC functions */ - INFO_NOTRAN (arg_info) - = LUTinsertIntoLutP (INFO_NOTRAN (arg_info), - ARG_AVIS (fundef_args), NULL); - } - } else { - /* If the N_avis has been come across before, replace its - * N_avis by the device N_avis */ - ID_AVIS (EXPRS_EXPR (ap_args)) = avis; - dup_avis = DUPdoDupNode (avis); - AVIS_SSAASSIGN (dup_avis) = NULL; - - /* Insert the pair of N_avis(fun arg)->N_avis(device variable) - * into the lookup table, so that when we later traverse the - * body of the fundef, old reference to the arg will be replaced - * by the new device varaible. */ - INFO_LUT (arg_info) - = LUTinsertIntoLutP (INFO_LUT (arg_info), ARG_AVIS (fundef_args), - dup_avis); - - /* Change N_avis of the fun arg to the device variable */ - ARG_AVIS (fundef_args) = dup_avis; - AVIS_DECL (dup_avis) = fundef_args; - } - - ap_args = EXPRS_NEXT (ap_args); - fundef_args = ARG_NEXT (fundef_args); - } - - AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); - } - - INFO_FROM_AP (arg_info) = old_from_ap; - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMassign( node *arg_node, info *arg_info) - * - * @brief Add newly created and to - * the assign chain. - * - *****************************************************************************/ -node * -IMEMassign (node *arg_node, info *arg_info) -{ - node *next; - - DBUG_ENTER (); - - /* - * Here we have to do a top-down traversal for the following reason: - * We need to check whether there is any array variables being defined - * in a cudarizable N_with. If there is, we don't want to create a - * host2devcice when we later come across it in the same block of code. - */ - - ASSIGN_STMT (arg_node) = TRAVdo (ASSIGN_STMT (arg_node), arg_info); - - /* If we are no longer in a cudarizable N_with, we insert - * data transfer primitives into the AST */ - if (!INFO_INCUDAWL (arg_info)) { - next = ASSIGN_NEXT (arg_node); - ASSIGN_NEXT (arg_node) = NULL; - - if (INFO_POSTASSIGNS (arg_info) != NULL) { - arg_node = TCappendAssign (arg_node, INFO_POSTASSIGNS (arg_info)); - INFO_POSTASSIGNS (arg_info) = NULL; - } - - if (INFO_PREASSIGNS (arg_info) != NULL) { - arg_node = TCappendAssign (INFO_PREASSIGNS (arg_info), arg_node); - INFO_PREASSIGNS (arg_info) = NULL; - } - - node *last_assign = arg_node; - while (ASSIGN_NEXT (last_assign) != NULL) { - last_assign = ASSIGN_NEXT (last_assign); - } - - ASSIGN_NEXT (last_assign) = next; - ASSIGN_NEXT (last_assign) = TRAVopt (ASSIGN_NEXT (last_assign), arg_info); - } else { - ASSIGN_NEXT (arg_node) = TRAVopt (ASSIGN_NEXT (arg_node), arg_info); - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMlet( node *arg_node, info *arg_info) - * - * @brief - * - *****************************************************************************/ -node * -IMEMlet (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - LET_EXPR (arg_node) = TRAVdo (LET_EXPR (arg_node), arg_info); - INFO_LETEXPR (arg_info) = LET_EXPR (arg_node); - LET_IDS (arg_node) = TRAVopt (LET_IDS (arg_node), arg_info); - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMwith( node *arg_node, info *arg_info) - * - * @brief Traverse both withop and N_code of a cudarizable N_with - * - *****************************************************************************/ -node * -IMEMwith (node *arg_node, info *arg_info) -{ - lut_t *old_lut; - - DBUG_ENTER (); - - /* If the N_with is cudarizable */ - if (WITH_CUDARIZABLE (arg_node)) { - INFO_LUT (arg_info) = LUTgenerateLut (); - INFO_INCUDAWL (arg_info) = TRUE; - WITH_WITHOP (arg_node) = TRAVdo (WITH_WITHOP (arg_node), arg_info); - - old_lut = INFO_NOTRAN (arg_info); - INFO_NOTRAN (arg_info) = LUTgenerateLut (); - - /* we do not want to create a host2device for index vector */ - INFO_NOTRAN (arg_info) = LUTinsertIntoLutP (INFO_NOTRAN (arg_info), - IDS_AVIS (WITH_VEC (arg_node)), NULL); - - WITH_CODE (arg_node) = TRAVdo (WITH_CODE (arg_node), arg_info); - INFO_NOTRAN (arg_info) = old_lut; - INFO_NOTRAN (arg_info) = LUTremoveLut (INFO_NOTRAN (arg_info)); - - INFO_INCUDAWL (arg_info) = FALSE; - INFO_LUT (arg_info) = LUTremoveLut (INFO_LUT (arg_info)); - - /* We need to create for N_ids on the LHS */ - INFO_CREATE_D2H (arg_info) = TRUE; - } else if (INFO_INCUDAWL (arg_info)) { - /* If we are already in a cudarizable N_with but the - * N_with itself is not a cudarizable N_with */ - - WITH_WITHOP (arg_node) = TRAVdo (WITH_WITHOP (arg_node), arg_info); - INFO_NOTRAN (arg_info) = LUTinsertIntoLutP (INFO_NOTRAN (arg_info), - IDS_AVIS (WITH_VEC (arg_node)), NULL); - - WITH_CODE (arg_node) = TRAVdo (WITH_CODE (arg_node), arg_info); - } else { - /* The following traversal has been commented out because if the outermost - * N_with is not cudarizable, none of its inner N_withs (if - * there is any) will be cudarizable since we only cudarize - * the outermost N_with. */ - - /* WITH_CODE( arg_node) = TRAVdo( WITH_CODE( arg_node), arg_info); */ - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMcode( node *arg_node, info *arg_info) - * - * @brief Traverse the code block - * - *****************************************************************************/ -node * -IMEMcode (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - CODE_CBLOCK (arg_node) = TRAVopt (CODE_CBLOCK (arg_node), arg_info); - - INFO_IN_CEXPRS (arg_info) = TRUE; - CODE_CEXPRS (arg_node) = TRAVopt (CODE_CEXPRS (arg_node), arg_info); - INFO_IN_CEXPRS (arg_info) = FALSE; - - CODE_NEXT (arg_node) = TRAVopt (CODE_NEXT (arg_node), arg_info); - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMgenarray( node *arg_node, info *arg_info) - * - * @brief Traverse default element of a N_genarray - * - *****************************************************************************/ -node * -IMEMgenarray (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - if (INFO_INCUDAWL (arg_info)) { - /* Note that we do not traverse N_genarray->shape. This is - * because it can be an N_id node and we do not want to insert - * for it in this case. Therefore, the only son - * of N_genarray we traverse is the default element. */ - if (GENARRAY_DEFAULT (arg_node) != NULL) { - DBUG_ASSERT (NODE_TYPE (GENARRAY_DEFAULT (arg_node)) == N_id, - "Non N_id default element found in N_genarray!"); - GENARRAY_DEFAULT (arg_node) = TRAVdo (GENARRAY_DEFAULT (arg_node), arg_info); - } - - GENARRAY_RC (arg_node) = TRAVopt (GENARRAY_RC (arg_node), arg_info); - - GENARRAY_NEXT (arg_node) = TRAVopt (GENARRAY_NEXT (arg_node), arg_info); - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMmodarray( node *arg_node, info *arg_info) - * - * @brief Traverse default element of a N_modarray - * - *****************************************************************************/ -node * -IMEMmodarray (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - if (INFO_INCUDAWL (arg_info)) { - DBUG_ASSERT (NODE_TYPE (MODARRAY_ARRAY (arg_node)) == N_id, - "Non N_id modified array found in N_modarray!"); - INFO_IS_MODARR (arg_info) = TRUE; - MODARRAY_ARRAY (arg_node) = TRAVdo (MODARRAY_ARRAY (arg_node), arg_info); - INFO_IS_MODARR (arg_info) = FALSE; - MODARRAY_RC (arg_node) = TRAVopt (MODARRAY_RC (arg_node), arg_info); - MODARRAY_NEXT (arg_node) = TRAVopt (MODARRAY_NEXT (arg_node), arg_info); - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMids( node *arg_node, info *arg_info) - * - * @brief For N_ids needed to be type converted, create . - * - *****************************************************************************/ -node * -IMEMids (node *arg_node, info *arg_info) -{ - node *new_avis, *ids_avis; - ntype *ids_type, *dev_type; - - DBUG_ENTER (); - - ids_avis = IDS_AVIS (arg_node); - ids_type = AVIS_TYPE (ids_avis); - - /* If the array is define in Cuda wl, we do not create - * a host2device transfer for it */ - if (INFO_INCUDAWL (arg_info)) { - if (TYisArray (ids_type)) { - INFO_NOTRAN (arg_info) - = LUTinsertIntoLutP (INFO_NOTRAN (arg_info), ids_avis, NULL); - } - } else { - if (INFO_CREATE_D2H (arg_info)) { - dev_type = TypeConvert (ids_type, NODE_TYPE (arg_node), arg_info); - if (dev_type != NULL) { - new_avis = TBmakeAvis (TRAVtmpVarName ("dev"), dev_type); - IDS_AVIS (arg_node) = new_avis; - FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) - = TBmakeVardec (new_avis, FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); - - INFO_POSTASSIGNS (arg_info) - = TBmakeAssign (TBmakeLet (TBmakeIds (ids_avis, NULL), - TBmakePrf (F_device2host, - TBmakeExprs (TBmakeId (new_avis), - NULL))), - INFO_POSTASSIGNS (arg_info)); - /* Maintain SSA property */ - AVIS_SSAASSIGN (new_avis) = AVIS_SSAASSIGN (ids_avis); - AVIS_SSAASSIGN (ids_avis) = INFO_POSTASSIGNS (arg_info); - } - // IDS_NEXT( arg_node) = TRAVopt( IDS_NEXT( arg_node), arg_info); - INFO_CREATE_D2H (arg_info) = FALSE; - } - } - - IDS_NEXT (arg_node) = TRAVopt (IDS_NEXT (arg_node), arg_info); - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMid( node *arg_node, info *arg_info) - * - * @brief For each host array N_id in the cudarizable N_with, either create - * type conversion for it (i.e. ) or set its N_avis to - * that of an already converted device array N_id depending on whether - * the N_id is encountered for the first time or not. - * - *****************************************************************************/ -node * -IMEMid (node *arg_node, info *arg_info) -{ - node *new_avis, *avis, *id_avis; - ntype *dev_type, *id_type; - - DBUG_ENTER (); - - id_avis = ID_AVIS (arg_node); - id_type = AVIS_TYPE (id_avis); - - /* if we are in cudarizable N_with */ - if (INFO_INCUDAWL (arg_info)) { - avis = LUTsearchInLutPp (INFO_LUT (arg_info), id_avis); - - /* If the N_avis node hasn't been come across before AND the id is - * NOT in cexprs. This is because we don't want to create a host2device - * for N_id in the cexprs. However, if the N_id has been come across - * before, even if it's in cexprs, we still need to replace its avis - * by the new avis, i.e. the transferred device variable (See the - * "else" case). This might happen that the N_id in cexprs is not - * a scalar and it's a default element of the withloop. Therefore, - * a early traverse of the withop will insert a host2device for this - * N_id and we here simply need to set it's avis to the device variable - * avis. (This is fix to the bug discovered in compiling tvd2d.sac) */ - - if (avis == id_avis && !INFO_IN_CEXPRS (arg_info)) { - dev_type = TypeConvert (id_type, NODE_TYPE (arg_node), arg_info); - /* Definition of the N_id must not be in the same block as - * reference of the N_id. Otherwise, no host2device will be - * created. e.g. - * - * a = with - * { - * b = [x, y, z]; - * ... - * ... = prf( b); - * }:genarray(); - * - * We do not create b_dev = host2device( b) in this case. - */ - if (dev_type != NULL - && (/* NODE_TYPE( AVIS_DECL( avis)) == N_arg || */ - /* INFO_IS_MODARR( arg_info) || */ - LUTsearchInLutPp (INFO_NOTRAN (arg_info), id_avis) == id_avis)) { - new_avis = TBmakeAvis (TRAVtmpVarName ("dev"), dev_type); - CreateHost2Device (&arg_node, id_avis, new_avis, arg_info); - } - } else { - /* If the N_avis has been come across before, replace its - * N_avis by the device N_avis */ - ID_AVIS (arg_node) = avis; - } - } - DBUG_RETURN (arg_node); -} - -static void -CreateHost2Device (node **id, node *host_avis, node *dev_avis, info *arg_info) -{ - DBUG_ENTER (); - - ID_AVIS (*id) = dev_avis; - FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) - = TBmakeVardec (dev_avis, FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); - - INFO_PREASSIGNS (arg_info) - = TBmakeAssign (TBmakeLet (TBmakeIds (dev_avis, NULL), - TBmakePrf (F_host2device, - TBmakeExprs (TBmakeId (host_avis), NULL))), - INFO_PREASSIGNS (arg_info)); - - /* Maintain SSA property */ - AVIS_SSAASSIGN (dev_avis) = INFO_PREASSIGNS (arg_info); - - /* Insert pair host_avis->dev_avis into lookup table. */ - INFO_LUT (arg_info) = LUTinsertIntoLutP (INFO_LUT (arg_info), host_avis, dev_avis); - - DBUG_RETURN (); -} - -/** - * @} - *****************************************************************************/ - -/** - * @} - *****************************************************************************/ - -#undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/insert_memory_transfers.h b/src/libsac2c/cuda/insert_memory_transfers.h deleted file mode 100644 index db89663428eb83b9b8134893b32efaf62b9bf9d8..0000000000000000000000000000000000000000 --- a/src/libsac2c/cuda/insert_memory_transfers.h +++ /dev/null @@ -1,20 +0,0 @@ - - -#ifndef _SAC_INSERT_MEMORY_TRANSFERS_H_ -#define _SAC_INSERT_MEMORY_TRANSFERS_H_ - -#include "types.h" - -extern node *IMEMdoInsertMemoryTransfers (node *arg_node); -extern node *IMEMfundef (node *arg_node, info *arg_info); -extern node *IMEMap (node *arg_node, info *arg_info); -extern node *IMEMid (node *arg_node, info *arg_info); -extern node *IMEMlet (node *arg_node, info *arg_info); -extern node *IMEMassign (node *arg_node, info *arg_info); -extern node *IMEMwith (node *arg_node, info *arg_info); -extern node *IMEMids (node *arg_node, info *arg_info); -extern node *IMEMgenarray (node *arg_node, info *arg_info); -extern node *IMEMmodarray (node *arg_node, info *arg_info); -extern node *IMEMcode (node *arg_node, info *arg_info); - -#endif diff --git a/src/libsac2c/cuda/insert_withloop_memtran.c b/src/libsac2c/cuda/insert_withloop_memtran.c index 55b3dd9803d8d426834755ccc29ae48074fcb5df..8bf8445f7db5e4ef5d9b0a5fde42e83a6d6e9610 100644 --- a/src/libsac2c/cuda/insert_withloop_memtran.c +++ b/src/libsac2c/cuda/insert_withloop_memtran.c @@ -1,57 +1,47 @@ -/** +/** + * @file + * @defgroup iwlmem Insert CUDA memory transfer primitives + * @ingroup cuda + * + * This module inserts CUDA type conversion primitives before and after + * each cudarizable N_with. The two primitives are and + * . They are used to trasfer the data of a host(device) array + * variable to a device(host) array variable. This is essentially + * compiled into host<->device memory transfers in the backend. As an + * example: + * + * ~~~~ + * a_host = with + * { + * ... = b_host; + * ... = c_host; + * ... = d_host; + * }:genarray( shp); + * ~~~~ + * + * is transformed into: + * + * ~~~~ + * b_dev = host2device( b_host); + * c_dev = host2device( c_host); + * d_dev = host2device( d_host); + * a_dev = with + * { + * ... = b_dev; + * ... = c_dev; + * ... = d_dev; + * }:genarray( shp); + * a_host = device2host( a_dev); + * ~~~~ + * + * @note + * Simple scalar variables need not be type converted since they + * can be passed as function parameters directly to CUDA kernels. * - * @defgroup Insert CUDA memory transfer primitives - * - * - * This module inserts CUDA type conversion primitives before and after - * each cudarizable N_with. The two primitives are and - * . They are used to trasfer the data of a host(device) array - * variable to a device(host) array variable. This is essentially - * compiled into host<->device memory transfers in the backend. As an - * example: - * - * a_host = with - * { - * ... = b_host; - * ... = c_host; - * ... = d_host; - * }:genarray( shp); - * - * is transformed into: - * - * b_dev = host2device( b_host); - * c_dev = host2device( c_host); - * d_dev = host2device( d_host); - * a_dev = with - * { - * ... = b_dev; - * ... = c_dev; - * ... = d_dev; - * }:genarray( shp); - * a_host = device2host( a_dev); - * - * Note that simple scalar variables need not be type converted since they - * can be passed as function parameters directly to CUDA kernels. - * - * @ingroup - * - * @{ASSIGN_STMT( arg_node) - * - *****************************************************************************/ - -/** - * - * @file cuda_type_conversion.c - * - * Prefix: IWLMEM - * - *****************************************************************************/ + * @{ + */ #include "insert_withloop_memtran.h" -/* - * Other includes go here - */ -#include #include "tree_basic.h" #include "tree_compound.h" #include "str.h" @@ -79,54 +69,34 @@ #include "infer_dfms.h" #include "NumLookUpTable.h" -/** - * - * @name INFO structure - * @{ - * - *****************************************************************************/ +/** @name INFO structure + * @{ + */ struct INFO { - node *fundef; - bool in_cudawl; - bool create_d2h; - node *postassigns; - node *preassigns; - lut_t *lut; - lut_t *notran; - node *let_expr; - bool is_modarr; - bool in_cexprs; - bool from_ap; - node *letids; - node *apids; - node *topblock; - nlut_t *at_nlut; + node *fundef; /**< N_fundef node of the enclosing function */ + bool in_cudawl; /**< Flag indicating whether the code currently being traversed is in + a cudarizable N_with */ + bool create_d2h; /**< Flag indicating whether needs to be created for + the N_let->N_ids */ + node *postassigns; /**< Chain of that needs to be appended at the end of + the current N_assign */ + node *preassigns; /**< Chain of that needs to be prepended at the + beginning of the current N_assign */ + lut_t *lut; /**< Lookup table storing pairs of Avis(host)->Avis(device) e.g. Given + a_dev = host2device( a_host), Avis(a_host)->Avis(a_dev) will be stored + into the table */ + lut_t *notran; /**< Lookup table storing N_avis of arrays varaibles that no data + transfers should be created. */ + node *let_expr; /**< Holds the current N_let expressions, used to check if the RHS is + a with-loop */ + node *let_ids; /**< Holds the current N_let N_ids chain */ + bool in_cexprs; /**< Flag indicating where are in N_code cexprs */ + bool from_ap; /**< Flag indicating where are coming from a N_ap */ + node *apids; /**< Holds LHS of current N_ap */ + node *topblock; /**< Holds the N_block (body) of the current N_fundef */ + nlut_t *at_nlut; /**< Used to count the number of references of N_avis */ }; -/* - * INFO_FUNDEF N_fundef node of the enclosing function - * - * INFO_INCUDAWL Flag indicating whether the code currently being - * traversed is in a cudarizable N_with - * - * INFO_CREATE_D2H Flag indicating whether needs to be - * created for the N_let->N_ids - * - * INFO_POSTASSIGNS Chain of that needs to be appended - * at the end of the current N_assign - * - * INFO_PREASSIGNS Chain of that needs to be prepended - * at the beginning of the current N_assign - * - * INFO_LUT Lookup table storing pairs of Avis(host)->Avis(device) - * e.g. Given a_dev = host2device( a_host), - * Avis(a_host)->Avis(a_dev) will be stored into the table - * - * INFO_NOTRAN Lookup table storing N_avis of arrays varaibles that - * no data transfers should be created. - * - */ - #define INFO_FUNDEF(n) (n->fundef) #define INFO_INCUDAWL(n) (n->in_cudawl) #define INFO_CREATE_D2H(n) (n->create_d2h) @@ -135,10 +105,9 @@ struct INFO { #define INFO_LUT(n) (n->lut) #define INFO_NOTRAN(n) (n->notran) #define INFO_LETEXPR(n) (n->let_expr) -#define INFO_IS_MODARR(n) (n->is_modarr) +#define INFO_LETIDS(n) (n->let_ids) #define INFO_IN_CEXPRS(n) (n->in_cexprs) #define INFO_FROM_AP(n) (n->from_ap) -#define INFO_LETIDS(n) (n->letids) #define INFO_APIDS(n) (n->apids) #define INFO_TOPBLOCK(n) (n->topblock) #define INFO_AT_NLUT(n) (n->at_nlut) @@ -159,7 +128,6 @@ MakeInfo (void) INFO_PREASSIGNS (result) = NULL; INFO_LUT (result) = NULL; INFO_NOTRAN (result) = NULL; - INFO_IS_MODARR (result) = FALSE; INFO_IN_CEXPRS (result) = FALSE; INFO_FROM_AP (result) = FALSE; INFO_LETIDS (result) = NULL; @@ -180,25 +148,18 @@ FreeInfo (info *info) DBUG_RETURN (info); } -/** - * @} - *****************************************************************************/ +/** @} */ -static void CreateHost2Device (node **id, node *host_avis, node *dev_avis, - info *arg_info); -static bool AssignInTopBlock (node *assign, info *arg_info); +/** @name Entry functions + * @{ + */ -/** - * - * @name Entry functions - * @{ +/** + * @brief Perform the IWLMEM traversal, and additionally call the EMRTU traversal. * - *****************************************************************************/ -/** - * - * @fn node *IWLMEMdoInsertWithloopMemtran( node *syntax_tree) - * - *****************************************************************************/ + * @param syntax_tree + * @return syntax_tree + */ node * IWLMEMdoInsertWithloopMemtran (node *syntax_tree) { @@ -206,13 +167,13 @@ IWLMEMdoInsertWithloopMemtran (node *syntax_tree) DBUG_ENTER (); - info = MakeInfo (); - /* * Infer dataflow masks */ // syntax_tree = INFDFMSdoInferDfms( syntax_tree, HIDE_LOCALS_NEVER); + info = MakeInfo (); + TRAVpush (TR_iwlmem); syntax_tree = TRAVdo (syntax_tree, info); TRAVpop (); @@ -222,78 +183,120 @@ IWLMEMdoInsertWithloopMemtran (node *syntax_tree) DBUG_RETURN (syntax_tree); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * - * @name Static helper functions - * @{ - * - *****************************************************************************/ +/** @name Static helper functions + * @{ + */ -/** +/** + * @brief Create host2device call, and add to the info struct to be added to + * the syntax tree later. * - * @fn node* TypeConvert( node *host_avis) + * @param id The argument position to place the device N_avis + * @param host_avis The host N_avis + * @param dev_avis The new device N_avis + * @param info The info struct + * @return + */ +static void +CreateHost2Device (node **id, node *host_avis, node *dev_avis, info *arg_info) +{ + DBUG_ENTER (); + + ID_AVIS (*id) = dev_avis; + FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) + = TBmakeVardec (dev_avis, FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); + + INFO_PREASSIGNS (arg_info) + = TBmakeAssign (TBmakeLet (TBmakeIds (dev_avis, NULL), + TBmakePrf (F_host2device, + TBmakeExprs (TBmakeId (host_avis), NULL))), + INFO_PREASSIGNS (arg_info)); + + /* Maintain SSA property */ + AVIS_SSAASSIGN (dev_avis) = INFO_PREASSIGNS (arg_info); + + /* Insert pair host_avis->dev_avis into lookup table. */ + INFO_LUT (arg_info) = LUTinsertIntoLutP (INFO_LUT (arg_info), host_avis, dev_avis); + + DBUG_RETURN (); +} + +/** + * @brief Search through N_block (passed in via the info struct) for a specific + * assignment. * - * @brief + * @param assign The N_assign to search for + * @param info The info struct (which holds link to the N_block) + * @return True if the N_assign was found, False otherwise + */ +static bool +AssignInTopBlock (node *assign, info *arg_info) +{ + bool res = FALSE; + node *assign_chain; + + DBUG_ENTER (); + + assign_chain = BLOCK_ASSIGNS (INFO_TOPBLOCK (arg_info)); + + while (assign_chain != NULL) { + if (assign_chain == assign) { + res = TRUE; + break; + } + assign_chain = ASSIGN_NEXT (assign_chain); + } + + DBUG_RETURN (res); +} + +/** + * @brief Convert from a host ntype to a device ntype, while preserving shape information. * - *****************************************************************************/ + * @param host_type The host ntype + * @param nty The nodetype of the node being converted (support N_id and N_ids) + * @param info The info struct + * @return A device ntype struct + */ static ntype * TypeConvert (ntype *host_type, nodetype nty, info *arg_info) { - ntype *scalar_type, *dev_type = NULL; - simpletype sty; + ntype *dev_type = NULL; DBUG_ENTER (); if (nty == N_id) { - /* If the N_id is of known dimension and is not a scalar */ - DBUG_ASSERT (TUdimKnown (host_type), "AUD N_id found in cudarizable N_with!"); - if (TYgetDim (host_type) > 0) { - /* If the scalar type is simple, e.g. int, float ... */ - if (TYisSimple (TYgetScalar (host_type))) { - dev_type = TYcopyType (host_type); - scalar_type = TYgetScalar (dev_type); - /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ - sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); - /* Set the device simple type */ - scalar_type = TYsetSimpleType (scalar_type, sty); - } - } + dev_type = CUconvertHostToDeviceType (host_type); } /* If the node to be type converted is N_ids, its original type * can be AUD as well as long as the N_with on the RHS is cudarizable. * The reason a cudarizbale can produce a AUD result illustrated by * the following example: * - * cond_fun() - * { - * int[*] aa; - * int bb; + * ~~~~ + * cond_fun() + * { + * int[*] aa; + * int bb; * - * if( cond) { - * aa = with {}:genarray( shp); (cudarizable N_with) - * } - * else { - * bb = 1; - * } - * ret = cond ? aa : bb; + * if( cond) { + * aa = with {}:genarray( shp); (cudarizable N_with) + * } + * else { + * bb = 1; * } + * ret = cond ? aa : bb; + * } + * ~~~~ * */ else if (nty == N_ids) { if (NODE_TYPE (INFO_LETEXPR (arg_info)) == N_with) { /* If the scalar type is simple, e.g. int, float ... */ - if (WITH_CUDARIZABLE (INFO_LETEXPR (arg_info)) - && TYisSimple (TYgetScalar (host_type))) { - dev_type = TYcopyType (host_type); - scalar_type = TYgetScalar (dev_type); - /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ - sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); - /* Set the device simple type */ - scalar_type = TYsetSimpleType (scalar_type, sty); + if (WITH_CUDARIZABLE (INFO_LETEXPR (arg_info))) { + dev_type = CUconvertHostToDeviceType (host_type); } } } else { @@ -303,6 +306,13 @@ TypeConvert (ntype *host_type, nodetype nty, info *arg_info) DBUG_RETURN (dev_type); } +/** + * @brief Anonymouse traversal function (N_with) + * + * @param arg_node N_with + * @param arg_info info struct + * @return N_with + */ static node * ATravWith (node *arg_node, info *arg_info) { @@ -315,6 +325,13 @@ ATravWith (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } +/** + * @brief Anonymouse traversal function (N_id). For every N_avis, increment a counter. + * + * @param arg_node N_id + * @param arg_info info struct + * @return N_id + */ static node * ATravId (node *arg_node, info *arg_info) { @@ -325,6 +342,13 @@ ATravId (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } +/** + * @brief Anonymouse traversal function (N_genarray). Traverse through all N_id sons. + * + * @param arg_node N_genarray + * @param arg_info info struct + * @return N_genarray + */ static node * ATravGenarray (node *arg_node, info *arg_info) { @@ -342,29 +366,29 @@ ATravGenarray (node *arg_node, info *arg_info) GENARRAY_RC (arg_node) = TRAVopt (GENARRAY_RC (arg_node), arg_info); GENARRAY_ERC (arg_node) = TRAVopt (GENARRAY_ERC (arg_node), arg_info); GENARRAY_PRC (arg_node) = TRAVopt (GENARRAY_PRC (arg_node), arg_info); + GENARRAY_NEXT (arg_node) = TRAVopt (GENARRAY_NEXT (arg_node), arg_info); DBUG_RETURN (arg_node); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * - * @name Traversal functions - * @{ - * - *****************************************************************************/ +/** @name Traversal functions + * @{ + */ -/** +/** + * @brief Traverse N_fundef * - * @fn node *IWLMEMfundef( node *arg_node, info *arg_info) + * If the current N_fundef is not a LaC function, traverse the body and next. Otherwise, + * if we are coming from a N_ap that is a LaC function, traverse *only* the body, passing + * a link to the body. Otherwise, we go to the next N_fundef. * - * @brief - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info struct + * @return N_fundef node + */ node * IWLMEMfundef (node *arg_node, info *arg_info) { @@ -373,15 +397,21 @@ IWLMEMfundef (node *arg_node, info *arg_info) DBUG_ENTER (); + DBUG_PRINT ("at %s", FUNDEF_NAME (arg_node)); + /* During the main traversal, we only look at non-lac functions */ if (!FUNDEF_ISLACFUN (arg_node)) { + DBUG_PRINT ("...inspecting body"); INFO_FUNDEF (arg_info) = arg_node; INFO_TOPBLOCK (arg_info) = FUNDEF_BODY (arg_node); FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); INFO_FUNDEF (arg_info) = NULL; FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); } else { + DBUG_PRINT ("...inspecting LAC body"); if (INFO_FROM_AP (arg_info)) { + DBUG_PRINT ("...from application"); + old_fundef = INFO_FUNDEF (arg_info); old_topblock = INFO_TOPBLOCK (arg_info); INFO_FUNDEF (arg_info) = arg_node; @@ -398,32 +428,37 @@ IWLMEMfundef (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMap( node *arg_node, info *arg_info) +/** + * @brief Traverse N_ap which is a LaC function, and *not* the recursive call. * - * @brief + * If the application is outside a CUDA withloop, we traverse into its N_fundef. + * If its from within a CUDA withloop, we check all its arguments against the LUT and if + * we find a match and its not marked for NOTRAN (no transfer), we create a host2device + * call. * - *****************************************************************************/ + * @param arg_node N_ap + * @param arg_info info struct + * @return N_ap node + */ node * IWLMEMap (node *arg_node, info *arg_info) { bool traverse_lac_fun, old_from_ap; - node *ap_args, *fundef_args; - node *avis, *id_avis, *new_avis, *dup_avis; + node *ap_args, *fundef_args, *avis, *new_avis, *dup_avis, *id_avis; ntype *dev_type; node *fundef, *old_apids; DBUG_ENTER (); fundef = AP_FUNDEF (arg_node); + DBUG_PRINT ("ap_fun %s", FUNDEF_NAME (fundef)); /* For us to traverse a function from calling site, it must be a * condictional function or a loop function and must not be the * recursive function call in the loop function. */ traverse_lac_fun = (FUNDEF_ISLACFUN (fundef) && fundef != INFO_FUNDEF (arg_info)); - if (traverse_lac_fun) { + if (traverse_lac_fun) { /* inside loop or conditional */ old_from_ap = INFO_FROM_AP (arg_info); INFO_FROM_AP (arg_info) = TRUE; @@ -431,10 +466,13 @@ IWLMEMap (node *arg_node, info *arg_info) INFO_APIDS (arg_info) = INFO_LETIDS (arg_info); if (!INFO_INCUDAWL (arg_info)) { + DBUG_PRINT ("...not in CUDAWL"); + AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); } else { + /* Used to add h2d transfers for applications within WL N_code */ ap_args = AP_ARGS (arg_node); - fundef_args = FUNDEF_ARGS (AP_FUNDEF (arg_node)); + fundef_args = FUNDEF_ARGS (fundef); while (ap_args != NULL) { DBUG_ASSERT (fundef_args != NULL, "# of Ap args != # of Fundef args!"); @@ -447,8 +485,8 @@ IWLMEMap (node *arg_node, info *arg_info) /* If the avis has NOT been come across before */ if (avis == id_avis) { - DBUG_PRINT ("fundef %s, id %s", FUNDEF_NAME (AP_FUNDEF (arg_node)), - AVIS_NAME (avis)); + DBUG_PRINT ("new arg for ap_fun %s, id %s", FUNDEF_NAME (fundef), + AVIS_NAME (avis)); /* If the id is NOT the one we don't want to create data transfer for */ if (LUTsearchInLutPp (INFO_NOTRAN (arg_info), id_avis) == id_avis) { @@ -482,6 +520,8 @@ IWLMEMap (node *arg_node, info *arg_info) ARG_AVIS (fundef_args), NULL); } } else { + DBUG_PRINT ("existing arg on ap_fun %s, id %s", FUNDEF_NAME (fundef), + AVIS_NAME (avis)); /* If the N_avis has been come across before, replace its * N_avis by the device N_avis */ ID_AVIS (EXPRS_EXPR (ap_args)) = avis; @@ -524,14 +564,14 @@ IWLMEMap (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMassign( node *arg_node, info *arg_info) - * +/** * @brief Add newly created and to * the assign chain. * - *****************************************************************************/ + * @param arg_node N_assign + * @param arg_info info struct + * @return N_assign node + */ node * IWLMEMassign (node *arg_node, info *arg_info) { @@ -577,13 +617,13 @@ IWLMEMassign (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMlet( node *arg_node, info *arg_info) - * - * @brief +/** + * @brief Traverse N_let, carrying both the LHS and RHS. * - *****************************************************************************/ + * @param arg_node N_let + * @param arg_info info struct + * @return N_let node + */ node * IWLMEMlet (node *arg_node, info *arg_info) { @@ -598,13 +638,14 @@ IWLMEMlet (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMfuncond( node *arg_node, info *arg_info) - * - * @brief +/** + * @brief Traverse N_funcond that are within a CUDA withloop and change N_avis basetypes + * to device types. * - *****************************************************************************/ + * @param arg_node N_funcond + * @param arg_info info struct + * @return N_funcond node + */ node * IWLMEMfuncond (node *arg_node, info *arg_info) { @@ -683,13 +724,13 @@ IWLMEMfuncond (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMwith( node *arg_node, info *arg_info) - * +/** * @brief Traverse both withop and N_code of a cudarizable N_with * - *****************************************************************************/ + * @param arg_node N_with + * @param arg_info info struct + * @return N_with node + */ node * IWLMEMwith (node *arg_node, info *arg_info) { @@ -698,6 +739,8 @@ IWLMEMwith (node *arg_node, info *arg_info) DBUG_ENTER (); + DBUG_PRINT ("at WL"); + /* If the N_with is cudarizable */ if (WITH_CUDARIZABLE (arg_node)) { @@ -774,13 +817,13 @@ IWLMEMwith (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMcode( node *arg_node, info *arg_info) +/** + * @brief Traverse N_code of withloop. * - * @brief Traverse the code block - * - *****************************************************************************/ + * @param arg_node N_code + * @param arg_info info struct + * @return N_code node + */ node * IWLMEMcode (node *arg_node, info *arg_info) { @@ -797,13 +840,13 @@ IWLMEMcode (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMgenarray( node *arg_node, info *arg_info) - * +/** * @brief Traverse default element of a N_genarray * - *****************************************************************************/ + * @param arg_node N_genarray + * @param arg_info info struct + * @return N_genarray node + */ node * IWLMEMgenarray (node *arg_node, info *arg_info) { @@ -830,13 +873,13 @@ IWLMEMgenarray (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMmodarray( node *arg_node, info *arg_info) - * +/** * @brief Traverse default element of a N_modarray * - *****************************************************************************/ + * @param arg_node N_modarray + * @param arg_info info struct + * @return N_modarray node + */ node * IWLMEMmodarray (node *arg_node, info *arg_info) { @@ -845,24 +888,26 @@ IWLMEMmodarray (node *arg_node, info *arg_info) if (INFO_INCUDAWL (arg_info)) { DBUG_ASSERT (NODE_TYPE (MODARRAY_ARRAY (arg_node)) == N_id, "Non N_id modified array found in N_modarray!"); - INFO_IS_MODARR (arg_info) = TRUE; MODARRAY_ARRAY (arg_node) = TRAVdo (MODARRAY_ARRAY (arg_node), arg_info); - INFO_IS_MODARR (arg_info) = FALSE; + MODARRAY_RC (arg_node) = TRAVopt (MODARRAY_RC (arg_node), arg_info); MODARRAY_ERC (arg_node) = TRAVopt (MODARRAY_ERC (arg_node), arg_info); + MODARRAY_NEXT (arg_node) = TRAVopt (MODARRAY_NEXT (arg_node), arg_info); } DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMids( node *arg_node, info *arg_info) - * - * @brief For N_ids needed to be type converted, create . +/** + * @brief For N_ids on the LHS of CUDA-WLs, create a new LHS avis with device type, and + * create a assign to be placed after the WL. If the N_ids is in a + * CUDA-WL, add it to the NOTRAN LUT. * - *****************************************************************************/ + * @param arg_node N_ids + * @param arg_info info struct + * @return N_ids node + */ node * IWLMEMids (node *arg_node, info *arg_info) { @@ -874,6 +919,8 @@ IWLMEMids (node *arg_node, info *arg_info) ids_avis = IDS_AVIS (arg_node); ids_type = AVIS_TYPE (ids_avis); + DBUG_PRINT ("at IDS of %s", AVIS_NAME (ids_avis)); + /* If the array is defined in cuda withloop, we do not create * a host2device transfer for it */ if (INFO_INCUDAWL (arg_info)) { @@ -890,26 +937,40 @@ IWLMEMids (node *arg_node, info *arg_info) TYgetSimpleType (TYgetScalar (ids_type)))); } } - } else { + } else { /* not in CUDAWL */ if (INFO_CREATE_D2H (arg_info)) { + /* if we come this this point after a CUDAWL, we probably need to + * create a device2host transfer. */ dev_type = TypeConvert (ids_type, NODE_TYPE (arg_node), arg_info); if (dev_type != NULL) { + + /* create new avis for WL return */ new_avis = TBmakeAvis (TRAVtmpVarName ("dev"), dev_type); IDS_AVIS (arg_node) = new_avis; + DBUG_PRINT ("...replacing WL return %s -> %s", AVIS_NAME (ids_avis), + AVIS_NAME (new_avis)); + + /* add to fundef vardecs */ FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) = TBmakeVardec (new_avis, FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); + /* create device2host */ INFO_POSTASSIGNS (arg_info) = TBmakeAssign (TBmakeLet (TBmakeIds (ids_avis, NULL), TBmakePrf (F_device2host, TBmakeExprs (TBmakeId (new_avis), NULL))), INFO_POSTASSIGNS (arg_info)); - /* Maintain SSA property */ + DBUG_PRINT ("Creating device2host for %s -> %s", AVIS_NAME (new_avis), + AVIS_NAME (ids_avis)); + + /* maintain SSA property */ AVIS_SSAASSIGN (new_avis) = AVIS_SSAASSIGN (ids_avis); AVIS_SSAASSIGN (ids_avis) = INFO_POSTASSIGNS (arg_info); } - // IDS_NEXT( arg_node) = TRAVopt( IDS_NEXT( arg_node), arg_info); + + /* We stop creating any further device2host assigns */ + /* XXX what about multi-operator WLs? */ INFO_CREATE_D2H (arg_info) = FALSE; } } @@ -919,16 +980,16 @@ IWLMEMids (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMid( node *arg_node, info *arg_info) - * +/** * @brief For each host array N_id in the cudarizable N_with, either create * type conversion for it (i.e. ) or set its N_avis to * that of an already converted device array N_id depending on whether * the N_id is encountered for the first time or not. * - *****************************************************************************/ + * @param arg_node N_id + * @param arg_info info struct + * @return N_id node + */ node * IWLMEMid (node *arg_node, info *arg_info) { @@ -943,7 +1004,9 @@ IWLMEMid (node *arg_node, info *arg_info) /* if we are in cudarizable N_with */ if (INFO_INCUDAWL (arg_info)) { - avis = (node *)LUTsearchInLutPp (INFO_LUT (arg_info), id_avis); + DBUG_PRINT ("inspecting %s", AVIS_NAME (id_avis)); + + avis = LUTsearchInLutPp (INFO_LUT (arg_info), id_avis); /* If the N_avis node hasn't been come across before AND the id is * NOT in cexprs. This is because we don't want to create a host2device @@ -976,7 +1039,7 @@ IWLMEMid (node *arg_node, info *arg_info) if (((INFO_IN_CEXPRS (arg_info) && ssaassign != NULL && AssignInTopBlock (ssaassign, arg_info)) || !INFO_IN_CEXPRS (arg_info)) - && !CUisShmemTypeNew (id_type) + && !CUisDeviceTypeNew (id_type) && !CUisShmemTypeNew (id_type) && LUTsearchInLutPp (INFO_NOTRAN (arg_info), id_avis) == id_avis) { dev_type = TypeConvert (id_type, NODE_TYPE (arg_node), arg_info); if (dev_type != NULL) { @@ -994,58 +1057,7 @@ IWLMEMid (node *arg_node, info *arg_info) } DBUG_RETURN (arg_node); } - -static void -CreateHost2Device (node **id, node *host_avis, node *dev_avis, info *arg_info) -{ - DBUG_ENTER (); - - ID_AVIS (*id) = dev_avis; - FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) - = TBmakeVardec (dev_avis, FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); - - INFO_PREASSIGNS (arg_info) - = TBmakeAssign (TBmakeLet (TBmakeIds (dev_avis, NULL), - TBmakePrf (F_host2device, - TBmakeExprs (TBmakeId (host_avis), NULL))), - INFO_PREASSIGNS (arg_info)); - - /* Maintain SSA property */ - AVIS_SSAASSIGN (dev_avis) = INFO_PREASSIGNS (arg_info); - - /* Insert pair host_avis->dev_avis into lookup table. */ - INFO_LUT (arg_info) = LUTinsertIntoLutP (INFO_LUT (arg_info), host_avis, dev_avis); - - DBUG_RETURN (); -} - -static bool -AssignInTopBlock (node *assign, info *arg_info) -{ - bool res = FALSE; - node *assign_chain; - - DBUG_ENTER (); - - assign_chain = BLOCK_ASSIGNS (INFO_TOPBLOCK (arg_info)); - - while (assign_chain != NULL) { - if (assign_chain == assign) { - res = TRUE; - break; - } - assign_chain = ASSIGN_NEXT (assign_chain); - } - - DBUG_RETURN (res); -} - -/** - * @} - *****************************************************************************/ - -/** - * @} - *****************************************************************************/ +/** @} */ +/** @} */ #undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/minimize_block_transfers2.c b/src/libsac2c/cuda/minimize_block_transfers2.c index d40de27f5e8b066f50fe3ced36c9d639a916134a..b997a8a380740de7021f003e0c19428a6a6f769c 100644 --- a/src/libsac2c/cuda/minimize_block_transfers2.c +++ b/src/libsac2c/cuda/minimize_block_transfers2.c @@ -1,54 +1,53 @@ -/** +/** + * @file + * @defgroup mbtran2 Minimize Block Transfers + * @ingroup cuda + * + * @brief Minimize the number of host<->device transfers in a + * sequential block of instructions. + * + * This modules tries to eliminate / instructions + * in a sequential block of code. Two difference cases expose the opportunities + * for elimination: + * + * 1. + * ~~~~ + * a_host = device2host( b_dev); + * ... + * ... + * a_dev = host2device( a_host); + * ~~~~ + * + * The second memory transfer, i.e. a_dev = host2device( a_host) + * can be eliminated. Any reference to a_dev after it will be + * replaced by b_dev. + * + * 2. + * ~~~~ + * b_dev = host2device( a_host); + * ... + * ... + * c_dev = host2device( a_host); + * ~~~~ + * + * The second memory transfer, i.e. c_dev = host2device( a_host) + * can be eliminated. Any reference to c_dev after it will be + * replaced by b_dev. * - * @defgroup Minimize the number of host<->device transfers in a - * sequential block of instructions. - * - * This modules tries to eliminate / instructions - * in a sequential block of code. Two difference cases expose the opportunities - * for elimination: - * - * 1) a_host = device2host( b_dev); - * ... - * ... - * a_dev = host2device( a_host); - * - * The second memory transfer, i.e. a_dev = host2device( a_host) - * can be eliminated. Any reference to a_dev after it will be - * replaced by b_dev. - * - * - * - * 2) b_dev = host2device( a_host); - * ... - * ... - * c_dev = host2device( a_host); - * - * The second memory transfer, i.e. c_dev = host2device( a_host) - * can be eliminated. Any reference to c_dev after it will be - * replaced by b_dev. - * - * - *****************************************************************************/ - -/** - * - * @file minimize_block_transfers.c - * - * Prefix: MBTRAN2 - * - *****************************************************************************/ + * @{ + */ #include "minimize_block_transfers2.h" -#include #include "new_types.h" #include "tree_compound.h" #include "free.h" +#include "globals.h" #include "traverse.h" #include "tree_basic.h" #include "LookUpTable.h" #include "memory.h" -#define DBUG_PREFIX "UNDEFINED" +#define DBUG_PREFIX "MBTRAN2" #include "debug.h" #include "deadcoderemoval.h" @@ -56,12 +55,10 @@ #include "SSACSE.h" #include "DupTree.h" -/** - * +/** * @name INFO structure * @{ - * - *****************************************************************************/ + */ struct INFO { node *current_block; node *lastassign; @@ -95,23 +92,18 @@ FreeInfo (info *info) DBUG_RETURN (info); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * +/** * @name Entry functions * @{ - * - *****************************************************************************/ -/** - * - * @fn node *MBTRAN2doMinimizeBlockTransfers( node *syntax_tree) - * - * @brief - * - *****************************************************************************/ + */ + +/** + * @brief Invoke the CUDA block transfer minimisation traversal + * @param syntax_tree + * @return syntax tree + */ node * MBTRAN2doMinimizeBlockTransfers (node *syntax_tree) { @@ -126,33 +118,31 @@ MBTRAN2doMinimizeBlockTransfers (node *syntax_tree) info = FreeInfo (info); + DBUG_PRINT ("invoking CSE"); syntax_tree = CSEdoCommonSubexpressionElimination (syntax_tree); /* We rely on Dead Code Removal to remove the * unused / */ + DBUG_PRINT ("invoking DCR"); syntax_tree = DCRdoDeadCodeRemoval (syntax_tree); DBUG_RETURN (syntax_tree); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * +/** * @name Traversal functions * @{ - * - *****************************************************************************/ + */ -/** +/** + * @brief Store current N_block in info struct and traverse the N_assigns * - * @fn node *MBTRAN2block( node *arg_node, info *arg_info) - * - * @brief - * - *****************************************************************************/ + * @param arg_node N_block + * @param arg_info info structure + * @return N_block + */ node * MBTRAN2block (node *arg_node, info *arg_info) { @@ -170,13 +160,13 @@ MBTRAN2block (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MBTRAN2assign( node *arg_node, info *arg_info) - * - * @brief +/** + * @brief Store the current N_assign in traverse statements top-down * - *****************************************************************************/ + * @param arg_node N_assign + * @param arg_info info structure + * @return N_assign + */ node * MBTRAN2assign (node *arg_node, info *arg_info) { @@ -193,13 +183,14 @@ MBTRAN2assign (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** +/** + * @brief Check if `F_host2device` argument is assigned to via `F_device2host`, + * delete the current N_prf and replace the RHS of the assign. * - * @fn node *MBTRAN2prf( node *arg_node, info *arg_info) - * - * @brief - * - *****************************************************************************/ + * @param arg_node N_prf + * @param arg_info info structure + * @return N_prf + */ node * MBTRAN2prf (node *arg_node, info *arg_info) { @@ -209,6 +200,7 @@ MBTRAN2prf (node *arg_node, info *arg_info) switch (PRF_PRF (arg_node)) { case F_host2device: + DBUG_PRINT ("Checking H2D to elimitating preceeding D2H"); ssaassign = AVIS_SSAASSIGN (ID_AVIS (PRF_ARG1 (arg_node))); /* if( ISDEVICE2HOST( ssaassign) && @@ -216,10 +208,12 @@ MBTRAN2prf (node *arg_node, info *arg_info) ASSIGN_CONTAINING_BLOCK( INFO_LASTASSIGN( arg_info)))) { ( */ if (ISDEVICE2HOST (ssaassign)) { + DBUG_PRINT ("...eliminating H2D and replacing LHS"); node *dev_id = PRF_ARG1 (ASSIGN_RHS (ssaassign)); node *dev_avis = ID_AVIS (dev_id); arg_node = FREEdoFreeNode (arg_node); arg_node = TBmakeId (dev_avis); + global.optcounters.cuda_min_trans++; } break; default: @@ -228,12 +222,6 @@ MBTRAN2prf (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * @} - *****************************************************************************/ - -/** - * @} - *****************************************************************************/ - +/** @} */ +/** @} */ #undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/minimize_cond_transfers.c b/src/libsac2c/cuda/minimize_cond_transfers.c index 01587830f15b540e9328dc1e1ffba78d5e1b0fcb..b6d68de157ea1c9c3cc11c456e92b56c1ace7440 100644 --- a/src/libsac2c/cuda/minimize_cond_transfers.c +++ b/src/libsac2c/cuda/minimize_cond_transfers.c @@ -304,10 +304,12 @@ MCTRANassign (node *arg_node, info *arg_info) if (INFO_APPOSTASSIGNS (arg_info) != NULL) { ASSIGN_NEXT (arg_node) = INFO_APPOSTASSIGNS (arg_info); + global.optcounters.cuda_min_trans++; } if (INFO_APPREASSIGNS (arg_info) != NULL) { arg_node = TCappendAssign (INFO_APPREASSIGNS (arg_info), arg_node); + global.optcounters.cuda_min_trans++; } FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) diff --git a/src/libsac2c/cuda/minimize_cudast_transfers.c b/src/libsac2c/cuda/minimize_cudast_transfers.c index e0ef2d22e06fe12daebf56d90f2a25c6e3cccb7e..648fb8149113eba6362a08fd4ace98c81b8a0a32 100644 --- a/src/libsac2c/cuda/minimize_cudast_transfers.c +++ b/src/libsac2c/cuda/minimize_cudast_transfers.c @@ -219,11 +219,13 @@ MCSTRANassign (node *arg_node, info *arg_info) ASSIGN_NEXT (arg_node) = NULL; arg_node = TCappendAssign (arg_node, assigns); INFO_POSTASSIGNS (arg_info) = NULL; + global.optcounters.cuda_min_trans++; } if (INFO_PREASSIGNS (arg_info) != NULL) { arg_node = TCappendAssign (INFO_PREASSIGNS (arg_info), arg_node); INFO_PREASSIGNS (arg_info) = NULL; + global.optcounters.cuda_min_trans++; } } diff --git a/src/libsac2c/cuda/minimize_emr_transfers.c b/src/libsac2c/cuda/minimize_emr_transfers.c new file mode 100644 index 0000000000000000000000000000000000000000..690050c147582f75be222147e53f0f05f5858113 --- /dev/null +++ b/src/libsac2c/cuda/minimize_emr_transfers.c @@ -0,0 +1,633 @@ +/** + * @file + * @defgroup memrt Minimize EMR Transfers + * @ingroup cuda + * + * @brief Convert all ERCs in EMRL affected fundefs with CUDA-WL to CUDA device types. + * + * The general idea is similar to what MLTRAN does, which is to lift out h2d/d2h memcpys + * from a loopfun. This traversal though works for a special case, concerning loopfuns + * which have been effected by the EMRL optimisation *and* where the existing optimisations + * are not able to lift out the h2d. + * + * The latter point is especially important, as traversals like MLTRAN only lift out h2d/d2h + * if there are no further references to the RHS of h2d/d2h. When using EMRL, this check can + * fail because of an extra argument in the recursive loopfun application used for the + * buffer-swapping. + * + * A point to consider, MEMRT only works on loopfuns marked with the ISEMRLIFTED flag + * (on the N_fundef). If we are dealing with a series of nested loops, it will only move + * the h2d one-level up. This is also limited by the EMRL traversal + * (@see memory/emr_loop_optimisation.c) which is conservative in how many levels it will + * lift out allocations. Typically it lifts out allocations from only the innermost loop. + * + * To give a concrete example, we have: + * + * ~~~~ + * lets_loop (...) { + * ... + * ret = let_loop_LOOPFUN (..., input, emr_lift); + * ... + * } + * + * let_loop_LOOPFUN (..., input, emr_tmp) { + * ... + * emr_dev = h2d (emr_tmp); + * input_dev = h2d (input); + * ... + * output_dev = wl (input_dev); [ERC: emr_dev] + * ... + * output = d2h (output_dev); + * ... + * intra = let_loop_LOOPFUN (..., ouput, input); + * } + * ~~~~ + * + * Through this traversal, we transform the above into: + * + * ~~~~ + * lets_loop (...) { + * ... + * emr_dev = h2d (emr_lift); + * ret = let_loop_LOOPFUN (..., input, emr_dev); + * ... + * } + * + * let_loop_LOOPFUN (..., input, emr_dev) { + * ... + * input_dev = h2d (input); + * ... + * output_dev = wl (input_dev); [ERC: emr_dev] + * ... + * output = d2h (output_dev); + * ... + * intra = let_loop_LOOPFUN (..., ouput, output_dev); + * } + * ~~~~ + * + * @{ + */ +#include "minimize_emr_transfers.h" + +#define DBUG_PREFIX "MEMRT" +#include "debug.h" + +#include "types.h" +#include "traverse.h" +#include "tree_basic.h" +#include "tree_compound.h" +#include "memory.h" +#include "globals.h" + +#include "free.h" +#include "cuda_utils.h" +#include "LookUpTable.h" +#include "DupTree.h" +#include "deadcoderemoval.h" + +enum trav_mode { bypass, inap, afterap }; + +/** + * @name INFO structure + * @{ + */ +struct INFO { + int funargnum; /**< used to assign ordinal values to fundef args */ + bool inemrloop; /**< flag indicating we are in a EMRL affected loop */ + enum trav_mode apmode; /**< specifies which mode we are for the N_ap traversal */ + node *fundef; /**< Holds current N_fundef */ + lut_t *lut; /**< LUT is used for storing EMRL lifted h2d RHS -> LHS mappings */ + lut_t *reclut; /**< LUT is used to store all h2d RHS -> LHS mappings */ + node *letids; /**< The the LHS of N_prf */ + node *apargs; /**< N_ap arguments */ + node *apvardecs; /**< Used to update vardecs in N_ap calling context */ + node *apassigns; /**< Used to update assigns in N_ap calling context */ + node *rec_ap; /**< the recursive loopfun N_ap */ +}; + +#define INFO_FUNDEF(n) ((n)->fundef) +#define INFO_LUT(n) ((n)->lut) +#define INFO_RECLUT(n) ((n)->reclut) +#define INFO_LETIDS(n) ((n)->letids) +#define INFO_FUNARGNUM(n) ((n)->funargnum) +#define INFO_APARGS(n) ((n)->apargs) +#define INFO_APVARDECS(n) ((n)->apvardecs) +#define INFO_APASSIGNS(n) ((n)->apassigns) +#define INFO_REC_AP(n) ((n)->rec_ap) +#define INFO_INEMRLOOP(n) ((n)->inemrloop) +#define INFO_APMODE(n) ((n)->apmode) + +static info * +MakeInfo (void) +{ + info *result; + + DBUG_ENTER (); + + result = (info *)MEMmalloc (sizeof (info)); + + INFO_FUNARGNUM (result) = 0; + INFO_FUNDEF (result) = NULL; + INFO_LUT (result) = NULL; + INFO_RECLUT (result) = NULL; + INFO_LETIDS (result) = NULL; + INFO_APARGS (result) = NULL; + INFO_APVARDECS (result) = NULL; + INFO_APASSIGNS (result) = NULL; + INFO_REC_AP (result) = NULL; + INFO_INEMRLOOP (result) = FALSE; + INFO_APMODE (result) = bypass; + + DBUG_RETURN (result); +} + +static info * +FreeInfo (info *info) +{ + DBUG_ENTER (); + + info = MEMfree (info); + + DBUG_RETURN (info); +} + +/** @} */ + +/** + * @name Anonymous Traversal + * @{ + */ + +/** + * @brief If the application is a the do-loop recursive loop, + * store it in the info structure + * + * @param arg_node N_ap + * @param arg_info info structure + * @return N_ap + */ +static node * +MEMRTapAnon (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + if (INFO_FUNDEF (arg_info) == AP_FUNDEF (arg_node)) { + DBUG_PRINT ("found recursive application of %s...", FUNDEF_NAME (INFO_FUNDEF (arg_info))); + INFO_REC_AP (arg_info) = arg_node; + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief Store LHS in info structure before traversing RHS + * + * @param arg_node N_let + * @param arg_info info structure + * @return N_let + */ +static node * +MEMRTletAnon (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + INFO_LETIDS (arg_info) = LET_IDS (arg_node); + LET_EXPR (arg_node) = TRAVdo (LET_EXPR (arg_node), arg_info); + + DBUG_RETURN (arg_node); +} + +/** + * @brief If the N_prf is `F_host2device`, store the mapping of + * RHS to LHS in the LUT + * + * @param arg_node N_prf + * @param arg_info info structure + * @return N_prf + */ +static node * +MEMRTprfAnon (node *arg_node, info *arg_info) +{ + node *arg_avis, *ret_avis; + + DBUG_ENTER (); + + switch (PRF_PRF (arg_node)) { + case F_host2device: + arg_avis = ID_AVIS (PRF_ARG1 (arg_node)); + ret_avis = IDS_AVIS (INFO_LETIDS (arg_info)); + DBUG_PRINT ("found h2d, adding mapping of arg to ret: %s -> %s", AVIS_NAME (arg_avis), AVIS_NAME (ret_avis)); + INFO_RECLUT (arg_info) + = LUTinsertIntoLutP (INFO_RECLUT (arg_info), arg_avis, ret_avis); + break; + default: + break; + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief Use an anonymous traversal to the recursive do-loop application. Additionally + * for all `F_host2device` primitives, store the mapping of RHS to LHS in a LUT. + * + * Both the recursive N_ap and the LUT containing mappings is used later to appropriately + * replace arguments in the recursive N_ap with those matching in the LUT. + * + * @param fundef A N_fundef node, from an N_ap node + * @param arg_info info structure + * @return the first argument, fundef + */ +static node * +MEMRTtravToRecAp (node *fundef, info *arg_info) +{ + node *old_fundef, *old_letids; + anontrav_t trav[4] = {{N_let, &MEMRTletAnon}, {N_ap, &MEMRTapAnon}, {N_prf, &MEMRTprfAnon}, {(nodetype)0, NULL}}; + + DBUG_ENTER (); + + DBUG_ASSERT (NODE_TYPE (fundef) == N_fundef, "First argument must be a N_fundef node!"); + DBUG_ASSERT (INFO_RECLUT (arg_info) != NULL, "The recursive LUT must be created first!"); + + old_fundef = INFO_FUNDEF (arg_info); + old_letids = INFO_LETIDS (arg_info); + INFO_FUNDEF (arg_info) = fundef; + INFO_LETIDS (arg_info) = NULL; + + TRAVpushAnonymous (trav, &TRAVsons); + FUNDEF_BODY (fundef) = TRAVdo (FUNDEF_BODY (fundef), arg_info); + TRAVpop (); + + INFO_FUNDEF (arg_info) = old_fundef; + INFO_LETIDS (arg_info) = old_letids; + + DBUG_RETURN (fundef); +} + +/** @} */ + +/** + * @name Entry function + * @{ + */ + +/** + * @brief The entry function into the MEMRT traversal. + * + * @param syntax_tree + * @return syntax tree + */ +node * +MEMRTdoMinimizeEMRTransfers (node *syntax_tree) +{ + info *info; + + DBUG_ENTER (); + + info = MakeInfo (); + + TRAVpush (TR_memrt); + syntax_tree = TRAVdo (syntax_tree, info); + TRAVpop (); + + info = FreeInfo (info); + + DBUG_PRINT ("invoking DCR"); + syntax_tree = DCRdoDeadCodeRemoval (syntax_tree); + + DBUG_RETURN (syntax_tree); +} + +/** @} */ + +/** + * @name Traversal functions + * @{ + */ + +/** + * @brief Traverse N_fundefs, if its an EMRL affected loopfun, traverse + * only the body. + * + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ +node * +MEMRTfundef (node *arg_node, info *arg_info) +{ + bool old_inemrloop; + + DBUG_ENTER (); + + INFO_FUNDEF (arg_info) = arg_node; + + if (!FUNDEF_ISEMRLIFTED (arg_node)) { + FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); + FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); + } else if (INFO_APMODE (arg_info) == inap) { /* EMR lifted loop */ + DBUG_PRINT ("inspecting EMR affected do-loop %s...", FUNDEF_NAME (arg_node)); + /* We assign a sequential number (starting from 0) to each argument of the loopfun */ + INFO_FUNARGNUM (arg_info) = 0; + FUNDEF_ARGS (arg_node) = TRAVopt (FUNDEF_ARGS (arg_node), arg_info); + + old_inemrloop = INFO_INEMRLOOP (arg_info); + INFO_INEMRLOOP (arg_info) = TRUE; + FUNDEF_BODY (arg_node) = TRAVdo (FUNDEF_BODY (arg_node), arg_info); + INFO_INEMRLOOP (arg_info) = old_inemrloop; + } else { + FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); + } + + + DBUG_RETURN (arg_node); +} + +/** + * @brief Traversal N_fundef arguments and assign an ordinal value + * + * With this we can retrieve an argument using the ordinal value. + * + * @param arg_node N_arg + * @param arg_info info structure + * @return N_arg + */ +node * +MEMRTarg (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + ARG_LINKSIGN (arg_node) = INFO_FUNARGNUM (arg_info); + INFO_FUNARGNUM (arg_info) += 1; + + ARG_NEXT (arg_node) = TRAVopt (ARG_NEXT (arg_node), arg_info); + + DBUG_RETURN (arg_node); +} + +/** + * @brief Traverse statements, if statement is initial N_ap of a loopfun, + * update the current context with values from info structure. + * + * @param arg_node N_assign + * @param arg_info info structure + * @return N_assign + */ +node * +MEMRTassign (node *arg_node, info *arg_info) +{ + node *old_next, *newold_assign, *old_ap_assigns, *old_ap_vardecs; + + DBUG_ENTER (); + + /* stack info fields */ + old_ap_assigns = INFO_APASSIGNS (arg_info); + old_ap_vardecs = INFO_APVARDECS (arg_info); + + ASSIGN_STMT (arg_node) = TRAVdo (ASSIGN_STMT (arg_node), arg_info); + + if (INFO_APMODE (arg_info) == afterap) { + DBUG_PRINT ("updating assigns in calling context"); + old_next = ASSIGN_NEXT (arg_node); + ASSIGN_NEXT (arg_node) = NULL; + + /* add h2d in calling context before N_ap */ + if (INFO_APASSIGNS (arg_info) != NULL) { + arg_node = TCappendAssign (INFO_APASSIGNS (arg_info), arg_node); + global.optcounters.cuda_min_trans++; + } + + /* add needed vardecs to calling context */ + FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) + = TCappendVardec (INFO_APVARDECS (arg_info), + FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); + + /* restore values */ + INFO_APASSIGNS (arg_info) = old_ap_assigns; + INFO_APVARDECS (arg_info) = old_ap_vardecs; + INFO_APMODE (arg_info) = bypass; + + /* re-attach original next node to end of new assigns */ + newold_assign = arg_node; + while (ASSIGN_NEXT (newold_assign) != NULL) { + newold_assign = ASSIGN_NEXT (newold_assign); + } + + ASSIGN_NEXT (newold_assign) = old_next; + ASSIGN_NEXT (newold_assign) = TRAVopt (ASSIGN_NEXT (newold_assign), arg_info); + } else { + ASSIGN_NEXT (arg_node) = TRAVopt (ASSIGN_NEXT (arg_node), arg_info); + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief Store LHS of N_let before traversing RHS + * + * @param arg_node N_let + * @param arg_info info structure + * @return N_let + */ +node * +MEMRTlet (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + INFO_LETIDS (arg_info) = LET_IDS (arg_node); + LET_EXPR (arg_node) = TRAVdo (LET_EXPR (arg_node), arg_info); + INFO_LETIDS (arg_info) = NULL; + + DBUG_RETURN (arg_node); +} + +/** + * @brief Replace current N_id with one stored in LUT + * + * @param arg_node N_id + * @param arg_info info structure + * @return N_id + */ +node * +MEMRTid (node *arg_node, info *arg_info) +{ + node *avis; + + DBUG_ENTER (); + + if (INFO_INEMRLOOP (arg_info)) { + /* If this N_id occurs in a place other than the argument list + * of a recursive application of the enclosing do-fun, reset its + * N_avis to the new N_avis. This is necessary when + * a is lifted out of the do-fun, and therefore + * the device variable is passed to the do-fun as an argument + * instead of a locally declared/defined variable. */ + avis = LUTsearchInLutPp (INFO_LUT (arg_info), ID_AVIS (arg_node)); + if (avis != ID_AVIS (arg_node)) { + ID_AVIS (arg_node) = avis; + } + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief If the N_ap is the initial call for a EMRL affected loopfun, + * traverse the loopfun's body and lift out H2D memcpys + * + * Here we setup the info structure, creating two LUTs and storing some + * stateful information. Additionally we call `MEMRTtravToRecAp` to + * store the recursive loopfun N_ap and populate one of the LUTs. + * + * After traversing, we reset the info structure to a previous state. + * + * @param arg_node N_ap + * @param arg_info info structure + * @return N_ap + */ +node * +MEMRTap (node *arg_node, info *arg_info) +{ + node *old_ap_args, *old_fundef, *old_rec_ap; + lut_t *old_lut, *old_reclut; + + DBUG_ENTER (); + + if (FUNDEF_ISLOOPFUN (AP_FUNDEF (arg_node)) + && FUNDEF_ISEMRLIFTED (AP_FUNDEF (arg_node))) { + if (INFO_FUNDEF (arg_info) != AP_FUNDEF (arg_node)) { /* initial application */ + DBUG_PRINT ("inspecting initial application of %s...", FUNDEF_NAME (AP_FUNDEF (arg_node))); + + /* traverse arguments first */ + AP_ARGS (arg_node) = TRAVopt (AP_ARGS (arg_node), arg_info); + + /* stack info fields */ + old_fundef = INFO_FUNDEF (arg_info); + old_ap_args = INFO_APARGS (arg_info); + old_rec_ap = INFO_REC_AP (arg_info); + old_lut = INFO_LUT (arg_info); + old_reclut = INFO_RECLUT (arg_info); + + /* initialise info fields */ + INFO_APARGS (arg_info) = AP_ARGS (arg_node); + INFO_APASSIGNS (arg_info) = NULL; + INFO_APVARDECS (arg_info) = NULL; + INFO_LUT (arg_info) = LUTgenerateLut (); + INFO_RECLUT (arg_info) = LUTgenerateLut (); + + /* we find the recursive N_ap and fill RECLUT with h2d arg to ret mappings */ + AP_FUNDEF (arg_node) = MEMRTtravToRecAp (AP_FUNDEF (arg_node), arg_info); + + INFO_APMODE (arg_info) = inap; + AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); + INFO_APMODE (arg_info) = afterap; + + /* reset all the info fields */ + INFO_LUT (arg_info) = LUTremoveLut (INFO_LUT (arg_info)); + INFO_LUT (arg_info) = old_lut; + INFO_RECLUT (arg_info) = LUTremoveLut (INFO_RECLUT (arg_info)); + INFO_RECLUT (arg_info) = old_reclut; + INFO_FUNDEF (arg_info) = old_fundef; + INFO_APARGS (arg_info) = old_ap_args; + INFO_REC_AP (arg_info) = old_rec_ap; + } + } else { + AP_ARGS (arg_node) = TRAVopt (AP_ARGS (arg_node), arg_info); + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief If we find a `F_host2device` primitive, we check if its argument + * was created via EMRL lifting out an allocation. If so, we lift + * out the primitive and update the loopfun appropriately. + * + * Assuming we are in a EMRL affected loopfun, if the argument of a `F_host2device` + * primitive is a lifted allocation, we transfer the primitive and declaration via + * the info structure (see N_assign for application). Additionally we place into + * LUT the primitives RHS -> LHS, such that we update all subsequent references + * correctly. Finally we update the correct argument in the recursive N_ap. + * + * @param arg_node N_prf + * @param arg_info info structure + * @return N_prf + */ +node * +MEMRTprf (node *arg_node, info *arg_info) +{ + node *id, *id_decl, *aparg, *ret_avis, *recaparg, *recapexprs; + + DBUG_ENTER (); + + if (INFO_INEMRLOOP (arg_info)) { + switch (PRF_PRF (arg_node)) { + case F_host2device: + id = PRF_ARG1 (arg_node); + id_decl = ID_DECL (id); + + if (NODE_TYPE (id_decl) == N_arg) { + /* host var is passed as argument of do-loop */ + aparg = CUnthApArg (INFO_APARGS (arg_info), ARG_LINKSIGN (id_decl)); + DBUG_ASSERT (NODE_TYPE (aparg) == N_id, + "Arguments of N_ap must be N_id nodes!"); + if (AVIS_ISALLOCLIFT (ID_AVIS (aparg))) { + /* this var is the result of EMRL alloc lifting */ + DBUG_PRINT ("Found H2D that was EMRL lifted: %s (ap) -> %s", ID_NAME (aparg), ID_NAME (id)); + /* We change the argument, e.g. a_host to + * device variable, e.g. a_dev */ + node *vardec = IDS_DECL (INFO_LETIDS (arg_info)); + ARG_AVIS (id_decl) = DUPdoDupNode (VARDEC_AVIS (vardec)); + AVIS_SSAASSIGN (ARG_AVIS (id_decl)) = NULL; + AVIS_DECL (ARG_AVIS (id_decl)) = id_decl; + + /* Insert pair [N_vardec->avis] -> [N_arg->avis] into H2D + * table. Therefore, N_vardec->avis of any subsequent N_id + * nodes will be replaced by N_arg->avis. */ + INFO_LUT (arg_info) + = LUTinsertIntoLutP (INFO_LUT (arg_info), VARDEC_AVIS (vardec), + ARG_AVIS (id_decl)); + + /* Create N_vardec and in the calling context + * i.e. lifting the */ + node *new_avis = DUPdoDupNode (ARG_AVIS (id_decl)); + INFO_APVARDECS (arg_info) + = TBmakeVardec (new_avis, INFO_APVARDECS (arg_info)); + + INFO_APASSIGNS (arg_info) + = TBmakeAssign (TBmakeLet (TBmakeIds (new_avis, NULL), + TBmakePrf (F_host2device, + TBmakeExprs (TBmakeId ( + ID_AVIS (aparg)), + NULL))), + INFO_APASSIGNS (arg_info)); + + /* Replace the N_avis of ap_arg to the new device N_avis */ + ID_AVIS (aparg) = new_avis; + /* Maintain SSA property */ + AVIS_SSAASSIGN (new_avis) = INFO_APASSIGNS (arg_info); + + /* update recursive N_ap argument appropriately */ + recapexprs = TCgetNthExprs ((size_t)ARG_LINKSIGN (id_decl), AP_ARGS (INFO_REC_AP (arg_info))); + recaparg = EXPRS_EXPR (recapexprs); + ret_avis = LUTsearchInLutPp (INFO_RECLUT (arg_info), ID_AVIS (recaparg)); + if (ret_avis == ID_AVIS (recaparg)) { + DBUG_UNREACHABLE ("%s does not exist in RECLUT!", ID_NAME (recaparg)); + } + DBUG_PRINT ("replacing %s -> %s in recursive N_ap", ID_NAME (recaparg), AVIS_NAME (ret_avis)); + ID_AVIS (recaparg) = ret_avis; + } + } + break; + default: + PRF_ARGS (arg_node) = TRAVopt (PRF_ARGS (arg_node), arg_info); + break; + } + } + + DBUG_RETURN (arg_node); +} + +/** @} */ +/** @} */ +#undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/minimize_emr_transfers.h b/src/libsac2c/cuda/minimize_emr_transfers.h new file mode 100644 index 0000000000000000000000000000000000000000..4a05cecc4affe27f1c816614b098dc67ea1b1caf --- /dev/null +++ b/src/libsac2c/cuda/minimize_emr_transfers.h @@ -0,0 +1,16 @@ +#ifndef _SAC_CUDA_MEMRT_H_ +#define _SAC_CUDA_MEMRT_H_ + +#include "types.h" + +extern node *MEMRTdoMinimizeEMRTransfers (node *syntax_tree); + +extern node *MEMRTfundef (node *arg_node, info *arg_info); +extern node *MEMRTarg (node *arg_node, info *arg_info); +extern node *MEMRTassign (node *arg_node, info *arg_info); +extern node *MEMRTlet (node *arg_node, info *arg_info); +extern node *MEMRTid (node *arg_node, info *arg_info); +extern node *MEMRTap (node *arg_node, info *arg_info); +extern node *MEMRTprf (node *arg_node, info *arg_info); + +#endif /* _SAC_CUDA_MEMRT_H_ */ diff --git a/src/libsac2c/cuda/minimize_loop_transfers.c b/src/libsac2c/cuda/minimize_loop_transfers.c index 219ec57a85dae8000036ec5b1ceb5198f0ecf92e..1767313162462bab966a47f6cacd3650889d18d8 100644 --- a/src/libsac2c/cuda/minimize_loop_transfers.c +++ b/src/libsac2c/cuda/minimize_loop_transfers.c @@ -1,23 +1,17 @@ -/***************************************************************************** +/** + * @file + * @defgroup mltran Minimize Loop Transfers + * @ingroup cuda * - * @defgroup Lift memory transfers in loops whenever possible + * @brief Lift memory transfers in loops whenever possible * + * This module implements the transformation of lifting memory transfers + * (/) out of a do-fun. Memory transfers that + * are allowed to be moved out were tagged in the previous phase, i.e. + * Annotate Memory Transfer (AMTRAN). * - * This module implements the transformation of lifting memory transfers - * (/) out of a do-fun. Memory transfers that - * are allowed to be moved out were tagged in the previous phase, i.e. - * Annotate Memory Transfer (AMTRAN). - * - * - *****************************************************************************/ - -/** - * - * @file minimize_loop_transfers.c - * - * Prefix: MLTRAN - * - *****************************************************************************/ + * @{ + */ #include "minimize_loop_transfers.h" #include @@ -29,7 +23,7 @@ #include "memory.h" #include "globals.h" -#define DBUG_PREFIX "MTRAN" +#define DBUG_PREFIX "MLTRAN" #include "debug.h" #include "ctinfo.h" @@ -54,12 +48,10 @@ enum traverse_mode { trav_normalfun, trav_dofun }; -/** - * +/** * @name INFO structure * @{ - * - *****************************************************************************/ + */ struct INFO { bool indofun; node *letids; @@ -135,21 +127,18 @@ FreeInfo (info *info) DBUG_RETURN (info); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * +/** * @name Entry functions * @{ - * - *****************************************************************************/ -/** - * - * @fn node *MLTRANdoMinimizeLoopTransfers( node *syntax_tree) - * - *****************************************************************************/ + */ + +/** + * @brief + * @param syntax_tree + * @return syntax tree + */ node * MLTRANdoMinimizeLoopTransfers (node *syntax_tree) { @@ -164,29 +153,26 @@ MLTRANdoMinimizeLoopTransfers (node *syntax_tree) info = FreeInfo (info); + DBUG_PRINT ("invoking DCR"); syntax_tree = DCRdoDeadCodeRemoval (syntax_tree); DBUG_RETURN (syntax_tree); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * +/** * @name Traversal functions * @{ - * - *****************************************************************************/ + */ -/** - * - * @fn node *MLTRANfundef( node *arg_node, info *arg_info) - * +/** * @brief * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * MLTRANfundef (node *arg_node, info *arg_info) { @@ -198,6 +184,7 @@ MLTRANfundef (node *arg_node, info *arg_info) /* If the function is not a do-fun, we traverse as normal */ if (!FUNDEF_ISLOOPFUN (arg_node)) { + DBUG_PRINT ("(not LOOP) Entering %s...", FUNDEF_NAME (arg_node)); FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); } else { @@ -205,6 +192,8 @@ MLTRANfundef (node *arg_node, info *arg_info) * otherwise we traverse the next N_fundef. */ if (INFO_TRAVMODE (arg_info) == trav_dofun) { + DBUG_PRINT ("(LOOP) Entering %s...", FUNDEF_NAME (arg_node)); + /* We assign a sequential number (starting from 0) * to each argument of the do-fun */ INFO_FUNARGNUM (arg_info) = 0; @@ -222,14 +211,13 @@ MLTRANfundef (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANarg( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_arg + * @param arg_info info structure + * @return N_arg + */ node * MLTRANarg (node *arg_node, info *arg_info) { @@ -243,14 +231,13 @@ MLTRANarg (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANassign( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_assign + * @param arg_info info structure + * @return N_assign + */ node * MLTRANassign (node *arg_node, info *arg_info) { @@ -279,10 +266,12 @@ MLTRANassign (node *arg_node, info *arg_info) if (INFO_APPOSTASSIGNS (arg_info) != NULL) { ASSIGN_NEXT (arg_node) = INFO_APPOSTASSIGNS (arg_info); + global.optcounters.cuda_min_trans+=1; } if (INFO_APPREASSIGNS (arg_info) != NULL) { arg_node = TCappendAssign (INFO_APPREASSIGNS (arg_info), arg_node); + global.optcounters.cuda_min_trans+=1; } FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) @@ -310,14 +299,13 @@ MLTRANassign (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANlet( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_let + * @param arg_info info structure + * @return N_let + */ node * MLTRANlet (node *arg_node, info *arg_info) { @@ -330,14 +318,13 @@ MLTRANlet (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANap( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_ap + * @param arg_info info structure + * @return N_ap + */ node * MLTRANap (node *arg_node, info *arg_info) { @@ -348,10 +335,14 @@ MLTRANap (node *arg_node, info *arg_info) DBUG_ENTER (); + DBUG_PRINT ("ap %s", FUNDEF_NAME (AP_FUNDEF (arg_node))); + /* If the N_ap->N_fundef is a do-fun */ if (FUNDEF_ISLOOPFUN (AP_FUNDEF (arg_node))) { /* If this is NOT a recursive application of the enclosing do-fun */ if (AP_FUNDEF (arg_node) != INFO_FUNDEF (arg_info)) { + DBUG_PRINT ("...non-recursive application"); + /* Traverse the N_ap arguments first */ AP_ARGS (arg_node) = TRAVopt (AP_ARGS (arg_node), arg_info); @@ -389,6 +380,7 @@ MLTRANap (node *arg_node, info *arg_info) } /* If this is a recursive application of the enclosing do-fun. */ else { + DBUG_PRINT ("...recursive application"); INFO_ISRECURSIVEAPARGS (arg_info) = TRUE; INFO_RECURSIVEAPARGS (arg_info) = AP_ARGS (arg_node); AP_ARGS (arg_node) = TRAVopt (AP_ARGS (arg_node), arg_info); @@ -401,14 +393,13 @@ MLTRANap (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANid( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_id + * @param arg_info info structure + * @return N_id + */ node * MLTRANid (node *arg_node, info *arg_info) { @@ -482,14 +473,13 @@ MLTRANid (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANfuncond( node *syntax_tree) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_funcond + * @param arg_info info structure + * @return N_funcond + */ node * MLTRANfuncond (node *arg_node, info *arg_info) { @@ -566,14 +556,13 @@ MLTRANfuncond (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANreturn( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_return + * @param arg_info info structure + * @return N_return + */ node * MLTRANreturn (node *arg_node, info *arg_info) { @@ -634,14 +623,13 @@ MLTRANreturn (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANprf( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_prf + * @param arg_info info structure + * @return N_prf + */ node * MLTRANprf (node *arg_node, info *arg_info) { @@ -652,10 +640,12 @@ MLTRANprf (node *arg_node, info *arg_info) if (INFO_INDOFUN (arg_info)) { switch (PRF_PRF (arg_node)) { case F_host2device: + id = PRF_ARG1 (arg_node); + DBUG_PRINT ("prf host2device %s -> %s", ID_NAME (id), IDS_NAME (INFO_LETIDS (arg_info))); if (!ASSIGN_ISNOTALLOWEDTOBEMOVEDUP ((INFO_LASTASSIGN (arg_info)))) { - id = PRF_ARG1 (arg_node); + DBUG_PRINT ("...can be moved up"); DBUG_ASSERT (NODE_TYPE (ID_DECL (id)) == N_arg, - "Host variable of is not declared as an N_arg!"); + "Host variable of H2D is not declared as an N_arg!"); /* If the is allowed to be moved out of the do-fun, * the host variable argument can be replaced by the device variable. * Note that if can be moved out, the host variable @@ -722,7 +712,9 @@ MLTRANprf (node *arg_node, info *arg_info) } break; case F_device2host: + DBUG_PRINT ("prf device2host"); if (!ASSIGN_ISNOTALLOWEDTOBEMOVEDDOWN ((INFO_LASTASSIGN (arg_info)))) { + DBUG_PRINT ("...can be moved down"); /* We insert the pair [N_id(host)->avis] -> [N_id(device)->avis] * into D2H table. */ INFO_D2HLUT (arg_info) @@ -739,12 +731,6 @@ MLTRANprf (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * @} - *****************************************************************************/ - -/** - * @} - *****************************************************************************/ - +/** @} */ +/** @} */ #undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/minimize_transfers.c b/src/libsac2c/cuda/minimize_transfers.c index 7d35775280529829851130bd1ab5802e2959f801..8e116e0cfec08e3cdf652cf0a538d01e76bac79d 100644 --- a/src/libsac2c/cuda/minimize_transfers.c +++ b/src/libsac2c/cuda/minimize_transfers.c @@ -1,30 +1,21 @@ -/***************************************************************************** +/** + * @file + * @defgroup mtran Minimize Transfers + * @ingroup cuda * - * @defgroup + * This is a driver module for several transformations aiming at minimizing + * the number of host<->device memory transfers. These three transformations + * are applied in a cyclic fashion since one optimization might expose more + * opportunities for another optimization. The number of cycles is currently + * set at max_optcycles (globals), but will termiate early if we've reached a + * fixed point. * - * - * This is a driver module for three transformations aiming at minimizing - * the number of host<->device memory transfers. These three transformations - * are applied in a cyclic fashion since one optimization might expose more - * opportunities for another optimization. The number of cycles is currently - * set at 10. However, a better approach would be to stop the cycle when no - * changes occur to the AST (Unfortunately, I have yet figurred out how to - * do it). For details of each transformation, please refer to the individual - * module files. - * - * - *****************************************************************************/ - -/** - * - * @file minimize_transfers.c - * - * Prefix: MTRAN - * - *****************************************************************************/ + * @{ + */ #include "minimize_transfers.h" -#include +#include "phase.h" +#include "traverse_optcounter.h" #define DBUG_PREFIX "MTRAN" #include "debug.h" @@ -35,52 +26,101 @@ #include "minimize_loop_transfers.h" #include "minimize_cond_transfers.h" #include "minimize_cudast_transfers.h" +#include "minimize_emr_transfers.h" +#include "loop_invariant_removal.h" #include "globals.h" #include "wl_descalarization.h" -/** - * +/** * @name Entry functions * @{ + */ + +/** + * @brief Applies various optimisation to the syntax tree, to minimize CUDA + * memcpy operations. * - *****************************************************************************/ -/** - * - * @fn node *MLTRANdoMinimizeLoopTransfers( node *syntax_tree) - * - *****************************************************************************/ + * @param syntax_tree + * @return the syntax tree + */ node * MTRANdoMinimizeTransfers (node *syntax_tree) { + int i; + bool done = false; + + TOC_SETUP(1, COUNT_TRL) + DBUG_ENTER (); - int i, j; + DBUG_PRINT ("Performaing CUDA Minimize Transfers Optimistions"); + + if (global.optimize.doexpar) { + DBUG_PRINT ("Doing expar optimisation cycle:"); + for (i = 1; i < global.max_optcycles; i++) { + /* XXX disabled for some reason, further investigation needed */ + TOC_RUNOPT ("MBTRAN2", false, COUNT_TRL, + global.optcounters.cuda_min_trans, + syntax_tree, MBTRAN2doMinimizeBlockTransfers) + TOC_RUNOPT ("ACTRAN", true, TOC_IGNORE, 0, + syntax_tree, ACTRANdoAnnotateCondTransfers) + TOC_RUNOPT ("MCTRAN", true, COUNT_TRL, + global.optcounters.cuda_min_trans, + syntax_tree, MCTRANdoMinimizeCondTransfers) + + TOC_COMPARE (done) - if (global.backend == BE_cuda && global.optimize.doexpar) { - i = 0; - while (i < 10) { - /* syntax_tree = MBTRAN2doMinimizeBlockTransfers( syntax_tree); */ - syntax_tree = ACTRANdoAnnotateCondTransfers (syntax_tree); - syntax_tree = MCTRANdoMinimizeCudastCondTransfers (syntax_tree); - i++; + if (done) { + break; + } } + DBUG_PRINT ("Completed expar optimisation cycle after %d cycles", i); } - j = 0; - while (j < 10) { - syntax_tree = MCSTRANdoMinimizeCudastTransfers (syntax_tree); - syntax_tree = MBTRAN2doMinimizeBlockTransfers (syntax_tree); - syntax_tree = ACTRANdoAnnotateCondTransfers (syntax_tree); - syntax_tree = MCTRANdoMinimizeCondTransfers (syntax_tree); + /* reset counters for next cycle */ + TOC_RESETCOUNTERS () + done = false; + + DBUG_PRINT ("Doing general optimisation cycle:"); + for (i = 1; i < global.max_optcycles; i++) { + + TOC_RUNOPT ("MCSTRAN", true, COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MCSTRANdoMinimizeCudastTransfers) + TOC_RUNOPT ("MBTRAN2", true, COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MBTRAN2doMinimizeBlockTransfers) + TOC_RUNOPT ("ACTRAN", true, TOC_IGNORE, 0, + syntax_tree, ACTRANdoAnnotateCondTransfers) + TOC_RUNOPT ("MCTRAN", true, COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MCTRANdoMinimizeCondTransfers) /* make sure the lifted transfer are removed when ever * possible before minimizing transfers in loops. */ - syntax_tree = MBTRAN2doMinimizeBlockTransfers (syntax_tree); - /*********************************************************/ - syntax_tree = AMTRANdoAnnotateMemoryTransfers (syntax_tree); - syntax_tree = MLTRANdoMinimizeLoopTransfers (syntax_tree); - j++; + TOC_RUNOPT ("MBTRAN2", true, COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MBTRAN2doMinimizeBlockTransfers) + TOC_RUNOPT ("AMTRAN", true, TOC_IGNORE, 0, + syntax_tree, AMTRANdoAnnotateMemoryTransfers) + TOC_RUNOPT ("MLTRAN", true, COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MLTRANdoMinimizeLoopTransfers) + + /* For any EMR lifted allocations which are H2Ds within a do-loop, + * we artificially lift these out, similar to MLTRAN above. We assume + * that because of the buffer-swapping, there is always a suitable + * device type to pass in as part of recursive call within the do-loop. + */ + TOC_RUNOPT ("MEMRT", global.optimize.doemrci && global.optimize.domemrt, + COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MEMRTdoMinimizeEMRTransfers) + + TOC_COMPARE (done) + + DBUG_PRINT ("Counter: Lift -> %zu", + (global.optcounters.cuda_min_trans - TOC_GETCOUNTER (COUNT_TRL))); + + if (done) { + break; + } } + DBUG_PRINT ("Completed general optimisation cycle after %d cycles", i); /* We perform loop invariant removal here because we found out * that that there are certained cases that are ignored by our @@ -89,13 +129,11 @@ MTRANdoMinimizeTransfers (node *syntax_tree) * regard to array "features" in kmeans.sac in the CUDA Rodinia * benchmark suite. */ - // syntax_tree = LIRdoLoopInvariantRemoval( syntax_tree); + // syntax_tree = DLIRdoLoopInvariantRemoval (syntax_tree); DBUG_RETURN (syntax_tree); } -/** - * @} - *****************************************************************************/ - +/** @} */ +/** @} */ #undef DBUG_PREFIX diff --git a/src/libsac2c/memory/emr_loop_optimisation.c b/src/libsac2c/memory/emr_loop_optimisation.c index c935bf2056749708b8df925c4896a7c5a7721802..046f666d0c5d99cfacf434478f840cd82326496a 100644 --- a/src/libsac2c/memory/emr_loop_optimisation.c +++ b/src/libsac2c/memory/emr_loop_optimisation.c @@ -56,7 +56,7 @@ typedef enum emrl_context {EMRL_rec, EMRL_ap} emrl_context_t; * node. See EMRL related functions for more info. */ typedef struct stack_node_s { - node * wl; /**< either a N_modarray or N_genarray */ + node *wl; /**< either a N_modarray or N_genarray */ node * avis; /**< our new avis */ struct stack_node_s * next; } stack_node_t; @@ -206,6 +206,25 @@ isSameShapeAvis (node * avis, node * exprs) DBUG_RETURN (ret); } +/** + * @brief Create a new temporary avis which copies the ntype of + * an existing avis + * + * @param type Some NType + * @return a new avis + */ +static inline node * +createTmpAvis (ntype *type) +{ + node *avis; + + avis = TBmakeAvis (TRAVtmpVarName ("emr_tmp"), TYcopyType (type)); + + DBUG_PRINT (" created %s var", AVIS_NAME (avis)); + + return avis; +} + /** * @brief Collect LHS of N_let and traverse the exprs * @@ -269,9 +288,7 @@ EMRLgenarray (node * arg_node, info * arg_info) DBUG_PRINT (" genarray in loopfun has no RCs or ERCs, generating tmp one!"); /* the new avis must have the same type/shape as genarray shape */ - new_avis = TBmakeAvis (TRAVtmpVarName ("emr_tmp"), - TYcopyType (IDS_NTYPE (INFO_LHS (arg_info)))); - DBUG_PRINT (" created %s var", AVIS_NAME (new_avis)); + new_avis = createTmpAvis (IDS_NTYPE (INFO_LHS (arg_info))); /* add to stack - this will be used in N_ap */ INFO_STACK (arg_info) = stack_push (INFO_STACK (arg_info), arg_node, new_avis); @@ -312,9 +329,7 @@ EMRLmodarray (node * arg_node, info * arg_info) DBUG_PRINT (" modarray in loopfun has no RCs or ERCs, generating tmp one!"); /* the new avis must have the same type/shape as modarray shape */ - new_avis = TBmakeAvis (TRAVtmpVarName ("emr_tmp"), - TYcopyType (IDS_NTYPE (INFO_LHS (arg_info)))); - DBUG_PRINT (" created %s var", AVIS_NAME (new_avis)); + new_avis = createTmpAvis (IDS_NTYPE (INFO_LHS (arg_info))); /* add to stack - this will be used in N_ap */ INFO_STACK (arg_info) = stack_push (INFO_STACK (arg_info), arg_node, new_avis); @@ -487,6 +502,9 @@ EMRLfundef (node * arg_node, info * arg_info) FUNDEF_ARGS (arg_node) = TCappendArgs (FUNDEF_ARGS (arg_node), INFO_ARGS (arg_info)); INFO_ARGS (arg_info) = NULL; + + /* mark fundef as having been touched by EMRL - this used later in CUDA MEMRT */ + FUNDEF_ISEMRLIFTED (arg_node) = TRUE; } INFO_FUNDEF (arg_info) = NULL; diff --git a/src/libsac2c/stdopt/optimize.mac b/src/libsac2c/stdopt/optimize.mac index 251e2fc82b1c736fdf274c610f0da5daf0488880..a4ee41559b85e209824fb87a2acb71b6449e66df 100644 --- a/src/libsac2c/stdopt/optimize.mac +++ b/src/libsac2c/stdopt/optimize.mac @@ -107,6 +107,7 @@ OPTIMIZE ("pra", pra, FALSE, FALSE, "polyhedra data reuse optimization") OPTIMIZE ("emrci", emrci, FALSE, FALSE, "EMR candidate inference") OPTIMIZE ("emrcf", emrcf, TRUE, TRUE, "EMR candidate filtering") OPTIMIZE ("emrl", emrl, TRUE, TRUE, "EMR loop memory optimisation") +OPTIMIZE ("memrt", memrt, TRUE, TRUE, "Minimize memcpy transfers for EMRL affected loop functions") OPTIMIZE ("rnb", rnb, FALSE, FALSE, "remove noop conditional branch in with-loops") OPTIMIZE ("rwo", rwo, TRUE, TRUE, "memory reuse with offset") OPTIMIZE ("rip", rip, TRUE, TRUE, "memory reuse with in place selection") @@ -223,6 +224,8 @@ OPTCOUNTER (safa_expr, TRUE, "associative function argument(s) sorted") OPTCOUNTER (pogo_expr, FALSE, "guards removed by pogo") OPTCOUNTER (pwlf_expr, FALSE, "with-loops folded using polyhedra") OPTCOUNTER (ssawl_expr, FALSE, "with-loops converted to SSA form") +/* optimisation counters for CUDA backend */ +OPTCOUNTER (cuda_min_trans, TRUE, "transfer primitives are out lifted") #undef OPTIMIZEstr #undef OPTIMIZEabbr diff --git a/src/libsac2c/tree/traverse_optcounter.h b/src/libsac2c/tree/traverse_optcounter.h new file mode 100644 index 0000000000000000000000000000000000000000..a163ab611cba3e1d3b1f051381676977e9c731de --- /dev/null +++ b/src/libsac2c/tree/traverse_optcounter.h @@ -0,0 +1,196 @@ +/** + * @file + * @brief Alternative phase cycle driver, similar to actual phase cycle driver + * + * This set of macros are meant to be used to driver some sort of cyclical + * operation, such as apply several traversals one after the other, to a + * fixed point. This is very similar to what is happening in the compiler phase + * driver (@see global/phase.c). The main difference here is that we can count + * anything we want, as the user specifies what variable to assign to. There + * is no requirement to only use the OPTCOUNTERS (@see stdopt/optimize.mac). + * + * This does make things more verbose though, compared to using the statistics + * helper (@see stdopt/statistics.c) to manipulate the OPTCOUNTERS. + * + * The typical use case is where one currently applies several traversals + * n times, and always n times: + * + * ~~~~ + * for (i = 1; i < 10; i++) { + * node = fun1 (node); + * if (doopt) { + * node = fun2 (node); + * } + * node = fun3 (node); + * node = fun4 (node); + * } + * ~~~~ + * + * We can change this over to: + * + * ~~~~ + * TOC_SETUP (2, COUNT_ONE, COUNT_TWO) + * bool test = false; + * + * TOC_SETCOUNTER (COUNT_TWO, 10) + * + * for (i = 1; i < global.max_optcycles; i++) { + * TOC_RUNOPT ("OPT1", true, COUNT_ONE, some_count_value, node, fun1) + * TOC_RUNOPT ("OPT2", doopt, TOC_IGNORE, 0, node, fun2) + * TOC_RUNOPT ("OPT3", true, COUNT_ONE, some_count_value2, node, fun3) + * TOC_RUNOPT ("OPT4", true, COUNT_TWO, some_count_value3, node, fun4) + * + * TOC_COMPARE (test) + * + * printf ("Counter: ONE -> %zu, TWO -> %zu\n", + * TOC_GETCOUNTER (COUNT_ONE), + * TOC_GETCOUNTER (COUNT_TWO)); + * + * if (test) + * break; + * } + * ~~~~ + * + */ +#ifndef _TREE_TRAVERSE_OPT_COUNTER_H_ +#define _TREE_TRAVERSE_OPT_COUNTER_H_ + +#include "phase.h" + +/** + * @brief Setup and initialise all needed variables + * + * Here the user passes in _names_ of counters, which are kept stored in an + * enum which is used to access an array, where all the counter values are stored. + * + * A special counter, called `TOC_IGNORE`, is already set. This can be used instead + * of a real counter in cases where nothing is being stored. + * + * @param num Number of counter names being passed in + * @param ... The counter names (its suggested that these should be in all-caps) + */ +#define TOC_SETUP(num, ...) \ + enum toc_optcounter_labels { TOC_IGNORE, __VA_ARGS__ }; \ + __attribute__((unused)) const size_t toc_optcount_size = num+1; \ + size_t toc_store[num+1] = {0}; \ + size_t toc_store_old[num+1] = {0}; \ + __attribute__((unused)) size_t toc_i; + +/** + * @brief Compare current counter state with previous counter state over a + * specified _range_ of counters. + * + * We iteratively compare the counter values between the current state and previous + * state. If all counters states are found to be **equal**, then the cycle has reached + * a fixed-point. If however one or more counter states are **unequal**, we continue + * the next iteration of the cycle. + * + * @param start Some label or integer indicating the start; **cannot** be less-than zero + * @param end Some label or integer indicating the end; **cannot** be greater than number + * of total counters + * @param out Variable used to store boolean result: if _true_, we've reached a + * fixed-point + */ +#define TOC_COMPARE_RANGE(start, end, out) \ + for (toc_i = start, out = true; toc_i < end; toc_i++) { \ + out = out && toc_store[toc_i] == toc_store_old[toc_i]; \ + toc_store_old[toc_i] = toc_store[toc_i]; \ + } + +/** + * @brief Compare current counter state with previous counter state for all counters + * + * @see TOC_COMPARE_RANGE + * + * @param out Variable used to store boolean result: if _true_, we've reached a + * fixed-point + */ +#define TOC_COMPARE(out) TOC_COMPARE_RANGE(1, toc_optcount_size, out) + +/** + * @brief Set counter to specified value + * + * @param label The counter name + * @param val The value to set + */ +#define TOC_SETCOUNTER(label, val) \ + toc_store[label] = toc_store_old[label] = val; + +/** + * @brief Get the current counter value + * + * @param label The counter name + * @return a value as type `size_t` + */ +#define TOC_GETCOUNTER(label) (toc_store[label]) + +/** + * @brief Reset all counters to default value (zero) + */ +#define TOC_RESETCOUNTERS() \ + for (toc_i = 0; toc_i < toc_optcount_size; toc_i++) { \ + toc_store[toc_i] = 0; \ + toc_store_old[toc_i] = 0; \ + } + +#ifdef DBUG_OFF + +/* in production compiler PHrunConsistencyChecks is disabled */ +#define TOC_RUNCHECK(name, node) +#define TOC_RUNCHECK_TAG(tag, name, node) + +#else /* DBUG_OFF */ + +#define TOC_RUNCHECK(name, node) \ + if (global.check_frequency >= 3) { \ + DBUG_PRINT ("Cycle iteration %d: running post-" name " check", i); \ + node = PHrunConsistencyChecks (node); \ + } + +#define TOC_RUNCHECK_TAG(tag, name, node) \ + if (global.check_frequency >= 3) { \ + DBUG_PRINT_TAG (tag, "Cycle iteration %d: running post-" name " check", i); \ + node = PHrunConsistencyChecks (node); \ + } + +#endif /* DBUG_OFF */ + +/** + * @brief Perform one call of the given optimisation/traversal function + * + * @param name Some string name to use for printouts + * @param cond Some condition to control when the traversal should run + * @param label Name of the counter to use + * @param stmt Some value(s) to set the counter + * @param node The node to pass to the function + * @param fun The function to be called + */ +#define TOC_RUNOPT(name, cond, label, stmt, node, fun) \ + if (cond) { \ + DBUG_PRINT ("Cycle iteration %d: running " name, i); \ + toc_store[label] = stmt; \ + node = fun (node); \ + TOC_RUNCHECK (name, node) \ + } + +/** + * @brief Perform one call of the given optimisation/traversal function using + * a specified TAG for printing + * + * @param tag Some string label for printouts + * @param name Some string name to use for printouts + * @param cond Some condition to control when the traversal should run + * @param label Name of the counter to use + * @param stmt Some value(s) to set the counter + * @param node The node to pass to the function + * @param fun The function to be called + */ +#define TOC_RUNOPT_TAG(tag, name, cond, label, stmt, node, fun) \ + if (cond) { \ + DBUG_PRINT_TAG (tag, "Cycle iteration %d: running " name, i); \ + toc_store[label] = stmt; \ + node = fun (node); \ + TOC_RUNCHECK_TAG (tag, name, node) \ + } + +#endif /* _TREE_TRAVERSE_OPT_COUNTER_H_ */ diff --git a/src/libsac2c/xml/ast.xml b/src/libsac2c/xml/ast.xml index bb476d0145887b13ae7b45c0c9afeb950b9882c9..9a813dc999b846b5382934607a3d52d6c802bc8f 100644 --- a/src/libsac2c/xml/ast.xml +++ b/src/libsac2c/xml/ast.xml @@ -494,6 +494,17 @@ + + + + + + + + + + + @@ -6000,6 +6011,11 @@ N_tfarg : Indicates whether or not we need to generate a declaration within header.c when linking to an external library. + + + TRUE iff this fundef has been affected by the EMRL optimisation. + + diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 4a25ece10ab05a454df3df303a77d1cf4e393e13..1294194181bcddc0f38a9f497a25e6252be03e65 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -78,6 +78,7 @@ ADD_FUNC_TEST (string-tests string-tests.cpp) ADD_FUNC_TEST (test-assoc-law test-assoc-law.cpp) ADD_FUNC_TEST (test-icm-compilation test-icm-compilation.cpp) ADD_FUNC_TEST (test-macros test-macros.cpp) +ADD_FUNC_TEST (test-traverse-optcounter test-traverse-optcounter.cpp) # libsac + runtime tests # XXX (hans) we can only create one test suite, *not* per-target, due to name-clashes diff --git a/src/tests/test-traverse-optcounter.cpp b/src/tests/test-traverse-optcounter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..54f02ba5a8ac3219003394d5a170fc228cfc06e7 --- /dev/null +++ b/src/tests/test-traverse-optcounter.cpp @@ -0,0 +1,82 @@ +#include "gtest/gtest.h" +#include "config.h" + +/* we safely ignore these */ +#define DBUG_PRINT(smt, ...) +#define DBUG_PRINT_TAG(tag, smt, ...) +#define DBUG_OFF /* to not call phase.c functions */ + +extern "C" { +#include "traverse_optcounter.h" +} + +static int counter = 0; + +static int +testFunction (int input) +{ + counter++; + return input; +} + +TEST (MACRO_OPTCOUNTER, Setup) +{ + TOC_SETUP (2, COUNT_ONE, COUNT_TWO) + + ASSERT_TRUE (toc_optcount_size == 3); + ASSERT_TRUE (toc_store[TOC_IGNORE] == 0); + ASSERT_TRUE (toc_store[COUNT_ONE] == 0); + ASSERT_TRUE (toc_store_old[COUNT_ONE] == 0); +} + +TEST (MACRO_OPTCOUNTER, SetAndGetCounter) +{ + TOC_SETUP (1, COUNT_ONE) + + ASSERT_TRUE (TOC_GETCOUNTER (COUNT_ONE) == 0); + TOC_SETCOUNTER (COUNT_ONE, 2) + ASSERT_TRUE (TOC_GETCOUNTER (COUNT_ONE) == 2); + ASSERT_TRUE (toc_store_old[COUNT_ONE] == 2); + + TOC_RESETCOUNTERS () + ASSERT_TRUE (TOC_GETCOUNTER (COUNT_ONE) == 0); + ASSERT_TRUE (toc_store_old[COUNT_ONE] == 0); +} + +TEST (MACRO_OPTCOUNTER, CompareCounters) +{ + bool test = false; + TOC_SETUP (3, COUNT_ONE, COUNT_TWO, COUNT_THREE) + + TOC_COMPARE (test) + + ASSERT_TRUE (test); + + toc_store[COUNT_TWO] = 10; + + TOC_COMPARE (test) + + ASSERT_FALSE (test); +} + +TEST (MACRO_OPTCOUNTER, RunOpt) +{ + int t = 4; + TOC_SETUP (2, COUNT_ONE, COUNT_TWO) + + TOC_RUNOPT ("Blah", true, COUNT_ONE, counter, t, testFunction) + ASSERT_TRUE (toc_store[COUNT_ONE] == 0); + ASSERT_TRUE (toc_store[COUNT_TWO] == 0); + TOC_RUNOPT ("Blah", true, COUNT_ONE, counter, t, testFunction) + ASSERT_TRUE (toc_store[COUNT_ONE] == 1); + ASSERT_TRUE (toc_store[COUNT_TWO] == 0); + TOC_RUNOPT ("Blah", true, TOC_IGNORE, 0, t, testFunction) + ASSERT_TRUE (toc_store[COUNT_ONE] == 1); + ASSERT_TRUE (toc_store[COUNT_TWO] == 0); + TOC_RUNOPT ("Blah", false, COUNT_TWO, counter, t, testFunction) + ASSERT_TRUE (toc_store[COUNT_ONE] == 1); + ASSERT_TRUE (toc_store[COUNT_TWO] == 0); + TOC_RUNOPT ("Blah", true, COUNT_TWO, counter, t, testFunction) + ASSERT_TRUE (toc_store[COUNT_ONE] == 1); + ASSERT_TRUE (toc_store[COUNT_TWO] == 3); +} diff --git a/tests/cuda/test-memrt-lift1.sac b/tests/cuda/test-memrt-lift1.sac new file mode 100644 index 0000000000000000000000000000000000000000..13d451dbe12ee856820f5508a87066c4d816e0f4 --- /dev/null +++ b/tests/cuda/test-memrt-lift1.sac @@ -0,0 +1,50 @@ +// This is only for the CUDA-backend, to test the MEMRT traversal, which in effect +// translates an existing transformation by the EMRL traversal (to lift out allocations +// from loops) into something more suitable for CUDA execution. +// +// SAC_TEST|include common.mk +// SAC_TEST|SAC2C_FLAGS += -t cuda -doemrci -doemrcf -doemrl -bcuda:mtran +// SAC_TEST|all: +// SAC_TEST|@$(SAC2C) $(SAC2C_FLAGS) -nomemrt $< | $(GREP_COMMAND_OUTPUT) '_Loop_1( .*, A, [^, ]*_emr_lifted)\|_Loop_1( .*, [^, ]*_A, A)' 2 +// SAC_TEST|@$(SAC2C) $(SAC2C_FLAGS) -domemrt $< | $(GREP_COMMAND_OUTPUT) '_Loop_1( .*, [^, ]*_A, [^, ]*_dev)\|_host2device_( [^, ]*_emr_lifted);' 2 +#include "mini-stdlib.sac" + +inline +int[+] onestep (int[+] B) +{ + A = with { + (. < x < .) : 2 * (B[x+[1,0]] + + B[x-[1,0]] + + B[x+[0,1]] + + B[x-[0,1]]); + } : modarray (B); + + return(A); +} + +noinline +int[+] operation (int[+] A) +{ + steps = 100; + + do { + B = A; + A = onestep (B); + steps--; + } while ((sum (A) > sum (B)) && (steps > 0)); + + return (A); +} + +int main () +{ + A = with { + ([0,1] <= x <= .) : 0; + } : genarray ([1000,1000], 500); + + A = operation (A); + + return _toi_S_ (A[100,100]); +} + + diff --git a/tests/mini-stdlib.sac b/tests/mini-stdlib.sac index 593b11e468f9c105ecf09cf3dae86c1d6bb7eed4..2b2932f87aa8142743a79b71894da98a60bb67b3 100644 --- a/tests/mini-stdlib.sac +++ b/tests/mini-stdlib.sac @@ -21,33 +21,25 @@ inline bool != (bool a, bool b) { return _neq_SxS_ (a, b); } // Selection functions -inline int[*] sel(int[.] idx, int[*] array) -{ - new_shape = _drop_SxV_ (_sel_VxA_ ([0], _shape_A_ (idx)), - _shape_A_ (array)); - return with { - (. <= iv <= .) { - new_idx = _cat_VxV_ (idx, iv); - } : _sel_VxA_ (new_idx, array); - } : genarray (new_shape, 0); -} - -inline int[*] sel (int idx, int[*] a) -{ - return sel ([idx], a); -} - -inline bool[*] sel (int[.] idx, bool[*] array) -{ - new_shape = _drop_SxV_ (_sel_VxA_ ([0], _shape_A_ (idx)), - _shape_A_ (array)); - return with { - (. <= iv <= .) { - new_idx = _cat_VxV_ (idx, iv); - } : _sel_VxA_ (new_idx, array); - } : genarray (new_shape, false); +#define SEL_A_(typ, def) \ +inline typ[*] sel(int[.] idx, typ[*] array) \ +{ \ + new_shape = _drop_SxV_ (_sel_VxA_ ([0], _shape_A_ (idx)), \ + _shape_A_ (array)); \ + return with { \ + (. <= iv <= .) { \ + new_idx = _cat_VxV_ (idx, iv); \ + } : _sel_VxA_ (new_idx, array); \ + } : genarray (new_shape, def); \ +} \ + \ +inline typ[*] sel (int idx, typ[*] a) \ +{ \ + return sel ([idx], a); \ } +SEL_A_(int, 0) +SEL_A_(bool, false) // Shape inline int[.] shape (bool[*] a) { return _shape_A_ (a); } @@ -61,6 +53,7 @@ inline int[.] drop (int a, int[.] b) { return _drop_SxV_ (a,b); } // Increment inline int ++ (int a) { return _add_SxS_ (a, 1); } +inline int -- (int a) { return _sub_SxS_ (a, 1); } // Mixed scalar-vector operations inline int[.] + (int a, int[.] b) { return _add_SxV_ (a, b); }