From 9af256c35c9c944868e9ed693a0e7355edb37d53 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Tue, 2 Apr 2019 13:06:17 +0100 Subject: [PATCH 01/17] remove stale CUDA module The insert_memory_transfers module was split into several different files (insert_withloop_memtran.c & insert_cudast_memtran.c) by Jing starting from the 26-03-2010, and this was left in the git history. This module is not used anywhere, and as far as I can tell, provides nothing in addition to the other modules. As such its safe to remove the files. Additionally, I've identified the following libsac2c modules which are also stale: icm_betest.c icm_trace.c icm_comment.c BEtest.c wl_empty_result_handling.c create_lac_fun.c wl_descalarization.c cuda_create_dfg.c idag.c elemqueue.c elemlist.c cygcompat.c cygwinhelpers.c One needs to take a closer look at this and see if they are still useful. --- src/libsac2c/cuda/insert_memory_transfers.c | 806 -------------------- src/libsac2c/cuda/insert_memory_transfers.h | 20 - 2 files changed, 826 deletions(-) delete mode 100644 src/libsac2c/cuda/insert_memory_transfers.c delete mode 100644 src/libsac2c/cuda/insert_memory_transfers.h diff --git a/src/libsac2c/cuda/insert_memory_transfers.c b/src/libsac2c/cuda/insert_memory_transfers.c deleted file mode 100644 index 926a77e9d..000000000 --- a/src/libsac2c/cuda/insert_memory_transfers.c +++ /dev/null @@ -1,806 +0,0 @@ -/** - * - * @defgroup Insert CUDA memory transfer primitives - * - * - * This module inserts CUDA type conversion primitives before and after - * each cudarizable N_with. The two primitives are and - * . They are used to trasfer the data of a host(device) array - * variable to a device(host) array variable. This is essentially - * compiled into host<->device memory transfers in the backend. As an - * example: - * - * a_host = with - * { - * ... = b_host; - * ... = c_host; - * ... = d_host; - * }:genarray( shp); - * - * is transformed into: - * - * b_dev = host2device( b_host); - * c_dev = host2device( c_host); - * d_dev = host2device( d_host); - * a_dev = with - * { - * ... = b_dev; - * ... = c_dev; - * ... = d_dev; - * }:genarray( shp); - * a_host = device2host( a_dev); - * - * Note that simple scalar variables need not be type converted since they - * can be passed as function parameters directly to CUDA kernels. - * - * @ingroup - * - * @{ASSIGN_STMT( arg_node) - * - *****************************************************************************/ - -/** - * - * @file cuda_type_conversion.c - * - * Prefix: IMEM - * - *****************************************************************************/ -#include "insert_memory_transfers.h" - -/* - * Other includes go here - */ -#include -#include "tree_basic.h" -#include "tree_compound.h" -#include "str.h" -#include "str_buffer.h" -#include "memory.h" -#include "globals.h" - -#define DBUG_PREFIX "UNDEFINED" -#include "debug.h" - -#include "ctinfo.h" -#include "traverse.h" -#include "free.h" -#include "DupTree.h" -#include "print.h" -#include "new_types.h" -#include "LookUpTable.h" -#include "math_utils.h" -#include "types.h" -#include "type_utils.h" -#include "cuda_utils.h" -#include "DataFlowMask.h" -#include "DataFlowMaskUtils.h" -#include "remove_dfms.h" -#include "infer_dfms.h" - -/** - * - * @name INFO structure - * @{ - * - *****************************************************************************/ -struct INFO { - node *fundef; - bool in_cudawl; - bool create_d2h; - node *postassigns; - node *preassigns; - lut_t *lut; - lut_t *notran; - node *let_expr; - bool is_modarr; - bool in_cexprs; - bool from_ap; -}; - -/* - * INFO_FUNDEF N_fundef node of the enclosing function - * - * INFO_INCUDAWL Flag indicating whether the code currently being - * traversed is in a cudarizable N_with - * - * INFO_CREATE_D2H Flag indicating whether needs to be - * created for the N_let->N_ids - * - * INFO_POSTASSIGNS Chain of that needs to be appended - * at the end of the current N_assign - * - * INFO_PREASSIGNS Chain of that needs to be prepended - * at the beginning of the current N_assign - * - * INFO_LUT Lookup table storing pairs of Avis(host)->Avis(device) - * e.g. Given a_dev = host2device( a_host), - * Avis(a_host)->Avis(a_dev) will be stored into the table - * - * INFO_NOTRAN Lookup table storing N_avis of arrays varaibles that - * no data transfers should be created. - * - */ - -#define INFO_FUNDEF(n) (n->fundef) -#define INFO_INCUDAWL(n) (n->in_cudawl) -#define INFO_CREATE_D2H(n) (n->create_d2h) -#define INFO_POSTASSIGNS(n) (n->postassigns) -#define INFO_PREASSIGNS(n) (n->preassigns) -#define INFO_LUT(n) (n->lut) -#define INFO_NOTRAN(n) (n->notran) -#define INFO_LETEXPR(n) (n->let_expr) -#define INFO_IS_MODARR(n) (n->is_modarr) -#define INFO_IN_CEXPRS(n) (n->in_cexprs) -#define INFO_FROM_AP(n) (n->from_ap) - -static info * -MakeInfo () -{ - info *result; - - DBUG_ENTER (); - - result = MEMmalloc (sizeof (info)); - - INFO_FUNDEF (result) = NULL; - INFO_INCUDAWL (result) = FALSE; - INFO_CREATE_D2H (result) = FALSE; - INFO_POSTASSIGNS (result) = NULL; - INFO_PREASSIGNS (result) = NULL; - INFO_LUT (result) = NULL; - INFO_NOTRAN (result) = NULL; - INFO_IS_MODARR (result) = FALSE; - INFO_IN_CEXPRS (result) = FALSE; - INFO_FROM_AP (result) = FALSE; - - DBUG_RETURN (result); -} - -static info * -FreeInfo (info *info) -{ - DBUG_ENTER (); - - info = MEMfree (info); - - DBUG_RETURN (info); -} - -/** - * @} - *****************************************************************************/ - -static void CreateHost2Device (node **id, node *host_avis, node *dev_avis, - info *arg_info); - -/** - * - * @name Entry functions - * @{ - * - *****************************************************************************/ -/** - * - * @fn node *IMEMdoInsertMemoryTransfers( node *syntax_tree) - * - *****************************************************************************/ -node * -IMEMdoInsertMemoryTransfers (node *syntax_tree) -{ - info *info; - - DBUG_ENTER (); - - info = MakeInfo (); - - /* - * Infer dataflow masks - */ - // syntax_tree = INFDFMSdoInferDfms( syntax_tree, HIDE_LOCALS_NEVER); - - TRAVpush (TR_imem); - syntax_tree = TRAVdo (syntax_tree, info); - TRAVpop (); - - info = FreeInfo (info); - - DBUG_RETURN (syntax_tree); -} - -/** - * @} - *****************************************************************************/ - -/** - * - * @name Static helper functions - * @{ - * - *****************************************************************************/ - -/** - * - * @fn node* TypeConvert( node *host_avis) - * - * @brief - * - *****************************************************************************/ -static ntype * -TypeConvert (ntype *host_type, nodetype nty, info *arg_info) -{ - ntype *scalar_type, *dev_type = NULL; - simpletype sty; - - DBUG_ENTER (); - - if (nty == N_id) { - /* If the N_ids is of known dimension and is not a scalar */ - DBUG_ASSERT (TUdimKnown (host_type), "AUD N_id found in cudarizable N_with!"); - if (TYgetDim (host_type) > 0) { - /* If the scalar type is simple, e.g. int, float ... */ - if (TYisSimple (TYgetScalar (host_type))) { - dev_type = TYcopyType (host_type); - scalar_type = TYgetScalar (dev_type); - /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ - sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); - /* Set the device simple type */ - scalar_type = TYsetSimpleType (scalar_type, sty); - } - } - } - /* If the node to be type converted is N_ids, its original type - * can be AUD as well as long as the N_with on the RHS is cudarizable. - * The reason a cudarizbale can produce a AUD result illustrated by - * the following example: - * - * cond_fun() - * { - * int[*] aa; - * int bb; - * - * if( cond) { - * aa = with {}:genarray( shp); (cudarizable N_with) - * } - * else { - * bb = 1; - * } - * ret = cond ? aa : bb; - * } - * - */ - else if (nty == N_ids) { - if (NODE_TYPE (INFO_LETEXPR (arg_info)) == N_with) { - /* If the scalar type is simple, e.g. int, float ... */ - if (WITH_CUDARIZABLE (INFO_LETEXPR (arg_info)) - && TYisSimple (TYgetScalar (host_type))) { - dev_type = TYcopyType (host_type); - scalar_type = TYgetScalar (dev_type); - /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ - sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); - /* Set the device simple type */ - scalar_type = TYsetSimpleType (scalar_type, sty); - } - } - } else { - DBUG_UNREACHABLE ("Neither N_id nor N_ids found in TypeConvert!"); - } - - DBUG_RETURN (dev_type); -} - -/** - * @} - *****************************************************************************/ - -/** - * - * @name Traversal functions - * @{ - * - *****************************************************************************/ - -/** - * - * @fn node *IMEMfundef( node *arg_node, info *arg_info) - * - * @brief - * - *****************************************************************************/ -node * -IMEMfundef (node *arg_node, info *arg_info) -{ - node *old_fundef; - - DBUG_ENTER (); - - /* During the main traversal, we only look at non-lac functions */ - if (!FUNDEF_ISLACFUN (arg_node)) { - INFO_FUNDEF (arg_info) = arg_node; - FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); - INFO_FUNDEF (arg_info) = NULL; - - FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); - } else { - if (INFO_FROM_AP (arg_info)) { - old_fundef = INFO_FUNDEF (arg_info); - INFO_FUNDEF (arg_info) = arg_node; - /* Traversal of lac functions are initiated from the calling site */ - FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); - INFO_FUNDEF (arg_info) = old_fundef; - } else { - FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); - } - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMap( node *arg_node, info *arg_info) - * - * @brief - * - *****************************************************************************/ -node * -IMEMap (node *arg_node, info *arg_info) -{ - bool traverse_lac_fun, old_from_ap; - node *ap_args, *fundef_args; - node *avis, *id_avis, *new_avis, *dup_avis; - ntype *dev_type; - node *fundef; - - DBUG_ENTER (); - - fundef = AP_FUNDEF (arg_node); - - /* For us to traverse a function from calling site, it must be a - * condictional function or a loop function and must not be the - * recursive function call in the loop function. */ - traverse_lac_fun = (FUNDEF_ISLACFUN (fundef) && fundef != INFO_FUNDEF (arg_info)); - - if (traverse_lac_fun) { - old_from_ap = INFO_FROM_AP (arg_info); - INFO_FROM_AP (arg_info) = TRUE; - if (!INFO_INCUDAWL (arg_info)) { - AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); - } else { - ap_args = AP_ARGS (arg_node); - fundef_args = FUNDEF_ARGS (AP_FUNDEF (arg_node)); - - while (ap_args != NULL) { - DBUG_ASSERT (fundef_args != NULL, "# of Ap args != # of Fundef args!"); - - DBUG_ASSERT (NODE_TYPE (EXPRS_EXPR (ap_args)) == N_id, - "N_ap argument is not N_id node!"); - - id_avis = ID_AVIS (EXPRS_EXPR (ap_args)); - avis = LUTsearchInLutPp (INFO_LUT (arg_info), id_avis); - - /* If the avis has NOT been come across before */ - if (avis == id_avis) { - /* If the id is NOT the one we don't want to create data transfer for - */ - if (LUTsearchInLutPp (INFO_NOTRAN (arg_info), id_avis) == id_avis) { - dev_type = TypeConvert (AVIS_TYPE (id_avis), N_id, arg_info); - - if( dev_type != NULL /* && - NODE_TYPE( AVIS_DECL( avis)) == N_arg */) { - new_avis = TBmakeAvis (TRAVtmpVarName ("dev"), dev_type); - CreateHost2Device (&EXPRS_EXPR (ap_args), id_avis, new_avis, - arg_info); - - dup_avis = DUPdoDupNode (new_avis); - AVIS_SSAASSIGN (dup_avis) = NULL; - - INFO_LUT (arg_info) - = LUTinsertIntoLutP (INFO_LUT (arg_info), - ARG_AVIS (fundef_args), dup_avis); - ARG_AVIS (fundef_args) = dup_avis; - AVIS_DECL (dup_avis) = fundef_args; - } - } else { - /* If the N_id is the one we don't want to create host2device for, - * propogate that information to the traversal of LAC functions */ - INFO_NOTRAN (arg_info) - = LUTinsertIntoLutP (INFO_NOTRAN (arg_info), - ARG_AVIS (fundef_args), NULL); - } - } else { - /* If the N_avis has been come across before, replace its - * N_avis by the device N_avis */ - ID_AVIS (EXPRS_EXPR (ap_args)) = avis; - dup_avis = DUPdoDupNode (avis); - AVIS_SSAASSIGN (dup_avis) = NULL; - - /* Insert the pair of N_avis(fun arg)->N_avis(device variable) - * into the lookup table, so that when we later traverse the - * body of the fundef, old reference to the arg will be replaced - * by the new device varaible. */ - INFO_LUT (arg_info) - = LUTinsertIntoLutP (INFO_LUT (arg_info), ARG_AVIS (fundef_args), - dup_avis); - - /* Change N_avis of the fun arg to the device variable */ - ARG_AVIS (fundef_args) = dup_avis; - AVIS_DECL (dup_avis) = fundef_args; - } - - ap_args = EXPRS_NEXT (ap_args); - fundef_args = ARG_NEXT (fundef_args); - } - - AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); - } - - INFO_FROM_AP (arg_info) = old_from_ap; - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMassign( node *arg_node, info *arg_info) - * - * @brief Add newly created and to - * the assign chain. - * - *****************************************************************************/ -node * -IMEMassign (node *arg_node, info *arg_info) -{ - node *next; - - DBUG_ENTER (); - - /* - * Here we have to do a top-down traversal for the following reason: - * We need to check whether there is any array variables being defined - * in a cudarizable N_with. If there is, we don't want to create a - * host2devcice when we later come across it in the same block of code. - */ - - ASSIGN_STMT (arg_node) = TRAVdo (ASSIGN_STMT (arg_node), arg_info); - - /* If we are no longer in a cudarizable N_with, we insert - * data transfer primitives into the AST */ - if (!INFO_INCUDAWL (arg_info)) { - next = ASSIGN_NEXT (arg_node); - ASSIGN_NEXT (arg_node) = NULL; - - if (INFO_POSTASSIGNS (arg_info) != NULL) { - arg_node = TCappendAssign (arg_node, INFO_POSTASSIGNS (arg_info)); - INFO_POSTASSIGNS (arg_info) = NULL; - } - - if (INFO_PREASSIGNS (arg_info) != NULL) { - arg_node = TCappendAssign (INFO_PREASSIGNS (arg_info), arg_node); - INFO_PREASSIGNS (arg_info) = NULL; - } - - node *last_assign = arg_node; - while (ASSIGN_NEXT (last_assign) != NULL) { - last_assign = ASSIGN_NEXT (last_assign); - } - - ASSIGN_NEXT (last_assign) = next; - ASSIGN_NEXT (last_assign) = TRAVopt (ASSIGN_NEXT (last_assign), arg_info); - } else { - ASSIGN_NEXT (arg_node) = TRAVopt (ASSIGN_NEXT (arg_node), arg_info); - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMlet( node *arg_node, info *arg_info) - * - * @brief - * - *****************************************************************************/ -node * -IMEMlet (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - LET_EXPR (arg_node) = TRAVdo (LET_EXPR (arg_node), arg_info); - INFO_LETEXPR (arg_info) = LET_EXPR (arg_node); - LET_IDS (arg_node) = TRAVopt (LET_IDS (arg_node), arg_info); - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMwith( node *arg_node, info *arg_info) - * - * @brief Traverse both withop and N_code of a cudarizable N_with - * - *****************************************************************************/ -node * -IMEMwith (node *arg_node, info *arg_info) -{ - lut_t *old_lut; - - DBUG_ENTER (); - - /* If the N_with is cudarizable */ - if (WITH_CUDARIZABLE (arg_node)) { - INFO_LUT (arg_info) = LUTgenerateLut (); - INFO_INCUDAWL (arg_info) = TRUE; - WITH_WITHOP (arg_node) = TRAVdo (WITH_WITHOP (arg_node), arg_info); - - old_lut = INFO_NOTRAN (arg_info); - INFO_NOTRAN (arg_info) = LUTgenerateLut (); - - /* we do not want to create a host2device for index vector */ - INFO_NOTRAN (arg_info) = LUTinsertIntoLutP (INFO_NOTRAN (arg_info), - IDS_AVIS (WITH_VEC (arg_node)), NULL); - - WITH_CODE (arg_node) = TRAVdo (WITH_CODE (arg_node), arg_info); - INFO_NOTRAN (arg_info) = old_lut; - INFO_NOTRAN (arg_info) = LUTremoveLut (INFO_NOTRAN (arg_info)); - - INFO_INCUDAWL (arg_info) = FALSE; - INFO_LUT (arg_info) = LUTremoveLut (INFO_LUT (arg_info)); - - /* We need to create for N_ids on the LHS */ - INFO_CREATE_D2H (arg_info) = TRUE; - } else if (INFO_INCUDAWL (arg_info)) { - /* If we are already in a cudarizable N_with but the - * N_with itself is not a cudarizable N_with */ - - WITH_WITHOP (arg_node) = TRAVdo (WITH_WITHOP (arg_node), arg_info); - INFO_NOTRAN (arg_info) = LUTinsertIntoLutP (INFO_NOTRAN (arg_info), - IDS_AVIS (WITH_VEC (arg_node)), NULL); - - WITH_CODE (arg_node) = TRAVdo (WITH_CODE (arg_node), arg_info); - } else { - /* The following traversal has been commented out because if the outermost - * N_with is not cudarizable, none of its inner N_withs (if - * there is any) will be cudarizable since we only cudarize - * the outermost N_with. */ - - /* WITH_CODE( arg_node) = TRAVdo( WITH_CODE( arg_node), arg_info); */ - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMcode( node *arg_node, info *arg_info) - * - * @brief Traverse the code block - * - *****************************************************************************/ -node * -IMEMcode (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - CODE_CBLOCK (arg_node) = TRAVopt (CODE_CBLOCK (arg_node), arg_info); - - INFO_IN_CEXPRS (arg_info) = TRUE; - CODE_CEXPRS (arg_node) = TRAVopt (CODE_CEXPRS (arg_node), arg_info); - INFO_IN_CEXPRS (arg_info) = FALSE; - - CODE_NEXT (arg_node) = TRAVopt (CODE_NEXT (arg_node), arg_info); - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMgenarray( node *arg_node, info *arg_info) - * - * @brief Traverse default element of a N_genarray - * - *****************************************************************************/ -node * -IMEMgenarray (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - if (INFO_INCUDAWL (arg_info)) { - /* Note that we do not traverse N_genarray->shape. This is - * because it can be an N_id node and we do not want to insert - * for it in this case. Therefore, the only son - * of N_genarray we traverse is the default element. */ - if (GENARRAY_DEFAULT (arg_node) != NULL) { - DBUG_ASSERT (NODE_TYPE (GENARRAY_DEFAULT (arg_node)) == N_id, - "Non N_id default element found in N_genarray!"); - GENARRAY_DEFAULT (arg_node) = TRAVdo (GENARRAY_DEFAULT (arg_node), arg_info); - } - - GENARRAY_RC (arg_node) = TRAVopt (GENARRAY_RC (arg_node), arg_info); - - GENARRAY_NEXT (arg_node) = TRAVopt (GENARRAY_NEXT (arg_node), arg_info); - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMmodarray( node *arg_node, info *arg_info) - * - * @brief Traverse default element of a N_modarray - * - *****************************************************************************/ -node * -IMEMmodarray (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - if (INFO_INCUDAWL (arg_info)) { - DBUG_ASSERT (NODE_TYPE (MODARRAY_ARRAY (arg_node)) == N_id, - "Non N_id modified array found in N_modarray!"); - INFO_IS_MODARR (arg_info) = TRUE; - MODARRAY_ARRAY (arg_node) = TRAVdo (MODARRAY_ARRAY (arg_node), arg_info); - INFO_IS_MODARR (arg_info) = FALSE; - MODARRAY_RC (arg_node) = TRAVopt (MODARRAY_RC (arg_node), arg_info); - MODARRAY_NEXT (arg_node) = TRAVopt (MODARRAY_NEXT (arg_node), arg_info); - } - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMids( node *arg_node, info *arg_info) - * - * @brief For N_ids needed to be type converted, create . - * - *****************************************************************************/ -node * -IMEMids (node *arg_node, info *arg_info) -{ - node *new_avis, *ids_avis; - ntype *ids_type, *dev_type; - - DBUG_ENTER (); - - ids_avis = IDS_AVIS (arg_node); - ids_type = AVIS_TYPE (ids_avis); - - /* If the array is define in Cuda wl, we do not create - * a host2device transfer for it */ - if (INFO_INCUDAWL (arg_info)) { - if (TYisArray (ids_type)) { - INFO_NOTRAN (arg_info) - = LUTinsertIntoLutP (INFO_NOTRAN (arg_info), ids_avis, NULL); - } - } else { - if (INFO_CREATE_D2H (arg_info)) { - dev_type = TypeConvert (ids_type, NODE_TYPE (arg_node), arg_info); - if (dev_type != NULL) { - new_avis = TBmakeAvis (TRAVtmpVarName ("dev"), dev_type); - IDS_AVIS (arg_node) = new_avis; - FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) - = TBmakeVardec (new_avis, FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); - - INFO_POSTASSIGNS (arg_info) - = TBmakeAssign (TBmakeLet (TBmakeIds (ids_avis, NULL), - TBmakePrf (F_device2host, - TBmakeExprs (TBmakeId (new_avis), - NULL))), - INFO_POSTASSIGNS (arg_info)); - /* Maintain SSA property */ - AVIS_SSAASSIGN (new_avis) = AVIS_SSAASSIGN (ids_avis); - AVIS_SSAASSIGN (ids_avis) = INFO_POSTASSIGNS (arg_info); - } - // IDS_NEXT( arg_node) = TRAVopt( IDS_NEXT( arg_node), arg_info); - INFO_CREATE_D2H (arg_info) = FALSE; - } - } - - IDS_NEXT (arg_node) = TRAVopt (IDS_NEXT (arg_node), arg_info); - - DBUG_RETURN (arg_node); -} - -/** - * - * @fn node *IMEMid( node *arg_node, info *arg_info) - * - * @brief For each host array N_id in the cudarizable N_with, either create - * type conversion for it (i.e. ) or set its N_avis to - * that of an already converted device array N_id depending on whether - * the N_id is encountered for the first time or not. - * - *****************************************************************************/ -node * -IMEMid (node *arg_node, info *arg_info) -{ - node *new_avis, *avis, *id_avis; - ntype *dev_type, *id_type; - - DBUG_ENTER (); - - id_avis = ID_AVIS (arg_node); - id_type = AVIS_TYPE (id_avis); - - /* if we are in cudarizable N_with */ - if (INFO_INCUDAWL (arg_info)) { - avis = LUTsearchInLutPp (INFO_LUT (arg_info), id_avis); - - /* If the N_avis node hasn't been come across before AND the id is - * NOT in cexprs. This is because we don't want to create a host2device - * for N_id in the cexprs. However, if the N_id has been come across - * before, even if it's in cexprs, we still need to replace its avis - * by the new avis, i.e. the transferred device variable (See the - * "else" case). This might happen that the N_id in cexprs is not - * a scalar and it's a default element of the withloop. Therefore, - * a early traverse of the withop will insert a host2device for this - * N_id and we here simply need to set it's avis to the device variable - * avis. (This is fix to the bug discovered in compiling tvd2d.sac) */ - - if (avis == id_avis && !INFO_IN_CEXPRS (arg_info)) { - dev_type = TypeConvert (id_type, NODE_TYPE (arg_node), arg_info); - /* Definition of the N_id must not be in the same block as - * reference of the N_id. Otherwise, no host2device will be - * created. e.g. - * - * a = with - * { - * b = [x, y, z]; - * ... - * ... = prf( b); - * }:genarray(); - * - * We do not create b_dev = host2device( b) in this case. - */ - if (dev_type != NULL - && (/* NODE_TYPE( AVIS_DECL( avis)) == N_arg || */ - /* INFO_IS_MODARR( arg_info) || */ - LUTsearchInLutPp (INFO_NOTRAN (arg_info), id_avis) == id_avis)) { - new_avis = TBmakeAvis (TRAVtmpVarName ("dev"), dev_type); - CreateHost2Device (&arg_node, id_avis, new_avis, arg_info); - } - } else { - /* If the N_avis has been come across before, replace its - * N_avis by the device N_avis */ - ID_AVIS (arg_node) = avis; - } - } - DBUG_RETURN (arg_node); -} - -static void -CreateHost2Device (node **id, node *host_avis, node *dev_avis, info *arg_info) -{ - DBUG_ENTER (); - - ID_AVIS (*id) = dev_avis; - FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) - = TBmakeVardec (dev_avis, FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); - - INFO_PREASSIGNS (arg_info) - = TBmakeAssign (TBmakeLet (TBmakeIds (dev_avis, NULL), - TBmakePrf (F_host2device, - TBmakeExprs (TBmakeId (host_avis), NULL))), - INFO_PREASSIGNS (arg_info)); - - /* Maintain SSA property */ - AVIS_SSAASSIGN (dev_avis) = INFO_PREASSIGNS (arg_info); - - /* Insert pair host_avis->dev_avis into lookup table. */ - INFO_LUT (arg_info) = LUTinsertIntoLutP (INFO_LUT (arg_info), host_avis, dev_avis); - - DBUG_RETURN (); -} - -/** - * @} - *****************************************************************************/ - -/** - * @} - *****************************************************************************/ - -#undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/insert_memory_transfers.h b/src/libsac2c/cuda/insert_memory_transfers.h deleted file mode 100644 index db8966342..000000000 --- a/src/libsac2c/cuda/insert_memory_transfers.h +++ /dev/null @@ -1,20 +0,0 @@ - - -#ifndef _SAC_INSERT_MEMORY_TRANSFERS_H_ -#define _SAC_INSERT_MEMORY_TRANSFERS_H_ - -#include "types.h" - -extern node *IMEMdoInsertMemoryTransfers (node *arg_node); -extern node *IMEMfundef (node *arg_node, info *arg_info); -extern node *IMEMap (node *arg_node, info *arg_info); -extern node *IMEMid (node *arg_node, info *arg_info); -extern node *IMEMlet (node *arg_node, info *arg_info); -extern node *IMEMassign (node *arg_node, info *arg_info); -extern node *IMEMwith (node *arg_node, info *arg_info); -extern node *IMEMids (node *arg_node, info *arg_info); -extern node *IMEMgenarray (node *arg_node, info *arg_info); -extern node *IMEMmodarray (node *arg_node, info *arg_info); -extern node *IMEMcode (node *arg_node, info *arg_info); - -#endif -- GitLab From d8b237a027cab4290648aa5cdc77d523d7dcb128 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Wed, 3 Apr 2019 18:41:43 +0100 Subject: [PATCH 02/17] fixup documentation for IWLMEM --- src/libsac2c/cuda/insert_withloop_memtran.c | 553 ++++++++++---------- 1 file changed, 291 insertions(+), 262 deletions(-) diff --git a/src/libsac2c/cuda/insert_withloop_memtran.c b/src/libsac2c/cuda/insert_withloop_memtran.c index 55b3dd980..3f503d98d 100644 --- a/src/libsac2c/cuda/insert_withloop_memtran.c +++ b/src/libsac2c/cuda/insert_withloop_memtran.c @@ -1,57 +1,47 @@ -/** +/** + * @file + * @defgroup iwlmem Insert CUDA memory transfer primitives + * @ingroup cuda + * + * This module inserts CUDA type conversion primitives before and after + * each cudarizable N_with. The two primitives are and + * . They are used to trasfer the data of a host(device) array + * variable to a device(host) array variable. This is essentially + * compiled into host<->device memory transfers in the backend. As an + * example: + * + * ~~~~ + * a_host = with + * { + * ... = b_host; + * ... = c_host; + * ... = d_host; + * }:genarray( shp); + * ~~~~ + * + * is transformed into: + * + * ~~~~ + * b_dev = host2device( b_host); + * c_dev = host2device( c_host); + * d_dev = host2device( d_host); + * a_dev = with + * { + * ... = b_dev; + * ... = c_dev; + * ... = d_dev; + * }:genarray( shp); + * a_host = device2host( a_dev); + * ~~~~ + * + * @note + * Simple scalar variables need not be type converted since they + * can be passed as function parameters directly to CUDA kernels. * - * @defgroup Insert CUDA memory transfer primitives - * - * - * This module inserts CUDA type conversion primitives before and after - * each cudarizable N_with. The two primitives are and - * . They are used to trasfer the data of a host(device) array - * variable to a device(host) array variable. This is essentially - * compiled into host<->device memory transfers in the backend. As an - * example: - * - * a_host = with - * { - * ... = b_host; - * ... = c_host; - * ... = d_host; - * }:genarray( shp); - * - * is transformed into: - * - * b_dev = host2device( b_host); - * c_dev = host2device( c_host); - * d_dev = host2device( d_host); - * a_dev = with - * { - * ... = b_dev; - * ... = c_dev; - * ... = d_dev; - * }:genarray( shp); - * a_host = device2host( a_dev); - * - * Note that simple scalar variables need not be type converted since they - * can be passed as function parameters directly to CUDA kernels. - * - * @ingroup - * - * @{ASSIGN_STMT( arg_node) - * - *****************************************************************************/ - -/** - * - * @file cuda_type_conversion.c - * - * Prefix: IWLMEM - * - *****************************************************************************/ + * @{ + */ #include "insert_withloop_memtran.h" -/* - * Other includes go here - */ -#include #include "tree_basic.h" #include "tree_compound.h" #include "str.h" @@ -79,54 +69,34 @@ #include "infer_dfms.h" #include "NumLookUpTable.h" -/** - * - * @name INFO structure - * @{ - * - *****************************************************************************/ +/** @name INFO structure + * @{ + */ struct INFO { - node *fundef; - bool in_cudawl; - bool create_d2h; - node *postassigns; - node *preassigns; - lut_t *lut; - lut_t *notran; - node *let_expr; - bool is_modarr; - bool in_cexprs; - bool from_ap; - node *letids; - node *apids; - node *topblock; - nlut_t *at_nlut; + node *fundef; /**< N_fundef node of the enclosing function */ + bool in_cudawl; /**< Flag indicating whether the code currently being traversed is in + a cudarizable N_with */ + bool create_d2h; /**< Flag indicating whether needs to be created for + the N_let->N_ids */ + node *postassigns; /**< Chain of that needs to be appended at the end of + the current N_assign */ + node *preassigns; /**< Chain of that needs to be prepended at the + beginning of the current N_assign */ + lut_t *lut; /**< Lookup table storing pairs of Avis(host)->Avis(device) e.g. Given + a_dev = host2device( a_host), Avis(a_host)->Avis(a_dev) will be stored + into the table */ + lut_t *notran; /**< Lookup table storing N_avis of arrays varaibles that no data + transfers should be created. */ + node *let_expr; /**< Holds the current N_let expressions, used to check if the RHS is + a with-loop */ + node *let_ids; /**< Holds the current N_let N_ids chain */ + bool in_cexprs; /**< Flag indicating where are in N_code cexprs */ + bool from_ap; /**< Flag indicating where are coming from a N_ap */ + node *apids; /**< Holds LHS of current N_ap */ + node *topblock; /**< Holds the N_block (body) of the current N_fundef */ + nlut_t *at_nlut; /**< Used to count the number of references of N_avis */ }; -/* - * INFO_FUNDEF N_fundef node of the enclosing function - * - * INFO_INCUDAWL Flag indicating whether the code currently being - * traversed is in a cudarizable N_with - * - * INFO_CREATE_D2H Flag indicating whether needs to be - * created for the N_let->N_ids - * - * INFO_POSTASSIGNS Chain of that needs to be appended - * at the end of the current N_assign - * - * INFO_PREASSIGNS Chain of that needs to be prepended - * at the beginning of the current N_assign - * - * INFO_LUT Lookup table storing pairs of Avis(host)->Avis(device) - * e.g. Given a_dev = host2device( a_host), - * Avis(a_host)->Avis(a_dev) will be stored into the table - * - * INFO_NOTRAN Lookup table storing N_avis of arrays varaibles that - * no data transfers should be created. - * - */ - #define INFO_FUNDEF(n) (n->fundef) #define INFO_INCUDAWL(n) (n->in_cudawl) #define INFO_CREATE_D2H(n) (n->create_d2h) @@ -135,10 +105,9 @@ struct INFO { #define INFO_LUT(n) (n->lut) #define INFO_NOTRAN(n) (n->notran) #define INFO_LETEXPR(n) (n->let_expr) -#define INFO_IS_MODARR(n) (n->is_modarr) +#define INFO_LETIDS(n) (n->let_ids) #define INFO_IN_CEXPRS(n) (n->in_cexprs) #define INFO_FROM_AP(n) (n->from_ap) -#define INFO_LETIDS(n) (n->letids) #define INFO_APIDS(n) (n->apids) #define INFO_TOPBLOCK(n) (n->topblock) #define INFO_AT_NLUT(n) (n->at_nlut) @@ -159,7 +128,6 @@ MakeInfo (void) INFO_PREASSIGNS (result) = NULL; INFO_LUT (result) = NULL; INFO_NOTRAN (result) = NULL; - INFO_IS_MODARR (result) = FALSE; INFO_IN_CEXPRS (result) = FALSE; INFO_FROM_AP (result) = FALSE; INFO_LETIDS (result) = NULL; @@ -180,25 +148,18 @@ FreeInfo (info *info) DBUG_RETURN (info); } -/** - * @} - *****************************************************************************/ +/** @} */ -static void CreateHost2Device (node **id, node *host_avis, node *dev_avis, - info *arg_info); -static bool AssignInTopBlock (node *assign, info *arg_info); +/** @name Entry functions + * @{ + */ -/** +/** + * @brief Perform the IWLMEM traversal * - * @name Entry functions - * @{ - * - *****************************************************************************/ -/** - * - * @fn node *IWLMEMdoInsertWithloopMemtran( node *syntax_tree) - * - *****************************************************************************/ + * @param syntax_tree + * @return syntax_tree + */ node * IWLMEMdoInsertWithloopMemtran (node *syntax_tree) { @@ -222,24 +183,83 @@ IWLMEMdoInsertWithloopMemtran (node *syntax_tree) DBUG_RETURN (syntax_tree); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * - * @name Static helper functions - * @{ - * - *****************************************************************************/ +/** @name Static helper functions + * @{ + */ -/** +/** + * @brief Create host2device call, and add to the info struct to be added to + * the syntax tree later. * - * @fn node* TypeConvert( node *host_avis) + * @param id The argument position to place the device N_avis + * @param host_avis The host N_avis + * @param dev_avis The new device N_avis + * @param info The info struct + * @return + */ +static void +CreateHost2Device (node **id, node *host_avis, node *dev_avis, info *arg_info) +{ + DBUG_ENTER (); + + ID_AVIS (*id) = dev_avis; + FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) + = TBmakeVardec (dev_avis, FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); + + INFO_PREASSIGNS (arg_info) + = TBmakeAssign (TBmakeLet (TBmakeIds (dev_avis, NULL), + TBmakePrf (F_host2device, + TBmakeExprs (TBmakeId (host_avis), NULL))), + INFO_PREASSIGNS (arg_info)); + + /* Maintain SSA property */ + AVIS_SSAASSIGN (dev_avis) = INFO_PREASSIGNS (arg_info); + + /* Insert pair host_avis->dev_avis into lookup table. */ + INFO_LUT (arg_info) = LUTinsertIntoLutP (INFO_LUT (arg_info), host_avis, dev_avis); + + DBUG_RETURN (); +} + +/** + * @brief Search through N_block (passed in via the info struct) for a specific + * assignment. * - * @brief + * @param assign The N_assign to search for + * @param info The info struct (which holds link to the N_block) + * @return True if the N_assign was found, False otherwise + */ +static bool +AssignInTopBlock (node *assign, info *arg_info) +{ + bool res = FALSE; + node *assign_chain; + + DBUG_ENTER (); + + assign_chain = BLOCK_ASSIGNS (INFO_TOPBLOCK (arg_info)); + + while (assign_chain != NULL) { + if (assign_chain == assign) { + res = TRUE; + break; + } + assign_chain = ASSIGN_NEXT (assign_chain); + } + + DBUG_RETURN (res); +} + +/** + * @brief Convert from a host ntype to a device ntype, while preserving shape information. * - *****************************************************************************/ + * @param host_type The host ntype + * @param nty The nodetype of the node being converted (support N_id and N_ids) + * @param info The info struct + * @return A device ntype struct + */ static ntype * TypeConvert (ntype *host_type, nodetype nty, info *arg_info) { @@ -303,6 +323,13 @@ TypeConvert (ntype *host_type, nodetype nty, info *arg_info) DBUG_RETURN (dev_type); } +/** + * @brief Anonymouse traversal function (N_with) + * + * @param arg_node N_with + * @param arg_info info struct + * @return N_with + */ static node * ATravWith (node *arg_node, info *arg_info) { @@ -315,6 +342,13 @@ ATravWith (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } +/** + * @brief Anonymouse traversal function (N_id). For every N_avis, increment a counter. + * + * @param arg_node N_id + * @param arg_info info struct + * @return N_id + */ static node * ATravId (node *arg_node, info *arg_info) { @@ -325,6 +359,13 @@ ATravId (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } +/** + * @brief Anonymouse traversal function (N_genarray). Traverse through all N_id sons. + * + * @param arg_node N_genarray + * @param arg_info info struct + * @return N_genarray + */ static node * ATravGenarray (node *arg_node, info *arg_info) { @@ -342,29 +383,29 @@ ATravGenarray (node *arg_node, info *arg_info) GENARRAY_RC (arg_node) = TRAVopt (GENARRAY_RC (arg_node), arg_info); GENARRAY_ERC (arg_node) = TRAVopt (GENARRAY_ERC (arg_node), arg_info); GENARRAY_PRC (arg_node) = TRAVopt (GENARRAY_PRC (arg_node), arg_info); + GENARRAY_NEXT (arg_node) = TRAVopt (GENARRAY_NEXT (arg_node), arg_info); DBUG_RETURN (arg_node); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * - * @name Traversal functions - * @{ - * - *****************************************************************************/ +/** @name Traversal functions + * @{ + */ -/** +/** + * @brief Traverse N_fundef * - * @fn node *IWLMEMfundef( node *arg_node, info *arg_info) + * If the current N_fundef is not a LaC function, traverse the body and next. Otherwise, + * if we are coming from a N_ap that is a LaC function, traverse *only* the body, passing + * a link to the body. Otherwise, we go to the next N_fundef. * - * @brief - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info struct + * @return N_fundef node + */ node * IWLMEMfundef (node *arg_node, info *arg_info) { @@ -373,15 +414,21 @@ IWLMEMfundef (node *arg_node, info *arg_info) DBUG_ENTER (); + DBUG_PRINT ("at %s", FUNDEF_NAME (arg_node)); + /* During the main traversal, we only look at non-lac functions */ if (!FUNDEF_ISLACFUN (arg_node)) { + DBUG_PRINT ("...inspecting body"); INFO_FUNDEF (arg_info) = arg_node; INFO_TOPBLOCK (arg_info) = FUNDEF_BODY (arg_node); FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); INFO_FUNDEF (arg_info) = NULL; FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); } else { + DBUG_PRINT ("...inspecting LAC body"); if (INFO_FROM_AP (arg_info)) { + DBUG_PRINT ("...from application"); + old_fundef = INFO_FUNDEF (arg_info); old_topblock = INFO_TOPBLOCK (arg_info); INFO_FUNDEF (arg_info) = arg_node; @@ -398,32 +445,37 @@ IWLMEMfundef (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMap( node *arg_node, info *arg_info) +/** + * @brief Traverse N_ap which is a LaC function, and *not* the recursive call. * - * @brief + * If the application is outside a CUDA withloop, we traverse into its N_fundef. + * If its from within a CUDA withloop, we check all its arguments against the LUT and if + * we find a match and its not marked for NOTRAN (no transfer), we create a host2device + * call. * - *****************************************************************************/ + * @param arg_node N_ap + * @param arg_info info struct + * @return N_ap node + */ node * IWLMEMap (node *arg_node, info *arg_info) { bool traverse_lac_fun, old_from_ap; - node *ap_args, *fundef_args; - node *avis, *id_avis, *new_avis, *dup_avis; + node *ap_args, *fundef_args, *avis, *new_avis, *dup_avis, *id_avis; ntype *dev_type; node *fundef, *old_apids; DBUG_ENTER (); fundef = AP_FUNDEF (arg_node); + DBUG_PRINT ("ap_fun %s", FUNDEF_NAME (fundef)); /* For us to traverse a function from calling site, it must be a * condictional function or a loop function and must not be the * recursive function call in the loop function. */ traverse_lac_fun = (FUNDEF_ISLACFUN (fundef) && fundef != INFO_FUNDEF (arg_info)); - if (traverse_lac_fun) { + if (traverse_lac_fun) { /* inside loop or conditional */ old_from_ap = INFO_FROM_AP (arg_info); INFO_FROM_AP (arg_info) = TRUE; @@ -431,10 +483,13 @@ IWLMEMap (node *arg_node, info *arg_info) INFO_APIDS (arg_info) = INFO_LETIDS (arg_info); if (!INFO_INCUDAWL (arg_info)) { + DBUG_PRINT ("...not in CUDAWL"); + AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); } else { + /* Used to add h2d transfers for applications within WL N_code */ ap_args = AP_ARGS (arg_node); - fundef_args = FUNDEF_ARGS (AP_FUNDEF (arg_node)); + fundef_args = FUNDEF_ARGS (fundef); while (ap_args != NULL) { DBUG_ASSERT (fundef_args != NULL, "# of Ap args != # of Fundef args!"); @@ -447,8 +502,8 @@ IWLMEMap (node *arg_node, info *arg_info) /* If the avis has NOT been come across before */ if (avis == id_avis) { - DBUG_PRINT ("fundef %s, id %s", FUNDEF_NAME (AP_FUNDEF (arg_node)), - AVIS_NAME (avis)); + DBUG_PRINT ("new arg for ap_fun %s, id %s", FUNDEF_NAME (fundef), + AVIS_NAME (avis)); /* If the id is NOT the one we don't want to create data transfer for */ if (LUTsearchInLutPp (INFO_NOTRAN (arg_info), id_avis) == id_avis) { @@ -482,6 +537,8 @@ IWLMEMap (node *arg_node, info *arg_info) ARG_AVIS (fundef_args), NULL); } } else { + DBUG_PRINT ("existing arg on ap_fun %s, id %s", FUNDEF_NAME (fundef), + AVIS_NAME (avis)); /* If the N_avis has been come across before, replace its * N_avis by the device N_avis */ ID_AVIS (EXPRS_EXPR (ap_args)) = avis; @@ -524,14 +581,14 @@ IWLMEMap (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMassign( node *arg_node, info *arg_info) - * +/** * @brief Add newly created and to * the assign chain. * - *****************************************************************************/ + * @param arg_node N_assign + * @param arg_info info struct + * @return N_assign node + */ node * IWLMEMassign (node *arg_node, info *arg_info) { @@ -577,13 +634,13 @@ IWLMEMassign (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMlet( node *arg_node, info *arg_info) - * - * @brief +/** + * @brief Traverse N_let, carrying both the LHS and RHS. * - *****************************************************************************/ + * @param arg_node N_let + * @param arg_info info struct + * @return N_let node + */ node * IWLMEMlet (node *arg_node, info *arg_info) { @@ -598,13 +655,14 @@ IWLMEMlet (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMfuncond( node *arg_node, info *arg_info) - * - * @brief +/** + * @brief Traverse N_funcond that are within a CUDA withloop and change N_avis basetypes + * to device types. * - *****************************************************************************/ + * @param arg_node N_funcond + * @param arg_info info struct + * @return N_funcond node + */ node * IWLMEMfuncond (node *arg_node, info *arg_info) { @@ -683,13 +741,13 @@ IWLMEMfuncond (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMwith( node *arg_node, info *arg_info) - * +/** * @brief Traverse both withop and N_code of a cudarizable N_with * - *****************************************************************************/ + * @param arg_node N_with + * @param arg_info info struct + * @return N_with node + */ node * IWLMEMwith (node *arg_node, info *arg_info) { @@ -698,6 +756,8 @@ IWLMEMwith (node *arg_node, info *arg_info) DBUG_ENTER (); + DBUG_PRINT ("at WL"); + /* If the N_with is cudarizable */ if (WITH_CUDARIZABLE (arg_node)) { @@ -774,13 +834,13 @@ IWLMEMwith (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMcode( node *arg_node, info *arg_info) +/** + * @brief Traverse N_code of withloop. * - * @brief Traverse the code block - * - *****************************************************************************/ + * @param arg_node N_code + * @param arg_info info struct + * @return N_code node + */ node * IWLMEMcode (node *arg_node, info *arg_info) { @@ -797,13 +857,13 @@ IWLMEMcode (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMgenarray( node *arg_node, info *arg_info) - * +/** * @brief Traverse default element of a N_genarray * - *****************************************************************************/ + * @param arg_node N_genarray + * @param arg_info info struct + * @return N_genarray node + */ node * IWLMEMgenarray (node *arg_node, info *arg_info) { @@ -830,13 +890,13 @@ IWLMEMgenarray (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMmodarray( node *arg_node, info *arg_info) - * +/** * @brief Traverse default element of a N_modarray * - *****************************************************************************/ + * @param arg_node N_modarray + * @param arg_info info struct + * @return N_modarray node + */ node * IWLMEMmodarray (node *arg_node, info *arg_info) { @@ -845,24 +905,26 @@ IWLMEMmodarray (node *arg_node, info *arg_info) if (INFO_INCUDAWL (arg_info)) { DBUG_ASSERT (NODE_TYPE (MODARRAY_ARRAY (arg_node)) == N_id, "Non N_id modified array found in N_modarray!"); - INFO_IS_MODARR (arg_info) = TRUE; MODARRAY_ARRAY (arg_node) = TRAVdo (MODARRAY_ARRAY (arg_node), arg_info); - INFO_IS_MODARR (arg_info) = FALSE; + MODARRAY_RC (arg_node) = TRAVopt (MODARRAY_RC (arg_node), arg_info); MODARRAY_ERC (arg_node) = TRAVopt (MODARRAY_ERC (arg_node), arg_info); + MODARRAY_NEXT (arg_node) = TRAVopt (MODARRAY_NEXT (arg_node), arg_info); } DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMids( node *arg_node, info *arg_info) - * - * @brief For N_ids needed to be type converted, create . +/** + * @brief For N_ids on the LHS of CUDA-WLs, create a new LHS avis with device type, and + * create a assign to be placed after the WL. If the N_ids is in a + * CUDA-WL, add it to the NOTRAN LUT. * - *****************************************************************************/ + * @param arg_node N_ids + * @param arg_info info struct + * @return N_ids node + */ node * IWLMEMids (node *arg_node, info *arg_info) { @@ -874,6 +936,8 @@ IWLMEMids (node *arg_node, info *arg_info) ids_avis = IDS_AVIS (arg_node); ids_type = AVIS_TYPE (ids_avis); + DBUG_PRINT ("at IDS of %s", AVIS_NAME (ids_avis)); + /* If the array is defined in cuda withloop, we do not create * a host2device transfer for it */ if (INFO_INCUDAWL (arg_info)) { @@ -890,26 +954,40 @@ IWLMEMids (node *arg_node, info *arg_info) TYgetSimpleType (TYgetScalar (ids_type)))); } } - } else { + } else { /* not in CUDAWL */ if (INFO_CREATE_D2H (arg_info)) { + /* if we come this this point after a CUDAWL, we probably need to + * create a device2host transfer. */ dev_type = TypeConvert (ids_type, NODE_TYPE (arg_node), arg_info); if (dev_type != NULL) { + + /* create new avis for WL return */ new_avis = TBmakeAvis (TRAVtmpVarName ("dev"), dev_type); IDS_AVIS (arg_node) = new_avis; + DBUG_PRINT ("...replacing WL return %s -> %s", AVIS_NAME (ids_avis), + AVIS_NAME (new_avis)); + + /* add to fundef vardecs */ FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) = TBmakeVardec (new_avis, FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); + /* create device2host */ INFO_POSTASSIGNS (arg_info) = TBmakeAssign (TBmakeLet (TBmakeIds (ids_avis, NULL), TBmakePrf (F_device2host, TBmakeExprs (TBmakeId (new_avis), NULL))), INFO_POSTASSIGNS (arg_info)); - /* Maintain SSA property */ + DBUG_PRINT ("Creating device2host for %s -> %s", AVIS_NAME (new_avis), + AVIS_NAME (ids_avis)); + + /* maintain SSA property */ AVIS_SSAASSIGN (new_avis) = AVIS_SSAASSIGN (ids_avis); AVIS_SSAASSIGN (ids_avis) = INFO_POSTASSIGNS (arg_info); } - // IDS_NEXT( arg_node) = TRAVopt( IDS_NEXT( arg_node), arg_info); + + /* We stop creating any further device2host assigns */ + /* XXX what about multi-operator WLs? */ INFO_CREATE_D2H (arg_info) = FALSE; } } @@ -919,16 +997,16 @@ IWLMEMids (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *IWLMEMid( node *arg_node, info *arg_info) - * +/** * @brief For each host array N_id in the cudarizable N_with, either create * type conversion for it (i.e. ) or set its N_avis to * that of an already converted device array N_id depending on whether * the N_id is encountered for the first time or not. * - *****************************************************************************/ + * @param arg_node N_id + * @param arg_info info struct + * @return N_id node + */ node * IWLMEMid (node *arg_node, info *arg_info) { @@ -943,7 +1021,9 @@ IWLMEMid (node *arg_node, info *arg_info) /* if we are in cudarizable N_with */ if (INFO_INCUDAWL (arg_info)) { - avis = (node *)LUTsearchInLutPp (INFO_LUT (arg_info), id_avis); + DBUG_PRINT ("inspecting %s", AVIS_NAME (id_avis)); + + avis = LUTsearchInLutPp (INFO_LUT (arg_info), id_avis); /* If the N_avis node hasn't been come across before AND the id is * NOT in cexprs. This is because we don't want to create a host2device @@ -976,7 +1056,7 @@ IWLMEMid (node *arg_node, info *arg_info) if (((INFO_IN_CEXPRS (arg_info) && ssaassign != NULL && AssignInTopBlock (ssaassign, arg_info)) || !INFO_IN_CEXPRS (arg_info)) - && !CUisShmemTypeNew (id_type) + && !CUisDeviceTypeNew (id_type) && !CUisShmemTypeNew (id_type) && LUTsearchInLutPp (INFO_NOTRAN (arg_info), id_avis) == id_avis) { dev_type = TypeConvert (id_type, NODE_TYPE (arg_node), arg_info); if (dev_type != NULL) { @@ -994,58 +1074,7 @@ IWLMEMid (node *arg_node, info *arg_info) } DBUG_RETURN (arg_node); } - -static void -CreateHost2Device (node **id, node *host_avis, node *dev_avis, info *arg_info) -{ - DBUG_ENTER (); - - ID_AVIS (*id) = dev_avis; - FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) - = TBmakeVardec (dev_avis, FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); - - INFO_PREASSIGNS (arg_info) - = TBmakeAssign (TBmakeLet (TBmakeIds (dev_avis, NULL), - TBmakePrf (F_host2device, - TBmakeExprs (TBmakeId (host_avis), NULL))), - INFO_PREASSIGNS (arg_info)); - - /* Maintain SSA property */ - AVIS_SSAASSIGN (dev_avis) = INFO_PREASSIGNS (arg_info); - - /* Insert pair host_avis->dev_avis into lookup table. */ - INFO_LUT (arg_info) = LUTinsertIntoLutP (INFO_LUT (arg_info), host_avis, dev_avis); - - DBUG_RETURN (); -} - -static bool -AssignInTopBlock (node *assign, info *arg_info) -{ - bool res = FALSE; - node *assign_chain; - - DBUG_ENTER (); - - assign_chain = BLOCK_ASSIGNS (INFO_TOPBLOCK (arg_info)); - - while (assign_chain != NULL) { - if (assign_chain == assign) { - res = TRUE; - break; - } - assign_chain = ASSIGN_NEXT (assign_chain); - } - - DBUG_RETURN (res); -} - -/** - * @} - *****************************************************************************/ - -/** - * @} - *****************************************************************************/ +/** @} */ +/** @} */ #undef DBUG_PREFIX -- GitLab From c27c2728a4accd29f8a9071743e28ab7d68ff802 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Thu, 4 Apr 2019 15:40:23 +0100 Subject: [PATCH 03/17] EMRL mark fundef has allocation lifts We will use this later for EMRTU (relating to CUDA IWLMEM optimisation). --- src/libsac2c/memory/emr_loop_optimisation.c | 37 +++++++++++++++++---- src/libsac2c/xml/ast.xml | 5 +++ 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/src/libsac2c/memory/emr_loop_optimisation.c b/src/libsac2c/memory/emr_loop_optimisation.c index c935bf205..67fd382ee 100644 --- a/src/libsac2c/memory/emr_loop_optimisation.c +++ b/src/libsac2c/memory/emr_loop_optimisation.c @@ -56,7 +56,7 @@ typedef enum emrl_context {EMRL_rec, EMRL_ap} emrl_context_t; * node. See EMRL related functions for more info. */ typedef struct stack_node_s { - node * wl; /**< either a N_modarray or N_genarray */ + node *wl; /**< either a N_modarray or N_genarray */ node * avis; /**< our new avis */ struct stack_node_s * next; } stack_node_t; @@ -206,6 +206,30 @@ isSameShapeAvis (node * avis, node * exprs) DBUG_RETURN (ret); } +/** + * @brief Create a new temporary avis which copies the ntype of + * an existing avis + * + * @param type Some NType + * @return a new avis + */ +static inline node * +createTmpAvis (ntype *type) +{ + node *avis; + + avis = TBmakeAvis (TRAVtmpVarName ("emr_tmp"), TYcopyType (type)); + + /* XXX setting this for a loop-fun signature could cause + * problems latter in alloc.c, as we might try to create + * a local allocation. We need to check this! + */ + AVIS_ISALLOCLIFT (avis) = TRUE; + DBUG_PRINT (" created %s var", AVIS_NAME (avis)); + + return avis; +} + /** * @brief Collect LHS of N_let and traverse the exprs * @@ -269,9 +293,7 @@ EMRLgenarray (node * arg_node, info * arg_info) DBUG_PRINT (" genarray in loopfun has no RCs or ERCs, generating tmp one!"); /* the new avis must have the same type/shape as genarray shape */ - new_avis = TBmakeAvis (TRAVtmpVarName ("emr_tmp"), - TYcopyType (IDS_NTYPE (INFO_LHS (arg_info)))); - DBUG_PRINT (" created %s var", AVIS_NAME (new_avis)); + new_avis = createTmpAvis (IDS_NTYPE (INFO_LHS (arg_info))); /* add to stack - this will be used in N_ap */ INFO_STACK (arg_info) = stack_push (INFO_STACK (arg_info), arg_node, new_avis); @@ -312,9 +334,7 @@ EMRLmodarray (node * arg_node, info * arg_info) DBUG_PRINT (" modarray in loopfun has no RCs or ERCs, generating tmp one!"); /* the new avis must have the same type/shape as modarray shape */ - new_avis = TBmakeAvis (TRAVtmpVarName ("emr_tmp"), - TYcopyType (IDS_NTYPE (INFO_LHS (arg_info)))); - DBUG_PRINT (" created %s var", AVIS_NAME (new_avis)); + new_avis = createTmpAvis (IDS_NTYPE (INFO_LHS (arg_info))); /* add to stack - this will be used in N_ap */ INFO_STACK (arg_info) = stack_push (INFO_STACK (arg_info), arg_node, new_avis); @@ -487,6 +507,9 @@ EMRLfundef (node * arg_node, info * arg_info) FUNDEF_ARGS (arg_node) = TCappendArgs (FUNDEF_ARGS (arg_node), INFO_ARGS (arg_info)); INFO_ARGS (arg_info) = NULL; + + /* mark fundef as having been touched by EMRL - this used later in EMRTU */ + FUNDEF_ISEMRLIFTED (arg_node) = TRUE; } INFO_FUNDEF (arg_info) = NULL; diff --git a/src/libsac2c/xml/ast.xml b/src/libsac2c/xml/ast.xml index bb476d014..943c0a319 100644 --- a/src/libsac2c/xml/ast.xml +++ b/src/libsac2c/xml/ast.xml @@ -6000,6 +6000,11 @@ N_tfarg : Indicates whether or not we need to generate a declaration within header.c when linking to an external library. + + + TRUE iff this fundef has been affected by the EMRL optimisation. + + -- GitLab From 0db3f2547fd7bc8cec9e552e63c6f07c626e6c1a Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Thu, 4 Apr 2019 15:50:59 +0100 Subject: [PATCH 04/17] add EMRTU optimistation This optimisation works together with the CUDA IWLMEM optimisation to introduce CUDA memory transfers into the AST. We need this traversal when using the EMR optimisation as IWLMEM would otherwise lead to inoptimal code generation. IWLMEM, when encountering a lift ERC, would within the loop function create a host2device and cause a memory allocation and free on the CUDA device. The EMRTU (EMR Type Update) traversal identifies such cases, and correclty transforms the lifted ERCs into CUDA device_types *and* fixes the arguments of the loop function applications (initial and recursi --- src/libsac2c/CMakeLists.txt | 3 +- src/libsac2c/cuda/emr_type_update.c | 699 ++++++++++++++++++++ src/libsac2c/cuda/emr_type_update.h | 16 + src/libsac2c/cuda/insert_withloop_memtran.c | 15 +- src/libsac2c/xml/ast.xml | 10 + 5 files changed, 739 insertions(+), 4 deletions(-) create mode 100644 src/libsac2c/cuda/emr_type_update.c create mode 100644 src/libsac2c/cuda/emr_type_update.h diff --git a/src/libsac2c/CMakeLists.txt b/src/libsac2c/CMakeLists.txt index c96a052ac..554ca1170 100644 --- a/src/libsac2c/CMakeLists.txt +++ b/src/libsac2c/CMakeLists.txt @@ -219,6 +219,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/cuda/cuda_sink_code.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/cuda_tag_executionmode.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/cuda_utils.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/data_access_analysis.c +${CMAKE_CURRENT_SOURCE_DIR}/cuda/emr_type_update.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/expand_shmem_boundary_load.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/infer_reusable_arrays.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/insert_cudast_memtran.c @@ -627,7 +628,7 @@ SET_SOURCE_FILES_PROPERTIES( ${CMAKE_CURRENT_SOURCE_DIR}/precompile/functionprecompile.c #RET & ARG_LINKSIGN ${CMAKE_CURRENT_SOURCE_DIR}/scanparse/handle_dots.c #TBmakeNum warnings ${CMAKE_CURRENT_SOURCE_DIR}/scanparse/parser.c #SHmakeShape (n) - n is size_t - ${CMAKE_CURRENT_SOURCE_DIR}/scanparse/resolvepragma.c #PRAGMA_NUMPARAMS + ${CMAKE_CURRENT_SOURCE_DIR}/scanparse/resolvepragma.c #PRAGMA_NUMPARAMS ${CMAKE_CURRENT_SOURCE_DIR}/stdopt/makedimexpr.c #TBmakeNum with TCcountExprs ${CMAKE_CURRENT_SOURCE_DIR}/stdopt/makeshapeexpr.c #TBmakeNum with TCcountExprs ${CMAKE_CURRENT_SOURCE_DIR}/stdopt/prfunroll.c #SHgetUnrlen diff --git a/src/libsac2c/cuda/emr_type_update.c b/src/libsac2c/cuda/emr_type_update.c new file mode 100644 index 000000000..e2c48d021 --- /dev/null +++ b/src/libsac2c/cuda/emr_type_update.c @@ -0,0 +1,699 @@ +/** + * @file + * @defgroup emrcudalift EMR loop allocation lifting for CUDA + * @ingroup cuda + * + * @brief Convert all ERCs in EMRL affected fundefs with CUDA-WL to CUDA device types. + * + * This traverse works *only* with IWLMEM in two modes: + * * we update initial application of loop fun, and loop fun arguments + * * we update the recursive application. + * + * The first mode is done *before* applying CUDA WL transfer insertion (IWLMEM), as we + * want to avoid adding host2device transfers for lifted ERCs. The second mode is applied + * *after* IWLMEM to update the recursive application with any LHS of host2device calls. + * + * To ilistrate what this traversal does, lets walk through with a code example. Before + * we reach IWLMEM, we have this situation: + * + * ~~~~ + * loop_entry (A) + * { + * type var_emr_lift; + * + * return loop_0 (A, var_emr_lift); + * } + * + * loop_0 (A, var_emr_tmp) + * { + * type a; + * type a_0; + * + * a = wl ( A ) [ERC: var_emr_tmp]; + * + * if (some cond) + * a_0 = loop_0 (a, A); + * return (some cond) ? a_0 : a; + * } + * ~~~~ + * + * After first mode step of EMRTU: + * + * ~~~~ + * loop_entry (A) + * { + * type_dev var_emr_lift_dev; + * + * return loop_0 (A, var_emr_lift_dev); + * } + * + * loop_0 (A, var_emr_tmp_dev) + * { + * type a; + * type a_0; + * + * a = wl ( A ) [ERC: var_emr_tmp_dev]; + * + * if (some cond) + * a_0 = loop_0 (a, A); + * return (some cond) ? a_0 : a; + * } + * ~~~~ + * + * After IWLMEM traversal: + * + * ~~~~ + * loop_entry (A) + * { + * type_dev var_emr_lift_dev; + * + * return loop_0 (A, var_emr_lift_dev); + * } + * + * loop_0 (A, var_emr_tmp_dev) + * { + * type a; + * type_dev A_dev; + * type_dev a_dev; + * type a_0; + * + * A_dev = host2device (A); + * a_dev = wl ( A_dev ) [ERC: var_emr_tmp_dev]; + * a = device2host (a_dev); + * + * if (some cond) + * a_0 = loop_0 (a, A); + * return (some cond) ? a_0 : a; + * } + * ~~~~ + * + * And finally after step two of EMRTU: + * + * ~~~~ + * loop_entry (A) + * { + * type_dev var_emr_lift_dev; + * + * return loop_0 (A, var_emr_lift_dev); + * } + * + * loop_0 (A, var_emr_tmp_dev) + * { + * type a; + * type_dev A_dev; + * type_dev a_dev; + * type a_0; + * + * A_dev = host2device (A); + * a_dev = wl ( A_dev ) [ERC: var_emr_tmp_dev]; + * a = device2host (a_dev); + * + * if (some cond) + * a_0 = loop_0 (a, A_dev); + * return (some cond) ? a_0 : a; + * } + * ~~~~ + * + * @note This transformation can only be applied *after* cudarizable WLs have been + * identified. + * + * @{ + */ +#include "emr_type_update.h" + +#include "types.h" +#include "type_utils.h" +#include "traverse.h" +#include "tree_basic.h" +#include "tree_compound.h" +#include "memory.h" + +#define DBUG_PREFIX "EMRTU" +#include "debug.h" + +#include "free.h" +#include "new_types.h" +#include "cuda_utils.h" +#include "LookUpTable.h" +#include "DupTree.h" + +/** @name INFO structure + * @{ + */ +struct INFO { + node *fundef; /**< Holds current N_fundef */ + lut_t *lut; /**< LUT is used for storing either new device N_avis (for fundef update), + or N_avis pairs (for ap update) */ + node *ap_args; /**< N_ap arguments */ + bool update_ap; /**< Flag indicating what mode we are in (either fundef updating, or + ap updating) */ + node *letids; /**< The the LHS of N_prf */ + node *wl_ercs; /**< Holds all CUDA-WL ERCs for a given fundef */ +}; + +#define INFO_FUNDEF(n) ((n)->fundef) +#define INFO_LUT(n) ((n)->lut) +#define INFO_AP_ARGS(n) ((n)->ap_args) +#define INFO_UPDATE_AP(n) ((n)->update_ap) +#define INFO_LETIDS(n) ((n)->letids) +#define INFO_WLERCS(n) ((n)->wl_ercs) + +static info * +MakeInfo (void) +{ + info *result; + + DBUG_ENTER (); + + result = (info *)MEMmalloc (sizeof (info)); + + INFO_FUNDEF (result) = NULL; + INFO_LUT (result) = NULL; + INFO_AP_ARGS (result) = NULL; + INFO_UPDATE_AP (result) = FALSE; + INFO_LETIDS (result) = NULL; + INFO_WLERCS (result) = NULL; + + DBUG_RETURN (result); +} + +static info * +FreeInfo (info *info) +{ + DBUG_ENTER (); + + info = MEMfree (info); + + DBUG_RETURN (info); +} + +/** @} */ + +/** @name Static helper functions + * @{ + */ + +/** + * @brief Convert from a host ntype to a device ntype, while preserving shape information. + * + * @param host_type The host ntype + * @return A device ntype struct + */ +static ntype * +ConvertHost2DeviceType (ntype *host_type) +{ + ntype *scalar_type, *dev_type = NULL; + simpletype sty; + + DBUG_ENTER (); + + /* If the N_id is of known dimension and is not a scalar */ + DBUG_ASSERT (TUdimKnown (host_type), "AUD N_id found!"); + if (TYgetDim (host_type) > 0) { + /* If the scalar type is simple, e.g. int, float ... */ + if (TYisSimple (TYgetScalar (host_type))) { + dev_type = TYcopyType (host_type); + scalar_type = TYgetScalar (dev_type); + /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ + sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); + /* Set the device simple type */ + scalar_type = TYsetSimpleType (scalar_type, sty); + } + } + + DBUG_RETURN (dev_type); +} + +/** + * @brief Used as part of LUTmap operation to free LUT keys. + * + * @param value The LUT value + * @param key The LUT key, which will be freed + * @return the value + */ +static void * +FreeLutArgs (void *value, void *key) +{ + key = MEMfree (key); + return value; +} + +/** + * @brief Search through N_exprs chain (of N_id) for N_avis + * + * @param exprs_chain N_exprs chain containing N_id + * @param avis The N_avis to look for + * @return true if found, otherwise false + */ +static bool +IsAvisInExprs (node *exprs_chain, node *avis) +{ + bool ret = FALSE; + + DBUG_ENTER (); + + while (exprs_chain != NULL) { + if (ID_AVIS (EXPRS_EXPR (exprs_chain)) == avis) { + ret = TRUE; + break; + } + exprs_chain = EXPRS_NEXT (exprs_chain); + } + + DBUG_RETURN (ret); +} + +/** + * @brief Anonymouse traversal function (N_with) + * + * @param arg_node N_with + * @param arg_info info struct + * @return N_with + */ +static node * +ATravWith (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + /* we only need to deal with CUDA WLs */ + if (WITH_CUDARIZABLE (arg_node)) { + WITH_WITHOP (arg_node) = TRAVdo (WITH_WITHOP (arg_node), arg_info); + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief Anonymouse traversal function (N_modarray) + * + * @param arg_node N_modarray + * @param arg_info info struct + * @return N_modarray + */ +static node * +ATravModarray (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + INFO_WLERCS (arg_info) + = TCappendExprs (INFO_WLERCS (arg_info), DUPdoDupTree (MODARRAY_ERC (arg_node))); + MODARRAY_NEXT (arg_node) = TRAVopt (MODARRAY_NEXT (arg_node), arg_info); + + DBUG_RETURN (arg_node); +} + +/** + * @brief Anonymouse traversal function (N_genarray) + * + * @param arg_node N_genarray + * @param arg_info info struct + * @return N_genarray + */ +static node * +ATravGenarray (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + INFO_WLERCS (arg_info) + = TCappendExprs (INFO_WLERCS (arg_info), DUPdoDupTree (GENARRAY_ERC (arg_node))); + GENARRAY_NEXT (arg_node) = TRAVopt (GENARRAY_NEXT (arg_node), arg_info); + + DBUG_RETURN (arg_node); +} +/** @} */ + +/** @name Entry functions + * @{ + */ + +/** + * @brief Traverse syntax_tree looking for loop functions with additional arguments + * due to EMRL (EMR loop optimisation), and convert these if they used by + * CUDA wothloops. + * + * @param syntax_tree + * @return syntax_tree + */ +node * +EMRTUdoEMRUpdateFun (node *syntax_tree) +{ + info *info; + + DBUG_ENTER (); + + info = MakeInfo (); + + TRAVpush (TR_emrtu); + syntax_tree = TRAVdo (syntax_tree, info); + TRAVpop (); + + info = FreeInfo (info); + + DBUG_RETURN (syntax_tree); +} + +/** + * @brief Traverse syntax_tree and update all loop function recursive calls where + * one or more arguments are the result of EMRL optimisation. We search for + * host2device primitives and replace their argument with the one in the + * application. + * + * @param syntax_tree + * @return syntax_tree + */ +node * +EMRTUdoEMRUpdateAp (node *syntax_tree) +{ + info *info; + + DBUG_ENTER (); + + info = MakeInfo (); + + INFO_UPDATE_AP (info) = TRUE; + + TRAVpush (TR_emrtu); + syntax_tree = TRAVdo (syntax_tree, info); + TRAVpop (); + + info = FreeInfo (info); + + DBUG_RETURN (syntax_tree); +} +/** @} */ + +/** @name Traversal functions + * @{ + */ + +/** + * @brief Traverse body of LAC functions + * + * We do this in two stages, the first stage looks for initial application of loop + * function. The second stage enters into the loop function to search for or modify its + * arguments. + * + * @param arg_node N_fundef + * @param arg_info info struct + * @return N_fundef + */ +node * +EMRTUfundef (node *arg_node, info *arg_info) +{ + node *old_fundef; + + DBUG_ENTER (); + + if (!FUNDEF_ISLACFUN (arg_node)) { + DBUG_PRINT ("inspecting body of %s", FUNDEF_NAME (arg_node)); + INFO_FUNDEF (arg_info) = arg_node; + FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); + FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); + } else if (INFO_AP_ARGS (arg_info) != NULL) { + DBUG_PRINT ("inspecting application body of %s", FUNDEF_NAME (arg_node)); + old_fundef = INFO_FUNDEF (arg_info); + INFO_FUNDEF (arg_info) = arg_node; + FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); + INFO_FUNDEF (arg_info) = old_fundef; + } else + FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); + + DBUG_RETURN (arg_node); +} + +/** + * @brief Traverse application functions if they are LAC functions *and* were modified by + * EMRL previously (that they have lifted allocations) + * + * If we are at the initial application, we either + * + * a. search for CUDA-WL within fundef and collect ERCs, and if any of the ERCs are + * arguments of the fundef, we update these to be device type. + * b. check for host2device primitives, and if their RHS is an argument of the recursive + * N_ap, change the argument to be the LHS. + * + * @param arg_node N_ap + * @param arg_info info struct + * @return N_ap + */ +node * +EMRTUap (node *arg_node, info *arg_info) +{ + ntype *dev_type; + node *ap_args, *fundef_args, *old_ap_args; + node *id_avis, *arg_avis, *new_avis, *vardec; + lut_t *old_lut; + + DBUG_ENTER (); + + DBUG_PRINT ("found application of %s", AP_NAME (arg_node)); + + /* we only traverse loop functions that were changed by the EMRL (EMR loop + * optimisation) previously */ + if (FUNDEF_ISLACFUN (AP_FUNDEF (arg_node)) + && FUNDEF_ISEMRLIFTED (AP_FUNDEF (arg_node))) { + if (AP_FUNDEF (arg_node) != INFO_FUNDEF (arg_info)) { /* initial application */ + DBUG_PRINT ("...checking for lifted allocations"); + + ap_args = AP_ARGS (arg_node); + fundef_args = FUNDEF_ARGS (AP_FUNDEF (arg_node)); + + /* We use the LUT for two different stages: + * 1. when looking for fundef arguments that have to be changed to device + * type) + * 2. when replacing recursive N_ap arguments with device types (to maintain + * the buffer swapping pattern) + * + * This is why we create a new LUT in all cases. + */ + old_lut = INFO_LUT (arg_info); + INFO_LUT (arg_info) = LUTgenerateLut (); + + /* if we are only updating the recurisve N_ap arguments, we don't need to + * update the initial N_ap. */ + if (!INFO_UPDATE_AP (arg_info)) { + + /* We use an anonymous traversal to find all ERCs in CUDA-WLs */ + anontrav_t atrav[4] = {{N_with, &ATravWith}, + {N_genarray, &ATravGenarray}, + {N_modarray, &ATravModarray}, + {(nodetype)0, NULL}}; + + TRAVpushAnonymous (atrav, &TRAVsons); + AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); + TRAVpop (); + + if (INFO_WLERCS (arg_info) != NULL) { /* found ERCs in CUDA-WLs */ + while (ap_args != NULL) { + DBUG_ASSERT (fundef_args != NULL, + "# of Ap args != # of Fundef args!"); + + id_avis = ID_AVIS (EXPRS_EXPR (ap_args)); + arg_avis = ARG_AVIS (fundef_args); + + /* if the lifted argument is *not* a ERC of a CUDA-WL, ignore */ + if (AVIS_ISALLOCLIFT (id_avis) + && IsAvisInExprs (INFO_WLERCS (arg_info), arg_avis)) { + DBUG_PRINT ("......found lifted argument ap arg %s, fundef " + "arg %s", + AVIS_NAME (id_avis), AVIS_NAME (arg_avis)); + + /* create new local avis (cuda device type) */ + dev_type = ConvertHost2DeviceType (AVIS_TYPE (id_avis)); + new_avis + = TBmakeAvis (TRAVtmpVarName ("emr_lift_dev"), dev_type); + AVIS_ISALLOCLIFT (new_avis) = TRUE; + + /* get old vardec and update it */ + DBUG_ASSERT (AVIS_DECL (id_avis) != NULL + && NODE_TYPE (AVIS_DECL (id_avis)) == N_vardec, + "Local avis has no vardec!"); + vardec = AVIS_DECL (id_avis); + VARDEC_AVIS (vardec) = new_avis; + AVIS_DECL (new_avis) = vardec; + + /* update application arguments */ + ID_AVIS (EXPRS_EXPR (ap_args)) = FREEdoFreeTree (id_avis); + ID_AVIS (EXPRS_EXPR (ap_args)) = new_avis; + + /* update function signature */ + dev_type = ConvertHost2DeviceType (AVIS_TYPE (arg_avis)); + new_avis + = TBmakeAvis (TRAVtmpVarName ("emr_tmp_dev"), dev_type); + + ARG_AVIS (fundef_args) = new_avis; + AVIS_DECL (new_avis) = fundef_args; + + /* add N_avis to LUT, and use to replace values within the + * function body */ + INFO_LUT (arg_info) = LUTinsertIntoLutP (INFO_LUT (arg_info), + arg_avis, new_avis); + } + + ap_args = EXPRS_NEXT (ap_args); + fundef_args = ARG_NEXT (fundef_args); + } + INFO_WLERCS (arg_info) = FREEdoFreeTree (INFO_WLERCS (arg_info)); + } + } + + /* now update all N_avis in the fundef */ + DBUG_PRINT ("...going into application fundef"); + old_ap_args = INFO_AP_ARGS (arg_info); + INFO_AP_ARGS (arg_info) = AP_ARGS (arg_node); + AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); + INFO_AP_ARGS (arg_info) = old_ap_args; + + if (!INFO_UPDATE_AP (arg_info)) { + /* free all old fundef args */ + INFO_LUT (arg_info) = LUTmapLutP (INFO_LUT (arg_info), FreeLutArgs); + } + INFO_LUT (arg_info) = LUTremoveLut (INFO_LUT (arg_info)); + INFO_LUT (arg_info) = old_lut; + } else if (AP_FUNDEF (arg_node) == INFO_FUNDEF (arg_info) + && INFO_UPDATE_AP (arg_info)) { /* is recursive application */ + DBUG_PRINT ("...is recursive call"); + ap_args = INFO_AP_ARGS (arg_info); + fundef_args = AP_ARGS (arg_node); + + DBUG_ASSERT (INFO_LUT (arg_info) != NULL, "There is no LUT!"); + + while (ap_args != NULL) { + DBUG_ASSERT (fundef_args != NULL, + "# of outer Ap args != # of recursive Ap args!"); + + arg_avis = ID_AVIS (EXPRS_EXPR (fundef_args)); + + new_avis = LUTsearchInLutPp (INFO_LUT (arg_info), arg_avis); + if (arg_avis != new_avis) { + DBUG_PRINT ("......found matching ERC lift %s -> %s", + AVIS_NAME (arg_avis), AVIS_NAME (new_avis)); + ID_AVIS (EXPRS_EXPR (fundef_args)) = new_avis; + } + + ap_args = EXPRS_NEXT (ap_args); + fundef_args = EXPRS_NEXT (fundef_args); + } + } + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief If we are updating the function argument, replace the ERC with the new device + * type function argument + * + * @param arg_node N_modarray + * @param arg_info info struct + * @return N_modarray + */ +node * +EMRTUmodarray (node *arg_node, info *arg_info) +{ + node *ercs, *erc; + + DBUG_ENTER (); + + if (!INFO_UPDATE_AP (arg_info)) { + ercs = MODARRAY_ERC (arg_node); + while (ercs != NULL) { + erc = LUTsearchInLutPp (INFO_LUT (arg_info), ID_AVIS (EXPRS_EXPR (ercs))); + if (ID_AVIS (EXPRS_EXPR (ercs)) != erc) { + DBUG_PRINT ("...found %s, replacing with %s", ID_NAME (EXPRS_EXPR (ercs)), + AVIS_NAME (erc)); + ID_AVIS (EXPRS_EXPR (ercs)) = erc; + break; + } + ercs = EXPRS_NEXT (ercs); + } + + MODARRAY_NEXT (arg_node) = TRAVopt (MODARRAY_NEXT (arg_node), arg_info); + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief If we are updating the function argument, replace the ERC with the new device + * type function argument + * + * @param arg_node N_genarray + * @param arg_info info struct + * @return N_genarray + */ +node * +EMRTUgenarray (node *arg_node, info *arg_info) +{ + node *ercs, *erc; + + DBUG_ENTER (); + + if (!INFO_UPDATE_AP (arg_info)) { + ercs = GENARRAY_ERC (arg_node); + while (ercs != NULL) { + erc = LUTsearchInLutPp (INFO_LUT (arg_info), ID_AVIS (EXPRS_EXPR (ercs))); + if (ID_AVIS (EXPRS_EXPR (ercs)) != erc) { + DBUG_PRINT ("...found %s, replacing with %s", ID_NAME (EXPRS_EXPR (ercs)), + AVIS_NAME (erc)); + ID_AVIS (EXPRS_EXPR (ercs)) = erc; + break; + } + ercs = EXPRS_NEXT (ercs); + } + + GENARRAY_NEXT (arg_node) = TRAVopt (GENARRAY_NEXT (arg_node), arg_info); + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief Store LHS before traversing into RHS + * + * @param arg_node N_let + * @param arg_info info struct + * @return N_let + */ +node * +EMRTUlet (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + LET_IDS (arg_node) = TRAVdo (LET_IDS (arg_node), arg_info); + + INFO_LETIDS (arg_info) = LET_IDS (arg_node); + LET_EXPR (arg_node) = TRAVdo (LET_EXPR (arg_node), arg_info); + + DBUG_RETURN (arg_node); +} + +/** + * @brief If we find a host2device primitive, add its LHS and argument to the LUT + * + * @param arg_node N_prf + * @param arg_info info struct + * @return N_prf + */ +node * +EMRTUprf (node *arg_node, info *arg_info) +{ + node *id_avis, *let_avis; + + DBUG_ENTER (); + + if (INFO_UPDATE_AP (arg_info)) { + if (PRF_PRF (arg_node) == F_host2device) { + id_avis = ID_AVIS (PRF_ARG1 (arg_node)); + let_avis = IDS_AVIS (INFO_LETIDS (arg_info)); + DBUG_PRINT ("Found host2device, %s -> %s", AVIS_NAME (id_avis), + AVIS_NAME (let_avis)); + DBUG_ASSERT (INFO_LUT (arg_info) != NULL, "There is no LUT!"); + INFO_LUT (arg_info) + = LUTinsertIntoLutP (INFO_LUT (arg_info), id_avis, let_avis); + } + } + + DBUG_RETURN (arg_node); +} +/** @} */ diff --git a/src/libsac2c/cuda/emr_type_update.h b/src/libsac2c/cuda/emr_type_update.h new file mode 100644 index 000000000..f35a1f19a --- /dev/null +++ b/src/libsac2c/cuda/emr_type_update.h @@ -0,0 +1,16 @@ +#ifndef _SAC_CUDA_EMR_LOOP_H_ +#define _SAC_CUDA_EMR_LOOP_H_ + +#include "types.h" + +extern node *EMRTUdoEMRUpdateFun (node *syntax_tree); +extern node *EMRTUdoEMRUpdateAp (node *syntax_tree); + +extern node *EMRTUfundef (node *arg_node, info *arg_info); +extern node *EMRTUap (node *arg_node, info *arg_info); +extern node *EMRTUmodarray (node *arg_node, info *arg_info); +extern node *EMRTUgenarray (node *arg_node, info *arg_info); +extern node *EMRTUlet (node *arg_node, info *arg_info); +extern node *EMRTUprf (node *arg_node, info *arg_info); + +#endif /* _SAC_CUDA_EMR_LOOP_H_ */ diff --git a/src/libsac2c/cuda/insert_withloop_memtran.c b/src/libsac2c/cuda/insert_withloop_memtran.c index 3f503d98d..f3cdc86cc 100644 --- a/src/libsac2c/cuda/insert_withloop_memtran.c +++ b/src/libsac2c/cuda/insert_withloop_memtran.c @@ -63,6 +63,7 @@ #include "types.h" #include "type_utils.h" #include "cuda_utils.h" +#include "emr_type_update.h" #include "DataFlowMask.h" #include "DataFlowMaskUtils.h" #include "remove_dfms.h" @@ -155,7 +156,7 @@ FreeInfo (info *info) */ /** - * @brief Perform the IWLMEM traversal + * @brief Perform the IWLMEM traversal, and additionally call the EMRTU traversal. * * @param syntax_tree * @return syntax_tree @@ -167,17 +168,25 @@ IWLMEMdoInsertWithloopMemtran (node *syntax_tree) DBUG_ENTER (); - info = MakeInfo (); - /* * Infer dataflow masks */ // syntax_tree = INFDFMSdoInferDfms( syntax_tree, HIDE_LOCALS_NEVER); + /* Convert EMRL allocations lifts to device type (update fundef signiture) */ + if (global.optimize.doemrl) + syntax_tree = EMRTUdoEMRUpdateFun (syntax_tree); + + info = MakeInfo (); + TRAVpush (TR_iwlmem); syntax_tree = TRAVdo (syntax_tree, info); TRAVpop (); + /* Convert EMRL allocations lifts to device type (update ap arguments) */ + if (global.optimize.doemrl) + syntax_tree = EMRTUdoEMRUpdateAp (syntax_tree); + info = FreeInfo (info); DBUG_RETURN (syntax_tree); diff --git a/src/libsac2c/xml/ast.xml b/src/libsac2c/xml/ast.xml index 943c0a319..43e214056 100644 --- a/src/libsac2c/xml/ast.xml +++ b/src/libsac2c/xml/ast.xml @@ -529,6 +529,16 @@ + + + + + + + + + + -- GitLab From 284fc0960c94fb95561597c69ad7e6ef3515be9b Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Thu, 4 Apr 2019 21:34:12 +0100 Subject: [PATCH 05/17] minor fix to EMRTU (forgot undef DBUG_PREFIX) --- src/libsac2c/cuda/emr_type_update.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/libsac2c/cuda/emr_type_update.c b/src/libsac2c/cuda/emr_type_update.c index e2c48d021..d6f1fb25c 100644 --- a/src/libsac2c/cuda/emr_type_update.c +++ b/src/libsac2c/cuda/emr_type_update.c @@ -697,3 +697,5 @@ EMRTUprf (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } /** @} */ + +#undef DBUG_PREFIX -- GitLab From e0ecc1fc4312989a0f8e1fb138e536d7ddbc642f Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Mon, 8 Apr 2019 18:09:54 +0100 Subject: [PATCH 06/17] refactoring of EMRTU and IWLMEM Have moved out a function used in both traversals to the cuda_utils module. Also, I've removed the ISEMRLIFTED flag from EMRL/EMRTU, as it was preventing certain cases from being identified (such as cond functions). --- src/libsac2c/cuda/cuda_utils.c | 40 ++++++++++++++- src/libsac2c/cuda/cuda_utils.h | 4 +- src/libsac2c/cuda/emr_type_update.c | 39 ++------------- src/libsac2c/cuda/insert_withloop_memtran.c | 54 ++++++++------------- src/libsac2c/memory/emr_loop_optimisation.c | 3 -- src/libsac2c/xml/ast.xml | 5 -- 6 files changed, 63 insertions(+), 82 deletions(-) diff --git a/src/libsac2c/cuda/cuda_utils.c b/src/libsac2c/cuda/cuda_utils.c index 4b21a4873..33d066737 100644 --- a/src/libsac2c/cuda/cuda_utils.c +++ b/src/libsac2c/cuda/cuda_utils.c @@ -1,6 +1,13 @@ - +/** + * @file + * @defgroup cutil CUDA utils + * @ingroup cuda + * + * @{ + */ #include "cuda_utils.h" +#include "type_utils.h" #include "tree_basic.h" #include "tree_compound.h" #include "str.h" @@ -273,4 +280,35 @@ CUisDeviceArrayTypeNew (ntype *ty) DBUG_RETURN (res); } +/** + * @brief Convert from a host ntype to a device ntype, while preserving shape information. + * + * @param host_type The host ntype + * @return A device ntype struct, or NULL if the host_type *does not* have a simpletype + */ +ntype * +CUconvertHostToDeviceType (ntype *host_type) +{ + ntype *scalar_type, *dev_type = NULL; + simpletype sty; + + DBUG_ENTER (); + + /* If the host_type is of known dimension */ + DBUG_ASSERT (TUdimKnown (host_type), "AUD type found!"); + + /* If the scalar type is simple, e.g. int, float ... */ + if (TYgetDim (host_type) > 0 + && TYisSimple (TYgetScalar (host_type))) { + dev_type = TYcopyType (host_type); + scalar_type = TYgetScalar (dev_type); + /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ + sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); + /* Set the device simple type */ + scalar_type = TYsetSimpleType (scalar_type, sty); + } + + DBUG_RETURN (dev_type); +} + #undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/cuda_utils.h b/src/libsac2c/cuda/cuda_utils.h index 553619739..05b366305 100644 --- a/src/libsac2c/cuda/cuda_utils.h +++ b/src/libsac2c/cuda/cuda_utils.h @@ -1,4 +1,3 @@ - #ifndef _SAC_CUDA_UTILS_H_ #define _SAC_CUDA_UTILS_H_ @@ -33,5 +32,6 @@ extern bool CUisShmemTypeNew (ntype *ty); extern bool CUisDeviceTypeOld (types *ty); extern bool CUisShmemTypeOld (types *ty); extern bool CUisDeviceArrayTypeNew (ntype *ty); +extern ntype *CUconvertHostToDeviceType (ntype *host_type); -#endif +#endif /* _SAC_CUDA_UTILS_H_ */ diff --git a/src/libsac2c/cuda/emr_type_update.c b/src/libsac2c/cuda/emr_type_update.c index d6f1fb25c..272423d7d 100644 --- a/src/libsac2c/cuda/emr_type_update.c +++ b/src/libsac2c/cuda/emr_type_update.c @@ -122,7 +122,6 @@ #include "emr_type_update.h" #include "types.h" -#include "type_utils.h" #include "traverse.h" #include "tree_basic.h" #include "tree_compound.h" @@ -193,37 +192,6 @@ FreeInfo (info *info) * @{ */ -/** - * @brief Convert from a host ntype to a device ntype, while preserving shape information. - * - * @param host_type The host ntype - * @return A device ntype struct - */ -static ntype * -ConvertHost2DeviceType (ntype *host_type) -{ - ntype *scalar_type, *dev_type = NULL; - simpletype sty; - - DBUG_ENTER (); - - /* If the N_id is of known dimension and is not a scalar */ - DBUG_ASSERT (TUdimKnown (host_type), "AUD N_id found!"); - if (TYgetDim (host_type) > 0) { - /* If the scalar type is simple, e.g. int, float ... */ - if (TYisSimple (TYgetScalar (host_type))) { - dev_type = TYcopyType (host_type); - scalar_type = TYgetScalar (dev_type); - /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ - sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); - /* Set the device simple type */ - scalar_type = TYsetSimpleType (scalar_type, sty); - } - } - - DBUG_RETURN (dev_type); -} - /** * @brief Used as part of LUTmap operation to free LUT keys. * @@ -450,8 +418,7 @@ EMRTUap (node *arg_node, info *arg_info) /* we only traverse loop functions that were changed by the EMRL (EMR loop * optimisation) previously */ - if (FUNDEF_ISLACFUN (AP_FUNDEF (arg_node)) - && FUNDEF_ISEMRLIFTED (AP_FUNDEF (arg_node))) { + if (FUNDEF_ISLACFUN (AP_FUNDEF (arg_node))) { if (AP_FUNDEF (arg_node) != INFO_FUNDEF (arg_info)) { /* initial application */ DBUG_PRINT ("...checking for lifted allocations"); @@ -499,7 +466,7 @@ EMRTUap (node *arg_node, info *arg_info) AVIS_NAME (id_avis), AVIS_NAME (arg_avis)); /* create new local avis (cuda device type) */ - dev_type = ConvertHost2DeviceType (AVIS_TYPE (id_avis)); + dev_type = CUconvertHostToDeviceType (AVIS_TYPE (id_avis)); new_avis = TBmakeAvis (TRAVtmpVarName ("emr_lift_dev"), dev_type); AVIS_ISALLOCLIFT (new_avis) = TRUE; @@ -517,7 +484,7 @@ EMRTUap (node *arg_node, info *arg_info) ID_AVIS (EXPRS_EXPR (ap_args)) = new_avis; /* update function signature */ - dev_type = ConvertHost2DeviceType (AVIS_TYPE (arg_avis)); + dev_type = CUconvertHostToDeviceType (AVIS_TYPE (arg_avis)); new_avis = TBmakeAvis (TRAVtmpVarName ("emr_tmp_dev"), dev_type); diff --git a/src/libsac2c/cuda/insert_withloop_memtran.c b/src/libsac2c/cuda/insert_withloop_memtran.c index f3cdc86cc..80bb0b380 100644 --- a/src/libsac2c/cuda/insert_withloop_memtran.c +++ b/src/libsac2c/cuda/insert_withloop_memtran.c @@ -272,57 +272,41 @@ AssignInTopBlock (node *assign, info *arg_info) static ntype * TypeConvert (ntype *host_type, nodetype nty, info *arg_info) { - ntype *scalar_type, *dev_type = NULL; - simpletype sty; + ntype *dev_type = NULL; DBUG_ENTER (); if (nty == N_id) { - /* If the N_id is of known dimension and is not a scalar */ - DBUG_ASSERT (TUdimKnown (host_type), "AUD N_id found in cudarizable N_with!"); - if (TYgetDim (host_type) > 0) { - /* If the scalar type is simple, e.g. int, float ... */ - if (TYisSimple (TYgetScalar (host_type))) { - dev_type = TYcopyType (host_type); - scalar_type = TYgetScalar (dev_type); - /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ - sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); - /* Set the device simple type */ - scalar_type = TYsetSimpleType (scalar_type, sty); - } - } + dev_type = CUconvertHostToDeviceType (host_type); } - /* If the node to be type converted is N_ids, its original type + /** + * If the node to be type converted is N_ids, its original type * can be AUD as well as long as the N_with on the RHS is cudarizable. * The reason a cudarizbale can produce a AUD result illustrated by * the following example: * - * cond_fun() - * { - * int[*] aa; - * int bb; + * ~~~~ + * cond_fun() + * { + * int[*] aa; + * int bb; * - * if( cond) { - * aa = with {}:genarray( shp); (cudarizable N_with) - * } - * else { - * bb = 1; - * } - * ret = cond ? aa : bb; + * if( cond) { + * aa = with {}:genarray( shp); (cudarizable N_with) + * } + * else { + * bb = 1; * } + * ret = cond ? aa : bb; + * } + * ~~~~ * */ else if (nty == N_ids) { if (NODE_TYPE (INFO_LETEXPR (arg_info)) == N_with) { /* If the scalar type is simple, e.g. int, float ... */ - if (WITH_CUDARIZABLE (INFO_LETEXPR (arg_info)) - && TYisSimple (TYgetScalar (host_type))) { - dev_type = TYcopyType (host_type); - scalar_type = TYgetScalar (dev_type); - /* Get the corresponding device simple type e.g. int_dev, float_dev...*/ - sty = CUh2dSimpleTypeConversion (TYgetSimpleType (scalar_type)); - /* Set the device simple type */ - scalar_type = TYsetSimpleType (scalar_type, sty); + if (WITH_CUDARIZABLE (INFO_LETEXPR (arg_info))) { + dev_type = CUconvertHostToDeviceType (host_type); } } } else { diff --git a/src/libsac2c/memory/emr_loop_optimisation.c b/src/libsac2c/memory/emr_loop_optimisation.c index 67fd382ee..06b26915f 100644 --- a/src/libsac2c/memory/emr_loop_optimisation.c +++ b/src/libsac2c/memory/emr_loop_optimisation.c @@ -507,9 +507,6 @@ EMRLfundef (node * arg_node, info * arg_info) FUNDEF_ARGS (arg_node) = TCappendArgs (FUNDEF_ARGS (arg_node), INFO_ARGS (arg_info)); INFO_ARGS (arg_info) = NULL; - - /* mark fundef as having been touched by EMRL - this used later in EMRTU */ - FUNDEF_ISEMRLIFTED (arg_node) = TRUE; } INFO_FUNDEF (arg_info) = NULL; diff --git a/src/libsac2c/xml/ast.xml b/src/libsac2c/xml/ast.xml index 43e214056..b5a418850 100644 --- a/src/libsac2c/xml/ast.xml +++ b/src/libsac2c/xml/ast.xml @@ -6010,11 +6010,6 @@ N_tfarg : Indicates whether or not we need to generate a declaration within header.c when linking to an external library. - - - TRUE iff this fundef has been affected by the EMRL optimisation. - - -- GitLab From 54490710febd211d9001e711e7874b37b1d55d79 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Mon, 8 Apr 2019 18:12:17 +0100 Subject: [PATCH 07/17] Fix incorrect traversal doflag check Completely forgot that EMRL is one by default, therefore whenever we entered IWLMEM, we entered EMRTU, which in some very rare cases did cause problems. We now check that the EMRCI and EMRTU flag is on as well! --- src/libsac2c/cuda/insert_withloop_memtran.c | 12 ++++++++---- src/libsac2c/stdopt/optimize.mac | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/libsac2c/cuda/insert_withloop_memtran.c b/src/libsac2c/cuda/insert_withloop_memtran.c index 80bb0b380..23757d4a8 100644 --- a/src/libsac2c/cuda/insert_withloop_memtran.c +++ b/src/libsac2c/cuda/insert_withloop_memtran.c @@ -174,7 +174,9 @@ IWLMEMdoInsertWithloopMemtran (node *syntax_tree) // syntax_tree = INFDFMSdoInferDfms( syntax_tree, HIDE_LOCALS_NEVER); /* Convert EMRL allocations lifts to device type (update fundef signiture) */ - if (global.optimize.doemrl) + if (global.optimize.doemrci + && global.optimize.doemrl + && global.optimize.doemrtu) syntax_tree = EMRTUdoEMRUpdateFun (syntax_tree); info = MakeInfo (); @@ -183,12 +185,14 @@ IWLMEMdoInsertWithloopMemtran (node *syntax_tree) syntax_tree = TRAVdo (syntax_tree, info); TRAVpop (); + info = FreeInfo (info); + /* Convert EMRL allocations lifts to device type (update ap arguments) */ - if (global.optimize.doemrl) + if (global.optimize.doemrci + && global.optimize.doemrl + && global.optimize.doemrtu) syntax_tree = EMRTUdoEMRUpdateAp (syntax_tree); - info = FreeInfo (info); - DBUG_RETURN (syntax_tree); } diff --git a/src/libsac2c/stdopt/optimize.mac b/src/libsac2c/stdopt/optimize.mac index 251e2fc82..56fc38c46 100644 --- a/src/libsac2c/stdopt/optimize.mac +++ b/src/libsac2c/stdopt/optimize.mac @@ -107,6 +107,7 @@ OPTIMIZE ("pra", pra, FALSE, FALSE, "polyhedra data reuse optimization") OPTIMIZE ("emrci", emrci, FALSE, FALSE, "EMR candidate inference") OPTIMIZE ("emrcf", emrcf, TRUE, TRUE, "EMR candidate filtering") OPTIMIZE ("emrl", emrl, TRUE, TRUE, "EMR loop memory optimisation") +OPTIMIZE ("emrtu", emrtu, TRUE, TRUE, "EMR type update optimisation for CUDA") OPTIMIZE ("rnb", rnb, FALSE, FALSE, "remove noop conditional branch in with-loops") OPTIMIZE ("rwo", rwo, TRUE, TRUE, "memory reuse with offset") OPTIMIZE ("rip", rip, TRUE, TRUE, "memory reuse with in place selection") -- GitLab From 53c24192ae53a1212e48c3e68d6291f27e85bed5 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Tue, 16 Apr 2019 18:39:07 +0100 Subject: [PATCH 08/17] Remove EMRTU optimisation Some testing showed that the implementation was not ideal, leading to problems in later CUDA traversals. I'm removing this, in favour of a better implementation. --- src/libsac2c/CMakeLists.txt | 1 - src/libsac2c/cuda/emr_type_update.c | 668 -------------------- src/libsac2c/cuda/emr_type_update.h | 16 - src/libsac2c/cuda/insert_withloop_memtran.c | 13 - src/libsac2c/memory/emr_loop_optimisation.c | 5 - src/libsac2c/stdopt/optimize.mac | 1 - src/libsac2c/xml/ast.xml | 10 - 7 files changed, 714 deletions(-) delete mode 100644 src/libsac2c/cuda/emr_type_update.c delete mode 100644 src/libsac2c/cuda/emr_type_update.h diff --git a/src/libsac2c/CMakeLists.txt b/src/libsac2c/CMakeLists.txt index 554ca1170..adcc25401 100644 --- a/src/libsac2c/CMakeLists.txt +++ b/src/libsac2c/CMakeLists.txt @@ -219,7 +219,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/cuda/cuda_sink_code.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/cuda_tag_executionmode.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/cuda_utils.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/data_access_analysis.c -${CMAKE_CURRENT_SOURCE_DIR}/cuda/emr_type_update.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/expand_shmem_boundary_load.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/infer_reusable_arrays.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/insert_cudast_memtran.c diff --git a/src/libsac2c/cuda/emr_type_update.c b/src/libsac2c/cuda/emr_type_update.c deleted file mode 100644 index 272423d7d..000000000 --- a/src/libsac2c/cuda/emr_type_update.c +++ /dev/null @@ -1,668 +0,0 @@ -/** - * @file - * @defgroup emrcudalift EMR loop allocation lifting for CUDA - * @ingroup cuda - * - * @brief Convert all ERCs in EMRL affected fundefs with CUDA-WL to CUDA device types. - * - * This traverse works *only* with IWLMEM in two modes: - * * we update initial application of loop fun, and loop fun arguments - * * we update the recursive application. - * - * The first mode is done *before* applying CUDA WL transfer insertion (IWLMEM), as we - * want to avoid adding host2device transfers for lifted ERCs. The second mode is applied - * *after* IWLMEM to update the recursive application with any LHS of host2device calls. - * - * To ilistrate what this traversal does, lets walk through with a code example. Before - * we reach IWLMEM, we have this situation: - * - * ~~~~ - * loop_entry (A) - * { - * type var_emr_lift; - * - * return loop_0 (A, var_emr_lift); - * } - * - * loop_0 (A, var_emr_tmp) - * { - * type a; - * type a_0; - * - * a = wl ( A ) [ERC: var_emr_tmp]; - * - * if (some cond) - * a_0 = loop_0 (a, A); - * return (some cond) ? a_0 : a; - * } - * ~~~~ - * - * After first mode step of EMRTU: - * - * ~~~~ - * loop_entry (A) - * { - * type_dev var_emr_lift_dev; - * - * return loop_0 (A, var_emr_lift_dev); - * } - * - * loop_0 (A, var_emr_tmp_dev) - * { - * type a; - * type a_0; - * - * a = wl ( A ) [ERC: var_emr_tmp_dev]; - * - * if (some cond) - * a_0 = loop_0 (a, A); - * return (some cond) ? a_0 : a; - * } - * ~~~~ - * - * After IWLMEM traversal: - * - * ~~~~ - * loop_entry (A) - * { - * type_dev var_emr_lift_dev; - * - * return loop_0 (A, var_emr_lift_dev); - * } - * - * loop_0 (A, var_emr_tmp_dev) - * { - * type a; - * type_dev A_dev; - * type_dev a_dev; - * type a_0; - * - * A_dev = host2device (A); - * a_dev = wl ( A_dev ) [ERC: var_emr_tmp_dev]; - * a = device2host (a_dev); - * - * if (some cond) - * a_0 = loop_0 (a, A); - * return (some cond) ? a_0 : a; - * } - * ~~~~ - * - * And finally after step two of EMRTU: - * - * ~~~~ - * loop_entry (A) - * { - * type_dev var_emr_lift_dev; - * - * return loop_0 (A, var_emr_lift_dev); - * } - * - * loop_0 (A, var_emr_tmp_dev) - * { - * type a; - * type_dev A_dev; - * type_dev a_dev; - * type a_0; - * - * A_dev = host2device (A); - * a_dev = wl ( A_dev ) [ERC: var_emr_tmp_dev]; - * a = device2host (a_dev); - * - * if (some cond) - * a_0 = loop_0 (a, A_dev); - * return (some cond) ? a_0 : a; - * } - * ~~~~ - * - * @note This transformation can only be applied *after* cudarizable WLs have been - * identified. - * - * @{ - */ -#include "emr_type_update.h" - -#include "types.h" -#include "traverse.h" -#include "tree_basic.h" -#include "tree_compound.h" -#include "memory.h" - -#define DBUG_PREFIX "EMRTU" -#include "debug.h" - -#include "free.h" -#include "new_types.h" -#include "cuda_utils.h" -#include "LookUpTable.h" -#include "DupTree.h" - -/** @name INFO structure - * @{ - */ -struct INFO { - node *fundef; /**< Holds current N_fundef */ - lut_t *lut; /**< LUT is used for storing either new device N_avis (for fundef update), - or N_avis pairs (for ap update) */ - node *ap_args; /**< N_ap arguments */ - bool update_ap; /**< Flag indicating what mode we are in (either fundef updating, or - ap updating) */ - node *letids; /**< The the LHS of N_prf */ - node *wl_ercs; /**< Holds all CUDA-WL ERCs for a given fundef */ -}; - -#define INFO_FUNDEF(n) ((n)->fundef) -#define INFO_LUT(n) ((n)->lut) -#define INFO_AP_ARGS(n) ((n)->ap_args) -#define INFO_UPDATE_AP(n) ((n)->update_ap) -#define INFO_LETIDS(n) ((n)->letids) -#define INFO_WLERCS(n) ((n)->wl_ercs) - -static info * -MakeInfo (void) -{ - info *result; - - DBUG_ENTER (); - - result = (info *)MEMmalloc (sizeof (info)); - - INFO_FUNDEF (result) = NULL; - INFO_LUT (result) = NULL; - INFO_AP_ARGS (result) = NULL; - INFO_UPDATE_AP (result) = FALSE; - INFO_LETIDS (result) = NULL; - INFO_WLERCS (result) = NULL; - - DBUG_RETURN (result); -} - -static info * -FreeInfo (info *info) -{ - DBUG_ENTER (); - - info = MEMfree (info); - - DBUG_RETURN (info); -} - -/** @} */ - -/** @name Static helper functions - * @{ - */ - -/** - * @brief Used as part of LUTmap operation to free LUT keys. - * - * @param value The LUT value - * @param key The LUT key, which will be freed - * @return the value - */ -static void * -FreeLutArgs (void *value, void *key) -{ - key = MEMfree (key); - return value; -} - -/** - * @brief Search through N_exprs chain (of N_id) for N_avis - * - * @param exprs_chain N_exprs chain containing N_id - * @param avis The N_avis to look for - * @return true if found, otherwise false - */ -static bool -IsAvisInExprs (node *exprs_chain, node *avis) -{ - bool ret = FALSE; - - DBUG_ENTER (); - - while (exprs_chain != NULL) { - if (ID_AVIS (EXPRS_EXPR (exprs_chain)) == avis) { - ret = TRUE; - break; - } - exprs_chain = EXPRS_NEXT (exprs_chain); - } - - DBUG_RETURN (ret); -} - -/** - * @brief Anonymouse traversal function (N_with) - * - * @param arg_node N_with - * @param arg_info info struct - * @return N_with - */ -static node * -ATravWith (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - /* we only need to deal with CUDA WLs */ - if (WITH_CUDARIZABLE (arg_node)) { - WITH_WITHOP (arg_node) = TRAVdo (WITH_WITHOP (arg_node), arg_info); - } - - DBUG_RETURN (arg_node); -} - -/** - * @brief Anonymouse traversal function (N_modarray) - * - * @param arg_node N_modarray - * @param arg_info info struct - * @return N_modarray - */ -static node * -ATravModarray (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - INFO_WLERCS (arg_info) - = TCappendExprs (INFO_WLERCS (arg_info), DUPdoDupTree (MODARRAY_ERC (arg_node))); - MODARRAY_NEXT (arg_node) = TRAVopt (MODARRAY_NEXT (arg_node), arg_info); - - DBUG_RETURN (arg_node); -} - -/** - * @brief Anonymouse traversal function (N_genarray) - * - * @param arg_node N_genarray - * @param arg_info info struct - * @return N_genarray - */ -static node * -ATravGenarray (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - INFO_WLERCS (arg_info) - = TCappendExprs (INFO_WLERCS (arg_info), DUPdoDupTree (GENARRAY_ERC (arg_node))); - GENARRAY_NEXT (arg_node) = TRAVopt (GENARRAY_NEXT (arg_node), arg_info); - - DBUG_RETURN (arg_node); -} -/** @} */ - -/** @name Entry functions - * @{ - */ - -/** - * @brief Traverse syntax_tree looking for loop functions with additional arguments - * due to EMRL (EMR loop optimisation), and convert these if they used by - * CUDA wothloops. - * - * @param syntax_tree - * @return syntax_tree - */ -node * -EMRTUdoEMRUpdateFun (node *syntax_tree) -{ - info *info; - - DBUG_ENTER (); - - info = MakeInfo (); - - TRAVpush (TR_emrtu); - syntax_tree = TRAVdo (syntax_tree, info); - TRAVpop (); - - info = FreeInfo (info); - - DBUG_RETURN (syntax_tree); -} - -/** - * @brief Traverse syntax_tree and update all loop function recursive calls where - * one or more arguments are the result of EMRL optimisation. We search for - * host2device primitives and replace their argument with the one in the - * application. - * - * @param syntax_tree - * @return syntax_tree - */ -node * -EMRTUdoEMRUpdateAp (node *syntax_tree) -{ - info *info; - - DBUG_ENTER (); - - info = MakeInfo (); - - INFO_UPDATE_AP (info) = TRUE; - - TRAVpush (TR_emrtu); - syntax_tree = TRAVdo (syntax_tree, info); - TRAVpop (); - - info = FreeInfo (info); - - DBUG_RETURN (syntax_tree); -} -/** @} */ - -/** @name Traversal functions - * @{ - */ - -/** - * @brief Traverse body of LAC functions - * - * We do this in two stages, the first stage looks for initial application of loop - * function. The second stage enters into the loop function to search for or modify its - * arguments. - * - * @param arg_node N_fundef - * @param arg_info info struct - * @return N_fundef - */ -node * -EMRTUfundef (node *arg_node, info *arg_info) -{ - node *old_fundef; - - DBUG_ENTER (); - - if (!FUNDEF_ISLACFUN (arg_node)) { - DBUG_PRINT ("inspecting body of %s", FUNDEF_NAME (arg_node)); - INFO_FUNDEF (arg_info) = arg_node; - FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); - FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); - } else if (INFO_AP_ARGS (arg_info) != NULL) { - DBUG_PRINT ("inspecting application body of %s", FUNDEF_NAME (arg_node)); - old_fundef = INFO_FUNDEF (arg_info); - INFO_FUNDEF (arg_info) = arg_node; - FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); - INFO_FUNDEF (arg_info) = old_fundef; - } else - FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); - - DBUG_RETURN (arg_node); -} - -/** - * @brief Traverse application functions if they are LAC functions *and* were modified by - * EMRL previously (that they have lifted allocations) - * - * If we are at the initial application, we either - * - * a. search for CUDA-WL within fundef and collect ERCs, and if any of the ERCs are - * arguments of the fundef, we update these to be device type. - * b. check for host2device primitives, and if their RHS is an argument of the recursive - * N_ap, change the argument to be the LHS. - * - * @param arg_node N_ap - * @param arg_info info struct - * @return N_ap - */ -node * -EMRTUap (node *arg_node, info *arg_info) -{ - ntype *dev_type; - node *ap_args, *fundef_args, *old_ap_args; - node *id_avis, *arg_avis, *new_avis, *vardec; - lut_t *old_lut; - - DBUG_ENTER (); - - DBUG_PRINT ("found application of %s", AP_NAME (arg_node)); - - /* we only traverse loop functions that were changed by the EMRL (EMR loop - * optimisation) previously */ - if (FUNDEF_ISLACFUN (AP_FUNDEF (arg_node))) { - if (AP_FUNDEF (arg_node) != INFO_FUNDEF (arg_info)) { /* initial application */ - DBUG_PRINT ("...checking for lifted allocations"); - - ap_args = AP_ARGS (arg_node); - fundef_args = FUNDEF_ARGS (AP_FUNDEF (arg_node)); - - /* We use the LUT for two different stages: - * 1. when looking for fundef arguments that have to be changed to device - * type) - * 2. when replacing recursive N_ap arguments with device types (to maintain - * the buffer swapping pattern) - * - * This is why we create a new LUT in all cases. - */ - old_lut = INFO_LUT (arg_info); - INFO_LUT (arg_info) = LUTgenerateLut (); - - /* if we are only updating the recurisve N_ap arguments, we don't need to - * update the initial N_ap. */ - if (!INFO_UPDATE_AP (arg_info)) { - - /* We use an anonymous traversal to find all ERCs in CUDA-WLs */ - anontrav_t atrav[4] = {{N_with, &ATravWith}, - {N_genarray, &ATravGenarray}, - {N_modarray, &ATravModarray}, - {(nodetype)0, NULL}}; - - TRAVpushAnonymous (atrav, &TRAVsons); - AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); - TRAVpop (); - - if (INFO_WLERCS (arg_info) != NULL) { /* found ERCs in CUDA-WLs */ - while (ap_args != NULL) { - DBUG_ASSERT (fundef_args != NULL, - "# of Ap args != # of Fundef args!"); - - id_avis = ID_AVIS (EXPRS_EXPR (ap_args)); - arg_avis = ARG_AVIS (fundef_args); - - /* if the lifted argument is *not* a ERC of a CUDA-WL, ignore */ - if (AVIS_ISALLOCLIFT (id_avis) - && IsAvisInExprs (INFO_WLERCS (arg_info), arg_avis)) { - DBUG_PRINT ("......found lifted argument ap arg %s, fundef " - "arg %s", - AVIS_NAME (id_avis), AVIS_NAME (arg_avis)); - - /* create new local avis (cuda device type) */ - dev_type = CUconvertHostToDeviceType (AVIS_TYPE (id_avis)); - new_avis - = TBmakeAvis (TRAVtmpVarName ("emr_lift_dev"), dev_type); - AVIS_ISALLOCLIFT (new_avis) = TRUE; - - /* get old vardec and update it */ - DBUG_ASSERT (AVIS_DECL (id_avis) != NULL - && NODE_TYPE (AVIS_DECL (id_avis)) == N_vardec, - "Local avis has no vardec!"); - vardec = AVIS_DECL (id_avis); - VARDEC_AVIS (vardec) = new_avis; - AVIS_DECL (new_avis) = vardec; - - /* update application arguments */ - ID_AVIS (EXPRS_EXPR (ap_args)) = FREEdoFreeTree (id_avis); - ID_AVIS (EXPRS_EXPR (ap_args)) = new_avis; - - /* update function signature */ - dev_type = CUconvertHostToDeviceType (AVIS_TYPE (arg_avis)); - new_avis - = TBmakeAvis (TRAVtmpVarName ("emr_tmp_dev"), dev_type); - - ARG_AVIS (fundef_args) = new_avis; - AVIS_DECL (new_avis) = fundef_args; - - /* add N_avis to LUT, and use to replace values within the - * function body */ - INFO_LUT (arg_info) = LUTinsertIntoLutP (INFO_LUT (arg_info), - arg_avis, new_avis); - } - - ap_args = EXPRS_NEXT (ap_args); - fundef_args = ARG_NEXT (fundef_args); - } - INFO_WLERCS (arg_info) = FREEdoFreeTree (INFO_WLERCS (arg_info)); - } - } - - /* now update all N_avis in the fundef */ - DBUG_PRINT ("...going into application fundef"); - old_ap_args = INFO_AP_ARGS (arg_info); - INFO_AP_ARGS (arg_info) = AP_ARGS (arg_node); - AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); - INFO_AP_ARGS (arg_info) = old_ap_args; - - if (!INFO_UPDATE_AP (arg_info)) { - /* free all old fundef args */ - INFO_LUT (arg_info) = LUTmapLutP (INFO_LUT (arg_info), FreeLutArgs); - } - INFO_LUT (arg_info) = LUTremoveLut (INFO_LUT (arg_info)); - INFO_LUT (arg_info) = old_lut; - } else if (AP_FUNDEF (arg_node) == INFO_FUNDEF (arg_info) - && INFO_UPDATE_AP (arg_info)) { /* is recursive application */ - DBUG_PRINT ("...is recursive call"); - ap_args = INFO_AP_ARGS (arg_info); - fundef_args = AP_ARGS (arg_node); - - DBUG_ASSERT (INFO_LUT (arg_info) != NULL, "There is no LUT!"); - - while (ap_args != NULL) { - DBUG_ASSERT (fundef_args != NULL, - "# of outer Ap args != # of recursive Ap args!"); - - arg_avis = ID_AVIS (EXPRS_EXPR (fundef_args)); - - new_avis = LUTsearchInLutPp (INFO_LUT (arg_info), arg_avis); - if (arg_avis != new_avis) { - DBUG_PRINT ("......found matching ERC lift %s -> %s", - AVIS_NAME (arg_avis), AVIS_NAME (new_avis)); - ID_AVIS (EXPRS_EXPR (fundef_args)) = new_avis; - } - - ap_args = EXPRS_NEXT (ap_args); - fundef_args = EXPRS_NEXT (fundef_args); - } - } - } - - DBUG_RETURN (arg_node); -} - -/** - * @brief If we are updating the function argument, replace the ERC with the new device - * type function argument - * - * @param arg_node N_modarray - * @param arg_info info struct - * @return N_modarray - */ -node * -EMRTUmodarray (node *arg_node, info *arg_info) -{ - node *ercs, *erc; - - DBUG_ENTER (); - - if (!INFO_UPDATE_AP (arg_info)) { - ercs = MODARRAY_ERC (arg_node); - while (ercs != NULL) { - erc = LUTsearchInLutPp (INFO_LUT (arg_info), ID_AVIS (EXPRS_EXPR (ercs))); - if (ID_AVIS (EXPRS_EXPR (ercs)) != erc) { - DBUG_PRINT ("...found %s, replacing with %s", ID_NAME (EXPRS_EXPR (ercs)), - AVIS_NAME (erc)); - ID_AVIS (EXPRS_EXPR (ercs)) = erc; - break; - } - ercs = EXPRS_NEXT (ercs); - } - - MODARRAY_NEXT (arg_node) = TRAVopt (MODARRAY_NEXT (arg_node), arg_info); - } - - DBUG_RETURN (arg_node); -} - -/** - * @brief If we are updating the function argument, replace the ERC with the new device - * type function argument - * - * @param arg_node N_genarray - * @param arg_info info struct - * @return N_genarray - */ -node * -EMRTUgenarray (node *arg_node, info *arg_info) -{ - node *ercs, *erc; - - DBUG_ENTER (); - - if (!INFO_UPDATE_AP (arg_info)) { - ercs = GENARRAY_ERC (arg_node); - while (ercs != NULL) { - erc = LUTsearchInLutPp (INFO_LUT (arg_info), ID_AVIS (EXPRS_EXPR (ercs))); - if (ID_AVIS (EXPRS_EXPR (ercs)) != erc) { - DBUG_PRINT ("...found %s, replacing with %s", ID_NAME (EXPRS_EXPR (ercs)), - AVIS_NAME (erc)); - ID_AVIS (EXPRS_EXPR (ercs)) = erc; - break; - } - ercs = EXPRS_NEXT (ercs); - } - - GENARRAY_NEXT (arg_node) = TRAVopt (GENARRAY_NEXT (arg_node), arg_info); - } - - DBUG_RETURN (arg_node); -} - -/** - * @brief Store LHS before traversing into RHS - * - * @param arg_node N_let - * @param arg_info info struct - * @return N_let - */ -node * -EMRTUlet (node *arg_node, info *arg_info) -{ - DBUG_ENTER (); - - LET_IDS (arg_node) = TRAVdo (LET_IDS (arg_node), arg_info); - - INFO_LETIDS (arg_info) = LET_IDS (arg_node); - LET_EXPR (arg_node) = TRAVdo (LET_EXPR (arg_node), arg_info); - - DBUG_RETURN (arg_node); -} - -/** - * @brief If we find a host2device primitive, add its LHS and argument to the LUT - * - * @param arg_node N_prf - * @param arg_info info struct - * @return N_prf - */ -node * -EMRTUprf (node *arg_node, info *arg_info) -{ - node *id_avis, *let_avis; - - DBUG_ENTER (); - - if (INFO_UPDATE_AP (arg_info)) { - if (PRF_PRF (arg_node) == F_host2device) { - id_avis = ID_AVIS (PRF_ARG1 (arg_node)); - let_avis = IDS_AVIS (INFO_LETIDS (arg_info)); - DBUG_PRINT ("Found host2device, %s -> %s", AVIS_NAME (id_avis), - AVIS_NAME (let_avis)); - DBUG_ASSERT (INFO_LUT (arg_info) != NULL, "There is no LUT!"); - INFO_LUT (arg_info) - = LUTinsertIntoLutP (INFO_LUT (arg_info), id_avis, let_avis); - } - } - - DBUG_RETURN (arg_node); -} -/** @} */ - -#undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/emr_type_update.h b/src/libsac2c/cuda/emr_type_update.h deleted file mode 100644 index f35a1f19a..000000000 --- a/src/libsac2c/cuda/emr_type_update.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _SAC_CUDA_EMR_LOOP_H_ -#define _SAC_CUDA_EMR_LOOP_H_ - -#include "types.h" - -extern node *EMRTUdoEMRUpdateFun (node *syntax_tree); -extern node *EMRTUdoEMRUpdateAp (node *syntax_tree); - -extern node *EMRTUfundef (node *arg_node, info *arg_info); -extern node *EMRTUap (node *arg_node, info *arg_info); -extern node *EMRTUmodarray (node *arg_node, info *arg_info); -extern node *EMRTUgenarray (node *arg_node, info *arg_info); -extern node *EMRTUlet (node *arg_node, info *arg_info); -extern node *EMRTUprf (node *arg_node, info *arg_info); - -#endif /* _SAC_CUDA_EMR_LOOP_H_ */ diff --git a/src/libsac2c/cuda/insert_withloop_memtran.c b/src/libsac2c/cuda/insert_withloop_memtran.c index 23757d4a8..efc932ee0 100644 --- a/src/libsac2c/cuda/insert_withloop_memtran.c +++ b/src/libsac2c/cuda/insert_withloop_memtran.c @@ -63,7 +63,6 @@ #include "types.h" #include "type_utils.h" #include "cuda_utils.h" -#include "emr_type_update.h" #include "DataFlowMask.h" #include "DataFlowMaskUtils.h" #include "remove_dfms.h" @@ -173,12 +172,6 @@ IWLMEMdoInsertWithloopMemtran (node *syntax_tree) */ // syntax_tree = INFDFMSdoInferDfms( syntax_tree, HIDE_LOCALS_NEVER); - /* Convert EMRL allocations lifts to device type (update fundef signiture) */ - if (global.optimize.doemrci - && global.optimize.doemrl - && global.optimize.doemrtu) - syntax_tree = EMRTUdoEMRUpdateFun (syntax_tree); - info = MakeInfo (); TRAVpush (TR_iwlmem); @@ -187,12 +180,6 @@ IWLMEMdoInsertWithloopMemtran (node *syntax_tree) info = FreeInfo (info); - /* Convert EMRL allocations lifts to device type (update ap arguments) */ - if (global.optimize.doemrci - && global.optimize.doemrl - && global.optimize.doemrtu) - syntax_tree = EMRTUdoEMRUpdateAp (syntax_tree); - DBUG_RETURN (syntax_tree); } diff --git a/src/libsac2c/memory/emr_loop_optimisation.c b/src/libsac2c/memory/emr_loop_optimisation.c index 06b26915f..2481fb417 100644 --- a/src/libsac2c/memory/emr_loop_optimisation.c +++ b/src/libsac2c/memory/emr_loop_optimisation.c @@ -220,11 +220,6 @@ createTmpAvis (ntype *type) avis = TBmakeAvis (TRAVtmpVarName ("emr_tmp"), TYcopyType (type)); - /* XXX setting this for a loop-fun signature could cause - * problems latter in alloc.c, as we might try to create - * a local allocation. We need to check this! - */ - AVIS_ISALLOCLIFT (avis) = TRUE; DBUG_PRINT (" created %s var", AVIS_NAME (avis)); return avis; diff --git a/src/libsac2c/stdopt/optimize.mac b/src/libsac2c/stdopt/optimize.mac index 56fc38c46..251e2fc82 100644 --- a/src/libsac2c/stdopt/optimize.mac +++ b/src/libsac2c/stdopt/optimize.mac @@ -107,7 +107,6 @@ OPTIMIZE ("pra", pra, FALSE, FALSE, "polyhedra data reuse optimization") OPTIMIZE ("emrci", emrci, FALSE, FALSE, "EMR candidate inference") OPTIMIZE ("emrcf", emrcf, TRUE, TRUE, "EMR candidate filtering") OPTIMIZE ("emrl", emrl, TRUE, TRUE, "EMR loop memory optimisation") -OPTIMIZE ("emrtu", emrtu, TRUE, TRUE, "EMR type update optimisation for CUDA") OPTIMIZE ("rnb", rnb, FALSE, FALSE, "remove noop conditional branch in with-loops") OPTIMIZE ("rwo", rwo, TRUE, TRUE, "memory reuse with offset") OPTIMIZE ("rip", rip, TRUE, TRUE, "memory reuse with in place selection") diff --git a/src/libsac2c/xml/ast.xml b/src/libsac2c/xml/ast.xml index b5a418850..bb476d014 100644 --- a/src/libsac2c/xml/ast.xml +++ b/src/libsac2c/xml/ast.xml @@ -529,16 +529,6 @@ - - - - - - - - - - -- GitLab From 322a9c6e379b15c39560c95a83eeb1dc30723e41 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Tue, 16 Apr 2019 18:49:31 +0100 Subject: [PATCH 09/17] Add comments and dbuging --- src/libsac2c/cuda/annotate_memory_transfers.c | 203 +++++++++--------- src/libsac2c/cuda/minimize_block_transfers2.c | 167 +++++++------- src/libsac2c/cuda/minimize_loop_transfers.c | 192 ++++++++--------- src/libsac2c/cuda/minimize_transfers.c | 73 +++---- 4 files changed, 299 insertions(+), 336 deletions(-) diff --git a/src/libsac2c/cuda/annotate_memory_transfers.c b/src/libsac2c/cuda/annotate_memory_transfers.c index f00639edc..66eace9dc 100644 --- a/src/libsac2c/cuda/annotate_memory_transfers.c +++ b/src/libsac2c/cuda/annotate_memory_transfers.c @@ -1,33 +1,28 @@ -/***************************************************************************** +/** + * @file + * @defgroup amtran Annotate Memory Transfers + * @ingroup cuda + * + * @brief Annotate the memory transfers that are allowed to be + * lifted from a do-fun. + * + * This module decides which and can be + * lifted out of the enclosing do-fun. Since host<->device transfers + * are expensive operations to perform in CUDA programs, and transfers + * within loop make it even more severe, eliminating transfers within + * loops as much as possible is crucial to program performance. For + * detailed explanation of what transfers can be moved out and what + * cannot, please see commets in the code. * - * @defgroup Annotate the memory transfers that are allowed to be - * lifted from a do-fun. - * - * - * This module decides which and can be - * lifted out of the enclosing do-fun. Since host<->device transfers - * are expensive operations to perform in CUDA programs, and transfers - * within loop make it even more severe, eliminating transfers within - * loops as much as possible is crucial to program performance. For - * detailed explanation of what transfers can be moved out and what - * cannot, please see commets in the code. - * - *****************************************************************************/ - -/** - * - * @file annotate_memory_transfers.c - * - * Prefix: AMTRAN - * - *****************************************************************************/ + * @{ + */ #include "annotate_memory_transfers.h" #include #include "tree_basic.h" #include "tree_compound.h" -#define DBUG_PREFIX "UNDEFINED" +#define DBUG_PREFIX "AMTRAN" #include "debug.h" #include "traverse.h" @@ -43,12 +38,10 @@ */ enum traverse_mode { trav_collect, trav_consolidate, trav_annotate }; -/** - * +/** * @name INFO structure * @{ - * - *****************************************************************************/ + */ struct INFO { bool indofun; nlut_t *nlut; @@ -106,10 +99,16 @@ FreeInfo (info *info) DBUG_RETURN (info); } -/** - * @} - *****************************************************************************/ +/** @} */ +/** + * @brief Find fundef arguments in application arguments + * + * @param fundef_args + * @param ap_args + * @param id + * @return matching fundef arguments + */ static node * GetFundefArgFromApArg (node *fundef_args, node *ap_args, node *id) { @@ -127,17 +126,18 @@ GetFundefArgFromApArg (node *fundef_args, node *ap_args, node *id) DBUG_RETURN (fundef_args); } -/** - * +/** * @name Entry functions * @{ + */ + +/** + * @brief * - *****************************************************************************/ -/** - * - * @fn node *AMTRANdoAnnotateMemoryTransfers( node *syntax_tree) - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANdoAnnotateMemoryTransfers (node *syntax_tree) { @@ -153,24 +153,20 @@ AMTRANdoAnnotateMemoryTransfers (node *syntax_tree) DBUG_RETURN (syntax_tree); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * +/** * @name Traversal functions * @{ - * - *****************************************************************************/ + */ -/** - * - * @fn node *AMTRANfundef( node *arg_node, info *arg_info) - * +/** * @brief * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANfundef (node *arg_node, info *arg_info) { @@ -180,6 +176,7 @@ AMTRANfundef (node *arg_node, info *arg_info) /* We only traverse do-fun. */ if (FUNDEF_ISLOOPFUN (arg_node)) { + DBUG_PRINT ("(LOOP) Looking at %s...", FUNDEF_NAME (arg_node)); INFO_INDOFUN (arg_info) = TRUE; INFO_NLUT (arg_info) = NLUTgenerateNlut (FUNDEF_ARGS (arg_node), FUNDEF_VARDECS (arg_node)); @@ -208,14 +205,13 @@ AMTRANfundef (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANarg( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANarg (node *arg_node, info *arg_info) { @@ -231,14 +227,13 @@ AMTRANarg (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANassign( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANassign (node *arg_node, info *arg_info) { @@ -257,14 +252,13 @@ AMTRANassign (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANlet( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANlet (node *arg_node, info *arg_info) { @@ -299,14 +293,13 @@ AMTRANlet (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANfuncond( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANfuncond (node *arg_node, info *arg_info) { @@ -328,24 +321,26 @@ AMTRANfuncond (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANap( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANap (node *arg_node, info *arg_info) { DBUG_ENTER (); + DBUG_PRINT ("inspecting N_ap of %s...", FUNDEF_NAME (AP_FUNDEF (arg_node))); + if (INFO_INDOFUN (arg_info)) { /* If the N_ap is a recursive do-fun application * and the traverse mode is collect. */ if (INFO_FUNDEF (arg_info) == AP_FUNDEF (arg_node) && INFO_TRAVMODE (arg_info) == trav_collect) { + DBUG_PRINT ("(mode: collect), at recursive N_ap"); /* The arguments of the recursive do-fun application * need to be stored and will be used in the annotate * traversal. */ @@ -362,6 +357,7 @@ AMTRANap (node *arg_node, info *arg_info) INFO_INRECURSIVEAPARGS (arg_info) = FALSE; } else if (INFO_FUNDEF (arg_info) == AP_FUNDEF (arg_node) && INFO_TRAVMODE (arg_info) == trav_annotate) { + DBUG_PRINT ("(mode: annotate), at recursive N_ap"); INFO_INRECURSIVEAPARGS (arg_info) = TRUE; AP_ARGS (arg_node) = TRAVopt (AP_ARGS (arg_node), arg_info); INFO_INRECURSIVEAPARGS (arg_info) = FALSE; @@ -372,19 +368,22 @@ AMTRANap (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANid( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANid (node *arg_node, info *arg_info) { + int nlut_num; + DBUG_ENTER (); + DBUG_PRINT ("inspecting N_id of %s...", ID_NAME (arg_node)); + if (INFO_INDOFUN (arg_info)) { if (INFO_TRAVMODE (arg_info) == trav_collect) { /* If the N_id is: @@ -445,6 +444,7 @@ AMTRANid (node *arg_node, info *arg_info) * a_host = host2device( a_dev); */ if (!INFO_INRECURSIVEAPARGS (arg_info) && !INFO_INFUNCOND (arg_info)) { + DBUG_PRINT ("(mode: collect), adding %s to NLUT", ID_NAME (arg_node)); NLUTincNum (INFO_NLUT (arg_info), ID_AVIS (arg_node), 1); } } else if (INFO_TRAVMODE (arg_info) == trav_annotate) { @@ -456,7 +456,9 @@ AMTRANid (node *arg_node, info *arg_info) /* If the N_arg at correpsonding position cannot be * replaced by its cuda counterpart, this devicetohost * cannot be lifted */ - if (NLUTgetNum (INFO_NLUT (arg_info), ARG_AVIS (arg)) != 0) { + nlut_num = NLUTgetNum (INFO_NLUT (arg_info), ARG_AVIS (arg)); + if (nlut_num != 0) { + DBUG_PRINT ("(mode: annotate), N_avis %s found %d time, can not move done D2H", ID_NAME (arg_node), nlut_num); ASSIGN_ISNOTALLOWEDTOBEMOVEDDOWN (ID_SSAASSIGN (arg_node)) = TRUE; } } @@ -468,23 +470,24 @@ AMTRANid (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *AMTRANprf( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * AMTRANprf (node *arg_node, info *arg_info) { node *id; DBUG_ENTER (); + if (INFO_INDOFUN (arg_info)) { switch (PRF_PRF (arg_node)) { case F_host2device: + DBUG_PRINT ("inspecting N_prf `F_host2device`"); /* Ensure that each is initially * tagged as can be moved out. */ if (INFO_TRAVMODE (arg_info) == trav_collect) { @@ -492,6 +495,7 @@ AMTRANprf (node *arg_node, info *arg_info) } /* If we are in trav_annotate traverse mode */ if (INFO_TRAVMODE (arg_info) == trav_annotate) { + DBUG_PRINT ("(mode: annoate), checking N_prf argument refcount"); id = PRF_ARG1 (arg_node); /* We only look at whose host N_id @@ -506,6 +510,7 @@ AMTRANprf (node *arg_node, info *arg_info) /* If the reference count of the host N_id is not 0, * we annotates the transfer to be not allowed to be moved out. */ if (NLUTgetNum (INFO_NLUT (arg_info), ID_AVIS (id)) != 0) { + DBUG_PRINT (" cannot move-out h2d of %s", ID_NAME (id)); ASSIGN_ISNOTALLOWEDTOBEMOVEDUP (INFO_LASTASSIGN (arg_info)) = TRUE; } else { @@ -595,12 +600,14 @@ AMTRANprf (node *arg_node, info *arg_info) } break; case F_device2host: + DBUG_PRINT ("inspecting N_prf `F_device2host`"); /* Ensure that each device2host is initially * tagged as can be moved out */ if (INFO_TRAVMODE (arg_info) == trav_collect) { ASSIGN_ISNOTALLOWEDTOBEMOVEDDOWN (INFO_LASTASSIGN (arg_info)) = FALSE; } if (INFO_TRAVMODE (arg_info) == trav_annotate) { + DBUG_PRINT ("(mode: annoate), checking N_prf argument refcount"); /* If the reference count of the host N_id is not 0, * we annotates the transfer to be not allowed to be moved out. */ if (NLUTgetNum (INFO_NLUT (arg_info), IDS_AVIS (INFO_LETIDS (arg_info))) @@ -618,12 +625,6 @@ AMTRANprf (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * @} - *****************************************************************************/ - -/** - * @} - *****************************************************************************/ - +/** @} */ +/** @} */ #undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/minimize_block_transfers2.c b/src/libsac2c/cuda/minimize_block_transfers2.c index d40de27f5..64e5d3c00 100644 --- a/src/libsac2c/cuda/minimize_block_transfers2.c +++ b/src/libsac2c/cuda/minimize_block_transfers2.c @@ -1,42 +1,41 @@ -/** +/** + * @file + * @defgroup mbtran2 Minimize Block Transfers + * @ingroup cuda + * + * @brief Minimize the number of host<->device transfers in a + * sequential block of instructions. + * + * This modules tries to eliminate / instructions + * in a sequential block of code. Two difference cases expose the opportunities + * for elimination: + * + * 1. + * ~~~~ + * a_host = device2host( b_dev); + * ... + * ... + * a_dev = host2device( a_host); + * ~~~~ + * + * The second memory transfer, i.e. a_dev = host2device( a_host) + * can be eliminated. Any reference to a_dev after it will be + * replaced by b_dev. + * + * 2. + * ~~~~ + * b_dev = host2device( a_host); + * ... + * ... + * c_dev = host2device( a_host); + * ~~~~ + * + * The second memory transfer, i.e. c_dev = host2device( a_host) + * can be eliminated. Any reference to c_dev after it will be + * replaced by b_dev. * - * @defgroup Minimize the number of host<->device transfers in a - * sequential block of instructions. - * - * This modules tries to eliminate / instructions - * in a sequential block of code. Two difference cases expose the opportunities - * for elimination: - * - * 1) a_host = device2host( b_dev); - * ... - * ... - * a_dev = host2device( a_host); - * - * The second memory transfer, i.e. a_dev = host2device( a_host) - * can be eliminated. Any reference to a_dev after it will be - * replaced by b_dev. - * - * - * - * 2) b_dev = host2device( a_host); - * ... - * ... - * c_dev = host2device( a_host); - * - * The second memory transfer, i.e. c_dev = host2device( a_host) - * can be eliminated. Any reference to c_dev after it will be - * replaced by b_dev. - * - * - *****************************************************************************/ - -/** - * - * @file minimize_block_transfers.c - * - * Prefix: MBTRAN2 - * - *****************************************************************************/ + * @{ + */ #include "minimize_block_transfers2.h" #include @@ -48,7 +47,7 @@ #include "LookUpTable.h" #include "memory.h" -#define DBUG_PREFIX "UNDEFINED" +#define DBUG_PREFIX "MBTRAN2" #include "debug.h" #include "deadcoderemoval.h" @@ -56,12 +55,10 @@ #include "SSACSE.h" #include "DupTree.h" -/** - * +/** * @name INFO structure * @{ - * - *****************************************************************************/ + */ struct INFO { node *current_block; node *lastassign; @@ -95,23 +92,18 @@ FreeInfo (info *info) DBUG_RETURN (info); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * +/** * @name Entry functions * @{ - * - *****************************************************************************/ -/** - * - * @fn node *MBTRAN2doMinimizeBlockTransfers( node *syntax_tree) - * - * @brief - * - *****************************************************************************/ + */ + +/** + * @brief Invoke the CUDA block transfer minimisation traversal + * @param syntax_tree + * @return syntax tree + */ node * MBTRAN2doMinimizeBlockTransfers (node *syntax_tree) { @@ -126,33 +118,31 @@ MBTRAN2doMinimizeBlockTransfers (node *syntax_tree) info = FreeInfo (info); + DBUG_PRINT ("invoking CSE"); syntax_tree = CSEdoCommonSubexpressionElimination (syntax_tree); /* We rely on Dead Code Removal to remove the * unused / */ + DBUG_PRINT ("invoking DCR"); syntax_tree = DCRdoDeadCodeRemoval (syntax_tree); DBUG_RETURN (syntax_tree); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * +/** * @name Traversal functions * @{ - * - *****************************************************************************/ + */ -/** +/** + * @brief Store current N_block in info struct and traverse the N_assigns * - * @fn node *MBTRAN2block( node *arg_node, info *arg_info) - * - * @brief - * - *****************************************************************************/ + * @param arg_node N_block + * @param arg_info info structure + * @return N_block + */ node * MBTRAN2block (node *arg_node, info *arg_info) { @@ -170,13 +160,13 @@ MBTRAN2block (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MBTRAN2assign( node *arg_node, info *arg_info) - * - * @brief +/** + * @brief Store the current N_assign in traverse statements top-down * - *****************************************************************************/ + * @param arg_node N_assign + * @param arg_info info structure + * @return N_assign + */ node * MBTRAN2assign (node *arg_node, info *arg_info) { @@ -193,13 +183,14 @@ MBTRAN2assign (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** +/** + * @brief Check if `F_host2device` argument is assigned to via `F_device2host`, + * delete the current N_prf and replace the RHS of the assign. * - * @fn node *MBTRAN2prf( node *arg_node, info *arg_info) - * - * @brief - * - *****************************************************************************/ + * @param arg_node N_prf + * @param arg_info info structure + * @return N_prf + */ node * MBTRAN2prf (node *arg_node, info *arg_info) { @@ -209,6 +200,7 @@ MBTRAN2prf (node *arg_node, info *arg_info) switch (PRF_PRF (arg_node)) { case F_host2device: + DBUG_PRINT ("Checking H2D to elimitating preceeding D2H"); ssaassign = AVIS_SSAASSIGN (ID_AVIS (PRF_ARG1 (arg_node))); /* if( ISDEVICE2HOST( ssaassign) && @@ -216,6 +208,7 @@ MBTRAN2prf (node *arg_node, info *arg_info) ASSIGN_CONTAINING_BLOCK( INFO_LASTASSIGN( arg_info)))) { ( */ if (ISDEVICE2HOST (ssaassign)) { + DBUG_PRINT ("...eliminating H2D and replacing LHS"); node *dev_id = PRF_ARG1 (ASSIGN_RHS (ssaassign)); node *dev_avis = ID_AVIS (dev_id); arg_node = FREEdoFreeNode (arg_node); @@ -228,12 +221,6 @@ MBTRAN2prf (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * @} - *****************************************************************************/ - -/** - * @} - *****************************************************************************/ - +/** @} */ +/** @} */ #undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/minimize_loop_transfers.c b/src/libsac2c/cuda/minimize_loop_transfers.c index 219ec57a8..cbee505b7 100644 --- a/src/libsac2c/cuda/minimize_loop_transfers.c +++ b/src/libsac2c/cuda/minimize_loop_transfers.c @@ -1,23 +1,17 @@ -/***************************************************************************** +/** + * @file + * @defgroup mltran Minimize Loop Transfers + * @ingroup cuda * - * @defgroup Lift memory transfers in loops whenever possible + * @brief Lift memory transfers in loops whenever possible * + * This module implements the transformation of lifting memory transfers + * (/) out of a do-fun. Memory transfers that + * are allowed to be moved out were tagged in the previous phase, i.e. + * Annotate Memory Transfer (AMTRAN). * - * This module implements the transformation of lifting memory transfers - * (/) out of a do-fun. Memory transfers that - * are allowed to be moved out were tagged in the previous phase, i.e. - * Annotate Memory Transfer (AMTRAN). - * - * - *****************************************************************************/ - -/** - * - * @file minimize_loop_transfers.c - * - * Prefix: MLTRAN - * - *****************************************************************************/ + * @{ + */ #include "minimize_loop_transfers.h" #include @@ -29,7 +23,7 @@ #include "memory.h" #include "globals.h" -#define DBUG_PREFIX "MTRAN" +#define DBUG_PREFIX "MLTRAN" #include "debug.h" #include "ctinfo.h" @@ -54,12 +48,10 @@ enum traverse_mode { trav_normalfun, trav_dofun }; -/** - * +/** * @name INFO structure * @{ - * - *****************************************************************************/ + */ struct INFO { bool indofun; node *letids; @@ -135,21 +127,18 @@ FreeInfo (info *info) DBUG_RETURN (info); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * +/** * @name Entry functions * @{ - * - *****************************************************************************/ -/** - * - * @fn node *MLTRANdoMinimizeLoopTransfers( node *syntax_tree) - * - *****************************************************************************/ + */ + +/** + * @brief + * @param syntax_tree + * @return syntax tree + */ node * MLTRANdoMinimizeLoopTransfers (node *syntax_tree) { @@ -164,29 +153,26 @@ MLTRANdoMinimizeLoopTransfers (node *syntax_tree) info = FreeInfo (info); + DBUG_PRINT ("invoking DCR"); syntax_tree = DCRdoDeadCodeRemoval (syntax_tree); DBUG_RETURN (syntax_tree); } -/** - * @} - *****************************************************************************/ +/** @} */ -/** - * +/** * @name Traversal functions * @{ - * - *****************************************************************************/ + */ -/** - * - * @fn node *MLTRANfundef( node *arg_node, info *arg_info) - * +/** * @brief * - *****************************************************************************/ + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ node * MLTRANfundef (node *arg_node, info *arg_info) { @@ -198,6 +184,7 @@ MLTRANfundef (node *arg_node, info *arg_info) /* If the function is not a do-fun, we traverse as normal */ if (!FUNDEF_ISLOOPFUN (arg_node)) { + DBUG_PRINT ("(not LOOP) Entering %s...", FUNDEF_NAME (arg_node)); FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); } else { @@ -205,6 +192,8 @@ MLTRANfundef (node *arg_node, info *arg_info) * otherwise we traverse the next N_fundef. */ if (INFO_TRAVMODE (arg_info) == trav_dofun) { + DBUG_PRINT ("(LOOP) Entering %s...", FUNDEF_NAME (arg_node)); + /* We assign a sequential number (starting from 0) * to each argument of the do-fun */ INFO_FUNARGNUM (arg_info) = 0; @@ -222,14 +211,13 @@ MLTRANfundef (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANarg( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_arg + * @param arg_info info structure + * @return N_arg + */ node * MLTRANarg (node *arg_node, info *arg_info) { @@ -243,14 +231,13 @@ MLTRANarg (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANassign( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_assign + * @param arg_info info structure + * @return N_assign + */ node * MLTRANassign (node *arg_node, info *arg_info) { @@ -310,14 +297,13 @@ MLTRANassign (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANlet( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_let + * @param arg_info info structure + * @return N_let + */ node * MLTRANlet (node *arg_node, info *arg_info) { @@ -330,14 +316,13 @@ MLTRANlet (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANap( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_ap + * @param arg_info info structure + * @return N_ap + */ node * MLTRANap (node *arg_node, info *arg_info) { @@ -348,10 +333,14 @@ MLTRANap (node *arg_node, info *arg_info) DBUG_ENTER (); + DBUG_PRINT ("ap %s", FUNDEF_NAME (AP_FUNDEF (arg_node))); + /* If the N_ap->N_fundef is a do-fun */ if (FUNDEF_ISLOOPFUN (AP_FUNDEF (arg_node))) { /* If this is NOT a recursive application of the enclosing do-fun */ if (AP_FUNDEF (arg_node) != INFO_FUNDEF (arg_info)) { + DBUG_PRINT ("...non-recursive application"); + /* Traverse the N_ap arguments first */ AP_ARGS (arg_node) = TRAVopt (AP_ARGS (arg_node), arg_info); @@ -389,6 +378,7 @@ MLTRANap (node *arg_node, info *arg_info) } /* If this is a recursive application of the enclosing do-fun. */ else { + DBUG_PRINT ("...recursive application"); INFO_ISRECURSIVEAPARGS (arg_info) = TRUE; INFO_RECURSIVEAPARGS (arg_info) = AP_ARGS (arg_node); AP_ARGS (arg_node) = TRAVopt (AP_ARGS (arg_node), arg_info); @@ -401,14 +391,13 @@ MLTRANap (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANid( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_id + * @param arg_info info structure + * @return N_id + */ node * MLTRANid (node *arg_node, info *arg_info) { @@ -482,14 +471,13 @@ MLTRANid (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANfuncond( node *syntax_tree) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_funcond + * @param arg_info info structure + * @return N_funcond + */ node * MLTRANfuncond (node *arg_node, info *arg_info) { @@ -566,14 +554,13 @@ MLTRANfuncond (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANreturn( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_return + * @param arg_info info structure + * @return N_return + */ node * MLTRANreturn (node *arg_node, info *arg_info) { @@ -634,14 +621,13 @@ MLTRANreturn (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * - * @fn node *MLTRANprf( node *arg_node, info *arg_info) - * +/** * @brief * - * - *****************************************************************************/ + * @param arg_node N_prf + * @param arg_info info structure + * @return N_prf + */ node * MLTRANprf (node *arg_node, info *arg_info) { @@ -652,10 +638,12 @@ MLTRANprf (node *arg_node, info *arg_info) if (INFO_INDOFUN (arg_info)) { switch (PRF_PRF (arg_node)) { case F_host2device: + id = PRF_ARG1 (arg_node); + DBUG_PRINT ("prf host2device %s -> %s", ID_NAME (id), IDS_NAME (INFO_LETIDS (arg_info))); if (!ASSIGN_ISNOTALLOWEDTOBEMOVEDUP ((INFO_LASTASSIGN (arg_info)))) { - id = PRF_ARG1 (arg_node); + DBUG_PRINT ("...can be moved up"); DBUG_ASSERT (NODE_TYPE (ID_DECL (id)) == N_arg, - "Host variable of is not declared as an N_arg!"); + "Host variable of H2D is not declared as an N_arg!"); /* If the is allowed to be moved out of the do-fun, * the host variable argument can be replaced by the device variable. * Note that if can be moved out, the host variable @@ -722,7 +710,9 @@ MLTRANprf (node *arg_node, info *arg_info) } break; case F_device2host: + DBUG_PRINT ("prf device2host"); if (!ASSIGN_ISNOTALLOWEDTOBEMOVEDDOWN ((INFO_LASTASSIGN (arg_info)))) { + DBUG_PRINT ("...can be moved down"); /* We insert the pair [N_id(host)->avis] -> [N_id(device)->avis] * into D2H table. */ INFO_D2HLUT (arg_info) @@ -739,12 +729,6 @@ MLTRANprf (node *arg_node, info *arg_info) DBUG_RETURN (arg_node); } -/** - * @} - *****************************************************************************/ - -/** - * @} - *****************************************************************************/ - +/** @} */ +/** @} */ #undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/minimize_transfers.c b/src/libsac2c/cuda/minimize_transfers.c index 7d3577528..79ee77fb7 100644 --- a/src/libsac2c/cuda/minimize_transfers.c +++ b/src/libsac2c/cuda/minimize_transfers.c @@ -1,27 +1,19 @@ -/***************************************************************************** +/** + * @file + * @defgroup mtran Minimize Transfers + * @ingroup cuda * - * @defgroup + * This is a driver module for three transformations aiming at minimizing + * the number of host<->device memory transfers. These three transformations + * are applied in a cyclic fashion since one optimization might expose more + * opportunities for another optimization. The number of cycles is currently + * set at 10. However, a better approach would be to stop the cycle when no + * changes occur to the AST (Unfortunately, I have yet figurred out how to + * do it). For details of each transformation, please refer to the individual + * module files. * - * - * This is a driver module for three transformations aiming at minimizing - * the number of host<->device memory transfers. These three transformations - * are applied in a cyclic fashion since one optimization might expose more - * opportunities for another optimization. The number of cycles is currently - * set at 10. However, a better approach would be to stop the cycle when no - * changes occur to the AST (Unfortunately, I have yet figurred out how to - * do it). For details of each transformation, please refer to the individual - * module files. - * - * - *****************************************************************************/ - -/** - * - * @file minimize_transfers.c - * - * Prefix: MTRAN - * - *****************************************************************************/ + * @{ + */ #include "minimize_transfers.h" #include @@ -35,20 +27,22 @@ #include "minimize_loop_transfers.h" #include "minimize_cond_transfers.h" #include "minimize_cudast_transfers.h" +#include "loop_invariant_removal.h" #include "globals.h" #include "wl_descalarization.h" -/** - * +/** * @name Entry functions * @{ + */ + +/** + * @brief Applies various optimisation to the syntax tree, to minimize CUDA + * memcpy operations. * - *****************************************************************************/ -/** - * - * @fn node *MLTRANdoMinimizeLoopTransfers( node *syntax_tree) - * - *****************************************************************************/ + * @param syntax_tree + * @return the syntax tree + */ node * MTRANdoMinimizeTransfers (node *syntax_tree) { @@ -56,18 +50,18 @@ MTRANdoMinimizeTransfers (node *syntax_tree) int i, j; + DBUG_PRINT ("Performaing CUDA Minimize Transfers Optimistions"); + if (global.backend == BE_cuda && global.optimize.doexpar) { - i = 0; - while (i < 10) { + DBUG_PRINT ("Performing `Expand Partitions' optimisation"); + for (i = 0; i < 10; i++) { /* syntax_tree = MBTRAN2doMinimizeBlockTransfers( syntax_tree); */ syntax_tree = ACTRANdoAnnotateCondTransfers (syntax_tree); syntax_tree = MCTRANdoMinimizeCudastCondTransfers (syntax_tree); - i++; } } - j = 0; - while (j < 10) { + for (j = 0; j < 10; j++) { syntax_tree = MCSTRANdoMinimizeCudastTransfers (syntax_tree); syntax_tree = MBTRAN2doMinimizeBlockTransfers (syntax_tree); syntax_tree = ACTRANdoAnnotateCondTransfers (syntax_tree); @@ -79,7 +73,6 @@ MTRANdoMinimizeTransfers (node *syntax_tree) /*********************************************************/ syntax_tree = AMTRANdoAnnotateMemoryTransfers (syntax_tree); syntax_tree = MLTRANdoMinimizeLoopTransfers (syntax_tree); - j++; } /* We perform loop invariant removal here because we found out @@ -89,13 +82,11 @@ MTRANdoMinimizeTransfers (node *syntax_tree) * regard to array "features" in kmeans.sac in the CUDA Rodinia * benchmark suite. */ - // syntax_tree = LIRdoLoopInvariantRemoval( syntax_tree); + // syntax_tree = DLIRdoLoopInvariantRemoval (syntax_tree); DBUG_RETURN (syntax_tree); } -/** - * @} - *****************************************************************************/ - +/** @} */ +/** @} */ #undef DBUG_PREFIX -- GitLab From e27a8e8630ccb25acdc045239e38ebda20b185e0 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Tue, 16 Apr 2019 18:50:57 +0100 Subject: [PATCH 10/17] Add the MEMRT (minimize EMR transfers) traversal --- src/libsac2c/CMakeLists.txt | 1 + src/libsac2c/cuda/minimize_emr_transfers.c | 622 ++++++++++++++++++++ src/libsac2c/cuda/minimize_emr_transfers.h | 16 + src/libsac2c/cuda/minimize_transfers.c | 12 + src/libsac2c/memory/emr_loop_optimisation.c | 3 + src/libsac2c/stdopt/optimize.mac | 1 + src/libsac2c/xml/ast.xml | 16 + 7 files changed, 671 insertions(+) create mode 100644 src/libsac2c/cuda/minimize_emr_transfers.c create mode 100644 src/libsac2c/cuda/minimize_emr_transfers.h diff --git a/src/libsac2c/CMakeLists.txt b/src/libsac2c/CMakeLists.txt index adcc25401..f5619ce66 100644 --- a/src/libsac2c/CMakeLists.txt +++ b/src/libsac2c/CMakeLists.txt @@ -228,6 +228,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_block_transfers2.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_cond_transfers.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_cudast_transfers.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_loop_transfers.c +${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_emr_transfers.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/minimize_transfers.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/partial_fold.c ${CMAKE_CURRENT_SOURCE_DIR}/cuda/prepare_forloop_generation.c diff --git a/src/libsac2c/cuda/minimize_emr_transfers.c b/src/libsac2c/cuda/minimize_emr_transfers.c new file mode 100644 index 000000000..f27cb594d --- /dev/null +++ b/src/libsac2c/cuda/minimize_emr_transfers.c @@ -0,0 +1,622 @@ +/** + * @file + * @defgroup memrt Minimize EMR Transfers + * @ingroup cuda + * + * @brief Convert all ERCs in EMRL affected fundefs with CUDA-WL to CUDA device types. + * + * The general idea is similar to what MLTRAN does, which is to lift out h2d/d2h memcpys + * from a loopfun. This traversal though works for a special case, concerning loopfuns + * which have been effected by the EMRL optimisation *and* where the existing optimisations + * are not able to lift out the h2d. + * + * The latter point is especially important, as traversals like MLTRAN only lift out h2d/d2h + * if there are no further references to the RHS of h2d/d2h. When using EMRL, this check fails + * because of an extra argument in recursive loopfun application used for the buffer-swapping. + * + * To give a concrete example, we have: + * + * ~~~~ + * lets_loop (...) { + * ... + * ret = let_loop_LOOPFUN (..., input, emr_lift); + * ... + * } + * + * let_loop_LOOPFUN (..., input, emr_tmp) { + * ... + * emr_dev = h2d (emr_tmp); + * input_dev = h2d (input); + * ... + * output_dev = wl (input_dev); [ERC: emr_dev] + * ... + * output = d2h (output_dev); + * ... + * intra = let_loop_LOOPFUN (..., ouput, input); + * } + * ~~~~ + * + * Through this traversal, we transform the above into: + * + * ~~~~ + * lets_loop (...) { + * ... + * emr_dev = h2d (emr_lift); + * ret = let_loop_LOOPFUN (..., input, emr_dev); + * ... + * } + * + * let_loop_LOOPFUN (..., input, emr_dev) { + * ... + * input_dev = h2d (input); + * ... + * output_dev = wl (input_dev); [ERC: emr_dev] + * ... + * output = d2h (output_dev); + * ... + * intra = let_loop_LOOPFUN (..., ouput, output_dev); + * } + * ~~~~ + * + * @{ + */ +#include "minimize_emr_transfers.h" + +#define DBUG_PREFIX "MEMRT" +#include "debug.h" + +#include "types.h" +#include "traverse.h" +#include "tree_basic.h" +#include "tree_compound.h" +#include "memory.h" + +#include "free.h" +#include "cuda_utils.h" +#include "LookUpTable.h" +#include "DupTree.h" +#include "deadcoderemoval.h" + +enum trav_mode { bypass, inap, afterap }; + +/** + * @name INFO structure + * @{ + */ +struct INFO { + int funargnum; /**< used to assign ordinal values to fundef args */ + bool inemrloop; /**< flag indicating we are in a EMRL affected loop */ + enum trav_mode apmode; /**< specifies which mode we are for the N_ap traversal */ + node *fundef; /**< Holds current N_fundef */ + lut_t *lut; /**< LUT is used for storing EMRL lifted h2d RHS -> LHS mappings */ + lut_t *reclut; /**< LUT is used to store all h2d RHS -> LHS mappings */ + node *letids; /**< The the LHS of N_prf */ + node *apargs; /**< N_ap arguments */ + node *apvardecs; /**< Used to update vardecs in N_ap calling context */ + node *apassigns; /**< Used to update assigns in N_ap calling context */ + node *rec_ap; /**< the recursive loopfun N_ap */ +}; + +#define INFO_FUNDEF(n) ((n)->fundef) +#define INFO_LUT(n) ((n)->lut) +#define INFO_RECLUT(n) ((n)->reclut) +#define INFO_LETIDS(n) ((n)->letids) +#define INFO_FUNARGNUM(n) ((n)->funargnum) +#define INFO_APARGS(n) ((n)->apargs) +#define INFO_APVARDECS(n) ((n)->apvardecs) +#define INFO_APASSIGNS(n) ((n)->apassigns) +#define INFO_REC_AP(n) ((n)->rec_ap) +#define INFO_INEMRLOOP(n) ((n)->inemrloop) +#define INFO_APMODE(n) ((n)->apmode) + +static info * +MakeInfo (void) +{ + info *result; + + DBUG_ENTER (); + + result = (info *)MEMmalloc (sizeof (info)); + + INFO_FUNARGNUM (result) = 0; + INFO_FUNDEF (result) = NULL; + INFO_LUT (result) = NULL; + INFO_RECLUT (result) = NULL; + INFO_LETIDS (result) = NULL; + INFO_APARGS (result) = NULL; + INFO_APVARDECS (result) = NULL; + INFO_APASSIGNS (result) = NULL; + INFO_REC_AP (result) = NULL; + INFO_INEMRLOOP (result) = FALSE; + INFO_APMODE (result) = bypass; + + DBUG_RETURN (result); +} + +static info * +FreeInfo (info *info) +{ + DBUG_ENTER (); + + info = MEMfree (info); + + DBUG_RETURN (info); +} + +/** @} */ + +/** + * @name Anonymous Traversal + * @{ + */ + +/** + * @brief If the application is a the do-loop recursive loop, + * store it in the info structure + * + * @param arg_node N_ap + * @param arg_info info structure + * @return N_ap + */ +static node * +MEMRTapAnon (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + if (INFO_FUNDEF (arg_info) == AP_FUNDEF (arg_node)) { + DBUG_PRINT ("found recursive application of %s...", FUNDEF_NAME (INFO_FUNDEF (arg_info))); + INFO_REC_AP (arg_info) = arg_node; + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief Store LHS in info structure before traversing RHS + * + * @param arg_node N_let + * @param arg_info info structure + * @return N_let + */ +static node * +MEMRTletAnon (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + INFO_LETIDS (arg_info) = LET_IDS (arg_node); + LET_EXPR (arg_node) = TRAVdo (LET_EXPR (arg_node), arg_info); + + DBUG_RETURN (arg_node); +} + +/** + * @brief If the N_prf is `F_host2device`, store the mapping of + * RHS to LHS in the LUT + * + * @param arg_node N_prf + * @param arg_info info structure + * @return N_prf + */ +static node * +MEMRTprfAnon (node *arg_node, info *arg_info) +{ + node *arg_avis, *ret_avis; + + DBUG_ENTER (); + + switch (PRF_PRF (arg_node)) { + case F_host2device: + arg_avis = ID_AVIS (PRF_ARG1 (arg_node)); + ret_avis = IDS_AVIS (INFO_LETIDS (arg_info)); + DBUG_PRINT ("found h2d, adding mapping of arg to ret: %s -> %s", AVIS_NAME (arg_avis), AVIS_NAME (ret_avis)); + INFO_RECLUT (arg_info) + = LUTinsertIntoLutP (INFO_RECLUT (arg_info), arg_avis, ret_avis); + break; + default: + break; + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief Use an anonymous traversal to the recursive do-loop application. Additionally + * for all `F_host2device` primitives, store the mapping of RHS to LHS in a LUT. + * + * Both the recursive N_ap and the LUT containing mappings is used later to appropriately + * replace arguments in the recursive N_ap with those matching in the LUT. + * + * @param fundef A N_fundef node, from an N_ap node + * @param arg_info info structure + * @return the first argument, fundef + */ +static node * +MEMRTtravToRecAp (node *fundef, info *arg_info) +{ + node *old_fundef, *old_letids; + anontrav_t trav[4] = {{N_let, &MEMRTletAnon}, {N_ap, &MEMRTapAnon}, {N_prf, &MEMRTprfAnon}, {(nodetype)0, NULL}}; + + DBUG_ENTER (); + + DBUG_ASSERT (NODE_TYPE (fundef) == N_fundef, "First argument must be a N_fundef node!"); + DBUG_ASSERT (INFO_RECLUT (arg_info) != NULL, "The recursive LUT must be created first!"); + + old_fundef = INFO_FUNDEF (arg_info); + old_letids = INFO_LETIDS (arg_info); + INFO_FUNDEF (arg_info) = fundef; + INFO_LETIDS (arg_info) = NULL; + + TRAVpushAnonymous (trav, &TRAVsons); + FUNDEF_BODY (fundef) = TRAVdo (FUNDEF_BODY (fundef), arg_info); + TRAVpop (); + + INFO_FUNDEF (arg_info) = old_fundef; + INFO_LETIDS (arg_info) = old_letids; + + DBUG_RETURN (fundef); +} + +/** @} */ + +/** + * @name Entry function + * @{ + */ + +/** + * @brief The entry function into the MEMRT traversal. + * + * @param syntax_tree + * @return syntax tree + */ +node * +MEMRTdoMinimizeEMRTransfers (node *syntax_tree) +{ + info *info; + + DBUG_ENTER (); + + info = MakeInfo (); + + TRAVpush (TR_memrt); + syntax_tree = TRAVdo (syntax_tree, info); + TRAVpop (); + + info = FreeInfo (info); + + DBUG_PRINT ("invoking DCR"); + syntax_tree = DCRdoDeadCodeRemoval (syntax_tree); + + DBUG_RETURN (syntax_tree); +} + +/** @} */ + +/** + * @name Traversal functions + * @{ + */ + +/** + * @brief Traverse N_fundefs, if its an EMRL affected loopfun, traverse + * only the body. + * + * @param arg_node N_fundef + * @param arg_info info structure + * @return N_fundef + */ +node * +MEMRTfundef (node *arg_node, info *arg_info) +{ + bool old_inemrloop; + + DBUG_ENTER (); + + INFO_FUNDEF (arg_info) = arg_node; + + if (!FUNDEF_ISEMRLIFTED (arg_node)) { + FUNDEF_BODY (arg_node) = TRAVopt (FUNDEF_BODY (arg_node), arg_info); + FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); + } else if (INFO_APMODE (arg_info) == inap) { /* EMR lifted loop */ + DBUG_PRINT ("inspecting EMR affected do-loop %s...", FUNDEF_NAME (arg_node)); + /* We assign a sequential number (starting from 0) to each argument of the loopfun */ + INFO_FUNARGNUM (arg_info) = 0; + FUNDEF_ARGS (arg_node) = TRAVopt (FUNDEF_ARGS (arg_node), arg_info); + + old_inemrloop = INFO_INEMRLOOP (arg_info); + INFO_INEMRLOOP (arg_info) = TRUE; + FUNDEF_BODY (arg_node) = TRAVdo (FUNDEF_BODY (arg_node), arg_info); + INFO_INEMRLOOP (arg_info) = old_inemrloop; + } else { + FUNDEF_NEXT (arg_node) = TRAVopt (FUNDEF_NEXT (arg_node), arg_info); + } + + + DBUG_RETURN (arg_node); +} + +/** + * @brief Traversal N_fundef arguments and assign an ordinal value + * + * With this we can retrieve an argument using the ordinal value. + * + * @param arg_node N_arg + * @param arg_info info structure + * @return N_arg + */ +node * +MEMRTarg (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + ARG_LINKSIGN (arg_node) = INFO_FUNARGNUM (arg_info); + INFO_FUNARGNUM (arg_info) += 1; + + ARG_NEXT (arg_node) = TRAVopt (ARG_NEXT (arg_node), arg_info); + + DBUG_RETURN (arg_node); +} + +/** + * @brief Traverse statements, if statement is initial N_ap of a loopfun, + * update the current context with values from info structure. + * + * @param arg_node N_assign + * @param arg_info info structure + * @return N_assign + */ +node * +MEMRTassign (node *arg_node, info *arg_info) +{ + node *old_next, *newold_assign, *old_ap_assigns, *old_ap_vardecs; + + DBUG_ENTER (); + + /* stack info fields */ + old_ap_assigns = INFO_APASSIGNS (arg_info); + old_ap_vardecs = INFO_APVARDECS (arg_info); + + ASSIGN_STMT (arg_node) = TRAVdo (ASSIGN_STMT (arg_node), arg_info); + + if (INFO_APMODE (arg_info) == afterap) { + DBUG_PRINT ("updating assigns in calling context"); + old_next = ASSIGN_NEXT (arg_node); + ASSIGN_NEXT (arg_node) = NULL; + + /* add h2d in calling context before N_ap */ + arg_node = TCappendAssign (INFO_APASSIGNS (arg_info), arg_node); + + /* add needed vardecs to calling context */ + FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) + = TCappendVardec (INFO_APVARDECS (arg_info), + FUNDEF_VARDECS (INFO_FUNDEF (arg_info))); + + /* restore values */ + INFO_APASSIGNS (arg_info) = old_ap_assigns; + INFO_APVARDECS (arg_info) = old_ap_vardecs; + INFO_APMODE (arg_info) = bypass; + + /* re-attach original next node to end of new assigns */ + newold_assign = arg_node; + while (ASSIGN_NEXT (newold_assign) != NULL) { + newold_assign = ASSIGN_NEXT (newold_assign); + } + + ASSIGN_NEXT (newold_assign) = old_next; + ASSIGN_NEXT (newold_assign) = TRAVopt (ASSIGN_NEXT (newold_assign), arg_info); + } else { + ASSIGN_NEXT (arg_node) = TRAVopt (ASSIGN_NEXT (arg_node), arg_info); + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief Store LHS of N_let before traversing RHS + * + * @param arg_node N_let + * @param arg_info info structure + * @return N_let + */ +node * +MEMRTlet (node *arg_node, info *arg_info) +{ + DBUG_ENTER (); + + INFO_LETIDS (arg_info) = LET_IDS (arg_node); + LET_EXPR (arg_node) = TRAVdo (LET_EXPR (arg_node), arg_info); + INFO_LETIDS (arg_info) = NULL; + + DBUG_RETURN (arg_node); +} + +/** + * @brief Replace current N_id with one stored in LUT + * + * @param arg_node N_id + * @param arg_info info structure + * @return N_id + */ +node * +MEMRTid (node *arg_node, info *arg_info) +{ + node *avis; + + DBUG_ENTER (); + + if (INFO_INEMRLOOP (arg_info)) { + /* If this N_id occurs in a place other than the argument list + * of a recursive application of the enclosing do-fun, reset its + * N_avis to the new N_avis. This is necessary when + * a is lifted out of the do-fun, and therefore + * the device variable is passed to the do-fun as an argument + * instead of a locally declared/defined variable. */ + avis = LUTsearchInLutPp (INFO_LUT (arg_info), ID_AVIS (arg_node)); + if (avis != ID_AVIS (arg_node)) { + ID_AVIS (arg_node) = avis; + } + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief If the N_ap is the initial call for a EMRL affected loopfun, + * traverse the loopfun's body and lift out H2D memcpys + * + * Here we setup the info structure, creating two LUTs and storing some + * stateful information. Additionally we call `MEMRTtravToRecAp` to + * store the recursive loopfun N_ap and populate one of the LUTs. + * + * After traversing, we reset the info structure to a previous state. + * + * @param arg_node N_ap + * @param arg_info info structure + * @return N_ap + */ +node * +MEMRTap (node *arg_node, info *arg_info) +{ + node *old_ap_args, *old_fundef, *old_rec_ap; + lut_t *old_lut, *old_reclut; + + DBUG_ENTER (); + + if (FUNDEF_ISLOOPFUN (AP_FUNDEF (arg_node)) + && FUNDEF_ISEMRLIFTED (AP_FUNDEF (arg_node))) { + if (INFO_FUNDEF (arg_info) != AP_FUNDEF (arg_node)) { /* initial application */ + DBUG_PRINT ("inspecting initial application of %s...", FUNDEF_NAME (AP_FUNDEF (arg_node))); + + /* traverse arguments first */ + AP_ARGS (arg_node) = TRAVopt (AP_ARGS (arg_node), arg_info); + + /* stack info fields */ + old_fundef = INFO_FUNDEF (arg_info); + old_ap_args = INFO_APARGS (arg_info); + old_rec_ap = INFO_REC_AP (arg_info); + old_lut = INFO_LUT (arg_info); + old_reclut = INFO_RECLUT (arg_info); + + /* initialise info fields */ + INFO_APARGS (arg_info) = AP_ARGS (arg_node); + INFO_APASSIGNS (arg_info) = NULL; + INFO_APVARDECS (arg_info) = NULL; + INFO_LUT (arg_info) = LUTgenerateLut (); + INFO_RECLUT (arg_info) = LUTgenerateLut (); + + /* we find the recursive N_ap and fill RECLUT with h2d arg to ret mappings */ + AP_FUNDEF (arg_node) = MEMRTtravToRecAp (AP_FUNDEF (arg_node), arg_info); + + INFO_APMODE (arg_info) = inap; + AP_FUNDEF (arg_node) = TRAVdo (AP_FUNDEF (arg_node), arg_info); + INFO_APMODE (arg_info) = afterap; + + /* reset all the info fields */ + INFO_LUT (arg_info) = LUTremoveLut (INFO_LUT (arg_info)); + INFO_LUT (arg_info) = old_lut; + INFO_RECLUT (arg_info) = LUTremoveLut (INFO_RECLUT (arg_info)); + INFO_RECLUT (arg_info) = old_reclut; + INFO_FUNDEF (arg_info) = old_fundef; + INFO_APARGS (arg_info) = old_ap_args; + INFO_REC_AP (arg_info) = old_rec_ap; + } + } else { + AP_ARGS (arg_node) = TRAVopt (AP_ARGS (arg_node), arg_info); + } + + DBUG_RETURN (arg_node); +} + +/** + * @brief If we find a `F_host2device` primitive, we check if its argument + * was created via EMRL lifting out an allocation. If so, we lift + * out the primitive and update the loopfun appropriately. + * + * Assuming we are in a EMRL affected loopfun, if the argument of a `F_host2device` + * primitive is a lifted allocation, we transfer the primitive and declaration via + * the info structure (see N_assign for application). Additionally we place into + * LUT the primitives RHS -> LHS, such that we update all subsequent references + * correctly. Finally we update the correct argument in the recursive N_ap. + * + * @param arg_node N_prf + * @param arg_info info structure + * @return N_prf + */ +node * +MEMRTprf (node *arg_node, info *arg_info) +{ + node *id, *id_decl, *aparg, *ret_avis, *recaparg, *recapexprs; + + DBUG_ENTER (); + + if (INFO_INEMRLOOP (arg_info)) { + switch (PRF_PRF (arg_node)) { + case F_host2device: + id = PRF_ARG1 (arg_node); + id_decl = ID_DECL (id); + + if (NODE_TYPE (id_decl) == N_arg) { + /* host var is passed as argument of do-loop */ + aparg = CUnthApArg (INFO_APARGS (arg_info), ARG_LINKSIGN (id_decl)); + DBUG_ASSERT (NODE_TYPE (aparg) == N_id, + "Arguments of N_ap must be N_id nodes!"); + if (AVIS_ISALLOCLIFT (ID_AVIS (aparg))) { + /* this var is the result of EMRL alloc lifting */ + DBUG_PRINT ("Found H2D that was EMRL lifted: %s (ap) -> %s", ID_NAME (aparg), ID_NAME (id)); + /* We change the argument, e.g. a_host to + * device variable, e.g. a_dev */ + node *vardec = IDS_DECL (INFO_LETIDS (arg_info)); + ARG_AVIS (id_decl) = DUPdoDupNode (VARDEC_AVIS (vardec)); + AVIS_SSAASSIGN (ARG_AVIS (id_decl)) = NULL; + AVIS_DECL (ARG_AVIS (id_decl)) = id_decl; + + /* Insert pair [N_vardec->avis] -> [N_arg->avis] into H2D + * table. Therefore, N_vardec->avis of any subsequent N_id + * nodes will be replaced by N_arg->avis. */ + INFO_LUT (arg_info) + = LUTinsertIntoLutP (INFO_LUT (arg_info), VARDEC_AVIS (vardec), + ARG_AVIS (id_decl)); + + /* Create N_vardec and in the calling context + * i.e. lifting the */ + node *new_avis = DUPdoDupNode (ARG_AVIS (id_decl)); + INFO_APVARDECS (arg_info) + = TBmakeVardec (new_avis, INFO_APVARDECS (arg_info)); + + INFO_APASSIGNS (arg_info) + = TBmakeAssign (TBmakeLet (TBmakeIds (new_avis, NULL), + TBmakePrf (F_host2device, + TBmakeExprs (TBmakeId ( + ID_AVIS (aparg)), + NULL))), + INFO_APASSIGNS (arg_info)); + + /* Replace the N_avis of ap_arg to the new device N_avis */ + ID_AVIS (aparg) = new_avis; + /* Maintain SSA property */ + AVIS_SSAASSIGN (new_avis) = INFO_APASSIGNS (arg_info); + + /* update recursive N_ap argument appropriately */ + recapexprs = TCgetNthExprs ((size_t)ARG_LINKSIGN (id_decl), AP_ARGS (INFO_REC_AP (arg_info))); + recaparg = EXPRS_EXPR (recapexprs); + ret_avis = LUTsearchInLutPp (INFO_RECLUT (arg_info), ID_AVIS (recaparg)); + if (ret_avis == ID_AVIS (recaparg)) { + DBUG_UNREACHABLE ("%s does not exist in RECLUT!", ID_NAME (recaparg)); + } + DBUG_PRINT ("replacing %s -> %s in recursive N_ap", ID_NAME (recaparg), AVIS_NAME (ret_avis)); + ID_AVIS (recaparg) = ret_avis; + } + } + break; + default: + PRF_ARGS (arg_node) = TRAVopt (PRF_ARGS (arg_node), arg_info); + break; + } + } + + DBUG_RETURN (arg_node); +} + +/** @} */ +/** @} */ +#undef DBUG_PREFIX diff --git a/src/libsac2c/cuda/minimize_emr_transfers.h b/src/libsac2c/cuda/minimize_emr_transfers.h new file mode 100644 index 000000000..4a05cecc4 --- /dev/null +++ b/src/libsac2c/cuda/minimize_emr_transfers.h @@ -0,0 +1,16 @@ +#ifndef _SAC_CUDA_MEMRT_H_ +#define _SAC_CUDA_MEMRT_H_ + +#include "types.h" + +extern node *MEMRTdoMinimizeEMRTransfers (node *syntax_tree); + +extern node *MEMRTfundef (node *arg_node, info *arg_info); +extern node *MEMRTarg (node *arg_node, info *arg_info); +extern node *MEMRTassign (node *arg_node, info *arg_info); +extern node *MEMRTlet (node *arg_node, info *arg_info); +extern node *MEMRTid (node *arg_node, info *arg_info); +extern node *MEMRTap (node *arg_node, info *arg_info); +extern node *MEMRTprf (node *arg_node, info *arg_info); + +#endif /* _SAC_CUDA_MEMRT_H_ */ diff --git a/src/libsac2c/cuda/minimize_transfers.c b/src/libsac2c/cuda/minimize_transfers.c index 79ee77fb7..da2c706ee 100644 --- a/src/libsac2c/cuda/minimize_transfers.c +++ b/src/libsac2c/cuda/minimize_transfers.c @@ -27,6 +27,7 @@ #include "minimize_loop_transfers.h" #include "minimize_cond_transfers.h" #include "minimize_cudast_transfers.h" +#include "minimize_emr_transfers.h" #include "loop_invariant_removal.h" #include "globals.h" #include "wl_descalarization.h" @@ -75,6 +76,17 @@ MTRANdoMinimizeTransfers (node *syntax_tree) syntax_tree = MLTRANdoMinimizeLoopTransfers (syntax_tree); } + /* For any EMR lifted allocations which are H2Ds within a do-loop, + * we artificially lift these out, similar to MLTRAN above. We assume + * that because of the buffer-swapping, there is always a suitable + * device type to pass in as part of recursive call within the do-loop. + * We apply this optimisation *after* we have completed all other + * transfer minimisations, this avoids a problem whereby vardecs within + * the do-loop are not removed, leaving dangling assigns. + */ + if (global.optimize.doemrci && global.optimize.domemrt) + syntax_tree = MEMRTdoMinimizeEMRTransfers (syntax_tree); + /* We perform loop invariant removal here because we found out * that that there are certained cases that are ignored by our * CUDA specific transfer removal, namely if a transfer is loop diff --git a/src/libsac2c/memory/emr_loop_optimisation.c b/src/libsac2c/memory/emr_loop_optimisation.c index 2481fb417..c03640ae4 100644 --- a/src/libsac2c/memory/emr_loop_optimisation.c +++ b/src/libsac2c/memory/emr_loop_optimisation.c @@ -502,6 +502,9 @@ EMRLfundef (node * arg_node, info * arg_info) FUNDEF_ARGS (arg_node) = TCappendArgs (FUNDEF_ARGS (arg_node), INFO_ARGS (arg_info)); INFO_ARGS (arg_info) = NULL; + + /* mark fundef as having been touched by EMRL - this used later in EMRTU */ + FUNDEF_ISEMRLIFTED (arg_node) = TRUE; } INFO_FUNDEF (arg_info) = NULL; diff --git a/src/libsac2c/stdopt/optimize.mac b/src/libsac2c/stdopt/optimize.mac index 251e2fc82..0884b6cfd 100644 --- a/src/libsac2c/stdopt/optimize.mac +++ b/src/libsac2c/stdopt/optimize.mac @@ -107,6 +107,7 @@ OPTIMIZE ("pra", pra, FALSE, FALSE, "polyhedra data reuse optimization") OPTIMIZE ("emrci", emrci, FALSE, FALSE, "EMR candidate inference") OPTIMIZE ("emrcf", emrcf, TRUE, TRUE, "EMR candidate filtering") OPTIMIZE ("emrl", emrl, TRUE, TRUE, "EMR loop memory optimisation") +OPTIMIZE ("memrt", memrt, TRUE, TRUE, "Minimize memcpy transfers for EMRL affected loop functions") OPTIMIZE ("rnb", rnb, FALSE, FALSE, "remove noop conditional branch in with-loops") OPTIMIZE ("rwo", rwo, TRUE, TRUE, "memory reuse with offset") OPTIMIZE ("rip", rip, TRUE, TRUE, "memory reuse with in place selection") diff --git a/src/libsac2c/xml/ast.xml b/src/libsac2c/xml/ast.xml index bb476d014..9a813dc99 100644 --- a/src/libsac2c/xml/ast.xml +++ b/src/libsac2c/xml/ast.xml @@ -494,6 +494,17 @@ + + + + + + + + + + + @@ -6000,6 +6011,11 @@ N_tfarg : Indicates whether or not we need to generate a declaration within header.c when linking to an external library. + + + TRUE iff this fundef has been affected by the EMRL optimisation. + + -- GitLab From 9f102ca1a6c51de0351f9f5c95e0dc5ad11daead Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Tue, 16 Apr 2019 19:51:06 +0100 Subject: [PATCH 11/17] added test for MEMRT --- tests/cuda/test-memrt-lift1.sac | 50 +++++++++++++++++++++++++++++++++ tests/mini-stdlib.sac | 43 ++++++++++++---------------- 2 files changed, 68 insertions(+), 25 deletions(-) create mode 100644 tests/cuda/test-memrt-lift1.sac diff --git a/tests/cuda/test-memrt-lift1.sac b/tests/cuda/test-memrt-lift1.sac new file mode 100644 index 000000000..13d451dbe --- /dev/null +++ b/tests/cuda/test-memrt-lift1.sac @@ -0,0 +1,50 @@ +// This is only for the CUDA-backend, to test the MEMRT traversal, which in effect +// translates an existing transformation by the EMRL traversal (to lift out allocations +// from loops) into something more suitable for CUDA execution. +// +// SAC_TEST|include common.mk +// SAC_TEST|SAC2C_FLAGS += -t cuda -doemrci -doemrcf -doemrl -bcuda:mtran +// SAC_TEST|all: +// SAC_TEST|@$(SAC2C) $(SAC2C_FLAGS) -nomemrt $< | $(GREP_COMMAND_OUTPUT) '_Loop_1( .*, A, [^, ]*_emr_lifted)\|_Loop_1( .*, [^, ]*_A, A)' 2 +// SAC_TEST|@$(SAC2C) $(SAC2C_FLAGS) -domemrt $< | $(GREP_COMMAND_OUTPUT) '_Loop_1( .*, [^, ]*_A, [^, ]*_dev)\|_host2device_( [^, ]*_emr_lifted);' 2 +#include "mini-stdlib.sac" + +inline +int[+] onestep (int[+] B) +{ + A = with { + (. < x < .) : 2 * (B[x+[1,0]] + + B[x-[1,0]] + + B[x+[0,1]] + + B[x-[0,1]]); + } : modarray (B); + + return(A); +} + +noinline +int[+] operation (int[+] A) +{ + steps = 100; + + do { + B = A; + A = onestep (B); + steps--; + } while ((sum (A) > sum (B)) && (steps > 0)); + + return (A); +} + +int main () +{ + A = with { + ([0,1] <= x <= .) : 0; + } : genarray ([1000,1000], 500); + + A = operation (A); + + return _toi_S_ (A[100,100]); +} + + diff --git a/tests/mini-stdlib.sac b/tests/mini-stdlib.sac index 593b11e46..2b2932f87 100644 --- a/tests/mini-stdlib.sac +++ b/tests/mini-stdlib.sac @@ -21,33 +21,25 @@ inline bool != (bool a, bool b) { return _neq_SxS_ (a, b); } // Selection functions -inline int[*] sel(int[.] idx, int[*] array) -{ - new_shape = _drop_SxV_ (_sel_VxA_ ([0], _shape_A_ (idx)), - _shape_A_ (array)); - return with { - (. <= iv <= .) { - new_idx = _cat_VxV_ (idx, iv); - } : _sel_VxA_ (new_idx, array); - } : genarray (new_shape, 0); -} - -inline int[*] sel (int idx, int[*] a) -{ - return sel ([idx], a); -} - -inline bool[*] sel (int[.] idx, bool[*] array) -{ - new_shape = _drop_SxV_ (_sel_VxA_ ([0], _shape_A_ (idx)), - _shape_A_ (array)); - return with { - (. <= iv <= .) { - new_idx = _cat_VxV_ (idx, iv); - } : _sel_VxA_ (new_idx, array); - } : genarray (new_shape, false); +#define SEL_A_(typ, def) \ +inline typ[*] sel(int[.] idx, typ[*] array) \ +{ \ + new_shape = _drop_SxV_ (_sel_VxA_ ([0], _shape_A_ (idx)), \ + _shape_A_ (array)); \ + return with { \ + (. <= iv <= .) { \ + new_idx = _cat_VxV_ (idx, iv); \ + } : _sel_VxA_ (new_idx, array); \ + } : genarray (new_shape, def); \ +} \ + \ +inline typ[*] sel (int idx, typ[*] a) \ +{ \ + return sel ([idx], a); \ } +SEL_A_(int, 0) +SEL_A_(bool, false) // Shape inline int[.] shape (bool[*] a) { return _shape_A_ (a); } @@ -61,6 +53,7 @@ inline int[.] drop (int a, int[.] b) { return _drop_SxV_ (a,b); } // Increment inline int ++ (int a) { return _add_SxS_ (a, 1); } +inline int -- (int a) { return _sub_SxS_ (a, 1); } // Mixed scalar-vector operations inline int[.] + (int a, int[.] b) { return _add_SxV_ (a, b); } -- GitLab From 66aa782b11ce93cbb1963b33ffd2b888ae36f119 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Thu, 18 Apr 2019 20:04:32 +0100 Subject: [PATCH 12/17] Add macros for cyclic app of transformations Macros perform similar purpose as phase driver (see phase.c), the main difference is to decouple it away from just running phases cyclically. Still uses counters, but the user controls what these are. User for minimize_transfers.c and algebraic_wlfi.c. --- src/libsac2c/arrayopt/algebraic_wlfi.c | 173 +++++++--------- src/libsac2c/cuda/minimize_block_transfers2.c | 1 + src/libsac2c/cuda/minimize_cond_transfers.c | 2 + src/libsac2c/cuda/minimize_cudast_transfers.c | 2 + src/libsac2c/cuda/minimize_emr_transfers.c | 5 +- src/libsac2c/cuda/minimize_loop_transfers.c | 2 + src/libsac2c/cuda/minimize_transfers.c | 103 ++++++--- src/libsac2c/stdopt/optimize.mac | 2 + src/libsac2c/tree/traverse_optcounter.h | 196 ++++++++++++++++++ src/tests/CMakeLists.txt | 1 + src/tests/test-traverse-optcounter.cpp | 82 ++++++++ 11 files changed, 433 insertions(+), 136 deletions(-) create mode 100644 src/libsac2c/tree/traverse_optcounter.h create mode 100644 src/tests/test-traverse-optcounter.cpp diff --git a/src/libsac2c/arrayopt/algebraic_wlfi.c b/src/libsac2c/arrayopt/algebraic_wlfi.c index 2dc726caf..1f9fbb7b8 100644 --- a/src/libsac2c/arrayopt/algebraic_wlfi.c +++ b/src/libsac2c/arrayopt/algebraic_wlfi.c @@ -77,6 +77,7 @@ #include "tree_basic.h" #include "tree_compound.h" +#include "traverse_optcounter.h" #include "node_basic.h" #include "print.h" @@ -312,20 +313,17 @@ SimplifySymbioticExpression (node *arg_node, info *arg_info) { int i = 0; int ct = 0; - size_t countDLIR = 0; - size_t countWLIR = global.optcounters.wlir_expr; - size_t countINL = 0; - size_t countCSE = global.optcounters.cse_expr; - size_t countTUP = 0; - size_t countCF = 0; - size_t countVP = 0; - size_t countREA = 0; - size_t countAS = 0; - size_t countAL = 0; - size_t countDL = 0; - size_t countESD = global.optcounters.esd_expr; - size_t countUESD = 0; - size_t countDCR = 0; + bool done = false; + + TOC_SETUP(14, COUNT_DLIR, COUNT_WLIR, COUNT_INL, + COUNT_CSE, COUNT_TUP, COUNT_CF, + COUNT_VP, COUNT_REA, COUNT_AS, + COUNT_AL, COUNT_DL, COUNT_ESD, + COUNT_UESD, COUNT_DCR) + + TOC_SETCOUNTER (COUNT_WLIR, global.optcounters.wlir_expr) + TOC_SETCOUNTER (COUNT_CSE, global.optcounters.cse_expr) + TOC_SETCOUNTER (COUNT_ESD, global.optcounters.esd_expr) DBUG_ENTER (); @@ -350,102 +348,75 @@ SimplifySymbioticExpression (node *arg_node, info *arg_info) #endif /* Invoke each opt */ - -#ifndef DBUG_OFF - /* debug compiler */ -#define RUNCHECK(Name) \ - if (global.check_frequency >= 4) { \ - DBUG_PRINT_TAG ("SSE", "Cycle iteration %d: running post-" #Name " check", i); \ - arg_node = PHrunConsistencyChecks (arg_node); \ - } -#else - /* production compiler does not have PHrunConsistencyChecks() */ -#define RUNCHECK(Name) /*empty*/ -#endif - -#define RUNOPT(Name, Cond, CntStmt, PassFun) \ - if (Cond) { \ - DBUG_PRINT_TAG ("SSE", "Cycle iteration %d: running " #Name, i); \ - CntStmt; \ - arg_node = PassFun (arg_node); \ - RUNCHECK (Name) \ - } - - RUNOPT (DLIR, global.optimize.dodlir, - countDLIR = global.optcounters.dlir_expr, DLIRdoLoopInvariantRemoval); - RUNOPT (WLIR, global.optimize.dowlir, - countWLIR = global.optcounters.wlir_expr, WLIRdoLoopInvariantRemoval); - RUNOPT (INL, global.optimize.doinl, countINL = global.optcounters.inl_fun, - INLdoInlining); - RUNOPT (ISAA, global.optimize.dosaa, , ISAAdoInsertShapeVariables); - RUNOPT (CSE, global.optimize.docse, countCSE = global.optcounters.cse_expr, - CSEdoCommonSubexpressionElimination); - RUNOPT (NTC, global.optimize.dotup, - countTUP = global.optcounters.tup_upgrades, NTCdoNewTypeCheck); - RUNOPT (EAT, global.optimize.dotup, , EATdoEliminateAlphaTypes); - RUNOPT (EBT, global.optimize.dotup, , EBTdoEliminateBottomTypes); - RUNOPT (DFC, TRUE, , DFCdoDispatchFunCalls); - RUNOPT (CF, global.optimize.docf, countCF = global.optcounters.cf_expr, - CFdoConstantFolding); - RUNOPT (VP, global.optimize.dovp, countVP = global.optcounters.vp_expr, - VPdoVarPropagation); - RUNOPT (REA, global.optimize.dorea, countREA = global.optcounters.rea_expr, - REAdoReorderEqualityprfArguments); - RUNOPT (TGTL, global.optimize.dotgtl, countREA = global.optcounters.tgtl_expr, - TGTLdoTransformGtgeToLtle); - RUNOPT (ESD, global.optimize.dosde, countESD = global.optcounters.esd_expr, - ESDdoElimSubDiv); - RUNOPT (AS, global.optimize.doas, countAS = global.optcounters.as_expr, - ASdoArithmeticSimplification); - RUNOPT (CF, global.optimize.docf, countCF = global.optcounters.cf_expr, - CFdoConstantFolding); - RUNOPT (CSE, global.optimize.docse, , CSEdoCommonSubexpressionElimination); - RUNOPT (AL, global.optimize.doal, countAL = global.optcounters.al_expr, - ALdoAssocLawOptimization); - RUNOPT (DL, global.optimize.dodl, countDL = global.optcounters.dl_expr, - DLdoDistributiveLawOptimization); - RUNOPT (UESD, global.optimize.dosde, countUESD = global.optcounters.uesd_expr, - UESDdoUndoElimSubDiv); - RUNOPT (DCR, global.optimize.dodcr, - countDCR = global.optcounters.dead_var + global.optcounters.dead_expr, - DCRdoDeadCodeRemoval); - -#undef RUNOPT -#undef RUNCHECK + TOC_RUNOPT_TAG ("SSE", "DLIR", global.optimize.dodlir, COUNT_DLIR, + global.optcounters.dlir_expr, arg_node, DLIRdoLoopInvariantRemoval); + TOC_RUNOPT_TAG ("SSE", "WLIR", global.optimize.dowlir, COUNT_WLIR, + global.optcounters.wlir_expr, arg_node, WLIRdoLoopInvariantRemoval); + TOC_RUNOPT_TAG ("SSE", "INL", global.optimize.doinl, COUNT_INL, global.optcounters.inl_fun, + arg_node, INLdoInlining); + TOC_RUNOPT_TAG ("SSE", "ISAA", global.optimize.dosaa, TOC_IGNORE, 0, arg_node, + ISAAdoInsertShapeVariables); + TOC_RUNOPT_TAG ("SSE", "CSE", global.optimize.docse, COUNT_CSE, global.optcounters.cse_expr, + arg_node, CSEdoCommonSubexpressionElimination); + TOC_RUNOPT_TAG ("SSE", "NTC", global.optimize.dotup, COUNT_TUP, global.optcounters.tup_upgrades, + arg_node, NTCdoNewTypeCheck); + TOC_RUNOPT_TAG ("SSE", "EAT", global.optimize.dotup, TOC_IGNORE, 0, arg_node, + EATdoEliminateAlphaTypes); + TOC_RUNOPT_TAG ("SSE", "EBT", global.optimize.dotup, TOC_IGNORE, 0, arg_node, + EBTdoEliminateBottomTypes); + TOC_RUNOPT_TAG ("SSE", "DFC", TRUE, TOC_IGNORE, 0, arg_node, DFCdoDispatchFunCalls); + TOC_RUNOPT_TAG ("SSE", "CF", global.optimize.docf, COUNT_CF, global.optcounters.cf_expr, + arg_node, CFdoConstantFolding); + TOC_RUNOPT_TAG ("SSE", "VP", global.optimize.dovp, COUNT_VP, global.optcounters.vp_expr, + arg_node, VPdoVarPropagation); + TOC_RUNOPT_TAG ("SSE", "REA", global.optimize.dorea, COUNT_REA, global.optcounters.rea_expr, + arg_node, REAdoReorderEqualityprfArguments); + TOC_RUNOPT_TAG ("SSE", "TGTL", global.optimize.dotgtl, COUNT_REA, global.optcounters.tgtl_expr, + arg_node, TGTLdoTransformGtgeToLtle); + TOC_RUNOPT_TAG ("SSE", "ESD", global.optimize.dosde, COUNT_ESD, global.optcounters.esd_expr, + arg_node, ESDdoElimSubDiv); + TOC_RUNOPT_TAG ("SSE", "AS", global.optimize.doas, COUNT_AS, global.optcounters.as_expr, + arg_node, ASdoArithmeticSimplification); + TOC_RUNOPT_TAG ("SSE", "CF", global.optimize.docf, COUNT_CF, global.optcounters.cf_expr, + arg_node, CFdoConstantFolding); + TOC_RUNOPT_TAG ("SSE", "CSE", global.optimize.docse, TOC_IGNORE, 0, arg_node, + CSEdoCommonSubexpressionElimination); + TOC_RUNOPT_TAG ("SSE", "AL", global.optimize.doal, COUNT_AL, global.optcounters.al_expr, + arg_node, ALdoAssocLawOptimization); + TOC_RUNOPT_TAG ("SSE", "DL", global.optimize.dodl, COUNT_DL, global.optcounters.dl_expr, + arg_node, DLdoDistributiveLawOptimization); + TOC_RUNOPT_TAG ("SSE", "UESD", global.optimize.dosde, COUNT_UESD, global.optcounters.uesd_expr, + arg_node, UESDdoUndoElimSubDiv); + TOC_RUNOPT_TAG ("SSE", "DCR", global.optimize.dodcr, COUNT_DCR, + global.optcounters.dead_var + global.optcounters.dead_expr, arg_node, + DCRdoDeadCodeRemoval); /* We do not count DCR, as it's merely for cleanup */ DBUG_PRINT_TAG ("SSE", "DLIR= %zu, WLIR= %zu, INL=%zu, CSE=%zu, TUP=%zu, CF=%zu, VP=%zu, " "AS=%zu, AL=%zu, DL=%zu, " "ESD=%zu, UESD=%zu, DCR=%zu", - (global.optcounters.dlir_expr - countDLIR), - (global.optcounters.wlir_expr - countWLIR), - (global.optcounters.inl_fun - countINL), - (global.optcounters.cse_expr - countCSE), - (global.optcounters.tup_upgrades - countTUP), - (global.optcounters.cf_expr - countCF), - (global.optcounters.vp_expr - countVP), - (global.optcounters.as_expr - countAS), - (global.optcounters.al_expr - countAL), - (global.optcounters.dl_expr - countDL), + (global.optcounters.dlir_expr - TOC_GETCOUNTER (COUNT_DLIR)), + (global.optcounters.wlir_expr - TOC_GETCOUNTER (COUNT_WLIR)), + (global.optcounters.inl_fun - TOC_GETCOUNTER (COUNT_INL)), + (global.optcounters.cse_expr - TOC_GETCOUNTER (COUNT_CSE)), + (global.optcounters.tup_upgrades - TOC_GETCOUNTER (COUNT_TUP)), + (global.optcounters.cf_expr - TOC_GETCOUNTER (COUNT_CF)), + (global.optcounters.vp_expr - TOC_GETCOUNTER (COUNT_VP)), + (global.optcounters.as_expr - TOC_GETCOUNTER (COUNT_AS)), + (global.optcounters.al_expr - TOC_GETCOUNTER (COUNT_AL)), + (global.optcounters.dl_expr - TOC_GETCOUNTER (COUNT_DL)), /* The following are not for some reason in the fixpoint check below: */ - (global.optcounters.esd_expr - countESD), - (global.optcounters.uesd_expr - countUESD), + (global.optcounters.esd_expr - TOC_GETCOUNTER (COUNT_ESD)), + (global.optcounters.uesd_expr - TOC_GETCOUNTER (COUNT_UESD)), ((global.optcounters.dead_var + global.optcounters.dead_expr) - - countDCR)); - - if (/* Fix point check */ - (countDLIR == global.optcounters.dlir_expr) - && (countWLIR == global.optcounters.wlir_expr) - && (countINL == global.optcounters.inl_fun) - && (countCSE == global.optcounters.cse_expr) - && (countTUP == global.optcounters.tup_upgrades) - && (countCF == global.optcounters.cf_expr) - && (countVP == global.optcounters.vp_expr) - && (countAS == global.optcounters.as_expr) - && (countAL == global.optcounters.al_expr) - && (countDL == global.optcounters.dl_expr)) { + - TOC_GETCOUNTER (COUNT_DCR))); + + /* Fix point check */ + TOC_COMPARE_RANGE (COUNT_DLIR, COUNT_DL, done) + + if (done) { i = global.max_optcycles; } } diff --git a/src/libsac2c/cuda/minimize_block_transfers2.c b/src/libsac2c/cuda/minimize_block_transfers2.c index 64e5d3c00..d3e5bc579 100644 --- a/src/libsac2c/cuda/minimize_block_transfers2.c +++ b/src/libsac2c/cuda/minimize_block_transfers2.c @@ -213,6 +213,7 @@ MBTRAN2prf (node *arg_node, info *arg_info) node *dev_avis = ID_AVIS (dev_id); arg_node = FREEdoFreeNode (arg_node); arg_node = TBmakeId (dev_avis); + global.optcounters.cuda_min_trans++; } break; default: diff --git a/src/libsac2c/cuda/minimize_cond_transfers.c b/src/libsac2c/cuda/minimize_cond_transfers.c index 01587830f..b6d68de15 100644 --- a/src/libsac2c/cuda/minimize_cond_transfers.c +++ b/src/libsac2c/cuda/minimize_cond_transfers.c @@ -304,10 +304,12 @@ MCTRANassign (node *arg_node, info *arg_info) if (INFO_APPOSTASSIGNS (arg_info) != NULL) { ASSIGN_NEXT (arg_node) = INFO_APPOSTASSIGNS (arg_info); + global.optcounters.cuda_min_trans++; } if (INFO_APPREASSIGNS (arg_info) != NULL) { arg_node = TCappendAssign (INFO_APPREASSIGNS (arg_info), arg_node); + global.optcounters.cuda_min_trans++; } FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) diff --git a/src/libsac2c/cuda/minimize_cudast_transfers.c b/src/libsac2c/cuda/minimize_cudast_transfers.c index e0ef2d22e..648fb8149 100644 --- a/src/libsac2c/cuda/minimize_cudast_transfers.c +++ b/src/libsac2c/cuda/minimize_cudast_transfers.c @@ -219,11 +219,13 @@ MCSTRANassign (node *arg_node, info *arg_info) ASSIGN_NEXT (arg_node) = NULL; arg_node = TCappendAssign (arg_node, assigns); INFO_POSTASSIGNS (arg_info) = NULL; + global.optcounters.cuda_min_trans++; } if (INFO_PREASSIGNS (arg_info) != NULL) { arg_node = TCappendAssign (INFO_PREASSIGNS (arg_info), arg_node); INFO_PREASSIGNS (arg_info) = NULL; + global.optcounters.cuda_min_trans++; } } diff --git a/src/libsac2c/cuda/minimize_emr_transfers.c b/src/libsac2c/cuda/minimize_emr_transfers.c index f27cb594d..0a821d9c2 100644 --- a/src/libsac2c/cuda/minimize_emr_transfers.c +++ b/src/libsac2c/cuda/minimize_emr_transfers.c @@ -384,7 +384,10 @@ MEMRTassign (node *arg_node, info *arg_info) ASSIGN_NEXT (arg_node) = NULL; /* add h2d in calling context before N_ap */ - arg_node = TCappendAssign (INFO_APASSIGNS (arg_info), arg_node); + if (INFO_APASSIGNS (arg_info) != NULL) { + arg_node = TCappendAssign (INFO_APASSIGNS (arg_info), arg_node); + global.optcounters.cuda_min_trans++; + } /* add needed vardecs to calling context */ FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) diff --git a/src/libsac2c/cuda/minimize_loop_transfers.c b/src/libsac2c/cuda/minimize_loop_transfers.c index cbee505b7..176731316 100644 --- a/src/libsac2c/cuda/minimize_loop_transfers.c +++ b/src/libsac2c/cuda/minimize_loop_transfers.c @@ -266,10 +266,12 @@ MLTRANassign (node *arg_node, info *arg_info) if (INFO_APPOSTASSIGNS (arg_info) != NULL) { ASSIGN_NEXT (arg_node) = INFO_APPOSTASSIGNS (arg_info); + global.optcounters.cuda_min_trans+=1; } if (INFO_APPREASSIGNS (arg_info) != NULL) { arg_node = TCappendAssign (INFO_APPREASSIGNS (arg_info), arg_node); + global.optcounters.cuda_min_trans+=1; } FUNDEF_VARDECS (INFO_FUNDEF (arg_info)) diff --git a/src/libsac2c/cuda/minimize_transfers.c b/src/libsac2c/cuda/minimize_transfers.c index da2c706ee..8e116e0cf 100644 --- a/src/libsac2c/cuda/minimize_transfers.c +++ b/src/libsac2c/cuda/minimize_transfers.c @@ -3,20 +3,19 @@ * @defgroup mtran Minimize Transfers * @ingroup cuda * - * This is a driver module for three transformations aiming at minimizing + * This is a driver module for several transformations aiming at minimizing * the number of host<->device memory transfers. These three transformations * are applied in a cyclic fashion since one optimization might expose more * opportunities for another optimization. The number of cycles is currently - * set at 10. However, a better approach would be to stop the cycle when no - * changes occur to the AST (Unfortunately, I have yet figurred out how to - * do it). For details of each transformation, please refer to the individual - * module files. + * set at max_optcycles (globals), but will termiate early if we've reached a + * fixed point. * * @{ */ #include "minimize_transfers.h" -#include +#include "phase.h" +#include "traverse_optcounter.h" #define DBUG_PREFIX "MTRAN" #include "debug.h" @@ -47,45 +46,81 @@ node * MTRANdoMinimizeTransfers (node *syntax_tree) { - DBUG_ENTER (); + int i; + bool done = false; + + TOC_SETUP(1, COUNT_TRL) - int i, j; + DBUG_ENTER (); DBUG_PRINT ("Performaing CUDA Minimize Transfers Optimistions"); - if (global.backend == BE_cuda && global.optimize.doexpar) { - DBUG_PRINT ("Performing `Expand Partitions' optimisation"); - for (i = 0; i < 10; i++) { - /* syntax_tree = MBTRAN2doMinimizeBlockTransfers( syntax_tree); */ - syntax_tree = ACTRANdoAnnotateCondTransfers (syntax_tree); - syntax_tree = MCTRANdoMinimizeCudastCondTransfers (syntax_tree); + if (global.optimize.doexpar) { + DBUG_PRINT ("Doing expar optimisation cycle:"); + for (i = 1; i < global.max_optcycles; i++) { + /* XXX disabled for some reason, further investigation needed */ + TOC_RUNOPT ("MBTRAN2", false, COUNT_TRL, + global.optcounters.cuda_min_trans, + syntax_tree, MBTRAN2doMinimizeBlockTransfers) + TOC_RUNOPT ("ACTRAN", true, TOC_IGNORE, 0, + syntax_tree, ACTRANdoAnnotateCondTransfers) + TOC_RUNOPT ("MCTRAN", true, COUNT_TRL, + global.optcounters.cuda_min_trans, + syntax_tree, MCTRANdoMinimizeCondTransfers) + + TOC_COMPARE (done) + + if (done) { + break; + } } + DBUG_PRINT ("Completed expar optimisation cycle after %d cycles", i); } - for (j = 0; j < 10; j++) { - syntax_tree = MCSTRANdoMinimizeCudastTransfers (syntax_tree); - syntax_tree = MBTRAN2doMinimizeBlockTransfers (syntax_tree); - syntax_tree = ACTRANdoAnnotateCondTransfers (syntax_tree); - syntax_tree = MCTRANdoMinimizeCondTransfers (syntax_tree); + /* reset counters for next cycle */ + TOC_RESETCOUNTERS () + done = false; + + DBUG_PRINT ("Doing general optimisation cycle:"); + for (i = 1; i < global.max_optcycles; i++) { + + TOC_RUNOPT ("MCSTRAN", true, COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MCSTRANdoMinimizeCudastTransfers) + TOC_RUNOPT ("MBTRAN2", true, COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MBTRAN2doMinimizeBlockTransfers) + TOC_RUNOPT ("ACTRAN", true, TOC_IGNORE, 0, + syntax_tree, ACTRANdoAnnotateCondTransfers) + TOC_RUNOPT ("MCTRAN", true, COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MCTRANdoMinimizeCondTransfers) /* make sure the lifted transfer are removed when ever * possible before minimizing transfers in loops. */ - syntax_tree = MBTRAN2doMinimizeBlockTransfers (syntax_tree); - /*********************************************************/ - syntax_tree = AMTRANdoAnnotateMemoryTransfers (syntax_tree); - syntax_tree = MLTRANdoMinimizeLoopTransfers (syntax_tree); - } + TOC_RUNOPT ("MBTRAN2", true, COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MBTRAN2doMinimizeBlockTransfers) + TOC_RUNOPT ("AMTRAN", true, TOC_IGNORE, 0, + syntax_tree, AMTRANdoAnnotateMemoryTransfers) + TOC_RUNOPT ("MLTRAN", true, COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MLTRANdoMinimizeLoopTransfers) - /* For any EMR lifted allocations which are H2Ds within a do-loop, - * we artificially lift these out, similar to MLTRAN above. We assume - * that because of the buffer-swapping, there is always a suitable - * device type to pass in as part of recursive call within the do-loop. - * We apply this optimisation *after* we have completed all other - * transfer minimisations, this avoids a problem whereby vardecs within - * the do-loop are not removed, leaving dangling assigns. - */ - if (global.optimize.doemrci && global.optimize.domemrt) - syntax_tree = MEMRTdoMinimizeEMRTransfers (syntax_tree); + /* For any EMR lifted allocations which are H2Ds within a do-loop, + * we artificially lift these out, similar to MLTRAN above. We assume + * that because of the buffer-swapping, there is always a suitable + * device type to pass in as part of recursive call within the do-loop. + */ + TOC_RUNOPT ("MEMRT", global.optimize.doemrci && global.optimize.domemrt, + COUNT_TRL, global.optcounters.cuda_min_trans, + syntax_tree, MEMRTdoMinimizeEMRTransfers) + + TOC_COMPARE (done) + + DBUG_PRINT ("Counter: Lift -> %zu", + (global.optcounters.cuda_min_trans - TOC_GETCOUNTER (COUNT_TRL))); + + if (done) { + break; + } + } + DBUG_PRINT ("Completed general optimisation cycle after %d cycles", i); /* We perform loop invariant removal here because we found out * that that there are certained cases that are ignored by our diff --git a/src/libsac2c/stdopt/optimize.mac b/src/libsac2c/stdopt/optimize.mac index 0884b6cfd..a4ee41559 100644 --- a/src/libsac2c/stdopt/optimize.mac +++ b/src/libsac2c/stdopt/optimize.mac @@ -224,6 +224,8 @@ OPTCOUNTER (safa_expr, TRUE, "associative function argument(s) sorted") OPTCOUNTER (pogo_expr, FALSE, "guards removed by pogo") OPTCOUNTER (pwlf_expr, FALSE, "with-loops folded using polyhedra") OPTCOUNTER (ssawl_expr, FALSE, "with-loops converted to SSA form") +/* optimisation counters for CUDA backend */ +OPTCOUNTER (cuda_min_trans, TRUE, "transfer primitives are out lifted") #undef OPTIMIZEstr #undef OPTIMIZEabbr diff --git a/src/libsac2c/tree/traverse_optcounter.h b/src/libsac2c/tree/traverse_optcounter.h new file mode 100644 index 000000000..a163ab611 --- /dev/null +++ b/src/libsac2c/tree/traverse_optcounter.h @@ -0,0 +1,196 @@ +/** + * @file + * @brief Alternative phase cycle driver, similar to actual phase cycle driver + * + * This set of macros are meant to be used to driver some sort of cyclical + * operation, such as apply several traversals one after the other, to a + * fixed point. This is very similar to what is happening in the compiler phase + * driver (@see global/phase.c). The main difference here is that we can count + * anything we want, as the user specifies what variable to assign to. There + * is no requirement to only use the OPTCOUNTERS (@see stdopt/optimize.mac). + * + * This does make things more verbose though, compared to using the statistics + * helper (@see stdopt/statistics.c) to manipulate the OPTCOUNTERS. + * + * The typical use case is where one currently applies several traversals + * n times, and always n times: + * + * ~~~~ + * for (i = 1; i < 10; i++) { + * node = fun1 (node); + * if (doopt) { + * node = fun2 (node); + * } + * node = fun3 (node); + * node = fun4 (node); + * } + * ~~~~ + * + * We can change this over to: + * + * ~~~~ + * TOC_SETUP (2, COUNT_ONE, COUNT_TWO) + * bool test = false; + * + * TOC_SETCOUNTER (COUNT_TWO, 10) + * + * for (i = 1; i < global.max_optcycles; i++) { + * TOC_RUNOPT ("OPT1", true, COUNT_ONE, some_count_value, node, fun1) + * TOC_RUNOPT ("OPT2", doopt, TOC_IGNORE, 0, node, fun2) + * TOC_RUNOPT ("OPT3", true, COUNT_ONE, some_count_value2, node, fun3) + * TOC_RUNOPT ("OPT4", true, COUNT_TWO, some_count_value3, node, fun4) + * + * TOC_COMPARE (test) + * + * printf ("Counter: ONE -> %zu, TWO -> %zu\n", + * TOC_GETCOUNTER (COUNT_ONE), + * TOC_GETCOUNTER (COUNT_TWO)); + * + * if (test) + * break; + * } + * ~~~~ + * + */ +#ifndef _TREE_TRAVERSE_OPT_COUNTER_H_ +#define _TREE_TRAVERSE_OPT_COUNTER_H_ + +#include "phase.h" + +/** + * @brief Setup and initialise all needed variables + * + * Here the user passes in _names_ of counters, which are kept stored in an + * enum which is used to access an array, where all the counter values are stored. + * + * A special counter, called `TOC_IGNORE`, is already set. This can be used instead + * of a real counter in cases where nothing is being stored. + * + * @param num Number of counter names being passed in + * @param ... The counter names (its suggested that these should be in all-caps) + */ +#define TOC_SETUP(num, ...) \ + enum toc_optcounter_labels { TOC_IGNORE, __VA_ARGS__ }; \ + __attribute__((unused)) const size_t toc_optcount_size = num+1; \ + size_t toc_store[num+1] = {0}; \ + size_t toc_store_old[num+1] = {0}; \ + __attribute__((unused)) size_t toc_i; + +/** + * @brief Compare current counter state with previous counter state over a + * specified _range_ of counters. + * + * We iteratively compare the counter values between the current state and previous + * state. If all counters states are found to be **equal**, then the cycle has reached + * a fixed-point. If however one or more counter states are **unequal**, we continue + * the next iteration of the cycle. + * + * @param start Some label or integer indicating the start; **cannot** be less-than zero + * @param end Some label or integer indicating the end; **cannot** be greater than number + * of total counters + * @param out Variable used to store boolean result: if _true_, we've reached a + * fixed-point + */ +#define TOC_COMPARE_RANGE(start, end, out) \ + for (toc_i = start, out = true; toc_i < end; toc_i++) { \ + out = out && toc_store[toc_i] == toc_store_old[toc_i]; \ + toc_store_old[toc_i] = toc_store[toc_i]; \ + } + +/** + * @brief Compare current counter state with previous counter state for all counters + * + * @see TOC_COMPARE_RANGE + * + * @param out Variable used to store boolean result: if _true_, we've reached a + * fixed-point + */ +#define TOC_COMPARE(out) TOC_COMPARE_RANGE(1, toc_optcount_size, out) + +/** + * @brief Set counter to specified value + * + * @param label The counter name + * @param val The value to set + */ +#define TOC_SETCOUNTER(label, val) \ + toc_store[label] = toc_store_old[label] = val; + +/** + * @brief Get the current counter value + * + * @param label The counter name + * @return a value as type `size_t` + */ +#define TOC_GETCOUNTER(label) (toc_store[label]) + +/** + * @brief Reset all counters to default value (zero) + */ +#define TOC_RESETCOUNTERS() \ + for (toc_i = 0; toc_i < toc_optcount_size; toc_i++) { \ + toc_store[toc_i] = 0; \ + toc_store_old[toc_i] = 0; \ + } + +#ifdef DBUG_OFF + +/* in production compiler PHrunConsistencyChecks is disabled */ +#define TOC_RUNCHECK(name, node) +#define TOC_RUNCHECK_TAG(tag, name, node) + +#else /* DBUG_OFF */ + +#define TOC_RUNCHECK(name, node) \ + if (global.check_frequency >= 3) { \ + DBUG_PRINT ("Cycle iteration %d: running post-" name " check", i); \ + node = PHrunConsistencyChecks (node); \ + } + +#define TOC_RUNCHECK_TAG(tag, name, node) \ + if (global.check_frequency >= 3) { \ + DBUG_PRINT_TAG (tag, "Cycle iteration %d: running post-" name " check", i); \ + node = PHrunConsistencyChecks (node); \ + } + +#endif /* DBUG_OFF */ + +/** + * @brief Perform one call of the given optimisation/traversal function + * + * @param name Some string name to use for printouts + * @param cond Some condition to control when the traversal should run + * @param label Name of the counter to use + * @param stmt Some value(s) to set the counter + * @param node The node to pass to the function + * @param fun The function to be called + */ +#define TOC_RUNOPT(name, cond, label, stmt, node, fun) \ + if (cond) { \ + DBUG_PRINT ("Cycle iteration %d: running " name, i); \ + toc_store[label] = stmt; \ + node = fun (node); \ + TOC_RUNCHECK (name, node) \ + } + +/** + * @brief Perform one call of the given optimisation/traversal function using + * a specified TAG for printing + * + * @param tag Some string label for printouts + * @param name Some string name to use for printouts + * @param cond Some condition to control when the traversal should run + * @param label Name of the counter to use + * @param stmt Some value(s) to set the counter + * @param node The node to pass to the function + * @param fun The function to be called + */ +#define TOC_RUNOPT_TAG(tag, name, cond, label, stmt, node, fun) \ + if (cond) { \ + DBUG_PRINT_TAG (tag, "Cycle iteration %d: running " name, i); \ + toc_store[label] = stmt; \ + node = fun (node); \ + TOC_RUNCHECK_TAG (tag, name, node) \ + } + +#endif /* _TREE_TRAVERSE_OPT_COUNTER_H_ */ diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 4a25ece10..129419418 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -78,6 +78,7 @@ ADD_FUNC_TEST (string-tests string-tests.cpp) ADD_FUNC_TEST (test-assoc-law test-assoc-law.cpp) ADD_FUNC_TEST (test-icm-compilation test-icm-compilation.cpp) ADD_FUNC_TEST (test-macros test-macros.cpp) +ADD_FUNC_TEST (test-traverse-optcounter test-traverse-optcounter.cpp) # libsac + runtime tests # XXX (hans) we can only create one test suite, *not* per-target, due to name-clashes diff --git a/src/tests/test-traverse-optcounter.cpp b/src/tests/test-traverse-optcounter.cpp new file mode 100644 index 000000000..54f02ba5a --- /dev/null +++ b/src/tests/test-traverse-optcounter.cpp @@ -0,0 +1,82 @@ +#include "gtest/gtest.h" +#include "config.h" + +/* we safely ignore these */ +#define DBUG_PRINT(smt, ...) +#define DBUG_PRINT_TAG(tag, smt, ...) +#define DBUG_OFF /* to not call phase.c functions */ + +extern "C" { +#include "traverse_optcounter.h" +} + +static int counter = 0; + +static int +testFunction (int input) +{ + counter++; + return input; +} + +TEST (MACRO_OPTCOUNTER, Setup) +{ + TOC_SETUP (2, COUNT_ONE, COUNT_TWO) + + ASSERT_TRUE (toc_optcount_size == 3); + ASSERT_TRUE (toc_store[TOC_IGNORE] == 0); + ASSERT_TRUE (toc_store[COUNT_ONE] == 0); + ASSERT_TRUE (toc_store_old[COUNT_ONE] == 0); +} + +TEST (MACRO_OPTCOUNTER, SetAndGetCounter) +{ + TOC_SETUP (1, COUNT_ONE) + + ASSERT_TRUE (TOC_GETCOUNTER (COUNT_ONE) == 0); + TOC_SETCOUNTER (COUNT_ONE, 2) + ASSERT_TRUE (TOC_GETCOUNTER (COUNT_ONE) == 2); + ASSERT_TRUE (toc_store_old[COUNT_ONE] == 2); + + TOC_RESETCOUNTERS () + ASSERT_TRUE (TOC_GETCOUNTER (COUNT_ONE) == 0); + ASSERT_TRUE (toc_store_old[COUNT_ONE] == 0); +} + +TEST (MACRO_OPTCOUNTER, CompareCounters) +{ + bool test = false; + TOC_SETUP (3, COUNT_ONE, COUNT_TWO, COUNT_THREE) + + TOC_COMPARE (test) + + ASSERT_TRUE (test); + + toc_store[COUNT_TWO] = 10; + + TOC_COMPARE (test) + + ASSERT_FALSE (test); +} + +TEST (MACRO_OPTCOUNTER, RunOpt) +{ + int t = 4; + TOC_SETUP (2, COUNT_ONE, COUNT_TWO) + + TOC_RUNOPT ("Blah", true, COUNT_ONE, counter, t, testFunction) + ASSERT_TRUE (toc_store[COUNT_ONE] == 0); + ASSERT_TRUE (toc_store[COUNT_TWO] == 0); + TOC_RUNOPT ("Blah", true, COUNT_ONE, counter, t, testFunction) + ASSERT_TRUE (toc_store[COUNT_ONE] == 1); + ASSERT_TRUE (toc_store[COUNT_TWO] == 0); + TOC_RUNOPT ("Blah", true, TOC_IGNORE, 0, t, testFunction) + ASSERT_TRUE (toc_store[COUNT_ONE] == 1); + ASSERT_TRUE (toc_store[COUNT_TWO] == 0); + TOC_RUNOPT ("Blah", false, COUNT_TWO, counter, t, testFunction) + ASSERT_TRUE (toc_store[COUNT_ONE] == 1); + ASSERT_TRUE (toc_store[COUNT_TWO] == 0); + TOC_RUNOPT ("Blah", true, COUNT_TWO, counter, t, testFunction) + ASSERT_TRUE (toc_store[COUNT_ONE] == 1); + ASSERT_TRUE (toc_store[COUNT_TWO] == 3); +} -- GitLab From b7d31658b7054dd7d398815f51386f47d65f1fb1 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Thu, 18 Apr 2019 20:12:51 +0100 Subject: [PATCH 13/17] Fixed comments as per MR --- src/libsac2c/cuda/insert_withloop_memtran.c | 47 ++++++++++----------- src/libsac2c/cuda/minimize_emr_transfers.c | 20 ++++----- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/src/libsac2c/cuda/insert_withloop_memtran.c b/src/libsac2c/cuda/insert_withloop_memtran.c index efc932ee0..8bf8445f7 100644 --- a/src/libsac2c/cuda/insert_withloop_memtran.c +++ b/src/libsac2c/cuda/insert_withloop_memtran.c @@ -73,28 +73,28 @@ * @{ */ struct INFO { - node *fundef; /**< N_fundef node of the enclosing function */ - bool in_cudawl; /**< Flag indicating whether the code currently being traversed is in - a cudarizable N_with */ - bool create_d2h; /**< Flag indicating whether needs to be created for - the N_let->N_ids */ - node *postassigns; /**< Chain of that needs to be appended at the end of - the current N_assign */ - node *preassigns; /**< Chain of that needs to be prepended at the - beginning of the current N_assign */ - lut_t *lut; /**< Lookup table storing pairs of Avis(host)->Avis(device) e.g. Given - a_dev = host2device( a_host), Avis(a_host)->Avis(a_dev) will be stored - into the table */ - lut_t *notran; /**< Lookup table storing N_avis of arrays varaibles that no data - transfers should be created. */ - node *let_expr; /**< Holds the current N_let expressions, used to check if the RHS is - a with-loop */ - node *let_ids; /**< Holds the current N_let N_ids chain */ - bool in_cexprs; /**< Flag indicating where are in N_code cexprs */ - bool from_ap; /**< Flag indicating where are coming from a N_ap */ - node *apids; /**< Holds LHS of current N_ap */ - node *topblock; /**< Holds the N_block (body) of the current N_fundef */ - nlut_t *at_nlut; /**< Used to count the number of references of N_avis */ + node *fundef; /**< N_fundef node of the enclosing function */ + bool in_cudawl; /**< Flag indicating whether the code currently being traversed is in + a cudarizable N_with */ + bool create_d2h; /**< Flag indicating whether needs to be created for + the N_let->N_ids */ + node *postassigns; /**< Chain of that needs to be appended at the end of + the current N_assign */ + node *preassigns; /**< Chain of that needs to be prepended at the + beginning of the current N_assign */ + lut_t *lut; /**< Lookup table storing pairs of Avis(host)->Avis(device) e.g. Given + a_dev = host2device( a_host), Avis(a_host)->Avis(a_dev) will be stored + into the table */ + lut_t *notran; /**< Lookup table storing N_avis of arrays varaibles that no data + transfers should be created. */ + node *let_expr; /**< Holds the current N_let expressions, used to check if the RHS is + a with-loop */ + node *let_ids; /**< Holds the current N_let N_ids chain */ + bool in_cexprs; /**< Flag indicating where are in N_code cexprs */ + bool from_ap; /**< Flag indicating where are coming from a N_ap */ + node *apids; /**< Holds LHS of current N_ap */ + node *topblock; /**< Holds the N_block (body) of the current N_fundef */ + nlut_t *at_nlut; /**< Used to count the number of references of N_avis */ }; #define INFO_FUNDEF(n) (n->fundef) @@ -270,8 +270,7 @@ TypeConvert (ntype *host_type, nodetype nty, info *arg_info) if (nty == N_id) { dev_type = CUconvertHostToDeviceType (host_type); } - /** - * If the node to be type converted is N_ids, its original type + /* If the node to be type converted is N_ids, its original type * can be AUD as well as long as the N_with on the RHS is cudarizable. * The reason a cudarizbale can produce a AUD result illustrated by * the following example: diff --git a/src/libsac2c/cuda/minimize_emr_transfers.c b/src/libsac2c/cuda/minimize_emr_transfers.c index 0a821d9c2..fadb4d010 100644 --- a/src/libsac2c/cuda/minimize_emr_transfers.c +++ b/src/libsac2c/cuda/minimize_emr_transfers.c @@ -84,17 +84,17 @@ enum trav_mode { bypass, inap, afterap }; * @{ */ struct INFO { - int funargnum; /**< used to assign ordinal values to fundef args */ - bool inemrloop; /**< flag indicating we are in a EMRL affected loop */ + int funargnum; /**< used to assign ordinal values to fundef args */ + bool inemrloop; /**< flag indicating we are in a EMRL affected loop */ enum trav_mode apmode; /**< specifies which mode we are for the N_ap traversal */ - node *fundef; /**< Holds current N_fundef */ - lut_t *lut; /**< LUT is used for storing EMRL lifted h2d RHS -> LHS mappings */ - lut_t *reclut; /**< LUT is used to store all h2d RHS -> LHS mappings */ - node *letids; /**< The the LHS of N_prf */ - node *apargs; /**< N_ap arguments */ - node *apvardecs; /**< Used to update vardecs in N_ap calling context */ - node *apassigns; /**< Used to update assigns in N_ap calling context */ - node *rec_ap; /**< the recursive loopfun N_ap */ + node *fundef; /**< Holds current N_fundef */ + lut_t *lut; /**< LUT is used for storing EMRL lifted h2d RHS -> LHS mappings */ + lut_t *reclut; /**< LUT is used to store all h2d RHS -> LHS mappings */ + node *letids; /**< The the LHS of N_prf */ + node *apargs; /**< N_ap arguments */ + node *apvardecs; /**< Used to update vardecs in N_ap calling context */ + node *apassigns; /**< Used to update assigns in N_ap calling context */ + node *rec_ap; /**< the recursive loopfun N_ap */ }; #define INFO_FUNDEF(n) ((n)->fundef) -- GitLab From ade710c9f16ad7bfd00f8304b2dd49ea90ceabdd Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Thu, 18 Apr 2019 20:25:55 +0100 Subject: [PATCH 14/17] Change to using CTIerror instead of ASSERT --- src/libsac2c/cuda/cuda_utils.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libsac2c/cuda/cuda_utils.c b/src/libsac2c/cuda/cuda_utils.c index 33d066737..89511ffa7 100644 --- a/src/libsac2c/cuda/cuda_utils.c +++ b/src/libsac2c/cuda/cuda_utils.c @@ -295,7 +295,8 @@ CUconvertHostToDeviceType (ntype *host_type) DBUG_ENTER (); /* If the host_type is of known dimension */ - DBUG_ASSERT (TUdimKnown (host_type), "AUD type found!"); + if (!TUdimKnown (host_type)) + CTIerrorInternal ("AUD type found!"); /* If the scalar type is simple, e.g. int, float ... */ if (TYgetDim (host_type) > 0 -- GitLab From b6931c587a9f07343c4c2ef4690d860decb3aa21 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Fri, 19 Apr 2019 14:20:54 +0100 Subject: [PATCH 15/17] Fix missing includes --- src/libsac2c/cuda/minimize_block_transfers2.c | 2 +- src/libsac2c/cuda/minimize_emr_transfers.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libsac2c/cuda/minimize_block_transfers2.c b/src/libsac2c/cuda/minimize_block_transfers2.c index d3e5bc579..b997a8a38 100644 --- a/src/libsac2c/cuda/minimize_block_transfers2.c +++ b/src/libsac2c/cuda/minimize_block_transfers2.c @@ -38,10 +38,10 @@ */ #include "minimize_block_transfers2.h" -#include #include "new_types.h" #include "tree_compound.h" #include "free.h" +#include "globals.h" #include "traverse.h" #include "tree_basic.h" #include "LookUpTable.h" diff --git a/src/libsac2c/cuda/minimize_emr_transfers.c b/src/libsac2c/cuda/minimize_emr_transfers.c index fadb4d010..2b64157e5 100644 --- a/src/libsac2c/cuda/minimize_emr_transfers.c +++ b/src/libsac2c/cuda/minimize_emr_transfers.c @@ -70,6 +70,7 @@ #include "tree_basic.h" #include "tree_compound.h" #include "memory.h" +#include "globals.h" #include "free.h" #include "cuda_utils.h" -- GitLab From fd1eb0078500a069431de2fcba5d1ec6c5d752b7 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Fri, 19 Apr 2019 15:11:11 +0100 Subject: [PATCH 16/17] fix spurious CMake build problem This is a known issue with running a CMake built Makefile system that eventually calls and External_Project. The various _steps_ in the External_Project are **not** considered individual jobs with interdependencies. This can therefor cause multiple instances of one _step_ to be run, leading to undefined behavior. The issue is documented in [1] and affects all versions of CMake > 3.10. The workaround for this is to explicitly set dependencies on the _steps_ and remove DEPENDS from the External_Project. [1]: https://gitlab.kitware.com/cmake/cmake/issues/18663 --- CMakeLists.txt | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3338b5b81..c6067abb3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,7 +91,7 @@ IF (ENABLE_ISL AND ENABLE_BARVINOK) PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -Wno-conversion" ) ADD_EXECUTABLE (sacislinterface src/tools/sacislinterface/sacislinterface.c) - ADD_DEPENDENCIES (sac2cShared check_repo_version) + ADD_DEPENDENCIES (sacislinterface check_repo_version) TARGET_LINK_LIBRARIES (sacislinterface ${LIB_ISL} ${LIB_BARVINOK} ${BARVINOK_LIB}) TARGET_INCLUDE_DIRECTORIES (sacislinterface PUBLIC ${BARVINOK_INC_PATH} ${ISL_INC_PATH}) ENDIF () @@ -115,7 +115,6 @@ STRING (REPLACE ";" ":" _TARGETS "${RT_TARGETS}") # This is where we call the build of the sac2c shared-libraries # This *depends* on sac2c having been build first! ExternalProject_Add(runtime_libraries - DEPENDS sac2cShared sac2c check_repo_version sac_h DOWNLOAD_COMMAND "" # this is to prevent any download target from being called INSTALL_COMMAND "" # this is to prevent any install target from being called PREFIX runtime_build @@ -133,7 +132,7 @@ ExternalProject_Add(runtime_libraries ) # We set dependencies on the configure step, this makes sure we propogate # certain values (such as from sac2crc). -ExternalProject_Add_StepDependencies(runtime_libraries configure sac2c sac2cShared check_repo_version) +ExternalProject_Add_StepDependencies(runtime_libraries configure sac2c sac2cShared sac_h check_repo_version) # Get runtime library build directory ExternalProject_Get_Property (runtime_libraries BINARY_DIR) SET (RUNTIME_BINARY_DIR ${BINARY_DIR}) # redefine name to something more useful @@ -159,7 +158,12 @@ ADD_DEPENDENCIES (fullclean runtime_libraries-clean) # directly as part of e.g. target_link_libraries functions. As such we instead # force add target properties which we can access explicitly. # Expose configure step as target -ExternalProject_Add_StepTargets (runtime_libraries configure) +# XXX (hans) additionaly we explicitly set inter-step dependencies as this +# facility is broken in cmake >= version 3.10, +# - see https://gitlab.kitware.com/cmake/cmake/issues/18663 +ExternalProject_Add_StepTargets (runtime_libraries configure build install) +ExternalProject_Add_StepDependencies (runtime_libraries install runtime_libraries-build) +ExternalProject_Add_StepDependencies (runtime_libraries build runtime_libraries-configure) FOREACH (__target ${RT_TARGETS}) ADD_CUSTOM_TARGET (libsac-${__target} COMMAND +${CMAKE_COMMAND} --build ${RUNTIME_BINARY_DIR} --target libsac-${__target} @@ -202,7 +206,7 @@ ENDMACRO () ADD_INSTALL_TARGET ("applications" sac2c sac4c sac2tex "${PROJECT_BINARY_DIR}/saccc") ADD_INSTALL_TARGET ("config" "${SAC2CRC_BUILD_CONF}") ADD_INSTALL_TARGET ("headers" sac_h) -ADD_INSTALL_TARGET ("libraries" sac2cShared runtime_libraries) +ADD_INSTALL_TARGET ("libraries" runtime_libraries) ADD_INSTALL_TARGET ("rtapplications" runtime_libraries) ADD_INSTALL_TARGET ("sources") ADD_INSTALL_TARGET ("symlinks") -- GitLab From cbf195535160218f39c78a192c75da003b03ce83 Mon Sep 17 00:00:00 2001 From: Hans-Nikolai Viessmann Date: Sat, 20 Apr 2019 14:20:01 +0100 Subject: [PATCH 17/17] update MEMRT and EMRL comments --- src/libsac2c/cuda/minimize_emr_transfers.c | 11 +++++++++-- src/libsac2c/memory/emr_loop_optimisation.c | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/libsac2c/cuda/minimize_emr_transfers.c b/src/libsac2c/cuda/minimize_emr_transfers.c index 2b64157e5..690050c14 100644 --- a/src/libsac2c/cuda/minimize_emr_transfers.c +++ b/src/libsac2c/cuda/minimize_emr_transfers.c @@ -11,8 +11,15 @@ * are not able to lift out the h2d. * * The latter point is especially important, as traversals like MLTRAN only lift out h2d/d2h - * if there are no further references to the RHS of h2d/d2h. When using EMRL, this check fails - * because of an extra argument in recursive loopfun application used for the buffer-swapping. + * if there are no further references to the RHS of h2d/d2h. When using EMRL, this check can + * fail because of an extra argument in the recursive loopfun application used for the + * buffer-swapping. + * + * A point to consider, MEMRT only works on loopfuns marked with the ISEMRLIFTED flag + * (on the N_fundef). If we are dealing with a series of nested loops, it will only move + * the h2d one-level up. This is also limited by the EMRL traversal + * (@see memory/emr_loop_optimisation.c) which is conservative in how many levels it will + * lift out allocations. Typically it lifts out allocations from only the innermost loop. * * To give a concrete example, we have: * diff --git a/src/libsac2c/memory/emr_loop_optimisation.c b/src/libsac2c/memory/emr_loop_optimisation.c index c03640ae4..046f666d0 100644 --- a/src/libsac2c/memory/emr_loop_optimisation.c +++ b/src/libsac2c/memory/emr_loop_optimisation.c @@ -503,7 +503,7 @@ EMRLfundef (node * arg_node, info * arg_info) = TCappendArgs (FUNDEF_ARGS (arg_node), INFO_ARGS (arg_info)); INFO_ARGS (arg_info) = NULL; - /* mark fundef as having been touched by EMRL - this used later in EMRTU */ + /* mark fundef as having been touched by EMRL - this used later in CUDA MEMRT */ FUNDEF_ISEMRLIFTED (arg_node) = TRUE; } -- GitLab