544 eff.c:1761处loop vect 分析
2.6 带有mask的向量数学函数
gcc 支持的svml向量数学函数
32652 GCC currently emits calls to @code{vmldExp2}, 32653 @code{vmldLn2}, @code{vmldLog102}, @code{vmldPow2}, 32654 @code{vmldTanh2}, @code{vmldTan2}, @code{vmldAtan2}, @code{vmldAtanh2}, 32655 @code{vmldCbrt2}, @code{vmldSinh2}, @code{vmldSin2}, @code{vmldAsinh2}, 32656 @code{vmldAsin2}, @code{vmldCosh2}, @code{vmldCos2}, @code{vmldAcosh2}, 32657 @code{vmldAcos2}, @code{vmlsExp4}, @code{vmlsLn4}, 32658 @code{vmlsLog104}, @code{vmlsPow4}, @code{vmlsTanh4}, @code{vmlsTan4}, 32659 @code{vmlsAtan4}, @code{vmlsAtanh4}, @code{vmlsCbrt4}, @code{vmlsSinh4}, 32660 @code{vmlsSin4}, @code{vmlsAsinh4}, @code{vmlsAsin4}, @code{vmlsCosh4}, 32661 @code{vmlsCos4}, @code{vmlsAcosh4} and @code{vmlsAcos4} for corresponding 32662 function type when @option{-mveclibabi=svml} is used |
oneapi的IR:
%
3970
= call fast cc104 <
4
x
double
>
@__svml_log4_mask
(<
4
x
double
> %
3968
, <
4
x i64> %
3969
)
gcc的IR : _799 = _ZGVdN4v_logD.6143 (_800);
<__svml_log4_mask_e9>汇编代码的函数原名。
从如何调用不带mask的svml向量数学函数的流程出发,找出调用带有mask的方法。
设计方案:
vect__ifc__1252.1526_717 = VEC_COND_EXPR <mask__1460.1449_910, vect__1761.1465_870, { 0.0, 0.0 }>; 找到一个VEC_COND_EXPR,在同一个基本块中,根据第二个或者第三个参数所涉及到的运算(建立一个栈暂存每次找到的结果),顺着运算的关系一步步往上找,直到找到了需要进行mask的数学函数。如果在第二个参数中找到,VEC_COND_EXPR中的第一个参数mask就是数学函数需要进行mask的值。如果在第三个参数的关系链中找到,其所需的mask就是VEC_COND_EXPR中的mask的取反。将数学函数和mask一起生成带有mask的数学函数的IR,替换掉原来的不带mask的。(在生成cond_expr之后做还是在loop vect pass之后另外新建一个pass做。)
#include "config.h" 2 #include "system.h" 3 #include "coretypes.h" 4 #include "backend.h" 5 #include "tree.h" 6 #include "gimple.h" 7 #include "predict.h" 8 #include "tree-pass.h" 9 #include "ssa.h" 10 #include "cgraph.h" 11 #include "fold-const.h" 12 #include "stor-layout.h" 13 #include "gimple-iterator.h" 14 #include "gimple-walk.h" 15 #include "tree-ssa-loop-manip.h" 16 #include "tree-ssa-loop-niter.h" 17 #include "tree-cfg.h" 18 #include "cfgloop.h" 19 #include "tree-vectorizer.h" 20 #include "tree-ssa-propagate.h" 21 #include "dbgcnt.h" 22 #include "tree-scalar-evolution.h" 23 #include "stringpool.h" 24 #include "attribs.h" 25 #include "gimple-pretty-print.h" 26 #include "opt-problem.h" 27 #include "internal-fn.h" 28 #include "tree-ssa-sccvn.h" 29 #include "gimple-expr.h" 30 #include <cstdio> 31 32 namespace 33 { 34 const pass_data pass_data_test = { 35 GIMPLE_PASS, /* type */ 36 "mask_vecmath_func", /* name */ 37 OPTGROUP_NONE, /* optinfo_flags */ 38 TV_TREE_VECT_MASK_VECMATH_FUNC, /* tv_id */ 39 (PROP_cfg | PROP_ssa), /* properties_required */ 40 0, /* properties_provided */ 41 0, /* properties_destroyed */ 42 0, /* todo_flags_start */ 43 0, /* todo_flags_finish */ 44 }; 46 class pass_mask_vecmath_func : public gimple_opt_pass 47 { 48 public: 49 pass_mask_vecmath_func (gcc::context *ctxt) : gimple_opt_pass (pass_data_test, ctxt) {} 50 virtual bool 51 gate (function *fun) 52 { 53 // printf ("gate function noipa.\n"); 54 return flag_tree_mask_vecmath_func; 55 } 56 57 virtual unsigned int execute (function *); 58 }; 59 60 61 static gimple *find_relate_operand(tree operand, gimple *stmt) 62 { 63 if (!stmt) 64 return NULL; 65 66 if (TREE_CODE (operand) == SSA_NAME && is_gimple_call(stmt)) { // operand is ssa && stmt is gimple call 67 tree fndecl = gimple_call_fndecl(stmt); // 获取函数声明 68 if (fndecl && DECL_P(fndecl)) { // 确保fndecl有效并且是一个声明 69 const char *func_name = IDENTIFIER_POINTER(DECL_NAME(fndecl)); // 获取函数名称 70 // if (strcmp(func_name, "vmldLn2") == 0) { 71 if (strcmp(func_name, "__svml_log4_mask_e9") == 0) { 72 return stmt; 73 } 74 } 75 } 76 if (TREE_CODE (operand) == SSA_NAME && is_gimple_assign(stmt)) { // only find gimple assign 77 78 for (unsigned i = 1; i < gimple_num_ops(stmt); ++i) { // get gimple assign right hand side operand 79 tree op = gimple_op(stmt, i); 80 if(TREE_CODE (op) == SSA_NAME) { 81 82 gimple *stmt_2 = SSA_NAME_DEF_STMT (op); 83 gimple *result = find_relate_operand(op,stmt_2); 84 if(result) return result; 85 } 86 } 87 } 88 return NULL; 89 } 90 91 static void add_mask_to_call(gimple *stmt, tree new_arg) { 92 if (!is_gimple_call(stmt)) { 93 // 如果不是函数调用语句,则不做任何操作 94 return; 95 } 96 97 // 获取原始函数调用的目标和参数列表 98 tree call_fn = gimple_call_fndecl(stmt); 99 100 // 获取或创建新的标识符节点来表示新的函数名称 101 // tree new_func_id = get_identifier("vmldLn2Mask"); 102 tree new_func_id = get_identifier("__svml_log4_mask_e9"); 103 tree fntype = TREE_TYPE(call_fn); 104 105 tree new_fndecl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, new_func_id, fntype); 106 107 TREE_PUBLIC (new_fndecl) = 1; 108 DECL_EXTERNAL (new_fndecl) = 1; 109 DECL_IS_NOVOPS (new_fndecl) = 1; 110 TREE_READONLY (new_fndecl) = 1; 111 112 113 // 将新的标识符节点分配给函数声明的汇编名 114 // DECL_ASSEMBLER_NAME(call_fn) = new_func_id; 115 116 int num_args = gimple_call_num_args(stmt); 117 vec<tree> vargs = vNULL; 118 vargs.create (num_args+1); 119 120 // 创建一个新的参数列表,包含原始的参数和新的参数 121 for (int i = 0; i < num_args; i++) { 122 tree arg = gimple_call_arg(stmt, i); 123 vargs.safe_push(arg); 124 } 125 vargs.safe_push(new_arg); 126 127 tree lhs = gimple_call_lhs(stmt); 128 129 // 创建新的函数调用语句,包含新的参数 130 gimple *new_call = gimple_build_call_vec(new_fndecl,vargs); 131 gimple_call_set_lhs (new_call, lhs); 132 133 // 替换原始的函数调用语句 134 gimple_stmt_iterator gsi = gsi_for_stmt (stmt); 135 136 // printf ("-------------finish add mask to vecmath func call------------.\n"); 137 138 gsi_replace(&gsi, new_call,true); 139 stmt = new_call; 140 141 // 释放参数列表的内存 142 vargs.release (); 143 } 144 145 unsigned 146 pass_mask_vecmath_func::execute (function *fun) 147 { 148 unsigned ret = 0; 149 150 // printf ("-----------begin mask vecmath func------------.\n"); 151 // printf ("current function name:%s\n", function_name (fun)); 152 basic_block bb; 153 enum tree_code code; 154 155 // 遍历所有基本块 156 FOR_EACH_BB_FN(bb, fun) { 157 gimple_stmt_iterator gsi; 158 159 // 遍历基本块中的所有 GIMPLE 语句 160 for (gsi = gsi_start_bb(bb); !gsi_end_p(gsi); gsi_next(&gsi)) { 161 gimple *stmt = gsi_stmt(gsi); 162 if (is_gimple_assign(stmt)) { 163 164 gassign *stmt_assign = dyn_cast <gassign *> (gsi_stmt (gsi)); 165 code = gimple_assign_rhs_code (stmt_assign); 166 167 // 检查语句是否为 VEC_COND_EXPR 168 if (code == VEC_COND_EXPR) { 169 170 // printf ("-----------find out vec cond expr------------.\n"); 171 tree true_vector_operand = gimple_assign_rhs2(stmt_assign); // add wrong vec operand 172 tree mask_operand = gimple_assign_rhs1(stmt_assign); 173 if(TREE_CODE (true_vector_operand) == SSA_NAME) { 174 175 gimple *stmt_def = SSA_NAME_DEF_STMT (true_vector_operand); 176 gimple *stmt_vecmath = find_relate_operand(true_vector_operand,stmt_def); 177 if(stmt_vecmath) { 178 // printf ("-----------find out vecmath stmt------------.\n"); 179 add_mask_to_call(stmt_vecmath,mask_operand); 180 181 } 182 } 183 } 184 } 185 } 186 } 187 return ret; 188 } 189 } 190 191 gimple_opt_pass * 192 make_pass_mask_vecmath_func (gcc::context *ctxt) 193 { 194 return new pass_mask_vecmath_func (ctxt); 195 } |
生成了正确的IR之后,使用buitlin的方式调用svml中的带有mask的数学函数。
gcc调用svml函数在gimple阶段的过程:
1:examining statement:
vect_analyze_stmt函数中检查stmt, 在vectorizable_xxx函数里面判断操作数的类型。vect_is_simple_use: 计算向量化的cost, vect_model_simple_cost,先不进行transform。
调用svml需要使用target-specific built-in function,使用此函数targetm.vectorize.builtin_vectorized_function,根据优化选项(config/i386/i386-options.cc:2567)定位到(ix86_veclib_handler = &ix86_veclibabi_svml)后端ix86_veclibabi_svml函数处,返回向量svml函数fndecl。
2:vectorizing statement:
vect_transform_loop_stmt函数中,进行transform,同样也会调用vectorizable_xxx函数进行此转化。gimple_build_call_vec (fndecl, vargs):根据获取到的fndecl以及对参数的向量化,构建一个新的gimple vec call。
loop vec pass的调用栈
vect_analyze_loop_2:
Apply a set of analyses on LOOP, and create a loop_vec_info struct for it. The different analyses will record information in the loop_vec_info struct
loop_vec_info 里面放的是对loop 分析完成后的整个loop的信息
vect_analyze_loop_operations:
Scan the loop stmts and make sure they are all vectorizable.
vect_analyze_stmt:
Make sure the statement is vectorizable.
ziyuan 2.3 和 2.4修改对于其他课题的影响 aggressive_if_conv && use_gather_2parts result.xlsx 采用HygonGCC 1.3.2编译器最新版本 和最新配置文件Hygon7490-2p-HygonGCC1.3.2.202403-hgalloc-znver1-base.cfg
跑1copy的时候整个node最好不要跑其他程序,不然性能数据会波动较大。会抢占node的内存等资源。
可能优化的方向:
- gcc调用svml向量数学库的接口函数只能支持128bit的输入。修改接口调用256bit的输入。
- -mtune-ctrl=^avx256_split_regs,^avx128_optimal,256_unaligned_store_optimal可以使程序使用256bit的ymm寄存器,提高循环向量化的vf,对性能有提高2069 4%,1761:8%。
- oneapi使用将条件和条件里面的计算分别放在不同的bb块中,通过控制流来选择需要执行哪些分支,可以减少冗余运算。Gcc向量化只能在同一个bb块中进行,无法控制每个分支,只支持在log函数上进行mask操作,和最终运算的结果上进行选择,其他操作- + *等只能在支持avx512的机器上。只能想办法在gcc上也进行将不同分支分为不同bb块的操作,模仿oneapi。
- gcc上的vf是8,使用两次log4,oneapi的vf为4,使用一次log4,通过将i32扩展为i64,使用256bit ymm,尝试将gcc变为vf4使用一次log4,使用相似的方法,未能成功。并且怀疑3才是性能的主要点,此操作应该不是性能的主要点。
5. gcc循环向量化无法处理跨bb的问题,如果向量化后拆分成不同bb,后续的pass可能无法处理会对拆分的bb做一些未知的操作,不建议使用此方法,可以在原有的bb里面插入一些 根据mask进行选择的指令,来模拟分支选择的操作。
void calc(double *src1,double *src2,double *src3) 5 { 6 int i; 7 for(i=0;i<100;i++) 8 { 9 if(src3[i] > 10.0) 10 { 11 src1[i] = exp(src2[i]); 12 } 13 else if(src3[i] > 5.0) 14 { 15 src1[i] = log(src2[i]); 16 } 17 else if(src3[i] > 2.5) 18 { 19 src1[i] = sin(src2[i]); 20 } 21 } 22 } |
对于有mask store的操作,会将if-conversion操作进行回退。optimize_mask_stores
1:新建一个对mask进行判断是否全为0的GIMPLE_COND。
2:新建一个then bb块,并且维护其边。
3:在mask store后分割一个新的bb,并且把stmt全部移到bb里面,新建一个边。
create_basic_block_1 (void *head, void *end, basic_block after):
int vf为4,double vf 为2.
test_mask_vecmath.c:13:18: note: === vect_determine_vectorization_factor === 681 test_mask_vecmath.c:13:18: note: ==> examining phi: i_114 = PHI <i_85(20), 0(35)> 682 test_mask_vecmath.c:13:18: note: ==> examining phi: sumi1_115 = PHI <_136(20), 0.0(35)> 683 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 684 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 685 test_mask_vecmath.c:13:18: note: nunits = 2 686 test_mask_vecmath.c:13:18: note: ==> examining phi: sumi2_117 = PHI <_138(20), 0.0(35)> 687 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 688 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 689 test_mask_vecmath.c:13:18: note: nunits = 2 690 test_mask_vecmath.c:13:18: note: ==> examining phi: sumi3_119 = PHI <_140(20), 0.0(35)> 691 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 692 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 693 test_mask_vecmath.c:13:18: note: nunits = 2 694 test_mask_vecmath.c:13:18: note: ==> examining phi: ivtmp_106 = PHI <ivtmp_101(20), 100(35)> 695 test_mask_vecmath.c:13:18: note: ==> examining statement: _62 = (long unsigned int) i_114; 696 test_mask_vecmath.c:13:18: note: skip. 697 test_mask_vecmath.c:13:18: note: ==> examining statement: _63 = _62 * 4; 698 test_mask_vecmath.c:13:18: note: skip. 699 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_151 = i_114 w* 4; 700 test_mask_vecmath.c:13:18: note: skip. 701 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_152 = (long unsigned int) patt_151; 702 test_mask_vecmath.c:13:18: note: skip. 703 test_mask_vecmath.c:13:18: note: ==> examining statement: _64 = &src3 + _63; 704 test_mask_vecmath.c:13:18: note: skip. 705 test_mask_vecmath.c:13:18: note: ==> examining statement: j_65 = *_64; 706 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(4) int 707 test_mask_vecmath.c:13:18: note: nunits = 4 708 test_mask_vecmath.c:13:18: note: ==> examining statement: _66 = (long unsigned int) j_65; 709 test_mask_vecmath.c:13:18: note: skip. 710 test_mask_vecmath.c:13:18: note: ==> examining statement: _67 = _66 * 8; 711 test_mask_vecmath.c:13:18: note: skip. 712 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_153 = j_65 w* 8; 713 test_mask_vecmath.c:13:18: note: skip. 714 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_154 = (long unsigned int) patt_153; 715 test_mask_vecmath.c:13:18: note: skip. 716 test_mask_vecmath.c:13:18: note: ==> examining statement: _142 = _141 + _67; 717 test_mask_vecmath.c:13:18: note: skip. test_mask_vecmath.c:13:18: note: ==> examining statement: _68 = (double *) _142; 719 test_mask_vecmath.c:13:18: note: skip. 720 test_mask_vecmath.c:13:18: note: ==> examining statement: _143 = j_65 > 10; 721 test_mask_vecmath.c:13:18: note: vectype: vector(4) <signed-boolean:32> 722 test_mask_vecmath.c:13:18: note: nunits = 4 723 test_mask_vecmath.c:13:18: note: ==> examining statement: _69 = .MASK_LOAD (_68, 64B, _143); 724 test_mask_vecmath.c:13:18: note: skip. 725 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_155 = (<signed-boolean:64>) _143; 726 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 727 test_mask_vecmath.c:13:18: note: nunits = 2 728 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_156 = .MASK_LOAD (_68, 64B, patt_155); 729 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 730 test_mask_vecmath.c:13:18: note: nunits = 2 731 test_mask_vecmath.c:13:18: note: ==> examining statement: _70 = log (_69); 732 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 733 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 734 test_mask_vecmath.c:13:18: note: nunits = 2 735 test_mask_vecmath.c:13:18: note: ==> examining statement: _89 = (unsigned int) j_65; 736 test_mask_vecmath.c:13:18: note: get vectype for scalar type: unsigned int 737 test_mask_vecmath.c:13:18: note: vectype: vector(4) unsigned int 738 test_mask_vecmath.c:13:18: note: nunits = 4 739 test_mask_vecmath.c:13:18: note: ==> examining statement: _87 = _89 + 4294967288; 740 test_mask_vecmath.c:13:18: note: get vectype for scalar type: unsigned int 741 test_mask_vecmath.c:13:18: note: vectype: vector(4) unsigned int 742 test_mask_vecmath.c:13:18: note: nunits = 4 743 test_mask_vecmath.c:13:18: note: ==> examining statement: _73 = _62 * 8; 744 test_mask_vecmath.c:13:18: note: skip. 745 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_157 = i_114 w* 8; 746 test_mask_vecmath.c:13:18: note: skip. 747 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_158 = (long unsigned int) patt_157; 748 test_mask_vecmath.c:13:18: note: skip. 749 test_mask_vecmath.c:13:18: note: ==> examining statement: _145 = _73 + _141; 750 test_mask_vecmath.c:13:18: note: skip. 751 test_mask_vecmath.c:13:18: note: ==> examining statement: _74 = (double *) _145; 752 test_mask_vecmath.c:13:18: note: skip. 753 test_mask_vecmath.c:13:18: note: ==> examining statement: _146 = _87 <= 2; 754 test_mask_vecmath.c:13:18: note: vectype: vector(4) <signed-boolean:32> 755 test_mask_vecmath.c:13:18: note: nunits = 4 756 test_mask_vecmath.c:13:18: note: ==> examining statement: _75 = .MASK_LOAD (_74, 64B, _146); 757 test_mask_vecmath.c:13:18: note: skip. 758 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_159 = (<signed-boolean:64>) _146; 759 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 760 test_mask_vecmath.c:13:18: note: nunits = 2 761 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_160 = .MASK_LOAD (_74, 64B, patt_159); 762 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 763 test_mask_vecmath.c:13:18: note: nunits = 2 764 test_mask_vecmath.c:13:18: note: ==> examining statement: _76 = log (_75); 765 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 766 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 767 test_mask_vecmath.c:13:18: note: nunits = 2 768 test_mask_vecmath.c:13:18: note: ==> examining statement: _148 = _73 + _147; 769 test_mask_vecmath.c:13:18: note: skip. 770 test_mask_vecmath.c:13:18: note: ==> examining statement: _80 = (double *) _148; 771 test_mask_vecmath.c:13:18: note: skip. 772 test_mask_vecmath.c:13:18: note: ==> examining statement: _149 = j_65 == 7; 773 test_mask_vecmath.c:13:18: note: vectype: vector(4) <signed-boolean:32> 774 test_mask_vecmath.c:13:18: note: nunits = 4 775 test_mask_vecmath.c:13:18: note: ==> examining statement: _81 = .MASK_LOAD (_80, 64B, _149); 776 test_mask_vecmath.c:13:18: note: skip. 777 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_161 = (<signed-boolean:64>) _149; 778 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 779 test_mask_vecmath.c:13:18: note: nunits = 2 780 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_162 = .MASK_LOAD (_80, 64B, patt_161); 781 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 782 test_mask_vecmath.c:13:18: note: nunits = 2 783 test_mask_vecmath.c:13:18: note: ==> examining statement: _82 = log (_81); 784 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 785 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 786 test_mask_vecmath.c:13:18: note: nunits = 2 787 test_mask_vecmath.c:13:18: note: ==> examining statement: _ifc__135 = j_65 > 10 ? _70 : 0.0; 788 test_mask_vecmath.c:13:18: note: skip. 789 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_163 = j_65 > 10; 790 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(4) <signed-boolean:32> 791 test_mask_vecmath.c:13:18: note: nunits = 4 792 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_164 = (<signed-boolean:64>) patt_163; 793 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 794 test_mask_vecmath.c:13:18: note: nunits = 2 795 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_165 = patt_164 ? _70 : 0.0; 796 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 797 test_mask_vecmath.c:13:18: note: nunits = 2 798 test_mask_vecmath.c:13:18: note: ==> examining statement: _136 = sumi1_115 + _ifc__135; 799 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 800 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 801 test_mask_vecmath.c:13:18: note: nunits = 2 802 test_mask_vecmath.c:13:18: note: ==> examining statement: _ifc__137 = _87 <= 2 ? _76 : 0.0; 803 test_mask_vecmath.c:13:18: note: skip. 804 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_166 = _87 <= 2; 805 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(4) <signed-boolean:32> 806 test_mask_vecmath.c:13:18: note: nunits = 4 807 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_167 = (<signed-boolean:64>) patt_166; 808 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 809 test_mask_vecmath.c:13:18: note: nunits = 2 810 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_168 = patt_167 ? _76 : 0.0; 811 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 812 test_mask_vecmath.c:13:18: note: nunits = 2 813 test_mask_vecmath.c:13:18: note: ==> examining statement: _138 = sumi2_117 + _ifc__137; 814 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 815 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 816 test_mask_vecmath.c:13:18: note: nunits = 2 817 test_mask_vecmath.c:13:18: note: ==> examining statement: _ifc__139 = j_65 == 7 ? _82 : 0.0; 818 test_mask_vecmath.c:13:18: note: skip. 819 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_169 = j_65 == 7; 820 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(4) <signed-boolean:32> 821 test_mask_vecmath.c:13:18: note: nunits = 4 822 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_170 = (<signed-boolean:64>) patt_169; 823 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 824 test_mask_vecmath.c:13:18: note: nunits = 2 825 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_171 = patt_170 ? _82 : 0.0; 826 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 827 test_mask_vecmath.c:13:18: note: nunits = 2 828 test_mask_vecmath.c:13:18: note: ==> examining statement: _140 = sumi3_119 + _ifc__139; 829 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 830 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 831 test_mask_vecmath.c:13:18: note: nunits = 2 832 test_mask_vecmath.c:13:18: note: ==> examining statement: i_85 = i_114 + 1; 833 test_mask_vecmath.c:13:18: note: skip. 834 test_mask_vecmath.c:13:18: note: ==> examining statement: ivtmp_101 = ivtmp_106 - 1; 835 test_mask_vecmath.c:13:18: note: skip. 836 test_mask_vecmath.c:13:18: note: ==> examining statement: if (ivtmp_101 != 0) 837 test_mask_vecmath.c:13:18: note: skip. 838 test_mask_vecmath.c:13:18: note: vectorization factor = 4 |
既有int 也有double的loop
#include<stdio.h> 2 #include<math.h> 3 #include<stdlib.h> 4 void calc(double *src1,double *src2,int *src3) 5 { 6 int i; 7 int j; 8 double sumi = 0; 9 double sumi1 = 0; 10 double sumi2 = 0; 11 double sumi3 = 0; 12 double sumi_temp[100]; 13 for(i=0;i<100;i++) 14 { 15 j = src3[i]; 16 if(src3[i] > 10) 17 { 18 // src1[i] = exp(src2[j]); 19 sumi1 += log(src2[j]); 20 // sumi = exp(src3[i]); 21 // sumi += 2; 22 } 23 else if(src3[i] > 7) 24 { 25 // src1[i] = log(src2[j]); 26 // sumi = log(src2[j]); 27 sumi2 += log(src2[i]); 28 // sumi += 3; 29 } 30 31 else if(src3[i] > 6) 32 { 33 // src1[i] = sin(src2[j]); 34 sumi3 += log(src1[i]); 35 // sumi += 2; 36 } 37 } 38 /* for(int i=0;i<100;i++) { 39 sumi+=src1[i]; 40 }*/ 41 sumi = sumi1 + sumi2 + sumi3; 42 printf("sumi is %lf\n",sumi); 43 44 } 46 int main() 47 { 48 srand(12); 49 double src1[100]; 50 double src2[100]; 51 // double src3[100]; 52 int src3[100]; 53 double rand_double_min2 = 5.0; 54 double rand_double_max2 = 15.0; 55 56 int rand_int_min2 = 5; 57 int rand_int_max2 = 15; 58 59 for(int k = 0;k<100;k++) { 60 src1[k] = rand_double_min2+1.0 * rand() / RAND_MAX * ( rand_double_max2 - rand_double_min2 ); 61 src2[k] = rand_double_min2+1.0 * rand() / RAND_MAX * ( rand_double_max2 - rand_double_min2 ); 62 // src3[k] = rand_double_min2+1.0 * rand() / RAND_MAX * ( rand_double_max2 - rand_double_min2 ); 63 } 64 for(int k = 0;k<100;k++) { 65 src3[k] = rand_int_min2+ rand() % ( rand_int_max2 - rand_int_min2 ); 66 } 67 68 for(int k = 0;k<100;k++) { 69 printf("src1 is %lf ",src1[k]); 70 } 71 calc(src1,src2,src3); 72 double res= 0; 73 for(int m = 0;m<100;m++) { 74 res += src1[m]; 75 } 76 printf("res is %lf\n",res); 77 return 0; 78 } |
bb分块
COUNT:1604735257<bb 78>: # # RANGE [0, 2147483647] NONZERO 2147483647 k_3019 = PHI <k_1827(216), 0(301)> # temp0_1543 = PHI <_1251(216), 0.0(301)> # temp1_2883 = PHI <_1249(216), 0.0(301)> # temp2_224 = PHI <_1247(216), 0.0(301)> # temp3_2699 = PHI <_1245(216), 0.0(301)> # temp4_1545 = PHI <_1243(216), 0.0(301)> # vect_temp0_1543.1410_1003 = PHI <vect__1251.1527_708(216), { 0.0, 0.0, 0.0, 0.0 }(301)> # vect_temp1_2883.1411_1002 = PHI <vect__1249.1530_701(216), { 0.0, 0.0, 0.0, 0.0 }(301)> # vect_temp2_224.1412_1001 = PHI <vect__1247.1533_694(216), { 0.0, 0.0, 0.0, 0.0 }(301)> # vect_temp3_2699.1413_1000 = PHI <vect__1245.1536_687(216), { 0.0, 0.0, 0.0, 0.0 }(301)> # vect_temp4_1545.1414_999 = PHI <vect__1243.1539_670(216), { 0.0, 0.0, 0.0, 0.0 }(301)> # # PT = nonlocal escaped null # ALIGN = 4, MISALIGN = 0 vectp.1415_998 = PHI <vectp.1415_997(216), _1703(301)> # ivtmp_667 = PHI <ivtmp_666(216), 0(301)> # DEBUG temp4D.7772 => NULL # DEBUG temp3D.7771 => NULL # DEBUG temp2D.7770 => NULL # DEBUG temp1D.7769 => NULL # DEBUG temp0D.7768 => NULL # DEBUG kD.7615 => NULL # DEBUG BEGIN_STMT # DEBUG BEGIN_STMT # RANGE [0, 2147483646] NONZERO 2147483647 _1705 = (long unsigned intD.10) k_3019; # RANGE [0, 8589934584] NONZERO 8589934588 _1706 = _1705 * 4; # PT = nonlocal escaped null _1707 = _1703 + _1706; # VUSE <.MEM_2600> vect_j_1708.1417_996 = MEM <vector(8) intD.6> [(INT_TD.3736 *)vectp.1415_998]; # VUSE <.MEM_2600> j_1708 = *_1707; # DEBUG jD.7613 => NULL # DEBUG BEGIN_STMT vect__1709.1418_994 = vect_j_1708.1417_996 * { 3, 3, 3, 3, 3, 3, 3, 3 }; _1709 = j_1708 * 3; # RANGE ~[2147483648, 18446744071562067967] _1710 = (long unsigned intD.10) _1709; # RANGE [0, 18446744073709551608] NONZERO 18446744073709551608 _1711 = _1710 * 8; # PT = nonlocal null _1712 = x_242(D) + _1711; # VUSE <.MEM_2600> # USE = anything vect__1713.1419_991 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, x_242(D), vect__1709.1418_994, { Nan, Nan, Nan, Nan }, 8); vect__1713.1420_990 = VEC_PERM_EXPR <vect__1709.1418_994, vect__1709.1418_994, { 4, 5, 6, 7, 4, 5, 6, 7 }>; # VUSE <.MEM_2600> # USE = anything vect__1713.1419_989 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, x_242(D), vect__1713.1420_990, { Nan, Nan, Nan, Nan }, 8); # VUSE <.MEM_2600> _1713 = *_1712; vect_xij_1714.1421_987 = vect_cst__988 - vect__1713.1419_991; vect_xij_1714.1421_986 = vect_cst__988 - vect__1713.1419_989; xij_1714 = xi_1687 - _1713; # DEBUG xijD.7655 => NULL # DEBUG BEGIN_STMT # RANGE ~[2147483649, 18446744071562067968] _1715 = _1710 + 1; # RANGE [0, 18446744073709551608] NONZERO 18446744073709551608 _1716 = _1715 * 8; # PT = nonlocal null _1717 = x_242(D) + _1716; # VUSE <.MEM_2600> # USE = anything vect__1718.1422_980 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _983, vect__1709.1418_994, { Nan, Nan, Nan, Nan }, 8); # VUSE <.MEM_2600> # USE = anything vect__1718.1422_977 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _983, vect__1713.1420_990, { Nan, Nan, Nan, Nan }, 8); # VUSE <.MEM_2600> _1718 = *_1717; vect_yij_1719.1424_975 = vect_cst__976 - vect__1718.1422_980; vect_yij_1719.1424_974 = vect_cst__976 - vect__1718.1422_977; yij_1719 = yi_1691 - _1718; # DEBUG yijD.7656 => NULL # DEBUG BEGIN_STMT # RANGE ~[2147483650, 18446744071562067969] _1720 = _1710 + 2; # RANGE [0, 18446744073709551608] NONZERO 18446744073709551608 _1721 = _1720 * 8; # PT = nonlocal null _1722 = x_242(D) + _1721; # VUSE <.MEM_2600> # USE = anything vect__1723.1425_967 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _971, vect__1709.1418_994, { Nan, Nan, Nan, Nan }, 8); # VUSE <.MEM_2600> # USE = anything vect__1723.1425_965 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _971, vect__1713.1420_990, { Nan, Nan, Nan, Nan }, 8); # VUSE <.MEM_2600> _1723 = *_1722; vect_zij_1724.1427_963 = vect_cst__964 - vect__1723.1425_967; vect_zij_1724.1427_962 = vect_cst__964 - vect__1723.1425_965; zij_1724 = zi_1695 - _1723; # DEBUG zijD.7657 => NULL # DEBUG BEGIN_STMT vect_powmult_2740.1428_961 = vect_xij_1714.1421_987 * vect_xij_1714.1421_987; vect_powmult_2740.1428_960 = vect_xij_1714.1421_986 * vect_xij_1714.1421_986; powmult_2740 = xij_1714 * xij_1714; vect_powmult_2713.1429_959 = vect_yij_1719.1424_975 * vect_yij_1719.1424_975; vect_powmult_2713.1429_958 = vect_yij_1719.1424_974 * vect_yij_1719.1424_974; powmult_2713 = yij_1719 * yij_1719; vect_powmult_1661.1430_957 = vect_zij_1724.1427_963 * vect_zij_1724.1427_963; vect_powmult_1661.1430_956 = vect_zij_1724.1427_962 * vect_zij_1724.1427_962; powmult_1661 = zij_1724 * zij_1724; vect__1971.1431_955 = vect_powmult_1661.1430_957 + vect_powmult_2713.1429_959; vect__1971.1431_954 = vect_powmult_1661.1430_956 + vect_powmult_2713.1429_958; _1971 = powmult_1661 + powmult_2713; vect_r2_1729.1432_953 = vect__1971.1431_955 + vect_powmult_2740.1428_961; vect_r2_1729.1432_952 = vect__1971.1431_954 + vect_powmult_2740.1428_960; // compute r2 r2_1729 = _1971 + powmult_2740; # DEBUG r2D.7683 => NULL # DEBUG BEGIN_STMT # DEBUG r2D.7683 => NULL # DEBUG BEGIN_STMT # DEBUG BEGIN_STMT vect__1730.1433_950 = .SQRT (vect_r2_1729.1432_953); // after if (r2 > rgbmaxpsmax2) compute vect__1730.1433_949 = .SQRT (vect_r2_1729.1432_952); vect_dij1i_1731.1434_947 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1730.1433_950; vect_dij1i_1731.1434_946 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1730.1433_949; # DEBUG dij1iD.7664 => NULL # DEBUG BEGIN_STMT vect_dij_1732.1435_945 = vect_r2_1729.1432_953 * vect_dij1i_1731.1434_947; vect_dij_1732.1435_944 = vect_r2_1729.1432_952 * vect_dij1i_1731.1434_946; dij_1732 = r2_1729 * Inf; # DEBUG dijD.7673 => NULL # DEBUG BEGIN_STMT _1733 = (long unsigned intD.10) j_1708; _1734 = _1733 * 8; _1241 = _1242 + _1734; # PT = nonlocal escaped null _1735 = (doubleD.32 *) _1241; mask__1239.1436_942 = vect_r2_1729.1432_953 <= vect_cst__943; // if (r2 > rgbmaxpsmax2) mask__1239.1436_941 = vect_r2_1729.1432_952 <= vect_cst__943; _1239 = r2_1729 <= powmult_2494; stmp_938 = VIEW_CONVERT_EXPR<vector(4) doubleD.32>(mask__1239.1436_942); # VUSE <.MEM_2600> # USE = anything vect__1736.1437_937 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _939, vect_j_1708.1417_996, stmp_938, 8); // after if (r2 > rgbmaxpsmax2) compute vect__1736.1438_936 = VEC_PERM_EXPR <vect_j_1708.1417_996, vect_j_1708.1417_996, { 4, 5, 6, 7, 4, 5, 6, 7 }>; stmp_935 = VIEW_CONVERT_EXPR<vector(4) doubleD.32>(mask__1239.1436_941); # VUSE <.MEM_2600> # USE = anything vect__1736.1437_934 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _939, vect__1736.1438_936, stmp_935, 8); _1237 = _1238 + _1734; # PT = nonlocal escaped null _1737 = (doubleD.32 *) _1237; # VUSE <.MEM_2600> # USE = anything vect__1738.1439_931 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _933, vect_j_1708.1417_996, stmp_938, 8); # VUSE <.MEM_2600> # USE = anything vect__1738.1439_924 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _933, vect__1736.1438_936, stmp_935, 8); vect__1739.1441_922 = vect__1738.1439_931 + { -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2 }; vect__1739.1441_921 = vect__1738.1439_924 + { -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2 }; vect_sj_1740.1442_920 = vect__1736.1437_937 * vect__1739.1441_922; vect_sj_1740.1442_919 = vect__1736.1437_934 * vect__1739.1441_921; # DEBUG sjD.7686 => NULL # DEBUG BEGIN_STMT # DEBUG sj2D.7687 => NULL # DEBUG BEGIN_STMT vect__1743.1443_917 = vect_sj_1740.1442_920 + { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 }; vect__1743.1443_916 = vect_sj_1740.1442_919 + { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 }; mask__1463.1444_915 = vect_dij_1732.1435_945 <= vect__1743.1443_917; mask__1463.1444_914 = vect_dij_1732.1435_944 <= vect__1743.1443_916; _1463 = dij_1732 <= 2.0e+1; mask__1462.1445_913 = mask__1239.1436_942 & mask__1463.1444_915; // if (dij > rgbmax + sj) mask__1462.1445_912 = mask__1239.1436_941 & mask__1463.1444_914; _1462 = _1239 & _1463; vect_powmult_1725.1446_911 = vect_sj_1740.1442_920 * vect_sj_1740.1442_920; vect_powmult_1725.1446_910 = vect_sj_1740.1442_919 * vect_sj_1740.1442_919; # DEBUG BEGIN_STMT vect__1744.1447_908 = { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 } - vect_sj_1740.1442_920; // begin if ((dij > rgbmax - sj)) vect__1744.1447_907 = { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 } - vect_sj_1740.1442_919; mask__1461.1448_906 = vect_dij_1732.1435_945 > vect__1744.1447_908; mask__1461.1448_905 = vect_dij_1732.1435_944 > vect__1744.1447_907; _1461 = dij_1732 > 2.0e+1; mask__1460.1449_904 = mask__1461.1448_906 & mask__1462.1445_913; // if ((dij > rgbmax - sj)) enter if-else chain mask__1460.1449_903 = mask__1461.1448_905 & mask__1462.1445_912; _1460 = _1461 & _1462; else add # DEBUG BEGIN_STMT vect__1745.1450_902 = vect_dij_1732.1435_945 - vect_sj_1740.1442_920; vect__1745.1450_901 = vect_dij_1732.1435_944 - vect_sj_1740.1442_919; vect_uij_1746.1451_899 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1745.1450_902; vect_uij_1746.1451_898 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1745.1450_901; uij_1746 = 0.0 / r2_1729; # DEBUG uijD.7689 => NULL # DEBUG BEGIN_STMT vect__1748.1452_896 = vect_dij_1732.1435_945 * { 8.0e+1, 8.0e+1, 8.0e+1, 8.0e+1 }; vect__1748.1452_895 = vect_dij_1732.1435_944 * { 8.0e+1, 8.0e+1, 8.0e+1, 8.0e+1 }; _1748 = dij_1732 * 8.0e+1; vect__2057.1453_894 = vect_powmult_1725.1446_911 - vect_r2_1729.1432_953; vect__2057.1453_893 = vect_powmult_1725.1446_910 - vect_r2_1729.1432_952; _2057 = -r2_1729; vect__1750.1454_892 = vect__1748.1452_896 + vect__2057.1453_894; vect__1750.1454_891 = vect__1748.1452_895 + vect__2057.1453_893; _1750 = _1748 + _2057; vect__1751.1455_889 = vect__1750.1454_892 * { 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3 }; vect__1751.1455_888 = vect__1750.1454_891 * { 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3 }; _1751 = _1750 * 2.50000000000000048572257327350598643533885478973388671875e-3; vect__2086.1456_886 = vect_dij_1732.1435_945 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 }; vect__2086.1456_885 = vect_dij_1732.1435_944 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 }; _2086 = dij_1732 * 2.0e+0; vect__1753.1457_884 = vect_uij_1746.1451_899 * vect__2086.1456_886; vect__1753.1457_883 = vect_uij_1746.1451_898 * vect__2086.1456_885; _1753 = uij_1746 * _2086; vect__1754.1458_882 = vect__1751.1455_889 - vect__1753.1457_884; vect__1754.1458_881 = vect__1751.1455_888 - vect__1753.1457_883; _1754 = _1751 - _1753; vect__1755.1459_879 = vect__1745.1450_902 * { 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2 }; vect__1755.1459_878 = vect__1745.1450_901 * { 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2 }; _1755 = dij_1732 * 5.000000000000000277555756156289135105907917022705078125e-2; vect__1756.1460_877 = __svml_log4_mask_e9D.7954 (vect__1755.1459_879); vect__1756.1460_876 = __svml_log4_mask_e9D.7954 (vect__1755.1459_878); vect__1757.1461_874 = vect__1756.1460_877 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 }; vect__1757.1461_873 = vect__1756.1460_876 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 }; vect__2097.1462_871 = vect__1754.1458_882 + { -1.0e+0, -1.0e+0, -1.0e+0, -1.0e+0 }; vect__2097.1462_870 = vect__1754.1458_881 + { -1.0e+0, -1.0e+0, -1.0e+0, -1.0e+0 }; _2097 = _1754 - 1.0e+0; vect__1759.1463_869 = vect__2097.1462_871 - vect__1757.1461_874; vect__1759.1463_868 = vect__2097.1462_870 - vect__1757.1461_873; vect__2099.1464_866 = vect_dij1i_1731.1434_947 * { 1.25e-1, 1.25e-1, 1.25e-1, 1.25e-1 }; vect__2099.1464_865 = vect_dij1i_1731.1434_946 * { 1.25e-1, 1.25e-1, 1.25e-1, 1.25e-1 }; vect__1761.1465_864 = vect__1759.1463_869 * vect__2099.1464_866; vect__1761.1465_863 = vect__1759.1463_868 * vect__2099.1464_865; _1761 = _2097 * Inf; /// else add # DEBUG temp0D.7768 => NULL mask__1458.1466_862 = vect_dij_1732.1435_945 <= vect__1744.1447_908; // begin else if (dij > 4.0 * sj) mask__1458.1466_861 = vect_dij_1732.1435_944 <= vect__1744.1447_907; mask__1457.1467_860 = mask__1458.1466_862 & mask__1462.1445_913; mask__1457.1467_859 = mask__1458.1466_861 & mask__1462.1445_912; # DEBUG BEGIN_STMT vect__1764.1468_857 = vect_sj_1740.1442_920 * { 4.0e+0, 4.0e+0, 4.0e+0, 4.0e+0 }; vect__1764.1468_856 = vect_sj_1740.1442_919 * { 4.0e+0, 4.0e+0, 4.0e+0, 4.0e+0 }; mask__1456.1469_855 = vect_dij_1732.1435_945 > vect__1764.1468_857; mask__1456.1469_854 = vect_dij_1732.1435_944 > vect__1764.1468_856; _1456 = dij_1732 > 0.0; mask__1455.1470_853 = mask__1456.1469_855 & mask__1457.1467_860; // else if (dij > 4.0 * sj) mask__1455.1470_852 = mask__1456.1469_854 & mask__1457.1467_859; _1455 = _1456 & _1462; /// else add # DEBUG BEGIN_STMT vect_powmult_1726.1471_851 = vect_dij1i_1731.1434_947 * vect_dij1i_1731.1434_947; vect_powmult_1726.1471_846 = vect_dij1i_1731.1434_946 * vect_dij1i_1731.1434_946; # DEBUG dij2iD.7672 => NULL # DEBUG BEGIN_STMT vect_tmpsd_1766.1472_845 = vect_powmult_1725.1446_911 * vect_powmult_1726.1471_851; vect_tmpsd_1766.1472_844 = vect_powmult_1725.1446_910 * vect_powmult_1726.1471_846; # DEBUG tmpsdD.7695 => NULL # DEBUG BEGIN_STMT vect__1767.1473_842 = vect_tmpsd_1766.1472_845 * { 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1 }; vect__1767.1473_841 = vect_tmpsd_1766.1472_844 * { 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1 }; vect__1768.1474_839 = vect__1767.1473_842 + { 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1 }; vect__1768.1474_838 = vect__1767.1473_841 + { 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1 }; vect__1769.1475_837 = vect_tmpsd_1766.1472_845 * vect__1768.1474_839; vect__1769.1475_836 = vect_tmpsd_1766.1472_844 * vect__1768.1474_838; vect__1770.1476_834 = vect__1769.1475_837 + { 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1 }; vect__1770.1476_832 = vect__1769.1475_836 + { 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1 }; vect__1771.1477_831 = vect_tmpsd_1766.1472_845 * vect__1770.1476_834; vect__1771.1477_830 = vect_tmpsd_1766.1472_844 * vect__1770.1476_832; vect__1772.1478_824 = vect__1771.1477_831 + { 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1 }; vect__1772.1478_823 = vect__1771.1477_830 + { 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1 }; vect__1773.1479_822 = vect_tmpsd_1766.1472_845 * vect__1772.1478_824; vect__1773.1479_821 = vect_tmpsd_1766.1472_844 * vect__1772.1478_823; vect_dumbo_1774.1480_819 = vect__1773.1479_822 + { 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1 }; vect_dumbo_1774.1480_818 = vect__1773.1479_821 + { 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1 }; # DEBUG dumboD.7694 => NULL # DEBUG BEGIN_STMT vect__2892.1481_817 = vect_powmult_1726.1471_851 * vect_sj_1740.1442_920; vect__2892.1481_816 = vect_powmult_1726.1471_846 * vect_sj_1740.1442_919; vect__1776.1482_815 = vect_tmpsd_1766.1472_845 * vect__2892.1481_817; vect__1776.1482_814 = vect_tmpsd_1766.1472_844 * vect__2892.1481_816; vect__1777.1483_813 = vect_dumbo_1774.1480_819 * vect__1776.1482_815; vect__1777.1483_812 = vect_dumbo_1774.1480_818 * vect__1776.1482_814; # DEBUG temp1D.7769 => NULL mask__1453.1484_811 = vect_dij_1732.1435_945 <= vect__1764.1468_857; // begin else if (dij > ri + sj) mask__1453.1484_810 = vect_dij_1732.1435_944 <= vect__1764.1468_856; _1453 = dij_1732 <= 0.0; mask__1452.1485_809 = mask__1453.1484_811 & mask__1457.1467_860; mask__1452.1485_808 = mask__1453.1484_810 & mask__1457.1467_859; _1452 = _1453 & _1462; // esle add # DEBUG BEGIN_STMT vect__1780.1486_806 = vect_cst__807 + vect_sj_1740.1442_920; vect__1780.1486_805 = vect_cst__807 + vect_sj_1740.1442_919; _1780 = ri_1700; mask__1451.1487_804 = vect_dij_1732.1435_945 > vect__1780.1486_806; mask__1451.1487_803 = vect_dij_1732.1435_944 > vect__1780.1486_805; _1451 = dij_1732 > _1780; mask__1450.1488_802 = mask__1451.1487_804 & mask__1452.1485_809; mask__1450.1488_801 = mask__1451.1487_803 & mask__1452.1485_808; // else if (dij > ri + sj) _1450 = _1451 & _1452; # DEBUG BEGIN_STMT vect__1782.1489_800 = vect_sj_1740.1442_920 / vect__2057.1453_894; vect__1782.1489_799 = vect_sj_1740.1442_919 / vect__2057.1453_893; _1782 = 0.0 / r2_1729; vect__1784.1490_797 = vect_dij_1732.1435_945 + vect_sj_1740.1442_920; vect__1784.1490_796 = vect_dij_1732.1435_944 + vect_sj_1740.1442_919; vect__1785.1491_795 = vect__1745.1450_902 / vect__1784.1490_797; vect__1785.1491_794 = vect__1745.1450_901 / vect__1784.1490_796; vect__1786.1492_793 = __svml_log4_mask_e9D.7987 (vect__1785.1491_795); vect__1786.1492_792 = __svml_log4_mask_e9D.7987 (vect__1785.1491_794); vect__1894.1493_790 = vect_dij1i_1731.1434_947 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 }; vect__1894.1493_789 = vect_dij1i_1731.1434_946 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 }; vect__1788.1494_788 = vect__1786.1492_793 * vect__1894.1493_790; vect__1788.1494_787 = vect__1786.1492_792 * vect__1894.1493_789; vect__1789.1495_786 = vect__1782.1489_800 - vect__1788.1494_788; vect__1789.1495_785 = vect__1782.1489_799 - vect__1788.1494_787; _1789 = _1782 - Nan; vect__1790.1496_783 = vect__1789.1495_786 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 }; vect__1790.1496_782 = vect__1789.1495_785 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 }; _1790 = _1789 * 5.0e-1; # DEBUG temp2D.7770 => NULL mask__1448.1497_781 = vect_dij_1732.1435_945 <= vect__1780.1486_806; // begin else if (dij > fabs(ri - sj)) mask__1448.1497_780 = vect_dij_1732.1435_944 <= vect__1780.1486_805; _1448 = dij_1732 <= _1780; mask__1447.1498_779 = mask__1448.1497_781 & mask__1452.1485_809; mask__1447.1498_778 = mask__1448.1497_780 & mask__1452.1485_808; _1447 = _1448 & _1452; # DEBUG BEGIN_STMT vect__1793.1499_776 = vect_cst__807 - vect_sj_1740.1442_920; vect__1793.1499_775 = vect_cst__807 - vect_sj_1740.1442_919; vect__1794.1500_774 = ABS_EXPR <vect__1793.1499_776>; vect__1794.1500_773 = ABS_EXPR <vect__1793.1499_775>; _1794 = ABS_EXPR <_1780>; mask__1446.1501_772 = vect_dij_1732.1435_945 > vect__1794.1500_774; mask__1446.1501_771 = vect_dij_1732.1435_944 > vect__1794.1500_773; _1446 = dij_1732 > _1794; mask__1445.1502_770 = mask__1446.1501_772 & mask__1447.1498_779; mask__1445.1502_769 = mask__1446.1501_771 & mask__1447.1498_778; // else if (dij > fabs(ri - sj)) _1445 = _1446 & _1447; # DEBUG BEGIN_STMT vect__2372.1503_767 = vect_cst__768 - vect_powmult_1725.1446_911; vect__2372.1503_766 = vect_cst__768 - vect_powmult_1725.1446_910; _2372 = powmult_1728; vect__1798.1504_765 = vect_r2_1729.1432_953 + vect__2372.1503_767; vect__1798.1504_764 = vect_r2_1729.1432_952 + vect__2372.1503_766; _1798 = r2_1729 + _2372; vect__2373.1505_762 = vect__1798.1504_765 * vect_cst__763; vect__2373.1505_761 = vect__1798.1504_764 * vect_cst__763; _2373 = _1798 * _2894; vect_theta_1800.1506_760 = vect_dij1i_1731.1434_947 * vect__2373.1505_762; vect_theta_1800.1506_759 = vect_dij1i_1731.1434_946 * vect__2373.1505_761; theta_1800 = _2373 * Inf; # DEBUG thetaD.7670 => NULL # DEBUG BEGIN_STMT vect_uij_1802.1507_757 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1784.1490_797; vect_uij_1802.1507_756 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1784.1490_796; # DEBUG uijD.7689 => NULL # DEBUG BEGIN_STMT vect__1803.1508_754 = vect_theta_1800.1506_760 + { -2.0e+0, -2.0e+0, -2.0e+0, -2.0e+0 }; vect__1803.1508_753 = vect_theta_1800.1506_759 + { -2.0e+0, -2.0e+0, -2.0e+0, -2.0e+0 }; _1803 = theta_1800 - 2.0e+0; vect__1804.1509_751 = vect_cst__752 * vect__1803.1508_754; vect__1804.1509_750 = vect_cst__752 * vect__1803.1508_753; _1804 = ri1i_1701 * _1803; vect__1805.1510_749 = vect_uij_1802.1507_757 + vect__1804.1509_751; vect__1805.1510_748 = vect_uij_1802.1507_756 + vect__1804.1509_750; _1805 = uij_1746 + _1804; vect__1806.1511_746 = vect_uij_1802.1507_757 * vect_cst__807; vect__1806.1511_745 = vect_uij_1802.1507_756 * vect_cst__807; _1806 = ri_1700 * uij_1746; vect__1807.1512_744 = __svml_log4_mask_e9D.8008 (vect__1806.1511_746); vect__1807.1512_743 = __svml_log4_mask_e9D.8008 (vect__1806.1511_745); vect__1808.1513_742 = vect_dij1i_1731.1434_947 * vect__1807.1512_744; vect__1808.1513_741 = vect_dij1i_1731.1434_946 * vect__1807.1512_743; vect__1809.1514_740 = vect__1805.1510_749 - vect__1808.1513_742; vect__1809.1514_739 = vect__1805.1510_748 - vect__1808.1513_741; _1809 = _1805 - Nan; vect__1810.1515_737 = vect__1809.1514_740 * { 2.5e-1, 2.5e-1, 2.5e-1, 2.5e-1 }; vect__1810.1515_736 = vect__1809.1514_739 * { 2.5e-1, 2.5e-1, 2.5e-1, 2.5e-1 }; _1810 = _1809 * 2.5e-1; # DEBUG temp3D.7771 => NULL mask__1443.1516_735 = vect_dij_1732.1435_945 <= vect__1794.1500_774; // begin else if (ri < sj) mask__1443.1516_734 = vect_dij_1732.1435_944 <= vect__1794.1500_773; _1443 = dij_1732 <= _1794; mask__1442.1517_733 = mask__1443.1516_735 & mask__1447.1498_779; mask__1442.1517_732 = mask__1443.1516_734 & mask__1447.1498_778; _1442 = _1443 & _1447; # DEBUG BEGIN_STMT mask__1441.1518_730 = vect_cst__807 < vect_sj_1740.1442_920; mask__1441.1518_729 = vect_cst__807 < vect_sj_1740.1442_919; _1441 = _1699 < 8.99999999999999966693309261245303787291049957275390625e-2; mask__1406.1519_728 = mask__1441.1518_730 & mask__1442.1517_733; mask__1406.1519_727 = mask__1441.1518_729 & mask__1442.1517_732; // else if (ri < sj) _1406 = _1441 & _1442; # DEBUG BEGIN_STMT vect__1816.1520_725 = vect__1782.1489_800 - vect_cst__726; vect__1816.1520_724 = vect__1782.1489_799 - vect_cst__726; _1816 = _1782 - _1815; vect__1235.1521_723 = -vect__1785.1491_795; vect__1235.1521_722 = -vect__1785.1491_794; vect__1820.1522_721 = __svml_log4_mask_e9D.8019 (vect__1235.1521_723); vect__1820.1522_720 = __svml_log4_mask_e9D.8019 (vect__1235.1521_722); vect__1822.1523_719 = vect__1820.1522_721 * vect__1894.1493_790; vect__1822.1523_718 = vect__1820.1522_720 * vect__1894.1493_789; vect__1823.1524_717 = vect__1816.1520_725 - vect__1822.1523_719; vect__1823.1524_716 = vect__1816.1520_724 - vect__1822.1523_718; _1823 = _1816 - Nan; vect__1824.1525_714 = vect__1823.1524_717 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 }; vect__1824.1525_713 = vect__1823.1524_716 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 }; // end if-else _1824 = _1823 * 5.0e-1; # DEBUG temp4D.7772 => NULL vect__ifc__1252.1526_711 = VEC_COND_EXPR <mask__1460.1449_904, vect__1761.1465_864, { 0.0, 0.0, 0.0, 0.0 }>; vect__ifc__1252.1526_710 = VEC_COND_EXPR <mask__1460.1449_903, vect__1761.1465_863, { 0.0, 0.0, 0.0, 0.0 }>; _ifc__1252 = _1460 ? _1761 : 0.0; vect__1251.1527_709 = vect_temp0_1543.1410_1003 + vect__ifc__1252.1526_711; vect__1251.1527_708 = vect__1251.1527_709 + vect__ifc__1252.1526_710; _1251 = temp0_1543 + _ifc__1252; vect__ifc__1250.1529_704 = VEC_COND_EXPR <mask__1455.1470_853, vect__1777.1483_813, { 0.0, 0.0, 0.0, 0.0 }>; vect__ifc__1250.1529_703 = VEC_COND_EXPR <mask__1455.1470_852, vect__1777.1483_812, { 0.0, 0.0, 0.0, 0.0 }>; _ifc__1250 = _1455 ? Nan : 0.0; vect__1249.1530_702 = vect_temp1_2883.1411_1002 - vect__ifc__1250.1529_704; vect__1249.1530_701 = vect__1249.1530_702 - vect__ifc__1250.1529_703; _1249 = temp1_2883 - _ifc__1250; vect__ifc__1248.1532_697 = VEC_COND_EXPR <mask__1450.1488_802, vect__1790.1496_783, { 0.0, 0.0, 0.0, 0.0 }>; vect__ifc__1248.1532_696 = VEC_COND_EXPR <mask__1450.1488_801, vect__1790.1496_782, { 0.0, 0.0, 0.0, 0.0 }>; _ifc__1248 = _1450 ? _1790 : 0.0; vect__1247.1533_695 = vect_temp2_224.1412_1001 + vect__ifc__1248.1532_697; vect__1247.1533_694 = vect__1247.1533_695 + vect__ifc__1248.1532_696; _1247 = temp2_224 + _ifc__1248; vect__ifc__1246.1535_690 = VEC_COND_EXPR <mask__1445.1502_770, vect__1810.1515_737, { 0.0, 0.0, 0.0, 0.0 }>; vect__ifc__1246.1535_689 = VEC_COND_EXPR <mask__1445.1502_769, vect__1810.1515_736, { 0.0, 0.0, 0.0, 0.0 }>; _ifc__1246 = _1445 ? _1810 : 0.0; vect__1245.1536_688 = vect_temp3_2699.1413_1000 + vect__ifc__1246.1535_690; vect__1245.1536_687 = vect__1245.1536_688 + vect__ifc__1246.1535_689; _1245 = temp3_2699 + _ifc__1246; vect__ifc__1244.1538_673 = VEC_COND_EXPR <mask__1406.1519_728, vect__1824.1525_714, { 0.0, 0.0, 0.0, 0.0 }>; vect__ifc__1244.1538_672 = VEC_COND_EXPR <mask__1406.1519_727, vect__1824.1525_713, { 0.0, 0.0, 0.0, 0.0 }>; _ifc__1244 = _1406 ? _1824 : 0.0; vect__1243.1539_671 = vect_temp4_1545.1414_999 + vect__ifc__1244.1538_673; vect__1243.1539_670 = vect__1243.1539_671 + vect__ifc__1244.1538_672; _1243 = temp4_1545 + _ifc__1244; # DEBUG temp4D.7772 => _1243 # DEBUG temp3D.7771 => _1245 # DEBUG temp2D.7770 => _1247 # DEBUG temp1D.7769 => _1249 # DEBUG temp0D.7768 => _1251 # DEBUG BEGIN_STMT # RANGE [1, 2147483647] NONZERO 2147483647 k_1827 = k_3019 + 1; # DEBUG temp4D.7772 => _1243 # DEBUG temp3D.7771 => _1245 # DEBUG temp2D.7770 => _1247 # DEBUG temp1D.7769 => _1249 # DEBUG temp0D.7768 => _1251 # DEBUG kD.7615 => k_1827 # DEBUG BEGIN_STMT # PT = nonlocal escaped null vectp.1415_997 = vectp.1415_998 + 32; ivtmp_666 = ivtmp_667 + 1; if (ivtmp_666 < bnd.1407_1013) goto <bb 216>; [83.33%] else goto <bb 303>; [16.67% |
bb 分块的优化方案:
1:找到vec_cond_expr,将其中第一个参数mask作为上一个bb的结束,(其后还有一个mask)并且在其后新建一个该mask与0进行比较的gimple_cond,将这两个mask相与。同时新建该mask判断为ture 和 false的edge,分别指向分割的bb和其下一个bb。
2:以vec_cond_expr的第二个参数的ssa_name_def作为要分割bb的末尾,进行分割。并且生成一条指向其下一个bb的edge。同时将其作为mask判断为false的edge的dest。
optimize_mask_stores 代码
10093 /* The code below is trying to perform simple optimization - revert 10094 if-conversion for masked stores, i.e. if the mask of a store is zero 10095 do not perform it and all stored value producers also if possible. 10096 For example, 10097 for (i=0; i<n; i++) 10098 if (c[i]) 10099 { 10100 p1[i] += 1; 10101 p2[i] = p3[i] +2; 10102 } 10103 this transformation will produce the following semi-hammock: 10104 10105 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 }) 10106 { 10107 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165); 10108 vect__12.22_172 = vect__11.19_170 + vect_cst__171; 10109 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172); 10110 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165); 10111 vect__19.28_184 = vect__18.25_182 + vect_cst__183; 10112 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); 10113 } 10114 */ 10115 10116 void 10117 optimize_mask_stores (class loop *loop) 10118 { 10119 basic_block *bbs = get_loop_body (loop); 10120 unsigned nbbs = loop->num_nodes; 10121 unsigned i; 10122 basic_block bb; 10123 class loop *bb_loop; 10124 gimple_stmt_iterator gsi; 10125 gimple *stmt; 10126 auto_vec<gimple *> worklist; 10127 auto_purge_vect_location sentinel; 10128 10129 vect_location = find_loop_location (loop); 10130 /* Pick up all masked stores in loop if any. */ 10131 for (i = 0; i < nbbs; i++) 10132 { 10133 bb = bbs[i]; 10134 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 10135 gsi_next (&gsi)) 10136 { 10137 stmt = gsi_stmt (gsi); 10138 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) 10139 worklist.safe_push (stmt); 10140 } 10141 } 10142 10143 free (bbs); 10144 if (worklist.is_empty ()) 10145 return; 10146 10147 /* Loop has masked stores. */ 10148 while (!worklist.is_empty ()) 10149 { 10150 gimple *last, *last_store; 10151 edge e, efalse; 10152 tree mask; 10153 basic_block store_bb, join_bb; 10154 gimple_stmt_iterator gsi_to; 10155 tree vdef, new_vdef; 10156 gphi *phi; 10157 tree vectype; 10158 tree zero; 10159 10160 last = worklist.pop (); 10161 mask = gimple_call_arg (last, 2); 10162 bb = gimple_bb (last); 10163 /* Create then_bb and if-then structure in CFG, then_bb belongs to 10164 the same loop as if_bb. It could be different to LOOP when two 10165 level loop-nest is vectorized and mask_store belongs to the inner 10166 one. */ 10167 e = split_block (bb, last); 10168 bb_loop = bb->loop_father; 10169 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); 10170 join_bb = e->dest; 10171 store_bb = create_empty_bb (bb); 10172 add_bb_to_loop (store_bb, bb_loop); 10173 e->flags = EDGE_TRUE_VALUE; 10174 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); 10175 /* Put STORE_BB to likely part. */ 10176 efalse->probability = profile_probability::unlikely (); 10177 store_bb->count = efalse->count (); 10178 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); 10179 if (dom_info_available_p (CDI_DOMINATORS)) 10180 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); 10181 if (dump_enabled_p ()) 10182 dump_printf_loc (MSG_NOTE, vect_location, 10183 "Create new block %d to sink mask stores.", 10184 store_bb->index); 10185 /* Create vector comparison with boolean result. */ 10186 vectype = TREE_TYPE (mask); 10187 zero = build_zero_cst (vectype); 10188 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); 10189 gsi = gsi_last_bb (bb); 10190 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); 10191 /* Create new PHI node for vdef of the last masked store: 10192 .MEM_2 = VDEF <.MEM_1> 10193 will be converted to 10194 .MEM.3 = VDEF <.MEM_1> 10195 and new PHI node will be created in join bb 10196 .MEM_2 = PHI <.MEM_1, .MEM_3> 10197 */ 10198 vdef = gimple_vdef (last); 10199 new_vdef = make_ssa_name (gimple_vop (cfun), last); 10200 gimple_set_vdef (last, new_vdef); 10201 phi = create_phi_node (vdef, join_bb); 10202 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); 10203 10204 /* Put all masked stores with the same mask to STORE_BB if possible. */ 10205 while (true) 10206 { 10207 gimple_stmt_iterator gsi_from; 10208 gimple *stmt1 = NULL; 10209 10210 /* Move masked store to STORE_BB. */ 10211 last_store = last; 10212 gsi = gsi_for_stmt (last); 10213 gsi_from = gsi; 10214 /* Shift GSI to the previous stmt for further traversal. */ 10215 gsi_prev (&gsi); 10216 gsi_to = gsi_start_bb (store_bb); 10217 gsi_move_before (&gsi_from, &gsi_to); 10218 /* Setup GSI_TO to the non-empty block start. */ 10219 gsi_to = gsi_start_bb (store_bb); 10220 if (dump_enabled_p ()) 10221 dump_printf_loc (MSG_NOTE, vect_location, 10222 "Move stmt to created bb\n%G", last); 10223 /* Move all stored value producers if possible. */ 10224 while (!gsi_end_p (gsi)) 10225 { 10226 tree lhs; 10227 imm_use_iterator imm_iter; 10228 use_operand_p use_p; 10229 bool res; 10230 10231 /* Skip debug statements. */ 10232 if (is_gimple_debug (gsi_stmt (gsi))) 10233 { 10234 gsi_prev (&gsi); 10235 continue; 10236 } 10237 stmt1 = gsi_stmt (gsi); 10238 /* Do not consider statements writing to memory or having 10239 volatile operand. */ 10240 if (gimple_vdef (stmt1) 10241 || gimple_has_volatile_ops (stmt1)) 10242 break; 10243 gsi_from = gsi; 10244 gsi_prev (&gsi); 10245 lhs = gimple_get_lhs (stmt1); 10246 if (!lhs) 10247 break; 10248 10249 /* LHS of vectorized stmt must be SSA_NAME. */ 10250 if (TREE_CODE (lhs) != SSA_NAME) 10251 break; 10252 10253 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 10254 { 10255 /* Remove dead scalar statement. */ 10256 if (has_zero_uses (lhs)) 10257 { 10258 gsi_remove (&gsi_from, true); 10259 continue; 10260 } 10261 } 10262 10263 /* Check that LHS does not have uses outside of STORE_BB. */ 10264 res = true; 10265 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 10266 { 10267 gimple *use_stmt; 10268 use_stmt = USE_STMT (use_p); 10269 if (is_gimple_debug (use_stmt)) 10270 continue; 10271 if (gimple_bb (use_stmt) != store_bb) 10272 { 10273 res = false; 10274 break; 10275 } 10276 } 10277 if (!res) 10278 break; 10279 10280 if (gimple_vuse (stmt1) 10281 && gimple_vuse (stmt1) != gimple_vuse (last_store)) 10282 break; 10283 10284 /* Can move STMT1 to STORE_BB. */ 10285 if (dump_enabled_p ()) 10286 dump_printf_loc (MSG_NOTE, vect_location, 10287 "Move stmt to created bb\n%G", stmt1); 10288 gsi_move_before (&gsi_from, &gsi_to); 10289 /* Shift GSI_TO for further insertion. */ 10290 gsi_prev (&gsi_to); 10291 } 10292 /* Put other masked stores with the same mask to STORE_BB. */ 10293 if (worklist.is_empty () 10294 || gimple_call_arg (worklist.last (), 2) != mask 10295 || worklist.last () != stmt1) 10296 break; 10297 last = worklist.pop (); 10298 } 10299 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); 10300 } 10301 } |
optimize_mask_vec_cond 代码
10093 void 10094 optimize_mask_vec_cond (class loop *loop) 10095 { 10096 basic_block *bbs = get_loop_body (loop); 10097 unsigned nbbs = loop->num_nodes; 10098 unsigned i; 10099 basic_block bb, bb_mask; 10100 class loop *bb_loop; 10101 gimple_stmt_iterator gsi; 10102 gimple *stmt; 10103 auto_vec<gimple *> worklist; 10104 auto_purge_vect_location sentinel; 10105 10106 enum tree_code code; 10107 10108 vect_location = find_loop_location (loop); 10109 /* Pick up all vec_cond_expr in loop if any. */ 10110 for (i = 0; i < nbbs; i++) 10111 { 10112 bb = bbs[i]; 10113 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 10114 gsi_next (&gsi)) 10115 { 10116 stmt = gsi_stmt (gsi); 10117 if (is_gimple_assign(stmt)) { 10118 gassign *stmt_assign = dyn_cast <gassign *> (gsi_stmt (gsi)); 10119 code = gimple_assign_rhs_code (stmt_assign); 10120 // 检查语句是否为 VEC_COND_EXPR 10121 if (code == VEC_COND_EXPR) { 10122 worklist.safe_push (stmt); 10123 } 10124 } 10125 } 10126 } 10128 free (bbs); 10129 if (worklist.is_empty ()) 10130 return; 10131 10132 /* Loop has vec_cond_expr. */ 10133 while (!worklist.is_empty ()) 10134 { 10135 gimple *last, *last_store, *last1; 10136 edge e, efalse; 10137 tree mask; 10138 basic_block store_bb, join_bb; 10139 gimple_stmt_iterator gsi_to; 10140 gimple_stmt_iterator gsi_stmt_def; 10141 tree vdef, new_vdef; 10142 gphi *phi; 10143 tree vectype; 10144 tree zero; 10145 10146 last = worklist.pop (); 10147 gassign *stmt_assign = dyn_cast <gassign *> (last); 10148 mask = gimple_assign_rhs1(stmt_assign); 10149 tree true_vector_operand = gimple_assign_rhs2(stmt_assign); 10150 10151 gimple *mask_def = SSA_NAME_DEF_STMT (mask); 10152 10153 gimple *stmt_def = SSA_NAME_DEF_STMT (true_vector_operand); 10154 10155 bb = gimple_bb (stmt_def); 10156 10157 // bb_mask = gimple_bb (mask_def); 10158 /* Create then_bb and if-then structure in CFG, then_bb belongs to 10159 the same loop as if_bb. It could be different to LOOP when two 10160 level loop-nest is vectorized and mask_store belongs to the inner 10161 one. */ 10162 10163 gsi_stmt_def = gsi_for_stmt (stmt_def); 10164 gsi_next(&gsi_stmt_def); 10165 10166 stmt_def = gsi_stmt(gsi_stmt_def); 10167 10168 e = split_block (bb, stmt_def); 10169 bb_loop = bb->loop_father; 10170 // gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); 10171 join_bb = e->dest; 10172 store_bb = create_empty_bb (bb); 10173 add_bb_to_loop (store_bb, bb_loop); 10174 e->flags = EDGE_TRUE_VALUE; 10175 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); 10176 /* Put STORE_BB to likely part. */ 10177 efalse->probability = profile_probability::unlikely (); 10178 store_bb->count = efalse->count (); 10179 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); 10180 if (dom_info_available_p (CDI_DOMINATORS)) 10181 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); 10182 if (dump_enabled_p ()) 10183 dump_printf_loc (MSG_NOTE, vect_location, 10184 "Create new block %d to sink vect cond expr", 10185 store_bb->index); 10186 /* Create vector comparison with boolean result. */ 10187 vectype = TREE_TYPE (mask); 10188 zero = build_zero_cst (vectype); 10189 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); 10190 // gsi = gsi_last_bb (bb); 10191 gsi = gsi_for_stmt (mask_def); 10192 gsi_next(&gsi); 10193 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); 10194 /* Create new PHI node for vdef of the last masked store: 10195 .MEM_2 = VDEF <.MEM_1> 10196 will be converted to 10197 .MEM.3 = VDEF <.MEM_1> 10198 and new PHI node will be created in join bb 10199 .MEM_2 = PHI <.MEM_1, .MEM_3> 10200 */ 10201 /* vdef = gimple_vdef (last); 10202 new_vdef = make_ssa_name (gimple_vop (cfun), last); 10203 gimple_set_vdef (last, new_vdef); 10204 phi = create_phi_node (vdef, join_bb); 10205 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);*/ 10206 10207 /* Put all masked stores with the same mask to STORE_BB if possible. */ 10208 // while (true) 10209 // { 10210 gimple_stmt_iterator gsi_from; 10211 gimple *stmt1 = NULL; 10213 /* Move vec_cond second var def to STORE_BB. */ 10214 last_store = stmt_def; 10215 gsi = gsi_for_stmt (stmt_def); 10216 gsi_from = gsi; 10217 /* Shift GSI to the previous stmt for further traversal. */ 10218 gsi_prev (&gsi); 10219 gsi_to = gsi_start_bb (store_bb); 10220 gsi_move_before (&gsi_from, &gsi_to); 10221 /* Setup GSI_TO to the non-empty block start. */ 10222 gsi_to = gsi_start_bb (store_bb); 10223 if (dump_enabled_p ()) 10224 dump_printf_loc (MSG_NOTE, vect_location, 10225 "Move stmt to created bb\n%G", last); 10226 /* Move all stored value producers if possible. */ 10227 while (!gsi_end_p (gsi)) 10228 { 10229 tree lhs; 10230 imm_use_iterator imm_iter; 10231 use_operand_p use_p; 10232 bool res; 10233 10234 /* Skip debug statements. */ 10235 if (is_gimple_debug (gsi_stmt (gsi))) 10236 { 10237 gsi_prev (&gsi); 10238 continue; 10239 } 10240 stmt1 = gsi_stmt (gsi); 10241 /* Do not consider statements writing to memory or having 10242 volatile operand. */ 10243 if (gimple_vdef (stmt1) 10244 || gimple_has_volatile_ops (stmt1)) 10245 break; 10246 gsi_from = gsi; 10247 gsi_prev (&gsi); 10248 lhs = gimple_get_lhs (stmt1); 10249 if (!lhs) 10250 break; 10251 10252 /* LHS of vectorized stmt must be SSA_NAME. */ 10253 if (TREE_CODE (lhs) != SSA_NAME) 10254 break; 10255 10256 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 10257 { 10258 /* Remove dead scalar statement. */ 10259 /* if (has_zero_uses (lhs)) 10260 { 10261 gsi_remove (&gsi_from, true); 10262 continue; 10263 }*/ 10264 } 10265 10266 /* Check that LHS does not have uses outside of STORE_BB. */ 10267 res = true; 10268 /* FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 10269 { 10270 gimple *use_stmt; 10271 use_stmt = USE_STMT (use_p); 10272 if (is_gimple_debug (use_stmt)) 10273 continue; 10274 if (gimple_bb (use_stmt) != store_bb) 10275 { 10276 res = false; 10277 break; 10278 } 10279 }*/ 10280 if (!res) 10281 break; 10282 10283 /* if (gimple_vuse (stmt1) 10284 && gimple_vuse (stmt1) != gimple_vuse (last_store)) 10285 break;*/ 10286 10287 /* Can move STMT1 to STORE_BB. */ 10288 if (dump_enabled_p ()) 10289 dump_printf_loc (MSG_NOTE, vect_location, 10290 "Move stmt to created bb\n%G", stmt1); 10291 gsi_move_before (&gsi_from, &gsi_to); 10292 /* Shift GSI_TO for further insertion. */ 10293 gsi_prev (&gsi_to); 10294 } 10295 /* Put other masked stores with the same mask to STORE_BB. */ 10296 /* if (worklist.is_empty () 10297 || gimple_call_arg (worklist.last (), 2) != mask 10298 || worklist.last () != stmt1) 10299 break; 10300 last = worklist.pop ();*/ 10301 // last1 = worklist.pop (); 10302 // } 10303 // add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); 10304 if (!worklist.is_empty ()) 10305 last = worklist.pop (); 10306 } 10307 } |
能够按照预期进行拆分bb块,同时解决编译不过的两个问题:
1:加上-g 之后,在fre pass 会报错,在对debug gimple 进行分析删除的时候,找不到某个标量的定义。 最后一个分支的标量gimple被直接删除了,没有生成debug gimple。导致后面debug gimple 使用到该标量是找不到其定义,报编译错误。解决方法,先去掉-g。后续在dce pass 中找删除标量和插入debug的逻辑。# DEBUG D#583 => D#597 ? _2164 : 0.0
2:在sink pass 中报编译错误,gimple_redirect_edge_and_branch函数中,assert不通过,需要该edge 是一个fallthru edge。在构造edge的时候需要生成。暂时注释掉。
default: 6134 /* Otherwise it must be a fallthru edge, and we don't need to 6135 do anything besides redirecting it. */ 6136 // gcc_assert (e->flags & EDGE_FALLTHRU); |
解决掉编译错误后,可以正确编译运行,但是结果错误。
原因是该loop 的 vf是8.每次会对loop 中的8个元素进行运算,计算mask的数据是double类型,会生成两个mask。每个分支需要对两个mask同时和{0,0,0,0}比较是否为0,目前只能进行一个mask的比较。可以的方法:
1:修改loop 中int 的类型使其在确定vf的时候将其作为double 看待(VIEW_CONVERT_EXPR),这样vf 是4, 就不存在两个mask。
2:gimple cond 不能支持这种if ( a==0 && b==0) 这种复杂条件表达,构造两个gimple cond。然后做&运算,将此条件作为需要判断的cond。
1761处循环:
1:在每个分支条件构造后插入两个mask按位或的gimple,并且以此新建一个gimple cond,作为分支判断的条件。
2:课题运算结果VE.查找原因。从打印每个分支运算结果来看,temp4的结果恒为0,即最后一个分支完全没有走到,存在问题,同时加上-g后报错,也是最后一个分支的标量被删除,怀疑最后分支在拆分的时候存在问题。(正确结果在源码中加打印中间结果,无法进行打印)。
Lhs use outside of BB。当其使用的outside BB是 VEC_COND 所在的BB认为是没问题的,其他情况需要进行添加phi节点操作
2中的stmt的 lhs res在4 里面被使用,原本在同一个bb里面不需要做额外的操作,当分到不同的bb后,走不走2 res的值会不同,如果不走4中用的res会使用上一次2中计算的res值,显然结果错误,需要添加phi节点来解决。
若2中的lhs res0 被 4 use ,需要在 2的上一个bb 1新建一个向量变量res1 = 0,在2 的下一个bb 3中,新建一个phi节点,res2 = phi<res1(1),res0(2)>, 并且将4中用到res0的地方改为res2。
若2中的lhs res0 被 4 use ,需要在 2的上一个bb 1新建一个向量变量res1 = 0,将2中的res0 = xx 修改为 res2 = xx,在2 的下一个bb 3中,新建一个phi节点,res0 = phi<res1(1),res2(2)>。
若2中的res0 2中的其他stmt使用到,则需要将所有用到res0的地方改成res2
对于多个分支都要进行计算的变量,可以将第二个分支直接用到此计算的地方,需要使用该计算的全部。在用到其的地方需要进行计算。
新增phi节点的代码
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 10283 { 10284 gimple *use_stmt; 10285 use_stmt = USE_STMT (use_p); 10286 if (is_gimple_debug (use_stmt)) 10287 continue; 10288 if (gimple_bb (use_stmt) != store_bb && gimple_bb (use_stmt) != gimple_bb (last)) 10289 { 10290 // res = false; 10291 10292 if (dump_enabled_p ()) 10293 dump_printf_loc (MSG_NOTE, vect_location, 10294 "LHS have use outside of store_BB\n%G", stmt1); 10295 tree lhs_use_out,new_lhs,new_lhs1,new_lhs2; 10296 tree new_lhs_phi; 10297 gphi *phi; 10298 tree vectype; 10299 tree zero; 10300 gimple *zero_def; 10301 lhs_use_out = gimple_assign_lhs(stmt1); 10302 10303 /* if (is_gimple_assign(stmt1)) { 10304 lhs_use_out = gimple_assign_lhs(stmt1); 10305 new_lhs = create_tmp_var(TREE_TYPE(lhs_use_out), "new_tmp_var"); 10306 new_lhs_phi = make_ssa_name(new_lhs,NULL); 10307 // gimple_assign_set_lhs(stmt1, new_lhs1); 10308 10309 10310 phi = create_phi_node (new_lhs_phi, join_bb); 10311 add_phi_arg (phi, lhs_use_out, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); 10312 10313 vectype = TREE_TYPE (lhs_use_out); 10314 zero = build_zero_cst (vectype); 10315 new_lhs1 = create_tmp_var(TREE_TYPE(lhs_use_out), "new_tmp_var1"); 10316 new_lhs2 = make_ssa_name(new_lhs1,NULL); 10317 zero_def = gimple_build_assign(new_lhs2, zero); 10318 10319 // basic_block stmt_bb = gimple_bb(stmt1); 10320 edge e_temp; 10321 edge_iterator ei; 10322 basic_block pred_bb; 10323 gimple_stmt_iterator gsi_temp; 10324 10325 // if (EDGE_COUNT(stmt_bb->preds) == 1) { 10326 e_temp = EDGE_PRED(store_bb, 0); 10327 pred_bb = e_temp->src; 10328 gsi_temp = gsi_start_bb(pred_bb); 10329 gsi_insert_before(&gsi_temp, zero_def, GSI_SAME_STMT); 10330 // } 10331 10332 add_phi_arg (phi, new_lhs2, e, UNKNOWN_LOCATION); 10333 // update_stmt (phi); 10334 10335 /* edge e_join; 10336 edge_iterator ei_join; 10337 10338 FOR_EACH_EDGE(e_join, ei_join, join_bb->succs) 10339 { 10340 if (EDGE_TRUE_P(e_join)) 10341 { 10342 *true_bb = e->dest; 10343 } 10344 }*/ 10345 10346 for (unsigned int i = 0; i < gimple_num_ops(use_stmt); i++) { 10347 tree rhs = gimple_op(use_stmt, i); 10348 if(rhs == lhs_use_out) { 10349 gimple_stmt_iterator gsi = gsi_for_stmt(use_stmt); 10350 gsi_insert_before (&gsi,stmt1,GSI_SAME_STMT); 10351 break; 10352 // create_new_def_for (rhs, phi,gimple_phi_result_ptr (phi)); 10353 // update_stmt (phi); 10354 } 10355 } 10356 // } |
2069处循环:
1:需要进行dim=3的常量传播,加上拆分循环这两个条件。验证前一个循环向量化后有7%的性能,加上ymm寄存器后有11%的性能。
2:查看gcc的loop split 和 loop distribute pass,发现loop distribute的总体思想是将能够向量化的代码最大限度拆分到一个循环中,(1)但其只对非嵌套循环的最内层循环分析,发现其dump的信息中没有对2069循环进行distribute。(2)同时其只能对没有数据依赖的部分distribute,源码有数据依赖的部分使用临时数组存储后进行拆分,需要自行编写代码实现。
549课题在mask store中涉及的运算上对数学函数添加mask代码
1 #include "config.h" 2 #include "system.h" 3 #include "coretypes.h" 4 #include "backend.h" 5 #include "tree.h" 6 #include "gimple.h" 7 #include "predict.h" 8 #include "tree-pass.h" 9 #include "ssa.h" 10 #include "cgraph.h" 11 #include "fold-const.h" 12 #include "stor-layout.h" 13 #include "gimple-iterator.h" 14 #include "gimple-walk.h" 15 #include "tree-ssa-loop-manip.h" 16 #include "tree-ssa-loop-niter.h" 17 #include "tree-cfg.h" 18 #include "cfgloop.h" 19 #include "tree-vectorizer.h" 20 #include "tree-ssa-propagate.h" 21 #include "dbgcnt.h" 22 #include "tree-scalar-evolution.h" 23 #include "stringpool.h" 24 #include "attribs.h" 25 #include "gimple-pretty-print.h" 26 #include "opt-problem.h" 27 #include "internal-fn.h" 28 #include "tree-ssa-sccvn.h" 29 #include "gimple-expr.h" 30 #include <cstdio> 31 32 namespace 33 { 34 const pass_data pass_data_test = { 35 GIMPLE_PASS, /* type */ 36 "mask_vecmath_func", /* name */ 37 OPTGROUP_NONE, /* optinfo_flags */ 38 TV_TREE_VECT_MASK_VECMATH_FUNC, /* tv_id */ 39 (PROP_cfg | PROP_ssa), /* properties_required */ 40 0, /* properties_provided */ 41 0, /* properties_destroyed */ 42 0, /* todo_flags_start */ 43 0, /* todo_flags_finish */ 44 }; 45 46 class pass_mask_vecmath_func : public gimple_opt_pass 47 { 48 public: 49 pass_mask_vecmath_func (gcc::context *ctxt) : gimple_opt_pass (pass_data_test, ctxt) {} 50 virtual bool 51 gate (function *fun) 52 { 53 // printf ("gate function noipa.\n"); 54 return flag_tree_mask_vecmath_func; 55 } 56 57 virtual unsigned int execute (function *); 58 }; 59 60 61 static void add_mask_to_call(gimple *stmt, tree new_arg, const char *func_name) { 62 if (!is_gimple_call(stmt)) { 63 // 如果不是函数调用语句,则不做任何操作 64 return; 65 } 66 67 // 获取原始函数调用的目标和参数列表 68 tree call_fn = gimple_call_fndecl(stmt); 69 70 // 获取或创建新的标识符节点来表示新的函数名称 71 tree new_func_id; 72 if(strcmp(func_name, "vmldCos2") == 0) 73 new_func_id = get_identifier("__svml_cos2_mask_e9"); 74 else if (strcmp(func_name, "vmldExp2") == 0) 75 new_func_id = get_identifier("__svml_exp2_mask_e9"); 76 else if (strcmp(func_name, "vmldSin2") == 0) 77 new_func_id = get_identifier("__svml_sin2_mask_e9"); 78 else if (strcmp(func_name, "sin.simdclone.2") == 0) 79 new_func_id = get_identifier("__svml_sin4_mask_e9"); 80 else if (strcmp(func_name, "cos.simdclone.2") == 0) 81 new_func_id = get_identifier("__svml_cos4_mask_e9"); 82 else if (strcmp(func_name, "exp.simdclone.2") == 0) 83 new_func_id = get_identifier("__svml_exp4_mask_e9"); 84 85 tree fntype = TREE_TYPE(call_fn); 87 tree new_fndecl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, new_func_id, fntype); 88 89 TREE_PUBLIC (new_fndecl) = 1; 90 DECL_EXTERNAL (new_fndecl) = 1; 91 DECL_IS_NOVOPS (new_fndecl) = 1; 92 TREE_READONLY (new_fndecl) = 1; 93 94 95 // 将新的标识符节点分配给函数声明的汇编名 96 // DECL_ASSEMBLER_NAME(call_fn) = new_func_id; 97 98 int num_args = gimple_call_num_args(stmt); 99 vec<tree> vargs = vNULL; 100 vargs.create (num_args+1); 101 102 // 创建一个新的参数列表,包含原始的参数和新的参数 103 for (int i = 0; i < num_args; i++) { 104 tree arg = gimple_call_arg(stmt, i); 105 vargs.safe_push(arg); 106 } 107 vargs.safe_push(new_arg); 108 109 tree lhs = gimple_call_lhs(stmt); 110 111 // 创建新的函数调用语句,包含新的参数 112 gimple *new_call = gimple_build_call_vec(new_fndecl,vargs); 113 gimple_call_set_lhs (new_call, lhs); 114 115 // 替换原始的函数调用语句 116 gimple_stmt_iterator gsi = gsi_for_stmt (stmt); 117 118 // printf ("-------------finish add mask to vecmath func call------------.\n"); 119 120 gsi_replace(&gsi, new_call,true); 121 stmt = new_call; 122 123 // 释放参数列表的内存 124 vargs.release (); 125 } 126 127 static void find_relate_operand(tree operand, gimple *stmt, tree mask) 128 { 129 if (!stmt) 130 return ; 131 132 if (TREE_CODE (operand) == SSA_NAME && is_gimple_call(stmt)) { // operand is ssa && stmt is gimple call 133 tree fndecl = gimple_call_fndecl(stmt); // 获取函数声明 134 if (fndecl && DECL_P(fndecl)) { // 确保fndecl有效并且是一个声明 135 const char *func_name = IDENTIFIER_POINTER(DECL_NAME(fndecl)); // 获取函数名称 136 // if (strcmp(func_name, "vmldLn2") == 0) { 137 if (strcmp(func_name, "vmldCos2") == 0 || 138 strcmp(func_name, "vmldExp2") == 0 || 139 strcmp(func_name, "vmldSin2") == 0 || 140 strcmp(func_name, "exp.simdclone.2") == 0 || 141 strcmp(func_name, "cos.simdclone.2") == 0 || 142 strcmp(func_name, "sin.simdclone.2") == 0) { 143 // printf ("-------------find math func------------.\n"); 144 add_mask_to_call(stmt,mask,func_name); 145 return ; 146 } 147 } 148 } 149 if (TREE_CODE (operand) == SSA_NAME && is_gimple_assign(stmt)) { // only find gimple assign 150 151 for (unsigned i = 1; i < gimple_num_ops(stmt); ++i) { // get gimple assign right hand side operand 152 tree op = gimple_op(stmt, i); 153 if(TREE_CODE (op) == SSA_NAME) { 154 155 gimple *stmt_2 = SSA_NAME_DEF_STMT (op); 156 find_relate_operand(op,stmt_2,mask); 157 // if(result) return result; 158 } 159 } 160 } 161 return ; 162 } 163 164 165 unsigned 166 pass_mask_vecmath_func::execute (function *fun) 167 { 168 unsigned ret = 0; 169 170 basic_block bb; 171 enum tree_code code; 172 FOR_EACH_BB_FN(bb, fun) { 173 gimple_stmt_iterator gsi; 174 175 /* for (int i = 1; i < number_of_loops (fun); i++) 176 { 177 loop_vec_info loop_vinfo; 178 bool has_mask_store; 179 180 class loop *loop = get_loop (fun, i); 181 if (!loop || !loop->aux) 182 continue; 183 loop_vinfo = (loop_vec_info) loop->aux; 184 has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo); 185 delete loop_vinfo; 186 if (has_mask_store) { 187 188 printf ("-------------have mask store------------.\n"); 189 190 basic_block *bbs = get_loop_body (loop); 191 unsigned nbbs = loop->num_nodes; 192 unsigned i; 193 basic_block bb; 194 class loop *bb_loop; 195 gimple_stmt_iterator gsi; 196 gimple *stmt; 197 198 for (i = 0; i < nbbs; i++) 199 { 200 bb = bbs[i];*/ 201 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 202 gsi_next (&gsi)) 203 { 204 gimple *stmt = gsi_stmt (gsi); 205 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) { 206 // printf ("------------ find mask store------------.\n"); 207 basic_block bb1 = gimple_bb(stmt); 208 tree mask = gimple_call_arg (stmt, 2); 209 tree value = gimple_call_arg (stmt, 3); 210 if(TREE_CODE (value) == SSA_NAME) { 211 gimple *value_def = SSA_NAME_DEF_STMT (value); 212 basic_block bb2 = gimple_bb(value_def); 213 // printf ("-------------begin find relate operand------------.\n"); 214 if(bb1 == bb2) // mask store and value def in same bb 215 find_relate_operand(value,value_def,mask); 216 } 217 } 218 } 219 220 // free (bbs); 221 } 222 // } 223 // } 224 225 return ret; 226 227 } 228 } 229 230 gimple_opt_pass * 231 make_pass_mask_vecmath_func (gcc::context *ctxt) 232 { 233 return new pass_mask_vecmath_func (ctxt); 234 } |
10092 10093 10094 void 10095 optimize_mask_vec_cond (class loop *loop) 10096 { 10097 basic_block *bbs = get_loop_body (loop); 10098 unsigned nbbs = loop->num_nodes; 10099 unsigned i; 10100 basic_block bb, bb_mask; 10101 class loop *bb_loop; 10102 gimple_stmt_iterator gsi; 10103 gimple *stmt; 10104 auto_vec<gimple *> worklist; 10105 auto_purge_vect_location sentinel; 10106 10107 enum tree_code code; 10108 10109 vect_location = find_loop_location (loop); 10110 /* Pick up all vec_cond_expr in loop if any. */ 10111 for (i = 0; i < nbbs; i++) 10112 { 10113 bb = bbs[i]; 10114 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 10115 gsi_next (&gsi)) 10116 { 10117 stmt = gsi_stmt (gsi); 10118 if (is_gimple_assign(stmt)) { 10119 gassign *stmt_assign = dyn_cast <gassign *> (gsi_stmt (gsi)); 10120 code = gimple_assign_rhs_code (stmt_assign); 10121 // 检查语句是否为 VEC_COND_EXPR 10122 if (code == VEC_COND_EXPR) { 10123 worklist.safe_push (stmt); 10124 } 10125 } 10126 } 10127 } 10128 10129 free (bbs); 10130 if (worklist.is_empty () || worklist.length()==1) 10131 return; 10132 10133 /* Loop has vec_cond_expr. */ 10134 while (!worklist.is_empty ()) 10135 { 10136 gimple *last, *last_store, *last1; 10137 edge e, efalse; 10138 tree mask,mask2; 10139 basic_block store_bb, join_bb; 10140 gimple_stmt_iterator gsi_to; 10141 gimple_stmt_iterator gsi_stmt_def,gsi_mask_def; 10142 tree vdef, new_vdef; 10143 gphi *phi; 10144 tree vectype; 10145 tree zero_vector; 10146 10147 last = worklist.pop (); 10148 gassign *stmt_assign = dyn_cast <gassign *> (last); 10149 mask = gimple_assign_rhs1(stmt_assign); 10150 tree true_vector_operand = gimple_assign_rhs2(stmt_assign); 10151 10152 gimple *mask_def = SSA_NAME_DEF_STMT (mask); 10153 10154 gsi_mask_def = gsi_for_stmt(mask_def); 10155 gsi_prev(&gsi_mask_def); 10156 gimple *mask2_def = gsi_stmt(gsi_mask_def); 10157 gassign *stmt_mask2 = dyn_cast <gassign *> (mask2_def); 10158 mask2 = gimple_assign_lhs(stmt_mask2); 10159 10160 10161 gimple *stmt_def = SSA_NAME_DEF_STMT (true_vector_operand); 10162 10163 bb = gimple_bb (stmt_def); 10164 10165 /* Create then_bb and if-then structure in CFG, then_bb belongs to 10166 the same loop as if_bb. It could be different to LOOP when two 10167 level loop-nest is vectorized and mask_store belongs to the inner 10168 one. */ 10169 10170 gsi_stmt_def = gsi_for_stmt (stmt_def); 10171 gsi_next(&gsi_stmt_def); 10172 10173 stmt_def = gsi_stmt(gsi_stmt_def); 10174 10175 e = split_block (bb, stmt_def); 10176 bb_loop = bb->loop_father; 10177 // gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); 10178 join_bb = e->dest; 10179 store_bb = create_empty_bb (bb); 10180 add_bb_to_loop (store_bb, bb_loop); 10181 e->flags = EDGE_TRUE_VALUE; 10182 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); 10183 /* Put STORE_BB to likely part. */ 10184 efalse->probability = profile_probability::unlikely (); 10185 store_bb->count = efalse->count (); 10186 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); 10187 if (dom_info_available_p (CDI_DOMINATORS)) 10188 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); 10189 if (dump_enabled_p ()) 10190 dump_printf_loc (MSG_NOTE, vect_location, 10191 "Create new block %d to sink vect cond expr", 10192 store_bb->index); 10193 /* Create vector comparison with boolean result. */ 10194 vectype = TREE_TYPE (mask); 10195 zero_vector = build_zero_cst (vectype); 10196 10197 tree combined_mask = create_tmp_var(TREE_TYPE(zero_vector), "combined_mask"); 10198 10199 gimple *combine_stmt1 = gimple_build_assign(combined_mask, BIT_IOR_EXPR, mask, mask2); 10200 10201 gsi = gsi_for_stmt (mask_def); 10202 gsi_next(&gsi); 10203 gsi_insert_after (&gsi, combine_stmt1, GSI_SAME_STMT); 10204 10205 /* vec<constructor_elt, va_gc> *ret_ctor_elts_tmp = NULL; 10206 vec_alloc (ret_ctor_elts_tmp, 2); 10207 CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask2); // 添加第二个左子树 10208 CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask); // 添加第一个左子树 10209 10210 // tree signed_boolean_type = build_nonstandard_integer_type(64, 1); 10211 tree signed_boolean_type = build_nonstandard_boolean_type(64); 10212 10213 tree vect_type = build_vector_type(signed_boolean_type, 4); 10214 tree constructor = build_constructor(vect_type, ret_ctor_elts_tmp); 10215 10216 tree new_var_constru = create_tmp_var(vect_type, "mask_array"); 10217 gimple *new_stmt_construc = gimple_build_assign(make_ssa_name(new_var_constru), constructor); 10218 gsi_next(&gsi); 10219 gsi_insert_after (&gsi, new_stmt_construc, GSI_SAME_STMT);*/ 10220 10221 gimple *gcond = gimple_build_cond(EQ_EXPR, combined_mask, zero_vector, NULL, NULL); 10222 gsi_next(&gsi); 10223 gsi_insert_after(&gsi, gcond, GSI_NEW_STMT); 10224 10225 10226 /* Put all masked stores with the same mask to STORE_BB if possible. */ 10227 // while (true) 10228 // { 10229 gimple_stmt_iterator gsi_from; 10230 gimple *stmt1 = NULL; 10231 10232 /* Move vec_cond second var def to STORE_BB. */ 10233 last_store = stmt_def; 10234 gsi = gsi_for_stmt (stmt_def); 10235 gsi_from = gsi; 10236 /* Shift GSI to the previous stmt for further traversal. */ 10237 gsi_prev (&gsi); 10238 gsi_to = gsi_start_bb (store_bb); 10239 gsi_move_before (&gsi_from, &gsi_to); 10240 /* Setup GSI_TO to the non-empty block start. */ 10241 gsi_to = gsi_start_bb (store_bb); 10242 if (dump_enabled_p ()) 10243 dump_printf_loc (MSG_NOTE, vect_location, 10244 "Move stmt to created bb\n%G", last); 10245 /* Move all stored value producers if possible. */ 10246 while (!gsi_end_p (gsi)) 10247 { 10248 tree lhs; 10249 imm_use_iterator imm_iter; 10250 use_operand_p use_p; 10251 bool res; 10252 10253 /* Skip debug statements. */ 10254 if (is_gimple_debug (gsi_stmt (gsi))) 10255 { 10256 gsi_prev (&gsi); 10257 continue; 10258 } 10259 stmt1 = gsi_stmt (gsi); 10260 /* Do not consider statements writing to memory or having 10261 volatile operand. */ 10262 if (gimple_vdef (stmt1) 10263 || gimple_has_volatile_ops (stmt1)) 10264 break; 10265 gsi_from = gsi; 10266 gsi_prev (&gsi); 10267 lhs = gimple_get_lhs (stmt1); 10268 if (!lhs) 10269 break; 10270 10271 /* LHS of vectorized stmt must be SSA_NAME. */ 10272 if (TREE_CODE (lhs) != SSA_NAME) 10273 break; 10274 10275 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 10276 { 10277 /* Remove dead scalar statement. */ 10278 if (has_zero_uses (lhs)) 10279 { 10280 gsi_remove (&gsi_from, true); 10281 continue; 10282 } 10283 } 10284 10285 /* Check that LHS does not have uses outside of STORE_BB. */ 10286 res = true; 10287 // FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 10288 gimple *use_lhs; 10289 FOR_EACH_IMM_USE_STMT (use_lhs, imm_iter, lhs) 10290 { 10291 gimple *use_stmt; 10292 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) { 10293 10294 // gimple *use_stmt; 10295 use_stmt = USE_STMT (use_p); 10296 if (is_gimple_debug (use_stmt)) 10297 continue; 10298 if (gimple_bb (use_stmt) != store_bb && gimple_bb (use_stmt) != gimple_bb (last)) 10299 { 10300 // res = false; 10301 10302 if (dump_enabled_p ()) 10303 dump_printf_loc (MSG_NOTE, vect_location, 10304 "LHS have use outside of store_BB\n%G", stmt1); 10305 tree new_lhs,new_lhs1,new_lhs2; 10306 tree new_lhs_phi; 10307 gphi *phi; 10308 tree vectype; 10309 tree zero; 10310 gimple *zero_def; 10311 10312 gimple *new_assign_stmt; 10313 10314 if (is_gimple_assign(stmt1) && is_gimple_assign(use_lhs)) { 10315 for (unsigned int i = 1; i < gimple_num_ops(use_stmt); i++) { 10316 tree rhs = gimple_op(use_stmt, i); 10317 if(TREE_CODE (rhs) == SSA_NAME && (rhs == lhs)) { 10318 10319 if (dump_enabled_p ()) 10320 dump_printf_loc (MSG_NOTE, vect_location, 10321 "insert new stmt to use out of BB\n"); 10322 new_lhs = create_tmp_var(TREE_TYPE(lhs), "new_tmp_var"); 10323 new_lhs1 = make_ssa_name(new_lhs,NULL); 10324 tree rhs1 = gimple_assign_rhs1(stmt1); 10325 tree rhs2 = gimple_assign_rhs2(stmt1); 10326 new_assign_stmt = gimple_build_assign(new_lhs1, gimple_assign_rhs_code(stmt1), rhs1, rhs2); 10327 10328 gimple_stmt_iterator gsi_temp = gsi_for_stmt(use_stmt); 10329 gsi_insert_before (&gsi_temp,new_assign_stmt,GSI_SAME_STMT); 10330 update_stmt(new_assign_stmt); 10331 10332 if( i == 1) { 10333 10334 gimple_assign_set_rhs1(use_stmt, new_lhs1); 10335 // update_stmt(use_stmt); 10336 } 10337 else if (i == 2) { 10338 gimple_assign_set_rhs2(use_stmt, new_lhs1); 10339 // update_stmt(use_stmt); 10340 } 10341 10342 // update_stmt(use_stmt); 10343 } 10344 } 10345 } 10346 } 10347 } 10348 10349 update_stmt(use_stmt); 10350 } 10351 10352 /* Can move STMT1 to STORE_BB. */ 10353 /* if (dump_enabled_p ()) 10354 dump_printf_loc (MSG_NOTE, vect_location, 10355 "Move stmt to created bb\n%G", stmt1);*/ 10356 gsi_move_before (&gsi_from, &gsi_to); 10357 /* Shift GSI_TO for further insertion. */ 10358 gsi_prev (&gsi_to); 10359 } 10360 if (!worklist.is_empty ()) 10361 last = worklist.pop (); 10362 } 10363 10364 } |
对 if continue的分块
10161 /* if(worklist.length()== 1) { 10162 if (dump_enabled_p ()) 10163 dump_printf_loc (MSG_NOTE, vect_location, 10164 " if-continue split bb\n"); 10165 tree mask_tmp2 = gimple_assign_rhs2(stmt_mask2); 10166 tree mask_tmp1 = gimple_assign_rhs2(stmt_mask1); 10167 10168 gimple *mask_temp2_def = SSA_NAME_DEF_STMT (mask_tmp2); 10169 gimple *mask_temp1_def = SSA_NAME_DEF_STMT (mask_tmp1); 10170 10171 gassign *stmt_mask_tmp2 = dyn_cast <gassign *> (mask_temp2_def); 10172 gassign *stmt_mask_tmp1 = dyn_cast <gassign *> (mask_temp1_def); 10173 10174 tree temp2_rhs1 = gimple_assign_rhs1(stmt_mask_tmp2); 10175 tree temp1_rhs1 = gimple_assign_rhs1(stmt_mask_tmp1); 10176 10177 tree target_mask3 = gimple_assign_lhs(stmt_mask_tmp2); 10178 tree target_mask4 = gimple_assign_lhs(stmt_mask_tmp1); 10179 10180 tree temp2_rhs2 = gimple_assign_rhs2(stmt_mask_tmp2); 10181 tree temp1_rhs2 = gimple_assign_rhs2(stmt_mask_tmp1); 10182 10183 gimple *target_stmt1 = SSA_NAME_DEF_STMT (temp2_rhs1); 10184 gimple *target_stmt2 = SSA_NAME_DEF_STMT (temp1_rhs1); 10185 10186 gassign *stmt_target_stmt1 = dyn_cast <gassign *> (target_stmt1); 10187 gassign *stmt_target_stmt2 = dyn_cast <gassign *> (target_stmt2); 10188 10189 tree target_mask1 = gimple_assign_lhs(stmt_target_stmt1); 10190 tree target_mask2 = gimple_assign_lhs(stmt_target_stmt2); 10191 10192 10193 gimple *target_stmt3 = SSA_NAME_DEF_STMT (temp2_rhs2); 10194 gimple *target_stmt4 = SSA_NAME_DEF_STMT (temp1_rhs2); 10195 10196 basic_block bb_tmp = gimple_bb (target_stmt1); 10197 basic_block bb_tmp_next = gimple_bb (target_stmt4); 10198 edge e_tmp; 10199 gimple_stmt_iterator target_stmt4_gsi = gsi_for_stmt(mask_temp1_def); 10200 gsi_next(&target_stmt4_gsi); 10201 gimple *target_stmt4_next = gsi_stmt(target_stmt4_gsi); 10202 10203 gimple_stmt_iterator target_stmt2_gsi = gsi_for_stmt(target_stmt2); 10204 gsi_next(&target_stmt2_gsi); 10205 gimple *target_stmt2_next = gsi_stmt(target_stmt2_gsi); 10206 10207 e_tmp = split_block (bb_tmp, target_stmt4_next); 10208 class loop *bb_loop_tmp = bb_tmp->loop_father; 10209 gcc_assert (loop == bb_loop_tmp || flow_loop_nested_p (loop, bb_loop_tmp)); 10210 10211 basic_block bb_last_tmp = gimple_bb(last); 10212 basic_block join_bb_tmp; 10213 gimple *last_stmt_tmp = last_stmt(bb_last_tmp); 10214 if (last_stmt_tmp && gimple_code(last_stmt_tmp) == GIMPLE_COND) { 10215 10216 edge e_tmp2; 10217 edge_iterator ei_tmp2; 10218 basic_block true_bb; 10219 10220 FOR_EACH_EDGE(e_tmp2, ei_tmp2, bb_last_tmp->succs) { 10221 // 检查是否为 true 分支 10222 if (e_tmp2->flags & EDGE_TRUE_VALUE) { 10223 true_bb = e_tmp2->dest; 10224 } 10225 } 10226 join_bb_tmp = e_tmp->dest; 10227 basic_block store_bb_tmp = create_empty_bb (bb_tmp); 10228 add_bb_to_loop (store_bb_tmp, bb_loop_tmp); 10229 // e_tmp->flags = EDGE_TRUE_VALUE; 10230 10231 edge efalse_tmp_true = make_edge (bb_tmp, bb_last_tmp, EDGE_TRUE_VALUE); 10232 /* Put STORE_BB to likely part. */ 10233 /* efalse_tmp_true->probability = profile_probability::likely (); 10234 store_bb_tmp->count = efalse_tmp_true->count (); 10235 10236 edge efalse_tmp = make_edge (bb_tmp, store_bb_tmp, EDGE_FALSE_VALUE); 10237 /* Put STORE_BB to likely part. */ 10238 /* efalse_tmp->probability = profile_probability::unlikely (); 10239 store_bb_tmp->count = efalse_tmp->count (); 10240 // make_single_succ_edge (store_bb_tmp, join_bb_tmp, EDGE_FALLTHRU); 10241 10242 edge efalse_tmp_next = make_edge (store_bb_tmp, join_bb_tmp, EDGE_FALSE_VALUE); 10243 efalse_tmp_next->probability = profile_probability::unlikely (); 10244 // store_bb_tmp->count = efalse_tmp_true->count (); 10245 10246 edge etrue_tmp_next = make_edge (store_bb_tmp, bb_last_tmp, EDGE_TRUE_VALUE); 10247 etrue_tmp_next->probability = profile_probability::likely (); 10248 store_bb_tmp->count = efalse_tmp_true->count (); 10249 // true_bb = e_tmp->dest; 10250 10251 // e_tmp->dest = NULL; 10252 // e_tmp->flags = EDGE_TRUE_VALUE; 10253 10254 edge e_dele = find_edge(bb_tmp, join_bb_tmp); 10255 if (e_dele) { 10256 remove_edge(e_dele); // 删除这条边 10257 } 10258 10259 // true_bb->preds = chainon(true_bb->preds, e_tmp); 10260 add_to_dominance_info(CDI_DOMINATORS,join_bb_tmp); 10261 10262 if (dom_info_available_p (CDI_DOMINATORS)) { 10263 set_immediate_dominator (CDI_DOMINATORS, store_bb_tmp, bb_tmp); 10264 set_immediate_dominator (CDI_DOMINATORS, join_bb_tmp, store_bb_tmp); 10265 set_immediate_dominator (CDI_DOMINATORS, bb_last_tmp, bb_tmp); 10266 // free_dominance_info(CDI_DOMINATORS); 10267 calculate_dominance_info(CDI_DOMINATORS); 10268 } 10269 10270 // free_dominance_info(CDI_DOMINATORS); 10271 // calculate_dominance_info(CDI_DOMINATORS); 10272 10273 tree vectype_tmp = TREE_TYPE (mask_tmp1); 10274 tree zero_vector_tmp = build_zero_cst (vectype_tmp); 10275 10276 tree combined_mask_tmp = create_tmp_var(TREE_TYPE(zero_vector_tmp), "combined_mask_ifconti"); 10277 10278 tree combined_mask_tmp2 = create_tmp_var(TREE_TYPE(zero_vector_tmp), "combined_mask_ifconti2"); 10279 10280 gimple *combine_stmt1_tmp = gimple_build_assign(combined_mask_tmp, BIT_IOR_EXPR, target_mask1, target_mask2); 10281 10282 gimple *combine_stmt1_tmp2 = gimple_build_assign(combined_mask_tmp2, BIT_IOR_EXPR, target_mask3, target_mask4); 10283 10284 gimple_stmt_iterator gsi_tmp = gsi_for_stmt (target_stmt2); 10285 gsi_next(&gsi_tmp); 10286 gsi_insert_after (&gsi_tmp, combine_stmt1_tmp, GSI_SAME_STMT); 10287 10288 gimple_stmt_iterator gsi_tmp_next_if = gsi_last_bb (store_bb_tmp); 10289 // gsi_prev(&gsi_tmp_next_if); 10290 gsi_insert_before (&gsi_tmp_next_if, combine_stmt1_tmp2, GSI_SAME_STMT); 10291 10292 gimple *gcond_tmp = gimple_build_cond(EQ_EXPR, combined_mask_tmp, zero_vector_tmp, NULL, NULL); 10293 gsi_next(&gsi_tmp); 10294 gsi_insert_after(&gsi_tmp, gcond_tmp, GSI_NEW_STMT); 10295 10296 gimple *gcond_tmp_next = gimple_build_cond(EQ_EXPR, combined_mask_tmp2, zero_vector_tmp, NULL, NULL); 10297 // gsi_next(&gsi_tmp_next_if); 10298 gsi_insert_before(&gsi_tmp_next_if, gcond_tmp_next, GSI_NEW_STMT); 10299 10300 // calculate_dominance_info(CDI_DOMINATORS); 10301 10302 gimple_stmt_iterator gsi_from_tmp; 10303 gimple *stmt1 = NULL; 10304 10305 /* Move vec_cond second var def to STORE_BB. */ 10306 /* gimple *last_store = target_stmt4_next; 10307 gimple_stmt_iterator gsi_tmp4 = gsi_for_stmt (target_stmt4_next); 10308 gsi_from_tmp = gsi_tmp4; 10309 /* Shift GSI to the previous stmt for further traversal. */ 10310 /* gsi_prev (&gsi_tmp4); 10311 gimple_stmt_iterator gsi_to_tmp = gsi_start_bb (store_bb_tmp); 10312 gsi_move_before (&gsi_from_tmp, &gsi_to_tmp); 10313 /* Setup GSI_TO to the non-empty block start. */ 10314 /* gsi_to_tmp = gsi_start_bb (store_bb_tmp); 10315 if (dump_enabled_p ()) 10316 dump_printf_loc (MSG_NOTE, vect_location, 10317 "Move if-continue stmt to created bb\n%G", last); 10318 /* Move all stored value producers if possible. */ 10319 /* while (!gsi_end_p (gsi_tmp4)) { 10320 10321 tree lhs; 10322 imm_use_iterator imm_iter; 10323 use_operand_p use_p; 10324 bool res; 10325 10326 /* Skip debug statements. */ 10327 /* if (is_gimple_debug (gsi_stmt (gsi_tmp4))) 10328 { 10329 gsi_prev (&gsi_tmp4); 10330 continue; 10331 } 10332 stmt1 = gsi_stmt (gsi_tmp4); 10333 /* Do not consider statements writing to memory or having 10334 volatile operand. */ 10335 /* if (gimple_vdef (stmt1) || gimple_has_volatile_ops (stmt1)) 10336 break; 10337 gsi_from_tmp = gsi_tmp4; 10338 gsi_prev (&gsi_tmp4); 10339 lhs = gimple_get_lhs (stmt1); 10340 if (!lhs) 10341 break; 10342 10343 /* LHS of vectorized stmt must be SSA_NAME. */ 10344 /* if (TREE_CODE (lhs) != SSA_NAME) 10345 break; 10346 10347 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 10348 { 10349 /* Remove dead scalar statement. */ 10350 /* if (has_zero_uses (lhs)) 10351 { 10352 gsi_remove (&gsi_from_tmp, true); 10353 continue; 10354 } 10355 } 10356 10357 gsi_move_before (&gsi_from_tmp, &gsi_to_tmp); 10358 /* Shift GSI_TO for further insertion. */ 10359 /* gsi_prev (&gsi_to_tmp); 10360 } 10361 } 10362 }*/ |
当vf 是4的时候,进行mask的合并,以及将合并后的mask加入到数学函数里面
mask合并代码
10410 vec<constructor_elt, va_gc> *ret_ctor_elts_tmp = NULL; 10411 vec_alloc (ret_ctor_elts_tmp, 2); 10412 CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask2); // 添加第二个左子树 10413 CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask); // 添加第一个左子树 10414 10415 // tree signed_boolean_type = build_nonstandard_integer_type(64, 1); 10416 tree signed_boolean_type = build_nonstandard_boolean_type(64); 10417 10418 tree vect_type = build_vector_type(signed_boolean_type, 4); 10419 tree constructor = build_constructor(vect_type, ret_ctor_elts_tmp); 10420 10421 tree new_var_constru = create_tmp_var(vect_type, "mask_array"); 10422 gimple *new_stmt_construc = gimple_build_assign(make_ssa_name(new_var_constru), constructor); 10423 gsi_next(&gsi); 10424 gsi_insert_after (&gsi, new_stmt_construc, GSI_SAME_STMT); |
将合并后的mask加入到数学函数里面
195 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mask_operand) 196 { 197 gimple *use_stmt; 198 use_stmt = USE_STMT (use_p); 199 if(is_gimple_assign(use_stmt)) { 200 tree rhs1_tmp1 = gimple_assign_rhs1(use_stmt); 201 if (TREE_CODE(rhs1_tmp1) == CONSTRUCTOR) { 202 tree lhs_tmp1 = gimple_assign_lhs(use_stmt); 203 if(stmt_vecmath) 204 add_mask_to_call(stmt_vecmath,lhs_tmp1); 205 } 206 } 207 } |
oneapi的cfg图
在移动的过程中如果store bb的中的LHS在 除了store bb外的其他bb中被使用,则需要重新计算
10490 /* Check that LHS does not have uses outside of STORE_BB. */ 10491 res = true; 10492 // FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 10493 gimple *use_lhs; 10494 FOR_EACH_IMM_USE_STMT (use_lhs, imm_iter, lhs) 10495 { 10496 gimple *use_stmt; 10497 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) { 10498 10499 // gimple *use_stmt; 10500 use_stmt = USE_STMT (use_p); 10501 if (is_gimple_debug (use_stmt)) 10502 continue; 10503 if (gimple_bb (use_stmt) != store_bb && gimple_bb (use_stmt) != gimple_bb (last)) 10504 { 10505 // res = false; 10506 10507 if (dump_enabled_p ()) 10508 dump_printf_loc (MSG_NOTE, vect_location, 10509 "LHS have use outside of store_BB\n%G", stmt1); 10510 tree new_lhs,new_lhs1,new_lhs2; 10511 tree new_lhs_phi; 10512 gphi *phi; 10513 tree vectype; 10514 tree zero; 10515 gimple *zero_def; 10516 10517 gimple *new_assign_stmt; 10518 10519 if (is_gimple_assign(stmt1) && is_gimple_assign(use_lhs)) { 10520 for (unsigned int i = 1; i < gimple_num_ops(use_stmt); i++) { 10521 tree rhs = gimple_op(use_stmt, i); 10522 if(TREE_CODE (rhs) == SSA_NAME && (rhs == lhs)) { 10523 10524 if (dump_enabled_p ()) 10525 dump_printf_loc (MSG_NOTE, vect_location, 10526 "insert new stmt to use out of BB\n"); 10527 new_lhs = create_tmp_var(TREE_TYPE(lhs), "new_tmp_var"); 10528 new_lhs1 = make_ssa_name(new_lhs,NULL); 10529 tree rhs1 = gimple_assign_rhs1(stmt1); 10530 tree rhs2 = gimple_assign_rhs2(stmt1); 10531 new_assign_stmt = gimple_build_assign(new_lhs1, gimple_assign_rhs_code(stmt1), rhs1, rhs2); 10532 10533 gimple_stmt_iterator gsi_temp = gsi_for_stmt(use_stmt); 10534 gsi_insert_before (&gsi_temp,new_assign_stmt,GSI_SAME_STMT); 10535 update_stmt(new_assign_stmt); 10536 10537 if( i == 1) { 10538 10539 gimple_assign_set_rhs1(use_stmt, new_lhs1); 10540 // update_stmt(use_stmt); 10541 } 10542 else if (i == 2) { 10543 gimple_assign_set_rhs2(use_stmt, new_lhs1); 10544 // update_stmt(use_stmt); 10545 } 10546 10547 // update_stmt(use_stmt); 10548 } 10549 } 10550 } 10551 } 10552 } 10553 10554 update_stmt(use_stmt); 10555 } */ |
消除同一个reduction 在loop 中使用多次
# temp_value.920_2824 = PHI <tmp_var.921_2823(234), 0.0(279)> 48420 # temp_value.923_2821 = PHI <tmp_var.924_2820(234), 0.0(279)> 48421 # temp_value.926_2814 = PHI <tmp_var.927_2813(234), 0.0(279)> 48422 # temp_value.929_2807 = PHI <tmp_var.930_2806(234), 0.0(279)> 48423 # temp_value.932_2800 = PHI <tmp_var.933_2798(234), 0.0(279)> _ifc__2843 = _3089 ? _2132 : 0.0; 48574 tmp_var.927_2813 = _ifc__2843 + temp_value.926_2814; 48575 _ifc__2842 = _3084 ? _2145 : 0.0; 48576 tmp_var.930_2806 = _ifc__2842 + temp_value.929_2807; 48577 _ifc__2841 = _3192 ? _2085 : 0.0; 48578 tmp_var.921_2823 = _ifc__2841 + temp_value.920_2824; 48579 _ifc__2840 = _3172 ? _2101 : 0.0; 48580 tmp_var.933_2798 = _ifc__2840 + temp_value.932_2800; 48581 _ifc__2839 = _3161 ? _2113 : 0.0; 48582 tmp_var.924_2820 = _ifc__2839 + temp_value.923_2821; # tmp_sumi.922_2822 = PHI <tmp_var.921_2823(83), 0.0(81), 0.0(276)> 48880 # tmp_sumi.925_2816 = PHI <tmp_var.924_2820(83), 0.0(81), 0.0(276)> 48881 # tmp_sumi.928_2809 = PHI <tmp_var.927_2813(83), 0.0(81), 0.0(276)> 48882 # tmp_sumi.931_2805 = PHI <tmp_var.930_2806(83), 0.0(81), 0.0(276)> 48883 # tmp_sumi.934_2793 = PHI <tmp_var.933_2798(83), 0.0(81), 0.0(276)> _2752 = tmp_sumi.922_2822 + tmp_sumi.925_2816; 48885 _2750 = _2752 + tmp_sumi.928_2809; 48886 _2747 = _2750 + tmp_sumi.931_2805; 48887 _2746 = _2747 + tmp_sumi.934_2793; _2156 = ri1i_2025 + _2746; 48931 _2163 = _2160 * _2746; |
1761 for (k = 0; k < lpears[i] + upears[i]; k++) { 1762 1763 if (pearlist[i] == NULL) { 1764 fprintf(nabout, 1765 "NULL pair list entry in egb loop 1, taskid = %d\n", 1766 mytaskid); 1767 fflush(nabout); 1768 } 1769 j = pearlist[i][k]; 1770 1771 xij = xi - x[dim * j]; 1772 yij = yi - x[dim * j + 1]; 1773 zij = zi - x[dim * j + 2]; 1774 r2 = xij * xij + yij * yij + zij * zij; 1775 1776 if (dim == 4) { // delete 1777 wij = wi - x[dim * j + 3]; 1778 r2 += wij * wij; 1779 } 1780 1781 if (r2 > rgbmaxpsmax2) // %hir.cmp.4310 ule 1782 continue; 1783 dij1i = 1.0 / sqrt(r2); 1784 dij = r2 * dij1i; 1785 sj = fs[j] * (rborn[j] - BOFFSET); // select fast 1786 sj2 = sj * sj; 1787 1788 /* 1789 * ---following are from the Appendix of Schaefer and Froemmel, 1790 * JMB 216:1045-1066, 1990; Taylor series expansion for d>>s 1791 * is by Andreas Svrcek-Seiler; smooth rgbmax idea is from 1792 * Andreas Svrcek-Seiler and Alexey Onufriev. 1793 */ 1794 1795 if (dij > rgbmax + sj) // rgbmax = 20; %hir.cmp.4333 ule 1796 continue; 1797 1798 if ((dij > rgbmax - sj)) { // %hir.cmp.4349 ogt 1799 uij = 1. / (dij - sj); 1800 sumi -= 0.125 * dij1i * (1.0 + 2.0 * dij * uij + 1801 rgbmax2i * (r2 - 1802 4.0 * rgbmax * 1803 dij - sj2) + 1804 2.0 * log((dij - sj) * rgbmax1i)); 1805 1806 } else if (dij > 4.0 * sj) { 1807 dij2i = dij1i * dij1i; 1808 tmpsd = sj2 * dij2i; 1809 dumbo = 1810 TA + tmpsd * (TB + 1811 tmpsd * (TC + 1812 tmpsd * (TD + tmpsd * TDD))); 1813 sumi -= sj * tmpsd * dij2i * dumbo; 1814 1815 } else if (dij > ri + sj) { 1816 sumi -= 0.5 * (sj / (r2 - sj2) + 1817 0.5 * dij1i * log((dij - sj) / (dij + sj))); 1818 1819 } else if (dij > fabs(ri - sj)) { 1820 theta = 0.5 * ri1i * dij1i * (r2 + ri * ri - sj2); 1821 uij = 1. / (dij + sj); 1822 sumi -= 0.25 * (ri1i * (2. - theta) - uij + 1823 dij1i * log(ri * uij)); 1824 1825 } else if (ri < sj) { 1826 sumi -= 0.5 * (sj / (r2 - sj2) + 2. * ri1i + 1827 0.5 * dij1i * log((sj - dij) / (sj + dij))); 1828 1829 } 1830 1831 } |
1:if fprintf 分析不出内存关系,无法ifcvt。(lim pass 其无法外提也是因为fprintf中内存关系无法分析)
解决:将其外提到最内层循环外面。
2 : dim常量传播 (ipa-cp pass)
mme → mme34 → egb
dim 作为全局变量无法常量传播,作为函数参数的时候可以传播到。
解决:新建一个pass,识别全局变量(当其没有作为函数传参时)和函数调用关系,在函数调用的地方将变量替换为常量值。(pass 的位置?是否有参数能解决)
根据inline pass debug的信息,发现mme34无法inline进mme 原因是--param early-inlining-insns= 值过小,将此值调大,可以成功inline。
inline 过后
;; basic block 2, loop depth 0, count 27580514 (estimated locally), maybe hot 74798 ;; prev block 0, next block 3, flags: (NEW, REACHABLE, VISITED) 74799 ;; pred: ENTRY [always] count:27580514 (estimated locally) (FALLTHRU,EXECUTABLE) 74800 # .MEM_2325 = VDEF <.MEM_2324(D)> 74801 dim.lto_priv.0D.4751 = 3; 74802 # VUSE <.MEM_2325> basic block 96, loop depth 2, count 954868629 (estimated locally), maybe hot 77095 ;; prev block 95, next block 97, flags: (NEW, REACHABLE, VISITED) 77096 ;; pred: 94 [82.6% (guessed)] count:788435027 (estimated locally) (FALSE_VALUE,EXECUTABLE) 77097 ;; 95 [always] count:166433602 (estimated locally) (FALLTHRU,EXECUTABLE) _698 = dim.lto_priv.0D.4751; 77112 _699 = j_697 * _698; if (_698 == 4) 77146 goto <bb 97>; [34.00%] 77147 else 77148 goto <bb 98>; [66.00%] |
怀疑是mme34函数中其他部分的代码,影响了其做常量传播的分析,注释掉mme34函数中的部分代码,发现其能够做到常量将dim =3 作为常量。
_77 = j_76 * 3; |
但是需要同时注释掉的内容较多,无法准确找到哪部分代码影响了传播,以及这部分代码的特性。
写了一个例子发现其静态全局变量可以成功作为常量计算,怀疑是mme34函数中的其他部分,影响到dim的常量传播。
1 #include<stdio.h> 2 #include<math.h> 3 #include<stdlib.h> 4 5 6 static int threshold = 5; 7 8 static inline int check_value1(int x) { 9 if(threshold < 20) 10 return x*threshold; 11 else return threshold; 12 } 13 14 static inline int check_value2(int x) { 15 if(threshold < 5) 16 return x+threshold; 17 else return threshold; 18 } 19 static inline int check_value3(int x) { 20 threshold = 10; 21 return check_value1(x); 22 } 23 static inline int check_value4(int x) { 24 threshold = 50; 25 return check_value2(x); 26 } 27 28 int use_threshold(int threshold) { 29 30 return 10 + threshold; 31 } 32 int main() 33 { 34 int num = 30; 35 int num2 = 5; 36 int ans3 = use_threshold(threshold); 37 int ans1 = check_value3(num); 38 int ans2 = check_value4(num2); 39 int ans = ans1 + ans2 +ans3; 40 printf("ans is %d\n",ans); 41 return 0; 42 } |
查看ccp pass 中的debug的信息
39040 Visiting statement: 39041 # VUSE <.MEM_2279> 39042 _698 = dim.lto_priv.0D.4751; 39043 which is likely CONSTANT 39044 Lattice value changed to VARYING. Adding SSA edges to worklist. |
在这里进行gdb 调试,
69046 Substituting values and folding statements 69048 Folding statement: dim = 3; 69049 Not folded |
1761 for (k = 0; k < lpears[i] + upears[i]; k++) { 1762 1763 if (pearlist[i] == NULL) { 1764 fprintf(nabout, 1765 "NULL pair list entry in egb loop 1, taskid = %d\n", 1766 mytaskid); 1767 fflush(nabout); abort(); 1768 } 1769 j = pearlist[i][k]; 1770 |
在ifcvt pass k看if 并没有被外提,无法ifcvt
插入abort需要识别的patern
14044 <bb 148> [local count: 919275880]: 14045 _2044 = _127 + _2039; 14046 _2045 = *_2044; 14047 if (_2045 == 0B) 14048 goto <bb 149>; [17.43%] 14049 else 14050 goto <bb 150>; [82.57%] 14051 14052 <bb 149> [local count: 160229786]: 14053 _2046 = 0; 14054 _2047 = nabout; 14055 fprintf (_2047, "NULL pair list entry in egb loop 1, taskid = %d\n", _2046); 14056 _2048 = nabout; 14057 fflush (_2048); 14058 14059 <bb 150> [local count: 919275880]: 14060 _2049 = *_2044; 14061 _2051 = (long unsigned int) k_2050; 14062 _2052 = _2051 * 4; 14063 _2053 = _2049 + _2052; 14064 j_2054 = *_2053; |
Eff.c:3282
build_base_HygonGCC_Spec2017_rate_perf-test.cfg-64.0000
build_base_HygonGCC_Spec2017_rate_perf.cfg-64.0001
加上一个参数使mme34内联进mme中,但是dim = 3的常量传播无法做到。写了一个静态全局变量的例子,发现其能够传播到,怀疑是函数中的其他代码影响了对常量的分析无法传播到,通过注释原题中的代码
加上if -continue 107
不加 106
Base 99.6