544 eff.c:1761处loop vect 分析
2.6 带有mask的向量数学函数
gcc 支持的svml向量数学函数
32652 GCC currently emits calls to @code{vmldExp2},
32653 @code{vmldLn2}, @code{vmldLog102}, @code{vmldPow2},
32654 @code{vmldTanh2}, @code{vmldTan2}, @code{vmldAtan2}, @code{vmldAtanh2},
32655 @code{vmldCbrt2}, @code{vmldSinh2}, @code{vmldSin2}, @code{vmldAsinh2},
32656 @code{vmldAsin2}, @code{vmldCosh2}, @code{vmldCos2}, @code{vmldAcosh2},
32657 @code{vmldAcos2}, @code{vmlsExp4}, @code{vmlsLn4},
32658 @code{vmlsLog104}, @code{vmlsPow4}, @code{vmlsTanh4}, @code{vmlsTan4},
32659 @code{vmlsAtan4}, @code{vmlsAtanh4}, @code{vmlsCbrt4}, @code{vmlsSinh4},
32660 @code{vmlsSin4}, @code{vmlsAsinh4}, @code{vmlsAsin4}, @code{vmlsCosh4},
32661 @code{vmlsCos4}, @code{vmlsAcosh4} and @code{vmlsAcos4} for corresponding
32662 function type when @option{-mveclibabi=svml} is used |
oneapi的IR:%3970 = call fast cc104 <4 x double> @__svml_log4_mask(<4 x double> %3968, <4 x i64> %3969)
gcc的IR : _799 = _ZGVdN4v_logD.6143 (_800);
<__svml_log4_mask_e9>汇编代码的函数原名。
从如何调用不带mask的svml向量数学函数的流程出发,找出调用带有mask的方法。
设计方案:
vect__ifc__1252.1526_717 = VEC_COND_EXPR <mask__1460.1449_910, vect__1761.1465_870, { 0.0, 0.0 }>; 找到一个VEC_COND_EXPR,在同一个基本块中,根据第二个或者第三个参数所涉及到的运算(建立一个栈暂存每次找到的结果),顺着运算的关系一步步往上找,直到找到了需要进行mask的数学函数。如果在第二个参数中找到,VEC_COND_EXPR中的第一个参数mask就是数学函数需要进行mask的值。如果在第三个参数的关系链中找到,其所需的mask就是VEC_COND_EXPR中的mask的取反。将数学函数和mask一起生成带有mask的数学函数的IR,替换掉原来的不带mask的。(在生成cond_expr之后做还是在loop vect pass之后另外新建一个pass做。)
#include "config.h"
2 #include "system.h"
3 #include "coretypes.h"
4 #include "backend.h"
5 #include "tree.h"
6 #include "gimple.h"
7 #include "predict.h"
8 #include "tree-pass.h"
9 #include "ssa.h"
10 #include "cgraph.h"
11 #include "fold-const.h"
12 #include "stor-layout.h"
13 #include "gimple-iterator.h"
14 #include "gimple-walk.h"
15 #include "tree-ssa-loop-manip.h"
16 #include "tree-ssa-loop-niter.h"
17 #include "tree-cfg.h"
18 #include "cfgloop.h"
19 #include "tree-vectorizer.h"
20 #include "tree-ssa-propagate.h"
21 #include "dbgcnt.h"
22 #include "tree-scalar-evolution.h"
23 #include "stringpool.h"
24 #include "attribs.h"
25 #include "gimple-pretty-print.h"
26 #include "opt-problem.h"
27 #include "internal-fn.h"
28 #include "tree-ssa-sccvn.h"
29 #include "gimple-expr.h"
30 #include <cstdio>
31
32 namespace
33 {
34 const pass_data pass_data_test = {
35 GIMPLE_PASS, /* type */
36 "mask_vecmath_func", /* name */
37 OPTGROUP_NONE, /* optinfo_flags */
38 TV_TREE_VECT_MASK_VECMATH_FUNC, /* tv_id */
39 (PROP_cfg | PROP_ssa), /* properties_required */
40 0, /* properties_provided */
41 0, /* properties_destroyed */
42 0, /* todo_flags_start */
43 0, /* todo_flags_finish */
44 };
46 class pass_mask_vecmath_func : public gimple_opt_pass
47 {
48 public:
49 pass_mask_vecmath_func (gcc::context *ctxt) : gimple_opt_pass (pass_data_test, ctxt) {}
50 virtual bool
51 gate (function *fun)
52 {
53 // printf ("gate function noipa.\n");
54 return flag_tree_mask_vecmath_func;
55 }
56
57 virtual unsigned int execute (function *);
58 };
59
60
61 static gimple *find_relate_operand(tree operand, gimple *stmt)
62 {
63 if (!stmt)
64 return NULL;
65
66 if (TREE_CODE (operand) == SSA_NAME && is_gimple_call(stmt)) { // operand is ssa && stmt is gimple call
67 tree fndecl = gimple_call_fndecl(stmt); // 获取函数声明
68 if (fndecl && DECL_P(fndecl)) { // 确保fndecl有效并且是一个声明
69 const char *func_name = IDENTIFIER_POINTER(DECL_NAME(fndecl)); // 获取函数名称
70 // if (strcmp(func_name, "vmldLn2") == 0) {
71 if (strcmp(func_name, "__svml_log4_mask_e9") == 0) {
72 return stmt;
73 }
74 }
75 }
76 if (TREE_CODE (operand) == SSA_NAME && is_gimple_assign(stmt)) { // only find gimple assign
77
78 for (unsigned i = 1; i < gimple_num_ops(stmt); ++i) { // get gimple assign right hand side operand
79 tree op = gimple_op(stmt, i);
80 if(TREE_CODE (op) == SSA_NAME) {
81
82 gimple *stmt_2 = SSA_NAME_DEF_STMT (op);
83 gimple *result = find_relate_operand(op,stmt_2);
84 if(result) return result;
85 }
86 }
87 }
88 return NULL;
89 }
90
91 static void add_mask_to_call(gimple *stmt, tree new_arg) {
92 if (!is_gimple_call(stmt)) {
93 // 如果不是函数调用语句,则不做任何操作
94 return;
95 }
96
97 // 获取原始函数调用的目标和参数列表
98 tree call_fn = gimple_call_fndecl(stmt);
99
100 // 获取或创建新的标识符节点来表示新的函数名称
101 // tree new_func_id = get_identifier("vmldLn2Mask");
102 tree new_func_id = get_identifier("__svml_log4_mask_e9");
103 tree fntype = TREE_TYPE(call_fn);
104
105 tree new_fndecl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, new_func_id, fntype);
106
107 TREE_PUBLIC (new_fndecl) = 1;
108 DECL_EXTERNAL (new_fndecl) = 1;
109 DECL_IS_NOVOPS (new_fndecl) = 1;
110 TREE_READONLY (new_fndecl) = 1;
111
112
113 // 将新的标识符节点分配给函数声明的汇编名
114 // DECL_ASSEMBLER_NAME(call_fn) = new_func_id;
115
116 int num_args = gimple_call_num_args(stmt);
117 vec<tree> vargs = vNULL;
118 vargs.create (num_args+1);
119
120 // 创建一个新的参数列表,包含原始的参数和新的参数
121 for (int i = 0; i < num_args; i++) {
122 tree arg = gimple_call_arg(stmt, i);
123 vargs.safe_push(arg);
124 }
125 vargs.safe_push(new_arg);
126
127 tree lhs = gimple_call_lhs(stmt);
128
129 // 创建新的函数调用语句,包含新的参数
130 gimple *new_call = gimple_build_call_vec(new_fndecl,vargs);
131 gimple_call_set_lhs (new_call, lhs);
132
133 // 替换原始的函数调用语句
134 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
135
136 // printf ("-------------finish add mask to vecmath func call------------.\n");
137
138 gsi_replace(&gsi, new_call,true);
139 stmt = new_call;
140
141 // 释放参数列表的内存
142 vargs.release ();
143 }
144
145 unsigned
146 pass_mask_vecmath_func::execute (function *fun)
147 {
148 unsigned ret = 0;
149
150 // printf ("-----------begin mask vecmath func------------.\n");
151 // printf ("current function name:%s\n", function_name (fun));
152 basic_block bb;
153 enum tree_code code;
154
155 // 遍历所有基本块
156 FOR_EACH_BB_FN(bb, fun) {
157 gimple_stmt_iterator gsi;
158
159 // 遍历基本块中的所有 GIMPLE 语句
160 for (gsi = gsi_start_bb(bb); !gsi_end_p(gsi); gsi_next(&gsi)) {
161 gimple *stmt = gsi_stmt(gsi);
162 if (is_gimple_assign(stmt)) {
163
164 gassign *stmt_assign = dyn_cast <gassign *> (gsi_stmt (gsi));
165 code = gimple_assign_rhs_code (stmt_assign);
166
167 // 检查语句是否为 VEC_COND_EXPR
168 if (code == VEC_COND_EXPR) {
169
170 // printf ("-----------find out vec cond expr------------.\n");
171 tree true_vector_operand = gimple_assign_rhs2(stmt_assign); // add wrong vec operand
172 tree mask_operand = gimple_assign_rhs1(stmt_assign);
173 if(TREE_CODE (true_vector_operand) == SSA_NAME) {
174
175 gimple *stmt_def = SSA_NAME_DEF_STMT (true_vector_operand);
176 gimple *stmt_vecmath = find_relate_operand(true_vector_operand,stmt_def);
177 if(stmt_vecmath) {
178 // printf ("-----------find out vecmath stmt------------.\n");
179 add_mask_to_call(stmt_vecmath,mask_operand);
180
181 }
182 }
183 }
184 }
185 }
186 }
187 return ret;
188 }
189 }
190
191 gimple_opt_pass *
192 make_pass_mask_vecmath_func (gcc::context *ctxt)
193 {
194 return new pass_mask_vecmath_func (ctxt);
195 }
|
生成了正确的IR之后,使用buitlin的方式调用svml中的带有mask的数学函数。
gcc调用svml函数在gimple阶段的过程:
1:examining statement:
vect_analyze_stmt函数中检查stmt, 在vectorizable_xxx函数里面判断操作数的类型。vect_is_simple_use: 计算向量化的cost, vect_model_simple_cost,先不进行transform。
调用svml需要使用target-specific built-in function,使用此函数targetm.vectorize.builtin_vectorized_function,根据优化选项(config/i386/i386-options.cc:2567)定位到(ix86_veclib_handler = &ix86_veclibabi_svml)后端ix86_veclibabi_svml函数处,返回向量svml函数fndecl。


2:vectorizing statement:
vect_transform_loop_stmt函数中,进行transform,同样也会调用vectorizable_xxx函数进行此转化。gimple_build_call_vec (fndecl, vargs):根据获取到的fndecl以及对参数的向量化,构建一个新的gimple vec call。

loop vec pass的调用栈

vect_analyze_loop_2:
Apply a set of analyses on LOOP, and create a loop_vec_info struct for it. The different analyses will record information in the loop_vec_info struct
loop_vec_info 里面放的是对loop 分析完成后的整个loop的信息
vect_analyze_loop_operations:
Scan the loop stmts and make sure they are all vectorizable.
vect_analyze_stmt:
Make sure the statement is vectorizable.
ziyuan 2.3 和 2.4修改对于其他课题的影响 aggressive_if_conv && use_gather_2parts result.xlsx 采用HygonGCC 1.3.2编译器最新版本 和最新配置文件Hygon7490-2p-HygonGCC1.3.2.202403-hgalloc-znver1-base.cfg
跑1copy的时候整个node最好不要跑其他程序,不然性能数据会波动较大。会抢占node的内存等资源。



可能优化的方向:
- gcc调用svml向量数学库的接口函数只能支持128bit的输入。修改接口调用256bit的输入。
- -mtune-ctrl=^avx256_split_regs,^avx128_optimal,256_unaligned_store_optimal可以使程序使用256bit的ymm寄存器,提高循环向量化的vf,对性能有提高2069 4%,1761:8%。
- oneapi使用将条件和条件里面的计算分别放在不同的bb块中,通过控制流来选择需要执行哪些分支,可以减少冗余运算。Gcc向量化只能在同一个bb块中进行,无法控制每个分支,只支持在log函数上进行mask操作,和最终运算的结果上进行选择,其他操作- + *等只能在支持avx512的机器上。只能想办法在gcc上也进行将不同分支分为不同bb块的操作,模仿oneapi。
- gcc上的vf是8,使用两次log4,oneapi的vf为4,使用一次log4,通过将i32扩展为i64,使用256bit ymm,尝试将gcc变为vf4使用一次log4,使用相似的方法,未能成功。并且怀疑3才是性能的主要点,此操作应该不是性能的主要点。
5. gcc循环向量化无法处理跨bb的问题,如果向量化后拆分成不同bb,后续的pass可能无法处理会对拆分的bb做一些未知的操作,不建议使用此方法,可以在原有的bb里面插入一些 根据mask进行选择的指令,来模拟分支选择的操作。
void calc(double *src1,double *src2,double *src3)
5 {
6 int i;
7 for(i=0;i<100;i++)
8 {
9 if(src3[i] > 10.0)
10 {
11 src1[i] = exp(src2[i]);
12 }
13 else if(src3[i] > 5.0)
14 {
15 src1[i] = log(src2[i]);
16 }
17 else if(src3[i] > 2.5)
18 {
19 src1[i] = sin(src2[i]);
20 }
21 }
22 }
|

对于有mask store的操作,会将if-conversion操作进行回退。optimize_mask_stores
1:新建一个对mask进行判断是否全为0的GIMPLE_COND。
2:新建一个then bb块,并且维护其边。
3:在mask store后分割一个新的bb,并且把stmt全部移到bb里面,新建一个边。
create_basic_block_1 (void *head, void *end, basic_block after):
int vf为4,double vf 为2.
test_mask_vecmath.c:13:18: note: === vect_determine_vectorization_factor === 681 test_mask_vecmath.c:13:18: note: ==> examining phi: i_114 = PHI <i_85(20), 0(35)> 682 test_mask_vecmath.c:13:18: note: ==> examining phi: sumi1_115 = PHI <_136(20), 0.0(35)> 683 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 684 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 685 test_mask_vecmath.c:13:18: note: nunits = 2 686 test_mask_vecmath.c:13:18: note: ==> examining phi: sumi2_117 = PHI <_138(20), 0.0(35)> 687 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 688 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 689 test_mask_vecmath.c:13:18: note: nunits = 2 690 test_mask_vecmath.c:13:18: note: ==> examining phi: sumi3_119 = PHI <_140(20), 0.0(35)> 691 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 692 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 693 test_mask_vecmath.c:13:18: note: nunits = 2 694 test_mask_vecmath.c:13:18: note: ==> examining phi: ivtmp_106 = PHI <ivtmp_101(20), 100(35)> 695 test_mask_vecmath.c:13:18: note: ==> examining statement: _62 = (long unsigned int) i_114; 696 test_mask_vecmath.c:13:18: note: skip. 697 test_mask_vecmath.c:13:18: note: ==> examining statement: _63 = _62 * 4; 698 test_mask_vecmath.c:13:18: note: skip. 699 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_151 = i_114 w* 4; 700 test_mask_vecmath.c:13:18: note: skip. 701 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_152 = (long unsigned int) patt_151; 702 test_mask_vecmath.c:13:18: note: skip. 703 test_mask_vecmath.c:13:18: note: ==> examining statement: _64 = &src3 + _63; 704 test_mask_vecmath.c:13:18: note: skip. 705 test_mask_vecmath.c:13:18: note: ==> examining statement: j_65 = *_64; 706 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(4) int 707 test_mask_vecmath.c:13:18: note: nunits = 4 708 test_mask_vecmath.c:13:18: note: ==> examining statement: _66 = (long unsigned int) j_65; 709 test_mask_vecmath.c:13:18: note: skip. 710 test_mask_vecmath.c:13:18: note: ==> examining statement: _67 = _66 * 8; 711 test_mask_vecmath.c:13:18: note: skip. 712 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_153 = j_65 w* 8; 713 test_mask_vecmath.c:13:18: note: skip. 714 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_154 = (long unsigned int) patt_153; 715 test_mask_vecmath.c:13:18: note: skip. 716 test_mask_vecmath.c:13:18: note: ==> examining statement: _142 = _141 + _67; 717 test_mask_vecmath.c:13:18: note: skip. test_mask_vecmath.c:13:18: note: ==> examining statement: _68 = (double *) _142; 719 test_mask_vecmath.c:13:18: note: skip. 720 test_mask_vecmath.c:13:18: note: ==> examining statement: _143 = j_65 > 10; 721 test_mask_vecmath.c:13:18: note: vectype: vector(4) <signed-boolean:32> 722 test_mask_vecmath.c:13:18: note: nunits = 4 723 test_mask_vecmath.c:13:18: note: ==> examining statement: _69 = .MASK_LOAD (_68, 64B, _143); 724 test_mask_vecmath.c:13:18: note: skip. 725 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_155 = (<signed-boolean:64>) _143; 726 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 727 test_mask_vecmath.c:13:18: note: nunits = 2 728 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_156 = .MASK_LOAD (_68, 64B, patt_155); 729 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 730 test_mask_vecmath.c:13:18: note: nunits = 2 731 test_mask_vecmath.c:13:18: note: ==> examining statement: _70 = log (_69); 732 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 733 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 734 test_mask_vecmath.c:13:18: note: nunits = 2 735 test_mask_vecmath.c:13:18: note: ==> examining statement: _89 = (unsigned int) j_65; 736 test_mask_vecmath.c:13:18: note: get vectype for scalar type: unsigned int 737 test_mask_vecmath.c:13:18: note: vectype: vector(4) unsigned int 738 test_mask_vecmath.c:13:18: note: nunits = 4 739 test_mask_vecmath.c:13:18: note: ==> examining statement: _87 = _89 + 4294967288; 740 test_mask_vecmath.c:13:18: note: get vectype for scalar type: unsigned int 741 test_mask_vecmath.c:13:18: note: vectype: vector(4) unsigned int 742 test_mask_vecmath.c:13:18: note: nunits = 4 743 test_mask_vecmath.c:13:18: note: ==> examining statement: _73 = _62 * 8; 744 test_mask_vecmath.c:13:18: note: skip. 745 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_157 = i_114 w* 8; 746 test_mask_vecmath.c:13:18: note: skip. 747 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_158 = (long unsigned int) patt_157; 748 test_mask_vecmath.c:13:18: note: skip. 749 test_mask_vecmath.c:13:18: note: ==> examining statement: _145 = _73 + _141; 750 test_mask_vecmath.c:13:18: note: skip. 751 test_mask_vecmath.c:13:18: note: ==> examining statement: _74 = (double *) _145; 752 test_mask_vecmath.c:13:18: note: skip. 753 test_mask_vecmath.c:13:18: note: ==> examining statement: _146 = _87 <= 2; 754 test_mask_vecmath.c:13:18: note: vectype: vector(4) <signed-boolean:32> 755 test_mask_vecmath.c:13:18: note: nunits = 4 756 test_mask_vecmath.c:13:18: note: ==> examining statement: _75 = .MASK_LOAD (_74, 64B, _146); 757 test_mask_vecmath.c:13:18: note: skip. 758 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_159 = (<signed-boolean:64>) _146; 759 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 760 test_mask_vecmath.c:13:18: note: nunits = 2 761 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_160 = .MASK_LOAD (_74, 64B, patt_159); 762 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 763 test_mask_vecmath.c:13:18: note: nunits = 2 764 test_mask_vecmath.c:13:18: note: ==> examining statement: _76 = log (_75); 765 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 766 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 767 test_mask_vecmath.c:13:18: note: nunits = 2 768 test_mask_vecmath.c:13:18: note: ==> examining statement: _148 = _73 + _147; 769 test_mask_vecmath.c:13:18: note: skip. 770 test_mask_vecmath.c:13:18: note: ==> examining statement: _80 = (double *) _148; 771 test_mask_vecmath.c:13:18: note: skip. 772 test_mask_vecmath.c:13:18: note: ==> examining statement: _149 = j_65 == 7; 773 test_mask_vecmath.c:13:18: note: vectype: vector(4) <signed-boolean:32> 774 test_mask_vecmath.c:13:18: note: nunits = 4 775 test_mask_vecmath.c:13:18: note: ==> examining statement: _81 = .MASK_LOAD (_80, 64B, _149); 776 test_mask_vecmath.c:13:18: note: skip. 777 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_161 = (<signed-boolean:64>) _149; 778 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 779 test_mask_vecmath.c:13:18: note: nunits = 2 780 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_162 = .MASK_LOAD (_80, 64B, patt_161); 781 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 782 test_mask_vecmath.c:13:18: note: nunits = 2 783 test_mask_vecmath.c:13:18: note: ==> examining statement: _82 = log (_81); 784 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 785 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 786 test_mask_vecmath.c:13:18: note: nunits = 2 787 test_mask_vecmath.c:13:18: note: ==> examining statement: _ifc__135 = j_65 > 10 ? _70 : 0.0; 788 test_mask_vecmath.c:13:18: note: skip. 789 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_163 = j_65 > 10; 790 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(4) <signed-boolean:32> 791 test_mask_vecmath.c:13:18: note: nunits = 4 792 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_164 = (<signed-boolean:64>) patt_163; 793 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 794 test_mask_vecmath.c:13:18: note: nunits = 2 795 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_165 = patt_164 ? _70 : 0.0; 796 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 797 test_mask_vecmath.c:13:18: note: nunits = 2 798 test_mask_vecmath.c:13:18: note: ==> examining statement: _136 = sumi1_115 + _ifc__135; 799 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 800 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 801 test_mask_vecmath.c:13:18: note: nunits = 2 802 test_mask_vecmath.c:13:18: note: ==> examining statement: _ifc__137 = _87 <= 2 ? _76 : 0.0; 803 test_mask_vecmath.c:13:18: note: skip. 804 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_166 = _87 <= 2; 805 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(4) <signed-boolean:32> 806 test_mask_vecmath.c:13:18: note: nunits = 4 807 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_167 = (<signed-boolean:64>) patt_166; 808 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 809 test_mask_vecmath.c:13:18: note: nunits = 2 810 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_168 = patt_167 ? _76 : 0.0; 811 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 812 test_mask_vecmath.c:13:18: note: nunits = 2 813 test_mask_vecmath.c:13:18: note: ==> examining statement: _138 = sumi2_117 + _ifc__137; 814 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 815 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 816 test_mask_vecmath.c:13:18: note: nunits = 2 817 test_mask_vecmath.c:13:18: note: ==> examining statement: _ifc__139 = j_65 == 7 ? _82 : 0.0; 818 test_mask_vecmath.c:13:18: note: skip. 819 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_169 = j_65 == 7; 820 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(4) <signed-boolean:32> 821 test_mask_vecmath.c:13:18: note: nunits = 4 822 test_mask_vecmath.c:13:18: note: ==> examining pattern def stmt: patt_170 = (<signed-boolean:64>) patt_169; 823 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) <signed-boolean:64> 824 test_mask_vecmath.c:13:18: note: nunits = 2 825 test_mask_vecmath.c:13:18: note: ==> examining pattern statement: patt_171 = patt_170 ? _82 : 0.0; 826 test_mask_vecmath.c:13:18: note: precomputed vectype: vector(2) double 827 test_mask_vecmath.c:13:18: note: nunits = 2 828 test_mask_vecmath.c:13:18: note: ==> examining statement: _140 = sumi3_119 + _ifc__139; 829 test_mask_vecmath.c:13:18: note: get vectype for scalar type: double 830 test_mask_vecmath.c:13:18: note: vectype: vector(2) double 831 test_mask_vecmath.c:13:18: note: nunits = 2 832 test_mask_vecmath.c:13:18: note: ==> examining statement: i_85 = i_114 + 1; 833 test_mask_vecmath.c:13:18: note: skip. 834 test_mask_vecmath.c:13:18: note: ==> examining statement: ivtmp_101 = ivtmp_106 - 1; 835 test_mask_vecmath.c:13:18: note: skip. 836 test_mask_vecmath.c:13:18: note: ==> examining statement: if (ivtmp_101 != 0) 837 test_mask_vecmath.c:13:18: note: skip. 838 test_mask_vecmath.c:13:18: note: vectorization factor = 4 |
既有int 也有double的loop
#include<stdio.h>
2 #include<math.h>
3 #include<stdlib.h>
4 void calc(double *src1,double *src2,int *src3)
5 {
6 int i;
7 int j;
8 double sumi = 0;
9 double sumi1 = 0;
10 double sumi2 = 0;
11 double sumi3 = 0;
12 double sumi_temp[100];
13 for(i=0;i<100;i++)
14 {
15 j = src3[i];
16 if(src3[i] > 10)
17 {
18 // src1[i] = exp(src2[j]);
19 sumi1 += log(src2[j]);
20 // sumi = exp(src3[i]);
21 // sumi += 2;
22 }
23 else if(src3[i] > 7)
24 {
25 // src1[i] = log(src2[j]);
26 // sumi = log(src2[j]);
27 sumi2 += log(src2[i]);
28 // sumi += 3;
29 }
30
31 else if(src3[i] > 6)
32 {
33 // src1[i] = sin(src2[j]);
34 sumi3 += log(src1[i]);
35 // sumi += 2;
36 }
37 }
38 /* for(int i=0;i<100;i++) {
39 sumi+=src1[i];
40 }*/
41 sumi = sumi1 + sumi2 + sumi3;
42 printf("sumi is %lf\n",sumi);
43
44 }
46 int main()
47 {
48 srand(12);
49 double src1[100];
50 double src2[100];
51 // double src3[100];
52 int src3[100];
53 double rand_double_min2 = 5.0;
54 double rand_double_max2 = 15.0;
55
56 int rand_int_min2 = 5;
57 int rand_int_max2 = 15;
58
59 for(int k = 0;k<100;k++) {
60 src1[k] = rand_double_min2+1.0 * rand() / RAND_MAX * ( rand_double_max2 - rand_double_min2 );
61 src2[k] = rand_double_min2+1.0 * rand() / RAND_MAX * ( rand_double_max2 - rand_double_min2 );
62 // src3[k] = rand_double_min2+1.0 * rand() / RAND_MAX * ( rand_double_max2 - rand_double_min2 );
63 }
64 for(int k = 0;k<100;k++) {
65 src3[k] = rand_int_min2+ rand() % ( rand_int_max2 - rand_int_min2 );
66 }
67
68 for(int k = 0;k<100;k++) {
69 printf("src1 is %lf ",src1[k]);
70 }
71 calc(src1,src2,src3);
72 double res= 0;
73 for(int m = 0;m<100;m++) {
74 res += src1[m];
75 }
76 printf("res is %lf\n",res);
77 return 0;
78 } |
bb分块
COUNT:1604735257<bb 78>:
# # RANGE [0, 2147483647] NONZERO 2147483647
k_3019 = PHI <k_1827(216), 0(301)>
# temp0_1543 = PHI <_1251(216), 0.0(301)>
# temp1_2883 = PHI <_1249(216), 0.0(301)>
# temp2_224 = PHI <_1247(216), 0.0(301)>
# temp3_2699 = PHI <_1245(216), 0.0(301)>
# temp4_1545 = PHI <_1243(216), 0.0(301)>
# vect_temp0_1543.1410_1003 = PHI <vect__1251.1527_708(216), { 0.0, 0.0, 0.0, 0.0 }(301)>
# vect_temp1_2883.1411_1002 = PHI <vect__1249.1530_701(216), { 0.0, 0.0, 0.0, 0.0 }(301)>
# vect_temp2_224.1412_1001 = PHI <vect__1247.1533_694(216), { 0.0, 0.0, 0.0, 0.0 }(301)>
# vect_temp3_2699.1413_1000 = PHI <vect__1245.1536_687(216), { 0.0, 0.0, 0.0, 0.0 }(301)>
# vect_temp4_1545.1414_999 = PHI <vect__1243.1539_670(216), { 0.0, 0.0, 0.0, 0.0 }(301)>
# # PT = nonlocal escaped null
# ALIGN = 4, MISALIGN = 0
vectp.1415_998 = PHI <vectp.1415_997(216), _1703(301)>
# ivtmp_667 = PHI <ivtmp_666(216), 0(301)>
# DEBUG temp4D.7772 => NULL
# DEBUG temp3D.7771 => NULL
# DEBUG temp2D.7770 => NULL
# DEBUG temp1D.7769 => NULL
# DEBUG temp0D.7768 => NULL
# DEBUG kD.7615 => NULL
# DEBUG BEGIN_STMT
# DEBUG BEGIN_STMT
# RANGE [0, 2147483646] NONZERO 2147483647
_1705 = (long unsigned intD.10) k_3019;
# RANGE [0, 8589934584] NONZERO 8589934588
_1706 = _1705 * 4;
# PT = nonlocal escaped null
_1707 = _1703 + _1706;
# VUSE <.MEM_2600>
vect_j_1708.1417_996 = MEM <vector(8) intD.6> [(INT_TD.3736 *)vectp.1415_998];
# VUSE <.MEM_2600>
j_1708 = *_1707;
# DEBUG jD.7613 => NULL
# DEBUG BEGIN_STMT
vect__1709.1418_994 = vect_j_1708.1417_996 * { 3, 3, 3, 3, 3, 3, 3, 3 };
_1709 = j_1708 * 3;
# RANGE ~[2147483648, 18446744071562067967]
_1710 = (long unsigned intD.10) _1709;
# RANGE [0, 18446744073709551608] NONZERO 18446744073709551608
_1711 = _1710 * 8;
# PT = nonlocal null
_1712 = x_242(D) + _1711;
# VUSE <.MEM_2600>
# USE = anything
vect__1713.1419_991 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, x_242(D), vect__1709.1418_994, { Nan, Nan, Nan, Nan }, 8);
vect__1713.1420_990 = VEC_PERM_EXPR <vect__1709.1418_994, vect__1709.1418_994, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
# VUSE <.MEM_2600>
# USE = anything
vect__1713.1419_989 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, x_242(D), vect__1713.1420_990, { Nan, Nan, Nan, Nan }, 8);
# VUSE <.MEM_2600>
_1713 = *_1712;
vect_xij_1714.1421_987 = vect_cst__988 - vect__1713.1419_991;
vect_xij_1714.1421_986 = vect_cst__988 - vect__1713.1419_989;
xij_1714 = xi_1687 - _1713;
# DEBUG xijD.7655 => NULL
# DEBUG BEGIN_STMT
# RANGE ~[2147483649, 18446744071562067968]
_1715 = _1710 + 1;
# RANGE [0, 18446744073709551608] NONZERO 18446744073709551608
_1716 = _1715 * 8;
# PT = nonlocal null
_1717 = x_242(D) + _1716;
# VUSE <.MEM_2600>
# USE = anything
vect__1718.1422_980 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _983, vect__1709.1418_994, { Nan, Nan, Nan, Nan }, 8);
# VUSE <.MEM_2600>
# USE = anything
vect__1718.1422_977 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _983, vect__1713.1420_990, { Nan, Nan, Nan, Nan }, 8);
# VUSE <.MEM_2600>
_1718 = *_1717;
vect_yij_1719.1424_975 = vect_cst__976 - vect__1718.1422_980;
vect_yij_1719.1424_974 = vect_cst__976 - vect__1718.1422_977;
yij_1719 = yi_1691 - _1718;
# DEBUG yijD.7656 => NULL
# DEBUG BEGIN_STMT
# RANGE ~[2147483650, 18446744071562067969]
_1720 = _1710 + 2;
# RANGE [0, 18446744073709551608] NONZERO 18446744073709551608
_1721 = _1720 * 8;
# PT = nonlocal null
_1722 = x_242(D) + _1721;
# VUSE <.MEM_2600>
# USE = anything
vect__1723.1425_967 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _971, vect__1709.1418_994, { Nan, Nan, Nan, Nan }, 8);
# VUSE <.MEM_2600>
# USE = anything
vect__1723.1425_965 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _971, vect__1713.1420_990, { Nan, Nan, Nan, Nan }, 8);
# VUSE <.MEM_2600>
_1723 = *_1722;
vect_zij_1724.1427_963 = vect_cst__964 - vect__1723.1425_967;
vect_zij_1724.1427_962 = vect_cst__964 - vect__1723.1425_965;
zij_1724 = zi_1695 - _1723;
# DEBUG zijD.7657 => NULL
# DEBUG BEGIN_STMT
vect_powmult_2740.1428_961 = vect_xij_1714.1421_987 * vect_xij_1714.1421_987;
vect_powmult_2740.1428_960 = vect_xij_1714.1421_986 * vect_xij_1714.1421_986;
powmult_2740 = xij_1714 * xij_1714;
vect_powmult_2713.1429_959 = vect_yij_1719.1424_975 * vect_yij_1719.1424_975;
vect_powmult_2713.1429_958 = vect_yij_1719.1424_974 * vect_yij_1719.1424_974;
powmult_2713 = yij_1719 * yij_1719;
vect_powmult_1661.1430_957 = vect_zij_1724.1427_963 * vect_zij_1724.1427_963;
vect_powmult_1661.1430_956 = vect_zij_1724.1427_962 * vect_zij_1724.1427_962;
powmult_1661 = zij_1724 * zij_1724;
vect__1971.1431_955 = vect_powmult_1661.1430_957 + vect_powmult_2713.1429_959;
vect__1971.1431_954 = vect_powmult_1661.1430_956 + vect_powmult_2713.1429_958;
_1971 = powmult_1661 + powmult_2713;
vect_r2_1729.1432_953 = vect__1971.1431_955 + vect_powmult_2740.1428_961;
vect_r2_1729.1432_952 = vect__1971.1431_954 + vect_powmult_2740.1428_960; // compute r2
r2_1729 = _1971 + powmult_2740;
# DEBUG r2D.7683 => NULL
# DEBUG BEGIN_STMT
# DEBUG r2D.7683 => NULL
# DEBUG BEGIN_STMT
# DEBUG BEGIN_STMT
vect__1730.1433_950 = .SQRT (vect_r2_1729.1432_953); // after if (r2 > rgbmaxpsmax2) compute
vect__1730.1433_949 = .SQRT (vect_r2_1729.1432_952);
vect_dij1i_1731.1434_947 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1730.1433_950;
vect_dij1i_1731.1434_946 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1730.1433_949;
# DEBUG dij1iD.7664 => NULL
# DEBUG BEGIN_STMT
vect_dij_1732.1435_945 = vect_r2_1729.1432_953 * vect_dij1i_1731.1434_947;
vect_dij_1732.1435_944 = vect_r2_1729.1432_952 * vect_dij1i_1731.1434_946;
dij_1732 = r2_1729 * Inf;
# DEBUG dijD.7673 => NULL
# DEBUG BEGIN_STMT
_1733 = (long unsigned intD.10) j_1708;
_1734 = _1733 * 8;
_1241 = _1242 + _1734;
# PT = nonlocal escaped null
_1735 = (doubleD.32 *) _1241;
mask__1239.1436_942 = vect_r2_1729.1432_953 <= vect_cst__943; // if (r2 > rgbmaxpsmax2)
mask__1239.1436_941 = vect_r2_1729.1432_952 <= vect_cst__943;
_1239 = r2_1729 <= powmult_2494;
stmp_938 = VIEW_CONVERT_EXPR<vector(4) doubleD.32>(mask__1239.1436_942);
# VUSE <.MEM_2600>
# USE = anything
vect__1736.1437_937 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _939, vect_j_1708.1417_996, stmp_938, 8); // after if (r2 > rgbmaxpsmax2) compute
vect__1736.1438_936 = VEC_PERM_EXPR <vect_j_1708.1417_996, vect_j_1708.1417_996, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
stmp_935 = VIEW_CONVERT_EXPR<vector(4) doubleD.32>(mask__1239.1436_941);
# VUSE <.MEM_2600>
# USE = anything
vect__1736.1437_934 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _939, vect__1736.1438_936, stmp_935, 8);
_1237 = _1238 + _1734;
# PT = nonlocal escaped null
_1737 = (doubleD.32 *) _1237;
# VUSE <.MEM_2600>
# USE = anything
vect__1738.1439_931 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _933, vect_j_1708.1417_996, stmp_938, 8);
# VUSE <.MEM_2600>
# USE = anything
vect__1738.1439_924 = __builtin_ia32_gatheraltsiv4df D.2164 ({ 0.0, 0.0, 0.0, 0.0 }, _933, vect__1736.1438_936, stmp_935, 8);
vect__1739.1441_922 = vect__1738.1439_931 + { -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2 };
vect__1739.1441_921 = vect__1738.1439_924 + { -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2, -8.99999999999999966693309261245303787291049957275390625e-2 };
vect_sj_1740.1442_920 = vect__1736.1437_937 * vect__1739.1441_922;
vect_sj_1740.1442_919 = vect__1736.1437_934 * vect__1739.1441_921;
# DEBUG sjD.7686 => NULL
# DEBUG BEGIN_STMT
# DEBUG sj2D.7687 => NULL
# DEBUG BEGIN_STMT
vect__1743.1443_917 = vect_sj_1740.1442_920 + { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 };
vect__1743.1443_916 = vect_sj_1740.1442_919 + { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 };
mask__1463.1444_915 = vect_dij_1732.1435_945 <= vect__1743.1443_917;
mask__1463.1444_914 = vect_dij_1732.1435_944 <= vect__1743.1443_916;
_1463 = dij_1732 <= 2.0e+1;
mask__1462.1445_913 = mask__1239.1436_942 & mask__1463.1444_915; // if (dij > rgbmax + sj)
mask__1462.1445_912 = mask__1239.1436_941 & mask__1463.1444_914;
_1462 = _1239 & _1463;
vect_powmult_1725.1446_911 = vect_sj_1740.1442_920 * vect_sj_1740.1442_920;
vect_powmult_1725.1446_910 = vect_sj_1740.1442_919 * vect_sj_1740.1442_919;
# DEBUG BEGIN_STMT
vect__1744.1447_908 = { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 } - vect_sj_1740.1442_920; // begin if ((dij > rgbmax - sj))
vect__1744.1447_907 = { 2.0e+1, 2.0e+1, 2.0e+1, 2.0e+1 } - vect_sj_1740.1442_919;
mask__1461.1448_906 = vect_dij_1732.1435_945 > vect__1744.1447_908;
mask__1461.1448_905 = vect_dij_1732.1435_944 > vect__1744.1447_907;
_1461 = dij_1732 > 2.0e+1;
mask__1460.1449_904 = mask__1461.1448_906 & mask__1462.1445_913; // if ((dij > rgbmax - sj)) enter if-else chain
mask__1460.1449_903 = mask__1461.1448_905 & mask__1462.1445_912;
_1460 = _1461 & _1462; else add
# DEBUG BEGIN_STMT
vect__1745.1450_902 = vect_dij_1732.1435_945 - vect_sj_1740.1442_920;
vect__1745.1450_901 = vect_dij_1732.1435_944 - vect_sj_1740.1442_919;
vect_uij_1746.1451_899 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1745.1450_902;
vect_uij_1746.1451_898 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1745.1450_901;
uij_1746 = 0.0 / r2_1729;
# DEBUG uijD.7689 => NULL
# DEBUG BEGIN_STMT
vect__1748.1452_896 = vect_dij_1732.1435_945 * { 8.0e+1, 8.0e+1, 8.0e+1, 8.0e+1 };
vect__1748.1452_895 = vect_dij_1732.1435_944 * { 8.0e+1, 8.0e+1, 8.0e+1, 8.0e+1 };
_1748 = dij_1732 * 8.0e+1;
vect__2057.1453_894 = vect_powmult_1725.1446_911 - vect_r2_1729.1432_953;
vect__2057.1453_893 = vect_powmult_1725.1446_910 - vect_r2_1729.1432_952;
_2057 = -r2_1729;
vect__1750.1454_892 = vect__1748.1452_896 + vect__2057.1453_894;
vect__1750.1454_891 = vect__1748.1452_895 + vect__2057.1453_893;
_1750 = _1748 + _2057;
vect__1751.1455_889 = vect__1750.1454_892 * { 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3 };
vect__1751.1455_888 = vect__1750.1454_891 * { 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3, 2.50000000000000048572257327350598643533885478973388671875e-3 };
_1751 = _1750 * 2.50000000000000048572257327350598643533885478973388671875e-3;
vect__2086.1456_886 = vect_dij_1732.1435_945 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 };
vect__2086.1456_885 = vect_dij_1732.1435_944 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 };
_2086 = dij_1732 * 2.0e+0;
vect__1753.1457_884 = vect_uij_1746.1451_899 * vect__2086.1456_886;
vect__1753.1457_883 = vect_uij_1746.1451_898 * vect__2086.1456_885;
_1753 = uij_1746 * _2086;
vect__1754.1458_882 = vect__1751.1455_889 - vect__1753.1457_884;
vect__1754.1458_881 = vect__1751.1455_888 - vect__1753.1457_883;
_1754 = _1751 - _1753;
vect__1755.1459_879 = vect__1745.1450_902 * { 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2 };
vect__1755.1459_878 = vect__1745.1450_901 * { 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2, 5.000000000000000277555756156289135105907917022705078125e-2 };
_1755 = dij_1732 * 5.000000000000000277555756156289135105907917022705078125e-2;
vect__1756.1460_877 = __svml_log4_mask_e9D.7954 (vect__1755.1459_879);
vect__1756.1460_876 = __svml_log4_mask_e9D.7954 (vect__1755.1459_878);
vect__1757.1461_874 = vect__1756.1460_877 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 };
vect__1757.1461_873 = vect__1756.1460_876 * { 2.0e+0, 2.0e+0, 2.0e+0, 2.0e+0 };
vect__2097.1462_871 = vect__1754.1458_882 + { -1.0e+0, -1.0e+0, -1.0e+0, -1.0e+0 };
vect__2097.1462_870 = vect__1754.1458_881 + { -1.0e+0, -1.0e+0, -1.0e+0, -1.0e+0 };
_2097 = _1754 - 1.0e+0;
vect__1759.1463_869 = vect__2097.1462_871 - vect__1757.1461_874;
vect__1759.1463_868 = vect__2097.1462_870 - vect__1757.1461_873;
vect__2099.1464_866 = vect_dij1i_1731.1434_947 * { 1.25e-1, 1.25e-1, 1.25e-1, 1.25e-1 };
vect__2099.1464_865 = vect_dij1i_1731.1434_946 * { 1.25e-1, 1.25e-1, 1.25e-1, 1.25e-1 };
vect__1761.1465_864 = vect__1759.1463_869 * vect__2099.1464_866;
vect__1761.1465_863 = vect__1759.1463_868 * vect__2099.1464_865;
_1761 = _2097 * Inf; /// else add
# DEBUG temp0D.7768 => NULL
mask__1458.1466_862 = vect_dij_1732.1435_945 <= vect__1744.1447_908; // begin else if (dij > 4.0 * sj)
mask__1458.1466_861 = vect_dij_1732.1435_944 <= vect__1744.1447_907;
mask__1457.1467_860 = mask__1458.1466_862 & mask__1462.1445_913;
mask__1457.1467_859 = mask__1458.1466_861 & mask__1462.1445_912;
# DEBUG BEGIN_STMT
vect__1764.1468_857 = vect_sj_1740.1442_920 * { 4.0e+0, 4.0e+0, 4.0e+0, 4.0e+0 };
vect__1764.1468_856 = vect_sj_1740.1442_919 * { 4.0e+0, 4.0e+0, 4.0e+0, 4.0e+0 };
mask__1456.1469_855 = vect_dij_1732.1435_945 > vect__1764.1468_857;
mask__1456.1469_854 = vect_dij_1732.1435_944 > vect__1764.1468_856;
_1456 = dij_1732 > 0.0;
mask__1455.1470_853 = mask__1456.1469_855 & mask__1457.1467_860; // else if (dij > 4.0 * sj)
mask__1455.1470_852 = mask__1456.1469_854 & mask__1457.1467_859;
_1455 = _1456 & _1462; /// else add
# DEBUG BEGIN_STMT
vect_powmult_1726.1471_851 = vect_dij1i_1731.1434_947 * vect_dij1i_1731.1434_947;
vect_powmult_1726.1471_846 = vect_dij1i_1731.1434_946 * vect_dij1i_1731.1434_946;
# DEBUG dij2iD.7672 => NULL
# DEBUG BEGIN_STMT
vect_tmpsd_1766.1472_845 = vect_powmult_1725.1446_911 * vect_powmult_1726.1471_851;
vect_tmpsd_1766.1472_844 = vect_powmult_1725.1446_910 * vect_powmult_1726.1471_846;
# DEBUG tmpsdD.7695 => NULL
# DEBUG BEGIN_STMT
vect__1767.1473_842 = vect_tmpsd_1766.1472_845 * { 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1 };
vect__1767.1473_841 = vect_tmpsd_1766.1472_844 * { 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1, 4.54545454545454530315140573293319903314113616943359375e-1 };
vect__1768.1474_839 = vect__1767.1473_842 + { 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1 };
vect__1768.1474_838 = vect__1767.1473_841 + { 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1, 4.444444444444444197728216749965213239192962646484375e-1 };
vect__1769.1475_837 = vect_tmpsd_1766.1472_845 * vect__1768.1474_839;
vect__1769.1475_836 = vect_tmpsd_1766.1472_844 * vect__1768.1474_838;
vect__1770.1476_834 = vect__1769.1475_837 + { 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1 };
vect__1770.1476_832 = vect__1769.1475_836 + { 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1, 4.28571428571428547638078043746645562350749969482421875e-1 };
vect__1771.1477_831 = vect_tmpsd_1766.1472_845 * vect__1770.1476_834;
vect__1771.1477_830 = vect_tmpsd_1766.1472_844 * vect__1770.1476_832;
vect__1772.1478_824 = vect__1771.1477_831 + { 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1 };
vect__1772.1478_823 = vect__1771.1477_830 + { 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1, 4.0000000000000002220446049250313080847263336181640625e-1 };
vect__1773.1479_822 = vect_tmpsd_1766.1472_845 * vect__1772.1478_824;
vect__1773.1479_821 = vect_tmpsd_1766.1472_844 * vect__1772.1478_823;
vect_dumbo_1774.1480_819 = vect__1773.1479_822 + { 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1 };
vect_dumbo_1774.1480_818 = vect__1773.1479_821 + { 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1, 3.33333333333333314829616256247390992939472198486328125e-1 };
# DEBUG dumboD.7694 => NULL
# DEBUG BEGIN_STMT
vect__2892.1481_817 = vect_powmult_1726.1471_851 * vect_sj_1740.1442_920;
vect__2892.1481_816 = vect_powmult_1726.1471_846 * vect_sj_1740.1442_919;
vect__1776.1482_815 = vect_tmpsd_1766.1472_845 * vect__2892.1481_817;
vect__1776.1482_814 = vect_tmpsd_1766.1472_844 * vect__2892.1481_816;
vect__1777.1483_813 = vect_dumbo_1774.1480_819 * vect__1776.1482_815;
vect__1777.1483_812 = vect_dumbo_1774.1480_818 * vect__1776.1482_814;
# DEBUG temp1D.7769 => NULL
mask__1453.1484_811 = vect_dij_1732.1435_945 <= vect__1764.1468_857; // begin else if (dij > ri + sj)
mask__1453.1484_810 = vect_dij_1732.1435_944 <= vect__1764.1468_856;
_1453 = dij_1732 <= 0.0;
mask__1452.1485_809 = mask__1453.1484_811 & mask__1457.1467_860;
mask__1452.1485_808 = mask__1453.1484_810 & mask__1457.1467_859;
_1452 = _1453 & _1462; // esle add
# DEBUG BEGIN_STMT
vect__1780.1486_806 = vect_cst__807 + vect_sj_1740.1442_920;
vect__1780.1486_805 = vect_cst__807 + vect_sj_1740.1442_919;
_1780 = ri_1700;
mask__1451.1487_804 = vect_dij_1732.1435_945 > vect__1780.1486_806;
mask__1451.1487_803 = vect_dij_1732.1435_944 > vect__1780.1486_805;
_1451 = dij_1732 > _1780;
mask__1450.1488_802 = mask__1451.1487_804 & mask__1452.1485_809;
mask__1450.1488_801 = mask__1451.1487_803 & mask__1452.1485_808; // else if (dij > ri + sj)
_1450 = _1451 & _1452;
# DEBUG BEGIN_STMT
vect__1782.1489_800 = vect_sj_1740.1442_920 / vect__2057.1453_894;
vect__1782.1489_799 = vect_sj_1740.1442_919 / vect__2057.1453_893;
_1782 = 0.0 / r2_1729;
vect__1784.1490_797 = vect_dij_1732.1435_945 + vect_sj_1740.1442_920;
vect__1784.1490_796 = vect_dij_1732.1435_944 + vect_sj_1740.1442_919;
vect__1785.1491_795 = vect__1745.1450_902 / vect__1784.1490_797;
vect__1785.1491_794 = vect__1745.1450_901 / vect__1784.1490_796;
vect__1786.1492_793 = __svml_log4_mask_e9D.7987 (vect__1785.1491_795);
vect__1786.1492_792 = __svml_log4_mask_e9D.7987 (vect__1785.1491_794);
vect__1894.1493_790 = vect_dij1i_1731.1434_947 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 };
vect__1894.1493_789 = vect_dij1i_1731.1434_946 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 };
vect__1788.1494_788 = vect__1786.1492_793 * vect__1894.1493_790;
vect__1788.1494_787 = vect__1786.1492_792 * vect__1894.1493_789;
vect__1789.1495_786 = vect__1782.1489_800 - vect__1788.1494_788;
vect__1789.1495_785 = vect__1782.1489_799 - vect__1788.1494_787;
_1789 = _1782 - Nan;
vect__1790.1496_783 = vect__1789.1495_786 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 };
vect__1790.1496_782 = vect__1789.1495_785 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 };
_1790 = _1789 * 5.0e-1;
# DEBUG temp2D.7770 => NULL
mask__1448.1497_781 = vect_dij_1732.1435_945 <= vect__1780.1486_806; // begin else if (dij > fabs(ri - sj))
mask__1448.1497_780 = vect_dij_1732.1435_944 <= vect__1780.1486_805;
_1448 = dij_1732 <= _1780;
mask__1447.1498_779 = mask__1448.1497_781 & mask__1452.1485_809;
mask__1447.1498_778 = mask__1448.1497_780 & mask__1452.1485_808;
_1447 = _1448 & _1452;
# DEBUG BEGIN_STMT
vect__1793.1499_776 = vect_cst__807 - vect_sj_1740.1442_920;
vect__1793.1499_775 = vect_cst__807 - vect_sj_1740.1442_919;
vect__1794.1500_774 = ABS_EXPR <vect__1793.1499_776>;
vect__1794.1500_773 = ABS_EXPR <vect__1793.1499_775>;
_1794 = ABS_EXPR <_1780>;
mask__1446.1501_772 = vect_dij_1732.1435_945 > vect__1794.1500_774;
mask__1446.1501_771 = vect_dij_1732.1435_944 > vect__1794.1500_773;
_1446 = dij_1732 > _1794;
mask__1445.1502_770 = mask__1446.1501_772 & mask__1447.1498_779;
mask__1445.1502_769 = mask__1446.1501_771 & mask__1447.1498_778; // else if (dij > fabs(ri - sj))
_1445 = _1446 & _1447;
# DEBUG BEGIN_STMT
vect__2372.1503_767 = vect_cst__768 - vect_powmult_1725.1446_911;
vect__2372.1503_766 = vect_cst__768 - vect_powmult_1725.1446_910;
_2372 = powmult_1728;
vect__1798.1504_765 = vect_r2_1729.1432_953 + vect__2372.1503_767;
vect__1798.1504_764 = vect_r2_1729.1432_952 + vect__2372.1503_766;
_1798 = r2_1729 + _2372;
vect__2373.1505_762 = vect__1798.1504_765 * vect_cst__763;
vect__2373.1505_761 = vect__1798.1504_764 * vect_cst__763;
_2373 = _1798 * _2894;
vect_theta_1800.1506_760 = vect_dij1i_1731.1434_947 * vect__2373.1505_762;
vect_theta_1800.1506_759 = vect_dij1i_1731.1434_946 * vect__2373.1505_761;
theta_1800 = _2373 * Inf;
# DEBUG thetaD.7670 => NULL
# DEBUG BEGIN_STMT
vect_uij_1802.1507_757 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1784.1490_797;
vect_uij_1802.1507_756 = { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 } / vect__1784.1490_796;
# DEBUG uijD.7689 => NULL
# DEBUG BEGIN_STMT
vect__1803.1508_754 = vect_theta_1800.1506_760 + { -2.0e+0, -2.0e+0, -2.0e+0, -2.0e+0 };
vect__1803.1508_753 = vect_theta_1800.1506_759 + { -2.0e+0, -2.0e+0, -2.0e+0, -2.0e+0 };
_1803 = theta_1800 - 2.0e+0;
vect__1804.1509_751 = vect_cst__752 * vect__1803.1508_754;
vect__1804.1509_750 = vect_cst__752 * vect__1803.1508_753;
_1804 = ri1i_1701 * _1803;
vect__1805.1510_749 = vect_uij_1802.1507_757 + vect__1804.1509_751;
vect__1805.1510_748 = vect_uij_1802.1507_756 + vect__1804.1509_750;
_1805 = uij_1746 + _1804;
vect__1806.1511_746 = vect_uij_1802.1507_757 * vect_cst__807;
vect__1806.1511_745 = vect_uij_1802.1507_756 * vect_cst__807;
_1806 = ri_1700 * uij_1746;
vect__1807.1512_744 = __svml_log4_mask_e9D.8008 (vect__1806.1511_746);
vect__1807.1512_743 = __svml_log4_mask_e9D.8008 (vect__1806.1511_745);
vect__1808.1513_742 = vect_dij1i_1731.1434_947 * vect__1807.1512_744;
vect__1808.1513_741 = vect_dij1i_1731.1434_946 * vect__1807.1512_743;
vect__1809.1514_740 = vect__1805.1510_749 - vect__1808.1513_742;
vect__1809.1514_739 = vect__1805.1510_748 - vect__1808.1513_741;
_1809 = _1805 - Nan;
vect__1810.1515_737 = vect__1809.1514_740 * { 2.5e-1, 2.5e-1, 2.5e-1, 2.5e-1 };
vect__1810.1515_736 = vect__1809.1514_739 * { 2.5e-1, 2.5e-1, 2.5e-1, 2.5e-1 };
_1810 = _1809 * 2.5e-1;
# DEBUG temp3D.7771 => NULL
mask__1443.1516_735 = vect_dij_1732.1435_945 <= vect__1794.1500_774; // begin else if (ri < sj)
mask__1443.1516_734 = vect_dij_1732.1435_944 <= vect__1794.1500_773;
_1443 = dij_1732 <= _1794;
mask__1442.1517_733 = mask__1443.1516_735 & mask__1447.1498_779;
mask__1442.1517_732 = mask__1443.1516_734 & mask__1447.1498_778;
_1442 = _1443 & _1447;
# DEBUG BEGIN_STMT
mask__1441.1518_730 = vect_cst__807 < vect_sj_1740.1442_920;
mask__1441.1518_729 = vect_cst__807 < vect_sj_1740.1442_919;
_1441 = _1699 < 8.99999999999999966693309261245303787291049957275390625e-2;
mask__1406.1519_728 = mask__1441.1518_730 & mask__1442.1517_733;
mask__1406.1519_727 = mask__1441.1518_729 & mask__1442.1517_732; // else if (ri < sj)
_1406 = _1441 & _1442;
# DEBUG BEGIN_STMT
vect__1816.1520_725 = vect__1782.1489_800 - vect_cst__726;
vect__1816.1520_724 = vect__1782.1489_799 - vect_cst__726;
_1816 = _1782 - _1815;
vect__1235.1521_723 = -vect__1785.1491_795;
vect__1235.1521_722 = -vect__1785.1491_794;
vect__1820.1522_721 = __svml_log4_mask_e9D.8019 (vect__1235.1521_723);
vect__1820.1522_720 = __svml_log4_mask_e9D.8019 (vect__1235.1521_722);
vect__1822.1523_719 = vect__1820.1522_721 * vect__1894.1493_790;
vect__1822.1523_718 = vect__1820.1522_720 * vect__1894.1493_789;
vect__1823.1524_717 = vect__1816.1520_725 - vect__1822.1523_719;
vect__1823.1524_716 = vect__1816.1520_724 - vect__1822.1523_718;
_1823 = _1816 - Nan;
vect__1824.1525_714 = vect__1823.1524_717 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 };
vect__1824.1525_713 = vect__1823.1524_716 * { 5.0e-1, 5.0e-1, 5.0e-1, 5.0e-1 }; // end if-else
_1824 = _1823 * 5.0e-1;
# DEBUG temp4D.7772 => NULL
vect__ifc__1252.1526_711 = VEC_COND_EXPR <mask__1460.1449_904, vect__1761.1465_864, { 0.0, 0.0, 0.0, 0.0 }>;
vect__ifc__1252.1526_710 = VEC_COND_EXPR <mask__1460.1449_903, vect__1761.1465_863, { 0.0, 0.0, 0.0, 0.0 }>;
_ifc__1252 = _1460 ? _1761 : 0.0;
vect__1251.1527_709 = vect_temp0_1543.1410_1003 + vect__ifc__1252.1526_711;
vect__1251.1527_708 = vect__1251.1527_709 + vect__ifc__1252.1526_710;
_1251 = temp0_1543 + _ifc__1252;
vect__ifc__1250.1529_704 = VEC_COND_EXPR <mask__1455.1470_853, vect__1777.1483_813, { 0.0, 0.0, 0.0, 0.0 }>;
vect__ifc__1250.1529_703 = VEC_COND_EXPR <mask__1455.1470_852, vect__1777.1483_812, { 0.0, 0.0, 0.0, 0.0 }>;
_ifc__1250 = _1455 ? Nan : 0.0;
vect__1249.1530_702 = vect_temp1_2883.1411_1002 - vect__ifc__1250.1529_704;
vect__1249.1530_701 = vect__1249.1530_702 - vect__ifc__1250.1529_703;
_1249 = temp1_2883 - _ifc__1250;
vect__ifc__1248.1532_697 = VEC_COND_EXPR <mask__1450.1488_802, vect__1790.1496_783, { 0.0, 0.0, 0.0, 0.0 }>;
vect__ifc__1248.1532_696 = VEC_COND_EXPR <mask__1450.1488_801, vect__1790.1496_782, { 0.0, 0.0, 0.0, 0.0 }>;
_ifc__1248 = _1450 ? _1790 : 0.0;
vect__1247.1533_695 = vect_temp2_224.1412_1001 + vect__ifc__1248.1532_697;
vect__1247.1533_694 = vect__1247.1533_695 + vect__ifc__1248.1532_696;
_1247 = temp2_224 + _ifc__1248;
vect__ifc__1246.1535_690 = VEC_COND_EXPR <mask__1445.1502_770, vect__1810.1515_737, { 0.0, 0.0, 0.0, 0.0 }>;
vect__ifc__1246.1535_689 = VEC_COND_EXPR <mask__1445.1502_769, vect__1810.1515_736, { 0.0, 0.0, 0.0, 0.0 }>;
_ifc__1246 = _1445 ? _1810 : 0.0;
vect__1245.1536_688 = vect_temp3_2699.1413_1000 + vect__ifc__1246.1535_690;
vect__1245.1536_687 = vect__1245.1536_688 + vect__ifc__1246.1535_689;
_1245 = temp3_2699 + _ifc__1246;
vect__ifc__1244.1538_673 = VEC_COND_EXPR <mask__1406.1519_728, vect__1824.1525_714, { 0.0, 0.0, 0.0, 0.0 }>;
vect__ifc__1244.1538_672 = VEC_COND_EXPR <mask__1406.1519_727, vect__1824.1525_713, { 0.0, 0.0, 0.0, 0.0 }>;
_ifc__1244 = _1406 ? _1824 : 0.0;
vect__1243.1539_671 = vect_temp4_1545.1414_999 + vect__ifc__1244.1538_673;
vect__1243.1539_670 = vect__1243.1539_671 + vect__ifc__1244.1538_672;
_1243 = temp4_1545 + _ifc__1244;
# DEBUG temp4D.7772 => _1243
# DEBUG temp3D.7771 => _1245
# DEBUG temp2D.7770 => _1247
# DEBUG temp1D.7769 => _1249
# DEBUG temp0D.7768 => _1251
# DEBUG BEGIN_STMT
# RANGE [1, 2147483647] NONZERO 2147483647
k_1827 = k_3019 + 1;
# DEBUG temp4D.7772 => _1243
# DEBUG temp3D.7771 => _1245
# DEBUG temp2D.7770 => _1247
# DEBUG temp1D.7769 => _1249
# DEBUG temp0D.7768 => _1251
# DEBUG kD.7615 => k_1827
# DEBUG BEGIN_STMT
# PT = nonlocal escaped null
vectp.1415_997 = vectp.1415_998 + 32;
ivtmp_666 = ivtmp_667 + 1;
if (ivtmp_666 < bnd.1407_1013)
goto <bb 216>; [83.33%]
else
goto <bb 303>; [16.67% |
bb 分块的优化方案:
1:找到vec_cond_expr,将其中第一个参数mask作为上一个bb的结束,(其后还有一个mask)并且在其后新建一个该mask与0进行比较的gimple_cond,将这两个mask相与。同时新建该mask判断为ture 和 false的edge,分别指向分割的bb和其下一个bb。
2:以vec_cond_expr的第二个参数的ssa_name_def作为要分割bb的末尾,进行分割。并且生成一条指向其下一个bb的edge。同时将其作为mask判断为false的edge的dest。
optimize_mask_stores 代码
10093 /* The code below is trying to perform simple optimization - revert
10094 if-conversion for masked stores, i.e. if the mask of a store is zero
10095 do not perform it and all stored value producers also if possible.
10096 For example,
10097 for (i=0; i<n; i++)
10098 if (c[i])
10099 {
10100 p1[i] += 1;
10101 p2[i] = p3[i] +2;
10102 }
10103 this transformation will produce the following semi-hammock:
10104
10105 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10106 {
10107 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10108 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10109 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10110 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10111 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10112 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10113 }
10114 */
10115
10116 void
10117 optimize_mask_stores (class loop *loop)
10118 {
10119 basic_block *bbs = get_loop_body (loop);
10120 unsigned nbbs = loop->num_nodes;
10121 unsigned i;
10122 basic_block bb;
10123 class loop *bb_loop;
10124 gimple_stmt_iterator gsi;
10125 gimple *stmt;
10126 auto_vec<gimple *> worklist;
10127 auto_purge_vect_location sentinel;
10128
10129 vect_location = find_loop_location (loop);
10130 /* Pick up all masked stores in loop if any. */
10131 for (i = 0; i < nbbs; i++)
10132 {
10133 bb = bbs[i];
10134 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10135 gsi_next (&gsi))
10136 {
10137 stmt = gsi_stmt (gsi);
10138 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10139 worklist.safe_push (stmt);
10140 }
10141 }
10142
10143 free (bbs);
10144 if (worklist.is_empty ())
10145 return;
10146
10147 /* Loop has masked stores. */
10148 while (!worklist.is_empty ())
10149 {
10150 gimple *last, *last_store;
10151 edge e, efalse;
10152 tree mask;
10153 basic_block store_bb, join_bb;
10154 gimple_stmt_iterator gsi_to;
10155 tree vdef, new_vdef;
10156 gphi *phi;
10157 tree vectype;
10158 tree zero;
10159
10160 last = worklist.pop ();
10161 mask = gimple_call_arg (last, 2);
10162 bb = gimple_bb (last);
10163 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10164 the same loop as if_bb. It could be different to LOOP when two
10165 level loop-nest is vectorized and mask_store belongs to the inner
10166 one. */
10167 e = split_block (bb, last);
10168 bb_loop = bb->loop_father;
10169 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10170 join_bb = e->dest;
10171 store_bb = create_empty_bb (bb);
10172 add_bb_to_loop (store_bb, bb_loop);
10173 e->flags = EDGE_TRUE_VALUE;
10174 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10175 /* Put STORE_BB to likely part. */
10176 efalse->probability = profile_probability::unlikely ();
10177 store_bb->count = efalse->count ();
10178 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10179 if (dom_info_available_p (CDI_DOMINATORS))
10180 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10181 if (dump_enabled_p ())
10182 dump_printf_loc (MSG_NOTE, vect_location,
10183 "Create new block %d to sink mask stores.",
10184 store_bb->index);
10185 /* Create vector comparison with boolean result. */
10186 vectype = TREE_TYPE (mask);
10187 zero = build_zero_cst (vectype);
10188 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10189 gsi = gsi_last_bb (bb);
10190 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10191 /* Create new PHI node for vdef of the last masked store:
10192 .MEM_2 = VDEF <.MEM_1>
10193 will be converted to
10194 .MEM.3 = VDEF <.MEM_1>
10195 and new PHI node will be created in join bb
10196 .MEM_2 = PHI <.MEM_1, .MEM_3>
10197 */
10198 vdef = gimple_vdef (last);
10199 new_vdef = make_ssa_name (gimple_vop (cfun), last);
10200 gimple_set_vdef (last, new_vdef);
10201 phi = create_phi_node (vdef, join_bb);
10202 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10203
10204 /* Put all masked stores with the same mask to STORE_BB if possible. */
10205 while (true)
10206 {
10207 gimple_stmt_iterator gsi_from;
10208 gimple *stmt1 = NULL;
10209
10210 /* Move masked store to STORE_BB. */
10211 last_store = last;
10212 gsi = gsi_for_stmt (last);
10213 gsi_from = gsi;
10214 /* Shift GSI to the previous stmt for further traversal. */
10215 gsi_prev (&gsi);
10216 gsi_to = gsi_start_bb (store_bb);
10217 gsi_move_before (&gsi_from, &gsi_to);
10218 /* Setup GSI_TO to the non-empty block start. */
10219 gsi_to = gsi_start_bb (store_bb);
10220 if (dump_enabled_p ())
10221 dump_printf_loc (MSG_NOTE, vect_location,
10222 "Move stmt to created bb\n%G", last);
10223 /* Move all stored value producers if possible. */
10224 while (!gsi_end_p (gsi))
10225 {
10226 tree lhs;
10227 imm_use_iterator imm_iter;
10228 use_operand_p use_p;
10229 bool res;
10230
10231 /* Skip debug statements. */
10232 if (is_gimple_debug (gsi_stmt (gsi)))
10233 {
10234 gsi_prev (&gsi);
10235 continue;
10236 }
10237 stmt1 = gsi_stmt (gsi);
10238 /* Do not consider statements writing to memory or having
10239 volatile operand. */
10240 if (gimple_vdef (stmt1)
10241 || gimple_has_volatile_ops (stmt1))
10242 break;
10243 gsi_from = gsi;
10244 gsi_prev (&gsi);
10245 lhs = gimple_get_lhs (stmt1);
10246 if (!lhs)
10247 break;
10248
10249 /* LHS of vectorized stmt must be SSA_NAME. */
10250 if (TREE_CODE (lhs) != SSA_NAME)
10251 break;
10252
10253 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10254 {
10255 /* Remove dead scalar statement. */
10256 if (has_zero_uses (lhs))
10257 {
10258 gsi_remove (&gsi_from, true);
10259 continue;
10260 }
10261 }
10262
10263 /* Check that LHS does not have uses outside of STORE_BB. */
10264 res = true;
10265 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10266 {
10267 gimple *use_stmt;
10268 use_stmt = USE_STMT (use_p);
10269 if (is_gimple_debug (use_stmt))
10270 continue;
10271 if (gimple_bb (use_stmt) != store_bb)
10272 {
10273 res = false;
10274 break;
10275 }
10276 }
10277 if (!res)
10278 break;
10279
10280 if (gimple_vuse (stmt1)
10281 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10282 break;
10283
10284 /* Can move STMT1 to STORE_BB. */
10285 if (dump_enabled_p ())
10286 dump_printf_loc (MSG_NOTE, vect_location,
10287 "Move stmt to created bb\n%G", stmt1);
10288 gsi_move_before (&gsi_from, &gsi_to);
10289 /* Shift GSI_TO for further insertion. */
10290 gsi_prev (&gsi_to);
10291 }
10292 /* Put other masked stores with the same mask to STORE_BB. */
10293 if (worklist.is_empty ()
10294 || gimple_call_arg (worklist.last (), 2) != mask
10295 || worklist.last () != stmt1)
10296 break;
10297 last = worklist.pop ();
10298 }
10299 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10300 }
10301 }
|
optimize_mask_vec_cond 代码
10093 void
10094 optimize_mask_vec_cond (class loop *loop)
10095 {
10096 basic_block *bbs = get_loop_body (loop);
10097 unsigned nbbs = loop->num_nodes;
10098 unsigned i;
10099 basic_block bb, bb_mask;
10100 class loop *bb_loop;
10101 gimple_stmt_iterator gsi;
10102 gimple *stmt;
10103 auto_vec<gimple *> worklist;
10104 auto_purge_vect_location sentinel;
10105
10106 enum tree_code code;
10107
10108 vect_location = find_loop_location (loop);
10109 /* Pick up all vec_cond_expr in loop if any. */
10110 for (i = 0; i < nbbs; i++)
10111 {
10112 bb = bbs[i];
10113 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10114 gsi_next (&gsi))
10115 {
10116 stmt = gsi_stmt (gsi);
10117 if (is_gimple_assign(stmt)) {
10118 gassign *stmt_assign = dyn_cast <gassign *> (gsi_stmt (gsi));
10119 code = gimple_assign_rhs_code (stmt_assign);
10120 // 检查语句是否为 VEC_COND_EXPR
10121 if (code == VEC_COND_EXPR) {
10122 worklist.safe_push (stmt);
10123 }
10124 }
10125 }
10126 }
10128 free (bbs);
10129 if (worklist.is_empty ())
10130 return;
10131
10132 /* Loop has vec_cond_expr. */
10133 while (!worklist.is_empty ())
10134 {
10135 gimple *last, *last_store, *last1;
10136 edge e, efalse;
10137 tree mask;
10138 basic_block store_bb, join_bb;
10139 gimple_stmt_iterator gsi_to;
10140 gimple_stmt_iterator gsi_stmt_def;
10141 tree vdef, new_vdef;
10142 gphi *phi;
10143 tree vectype;
10144 tree zero;
10145
10146 last = worklist.pop ();
10147 gassign *stmt_assign = dyn_cast <gassign *> (last);
10148 mask = gimple_assign_rhs1(stmt_assign);
10149 tree true_vector_operand = gimple_assign_rhs2(stmt_assign);
10150
10151 gimple *mask_def = SSA_NAME_DEF_STMT (mask);
10152
10153 gimple *stmt_def = SSA_NAME_DEF_STMT (true_vector_operand);
10154
10155 bb = gimple_bb (stmt_def);
10156
10157 // bb_mask = gimple_bb (mask_def);
10158 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10159 the same loop as if_bb. It could be different to LOOP when two
10160 level loop-nest is vectorized and mask_store belongs to the inner
10161 one. */
10162
10163 gsi_stmt_def = gsi_for_stmt (stmt_def);
10164 gsi_next(&gsi_stmt_def);
10165
10166 stmt_def = gsi_stmt(gsi_stmt_def);
10167
10168 e = split_block (bb, stmt_def);
10169 bb_loop = bb->loop_father;
10170 // gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10171 join_bb = e->dest;
10172 store_bb = create_empty_bb (bb);
10173 add_bb_to_loop (store_bb, bb_loop);
10174 e->flags = EDGE_TRUE_VALUE;
10175 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10176 /* Put STORE_BB to likely part. */
10177 efalse->probability = profile_probability::unlikely ();
10178 store_bb->count = efalse->count ();
10179 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10180 if (dom_info_available_p (CDI_DOMINATORS))
10181 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10182 if (dump_enabled_p ())
10183 dump_printf_loc (MSG_NOTE, vect_location,
10184 "Create new block %d to sink vect cond expr",
10185 store_bb->index);
10186 /* Create vector comparison with boolean result. */
10187 vectype = TREE_TYPE (mask);
10188 zero = build_zero_cst (vectype);
10189 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10190 // gsi = gsi_last_bb (bb);
10191 gsi = gsi_for_stmt (mask_def);
10192 gsi_next(&gsi);
10193 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10194 /* Create new PHI node for vdef of the last masked store:
10195 .MEM_2 = VDEF <.MEM_1>
10196 will be converted to
10197 .MEM.3 = VDEF <.MEM_1>
10198 and new PHI node will be created in join bb
10199 .MEM_2 = PHI <.MEM_1, .MEM_3>
10200 */
10201 /* vdef = gimple_vdef (last);
10202 new_vdef = make_ssa_name (gimple_vop (cfun), last);
10203 gimple_set_vdef (last, new_vdef);
10204 phi = create_phi_node (vdef, join_bb);
10205 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);*/
10206
10207 /* Put all masked stores with the same mask to STORE_BB if possible. */
10208 // while (true)
10209 // {
10210 gimple_stmt_iterator gsi_from;
10211 gimple *stmt1 = NULL;
10213 /* Move vec_cond second var def to STORE_BB. */
10214 last_store = stmt_def;
10215 gsi = gsi_for_stmt (stmt_def);
10216 gsi_from = gsi;
10217 /* Shift GSI to the previous stmt for further traversal. */
10218 gsi_prev (&gsi);
10219 gsi_to = gsi_start_bb (store_bb);
10220 gsi_move_before (&gsi_from, &gsi_to);
10221 /* Setup GSI_TO to the non-empty block start. */
10222 gsi_to = gsi_start_bb (store_bb);
10223 if (dump_enabled_p ())
10224 dump_printf_loc (MSG_NOTE, vect_location,
10225 "Move stmt to created bb\n%G", last);
10226 /* Move all stored value producers if possible. */
10227 while (!gsi_end_p (gsi))
10228 {
10229 tree lhs;
10230 imm_use_iterator imm_iter;
10231 use_operand_p use_p;
10232 bool res;
10233
10234 /* Skip debug statements. */
10235 if (is_gimple_debug (gsi_stmt (gsi)))
10236 {
10237 gsi_prev (&gsi);
10238 continue;
10239 }
10240 stmt1 = gsi_stmt (gsi);
10241 /* Do not consider statements writing to memory or having
10242 volatile operand. */
10243 if (gimple_vdef (stmt1)
10244 || gimple_has_volatile_ops (stmt1))
10245 break;
10246 gsi_from = gsi;
10247 gsi_prev (&gsi);
10248 lhs = gimple_get_lhs (stmt1);
10249 if (!lhs)
10250 break;
10251
10252 /* LHS of vectorized stmt must be SSA_NAME. */
10253 if (TREE_CODE (lhs) != SSA_NAME)
10254 break;
10255
10256 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10257 {
10258 /* Remove dead scalar statement. */
10259 /* if (has_zero_uses (lhs))
10260 {
10261 gsi_remove (&gsi_from, true);
10262 continue;
10263 }*/
10264 }
10265
10266 /* Check that LHS does not have uses outside of STORE_BB. */
10267 res = true;
10268 /* FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10269 {
10270 gimple *use_stmt;
10271 use_stmt = USE_STMT (use_p);
10272 if (is_gimple_debug (use_stmt))
10273 continue;
10274 if (gimple_bb (use_stmt) != store_bb)
10275 {
10276 res = false;
10277 break;
10278 }
10279 }*/
10280 if (!res)
10281 break;
10282
10283 /* if (gimple_vuse (stmt1)
10284 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10285 break;*/
10286
10287 /* Can move STMT1 to STORE_BB. */
10288 if (dump_enabled_p ())
10289 dump_printf_loc (MSG_NOTE, vect_location,
10290 "Move stmt to created bb\n%G", stmt1);
10291 gsi_move_before (&gsi_from, &gsi_to);
10292 /* Shift GSI_TO for further insertion. */
10293 gsi_prev (&gsi_to);
10294 }
10295 /* Put other masked stores with the same mask to STORE_BB. */
10296 /* if (worklist.is_empty ()
10297 || gimple_call_arg (worklist.last (), 2) != mask
10298 || worklist.last () != stmt1)
10299 break;
10300 last = worklist.pop ();*/
10301 // last1 = worklist.pop ();
10302 // }
10303 // add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10304 if (!worklist.is_empty ())
10305 last = worklist.pop ();
10306 }
10307 }
|
能够按照预期进行拆分bb块,同时解决编译不过的两个问题:
1:加上-g 之后,在fre pass 会报错,在对debug gimple 进行分析删除的时候,找不到某个标量的定义。 最后一个分支的标量gimple被直接删除了,没有生成debug gimple。导致后面debug gimple 使用到该标量是找不到其定义,报编译错误。解决方法,先去掉-g。后续在dce pass 中找删除标量和插入debug的逻辑。# DEBUG D#583 => D#597 ? _2164 : 0.0
2:在sink pass 中报编译错误,gimple_redirect_edge_and_branch函数中,assert不通过,需要该edge 是一个fallthru edge。在构造edge的时候需要生成。暂时注释掉。
default: 6134 /* Otherwise it must be a fallthru edge, and we don't need to 6135 do anything besides redirecting it. */ 6136 // gcc_assert (e->flags & EDGE_FALLTHRU); |
解决掉编译错误后,可以正确编译运行,但是结果错误。
原因是该loop 的 vf是8.每次会对loop 中的8个元素进行运算,计算mask的数据是double类型,会生成两个mask。每个分支需要对两个mask同时和{0,0,0,0}比较是否为0,目前只能进行一个mask的比较。可以的方法:
1:修改loop 中int 的类型使其在确定vf的时候将其作为double 看待(VIEW_CONVERT_EXPR),这样vf 是4, 就不存在两个mask。
2:gimple cond 不能支持这种if ( a==0 && b==0) 这种复杂条件表达,构造两个gimple cond。然后做&运算,将此条件作为需要判断的cond。
1761处循环:
1:在每个分支条件构造后插入两个mask按位或的gimple,并且以此新建一个gimple cond,作为分支判断的条件。
2:课题运算结果VE.查找原因。从打印每个分支运算结果来看,temp4的结果恒为0,即最后一个分支完全没有走到,存在问题,同时加上-g后报错,也是最后一个分支的标量被删除,怀疑最后分支在拆分的时候存在问题。(正确结果在源码中加打印中间结果,无法进行打印)。
Lhs use outside of BB。当其使用的outside BB是 VEC_COND 所在的BB认为是没问题的,其他情况需要进行添加phi节点操作
![]()



2中的stmt的 lhs res在4 里面被使用,原本在同一个bb里面不需要做额外的操作,当分到不同的bb后,走不走2 res的值会不同,如果不走4中用的res会使用上一次2中计算的res值,显然结果错误,需要添加phi节点来解决。
若2中的lhs res0 被 4 use ,需要在 2的上一个bb 1新建一个向量变量res1 = 0,在2 的下一个bb 3中,新建一个phi节点,res2 = phi<res1(1),res0(2)>, 并且将4中用到res0的地方改为res2。
若2中的lhs res0 被 4 use ,需要在 2的上一个bb 1新建一个向量变量res1 = 0,将2中的res0 = xx 修改为 res2 = xx,在2 的下一个bb 3中,新建一个phi节点,res0 = phi<res1(1),res2(2)>。
若2中的res0 2中的其他stmt使用到,则需要将所有用到res0的地方改成res2
对于多个分支都要进行计算的变量,可以将第二个分支直接用到此计算的地方,需要使用该计算的全部。在用到其的地方需要进行计算。
新增phi节点的代码
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10283 {
10284 gimple *use_stmt;
10285 use_stmt = USE_STMT (use_p);
10286 if (is_gimple_debug (use_stmt))
10287 continue;
10288 if (gimple_bb (use_stmt) != store_bb && gimple_bb (use_stmt) != gimple_bb (last))
10289 {
10290 // res = false;
10291
10292 if (dump_enabled_p ())
10293 dump_printf_loc (MSG_NOTE, vect_location,
10294 "LHS have use outside of store_BB\n%G", stmt1);
10295 tree lhs_use_out,new_lhs,new_lhs1,new_lhs2;
10296 tree new_lhs_phi;
10297 gphi *phi;
10298 tree vectype;
10299 tree zero;
10300 gimple *zero_def;
10301 lhs_use_out = gimple_assign_lhs(stmt1);
10302
10303 /* if (is_gimple_assign(stmt1)) {
10304 lhs_use_out = gimple_assign_lhs(stmt1);
10305 new_lhs = create_tmp_var(TREE_TYPE(lhs_use_out), "new_tmp_var");
10306 new_lhs_phi = make_ssa_name(new_lhs,NULL);
10307 // gimple_assign_set_lhs(stmt1, new_lhs1);
10308
10309
10310 phi = create_phi_node (new_lhs_phi, join_bb);
10311 add_phi_arg (phi, lhs_use_out, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10312
10313 vectype = TREE_TYPE (lhs_use_out);
10314 zero = build_zero_cst (vectype);
10315 new_lhs1 = create_tmp_var(TREE_TYPE(lhs_use_out), "new_tmp_var1");
10316 new_lhs2 = make_ssa_name(new_lhs1,NULL);
10317 zero_def = gimple_build_assign(new_lhs2, zero);
10318
10319 // basic_block stmt_bb = gimple_bb(stmt1);
10320 edge e_temp;
10321 edge_iterator ei;
10322 basic_block pred_bb;
10323 gimple_stmt_iterator gsi_temp;
10324
10325 // if (EDGE_COUNT(stmt_bb->preds) == 1) {
10326 e_temp = EDGE_PRED(store_bb, 0);
10327 pred_bb = e_temp->src;
10328 gsi_temp = gsi_start_bb(pred_bb);
10329 gsi_insert_before(&gsi_temp, zero_def, GSI_SAME_STMT);
10330 // }
10331
10332 add_phi_arg (phi, new_lhs2, e, UNKNOWN_LOCATION);
10333 // update_stmt (phi);
10334
10335 /* edge e_join;
10336 edge_iterator ei_join;
10337
10338 FOR_EACH_EDGE(e_join, ei_join, join_bb->succs)
10339 {
10340 if (EDGE_TRUE_P(e_join))
10341 {
10342 *true_bb = e->dest;
10343 }
10344 }*/
10345
10346 for (unsigned int i = 0; i < gimple_num_ops(use_stmt); i++) {
10347 tree rhs = gimple_op(use_stmt, i);
10348 if(rhs == lhs_use_out) {
10349 gimple_stmt_iterator gsi = gsi_for_stmt(use_stmt);
10350 gsi_insert_before (&gsi,stmt1,GSI_SAME_STMT);
10351 break;
10352 // create_new_def_for (rhs, phi,gimple_phi_result_ptr (phi));
10353 // update_stmt (phi);
10354 }
10355 }
10356 // }
|
2069处循环:
1:需要进行dim=3的常量传播,加上拆分循环这两个条件。验证前一个循环向量化后有7%的性能,加上ymm寄存器后有11%的性能。
2:查看gcc的loop split 和 loop distribute pass,发现loop distribute的总体思想是将能够向量化的代码最大限度拆分到一个循环中,(1)但其只对非嵌套循环的最内层循环分析,发现其dump的信息中没有对2069循环进行distribute。(2)同时其只能对没有数据依赖的部分distribute,源码有数据依赖的部分使用临时数组存储后进行拆分,需要自行编写代码实现。
549课题在mask store中涉及的运算上对数学函数添加mask代码
1 #include "config.h"
2 #include "system.h"
3 #include "coretypes.h"
4 #include "backend.h"
5 #include "tree.h"
6 #include "gimple.h"
7 #include "predict.h"
8 #include "tree-pass.h"
9 #include "ssa.h"
10 #include "cgraph.h"
11 #include "fold-const.h"
12 #include "stor-layout.h"
13 #include "gimple-iterator.h"
14 #include "gimple-walk.h"
15 #include "tree-ssa-loop-manip.h"
16 #include "tree-ssa-loop-niter.h"
17 #include "tree-cfg.h"
18 #include "cfgloop.h"
19 #include "tree-vectorizer.h"
20 #include "tree-ssa-propagate.h"
21 #include "dbgcnt.h"
22 #include "tree-scalar-evolution.h"
23 #include "stringpool.h"
24 #include "attribs.h"
25 #include "gimple-pretty-print.h"
26 #include "opt-problem.h"
27 #include "internal-fn.h"
28 #include "tree-ssa-sccvn.h"
29 #include "gimple-expr.h"
30 #include <cstdio>
31
32 namespace
33 {
34 const pass_data pass_data_test = {
35 GIMPLE_PASS, /* type */
36 "mask_vecmath_func", /* name */
37 OPTGROUP_NONE, /* optinfo_flags */
38 TV_TREE_VECT_MASK_VECMATH_FUNC, /* tv_id */
39 (PROP_cfg | PROP_ssa), /* properties_required */
40 0, /* properties_provided */
41 0, /* properties_destroyed */
42 0, /* todo_flags_start */
43 0, /* todo_flags_finish */
44 };
45
46 class pass_mask_vecmath_func : public gimple_opt_pass
47 {
48 public:
49 pass_mask_vecmath_func (gcc::context *ctxt) : gimple_opt_pass (pass_data_test, ctxt) {}
50 virtual bool
51 gate (function *fun)
52 {
53 // printf ("gate function noipa.\n");
54 return flag_tree_mask_vecmath_func;
55 }
56
57 virtual unsigned int execute (function *);
58 };
59
60
61 static void add_mask_to_call(gimple *stmt, tree new_arg, const char *func_name) {
62 if (!is_gimple_call(stmt)) {
63 // 如果不是函数调用语句,则不做任何操作
64 return;
65 }
66
67 // 获取原始函数调用的目标和参数列表
68 tree call_fn = gimple_call_fndecl(stmt);
69
70 // 获取或创建新的标识符节点来表示新的函数名称
71 tree new_func_id;
72 if(strcmp(func_name, "vmldCos2") == 0)
73 new_func_id = get_identifier("__svml_cos2_mask_e9");
74 else if (strcmp(func_name, "vmldExp2") == 0)
75 new_func_id = get_identifier("__svml_exp2_mask_e9");
76 else if (strcmp(func_name, "vmldSin2") == 0)
77 new_func_id = get_identifier("__svml_sin2_mask_e9");
78 else if (strcmp(func_name, "sin.simdclone.2") == 0)
79 new_func_id = get_identifier("__svml_sin4_mask_e9");
80 else if (strcmp(func_name, "cos.simdclone.2") == 0)
81 new_func_id = get_identifier("__svml_cos4_mask_e9");
82 else if (strcmp(func_name, "exp.simdclone.2") == 0)
83 new_func_id = get_identifier("__svml_exp4_mask_e9");
84
85 tree fntype = TREE_TYPE(call_fn);
87 tree new_fndecl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, new_func_id, fntype);
88
89 TREE_PUBLIC (new_fndecl) = 1;
90 DECL_EXTERNAL (new_fndecl) = 1;
91 DECL_IS_NOVOPS (new_fndecl) = 1;
92 TREE_READONLY (new_fndecl) = 1;
93
94
95 // 将新的标识符节点分配给函数声明的汇编名
96 // DECL_ASSEMBLER_NAME(call_fn) = new_func_id;
97
98 int num_args = gimple_call_num_args(stmt);
99 vec<tree> vargs = vNULL;
100 vargs.create (num_args+1);
101
102 // 创建一个新的参数列表,包含原始的参数和新的参数
103 for (int i = 0; i < num_args; i++) {
104 tree arg = gimple_call_arg(stmt, i);
105 vargs.safe_push(arg);
106 }
107 vargs.safe_push(new_arg);
108
109 tree lhs = gimple_call_lhs(stmt);
110
111 // 创建新的函数调用语句,包含新的参数
112 gimple *new_call = gimple_build_call_vec(new_fndecl,vargs);
113 gimple_call_set_lhs (new_call, lhs);
114
115 // 替换原始的函数调用语句
116 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
117
118 // printf ("-------------finish add mask to vecmath func call------------.\n");
119
120 gsi_replace(&gsi, new_call,true);
121 stmt = new_call;
122
123 // 释放参数列表的内存
124 vargs.release ();
125 }
126
127 static void find_relate_operand(tree operand, gimple *stmt, tree mask)
128 {
129 if (!stmt)
130 return ;
131
132 if (TREE_CODE (operand) == SSA_NAME && is_gimple_call(stmt)) { // operand is ssa && stmt is gimple call
133 tree fndecl = gimple_call_fndecl(stmt); // 获取函数声明
134 if (fndecl && DECL_P(fndecl)) { // 确保fndecl有效并且是一个声明
135 const char *func_name = IDENTIFIER_POINTER(DECL_NAME(fndecl)); // 获取函数名称
136 // if (strcmp(func_name, "vmldLn2") == 0) {
137 if (strcmp(func_name, "vmldCos2") == 0 ||
138 strcmp(func_name, "vmldExp2") == 0 ||
139 strcmp(func_name, "vmldSin2") == 0 ||
140 strcmp(func_name, "exp.simdclone.2") == 0 ||
141 strcmp(func_name, "cos.simdclone.2") == 0 ||
142 strcmp(func_name, "sin.simdclone.2") == 0) {
143 // printf ("-------------find math func------------.\n");
144 add_mask_to_call(stmt,mask,func_name);
145 return ;
146 }
147 }
148 }
149 if (TREE_CODE (operand) == SSA_NAME && is_gimple_assign(stmt)) { // only find gimple assign
150
151 for (unsigned i = 1; i < gimple_num_ops(stmt); ++i) { // get gimple assign right hand side operand
152 tree op = gimple_op(stmt, i);
153 if(TREE_CODE (op) == SSA_NAME) {
154
155 gimple *stmt_2 = SSA_NAME_DEF_STMT (op);
156 find_relate_operand(op,stmt_2,mask);
157 // if(result) return result;
158 }
159 }
160 }
161 return ;
162 }
163
164
165 unsigned
166 pass_mask_vecmath_func::execute (function *fun)
167 {
168 unsigned ret = 0;
169
170 basic_block bb;
171 enum tree_code code;
172 FOR_EACH_BB_FN(bb, fun) {
173 gimple_stmt_iterator gsi;
174
175 /* for (int i = 1; i < number_of_loops (fun); i++)
176 {
177 loop_vec_info loop_vinfo;
178 bool has_mask_store;
179
180 class loop *loop = get_loop (fun, i);
181 if (!loop || !loop->aux)
182 continue;
183 loop_vinfo = (loop_vec_info) loop->aux;
184 has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
185 delete loop_vinfo;
186 if (has_mask_store) {
187
188 printf ("-------------have mask store------------.\n");
189
190 basic_block *bbs = get_loop_body (loop);
191 unsigned nbbs = loop->num_nodes;
192 unsigned i;
193 basic_block bb;
194 class loop *bb_loop;
195 gimple_stmt_iterator gsi;
196 gimple *stmt;
197
198 for (i = 0; i < nbbs; i++)
199 {
200 bb = bbs[i];*/
201 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
202 gsi_next (&gsi))
203 {
204 gimple *stmt = gsi_stmt (gsi);
205 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) {
206 // printf ("------------ find mask store------------.\n");
207 basic_block bb1 = gimple_bb(stmt);
208 tree mask = gimple_call_arg (stmt, 2);
209 tree value = gimple_call_arg (stmt, 3);
210 if(TREE_CODE (value) == SSA_NAME) {
211 gimple *value_def = SSA_NAME_DEF_STMT (value);
212 basic_block bb2 = gimple_bb(value_def);
213 // printf ("-------------begin find relate operand------------.\n");
214 if(bb1 == bb2) // mask store and value def in same bb
215 find_relate_operand(value,value_def,mask);
216 }
217 }
218 }
219
220 // free (bbs);
221 }
222 // }
223 // }
224
225 return ret;
226
227 }
228 }
229
230 gimple_opt_pass *
231 make_pass_mask_vecmath_func (gcc::context *ctxt)
232 {
233 return new pass_mask_vecmath_func (ctxt);
234 }
|
10092
10093
10094 void
10095 optimize_mask_vec_cond (class loop *loop)
10096 {
10097 basic_block *bbs = get_loop_body (loop);
10098 unsigned nbbs = loop->num_nodes;
10099 unsigned i;
10100 basic_block bb, bb_mask;
10101 class loop *bb_loop;
10102 gimple_stmt_iterator gsi;
10103 gimple *stmt;
10104 auto_vec<gimple *> worklist;
10105 auto_purge_vect_location sentinel;
10106
10107 enum tree_code code;
10108
10109 vect_location = find_loop_location (loop);
10110 /* Pick up all vec_cond_expr in loop if any. */
10111 for (i = 0; i < nbbs; i++)
10112 {
10113 bb = bbs[i];
10114 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10115 gsi_next (&gsi))
10116 {
10117 stmt = gsi_stmt (gsi);
10118 if (is_gimple_assign(stmt)) {
10119 gassign *stmt_assign = dyn_cast <gassign *> (gsi_stmt (gsi));
10120 code = gimple_assign_rhs_code (stmt_assign);
10121 // 检查语句是否为 VEC_COND_EXPR
10122 if (code == VEC_COND_EXPR) {
10123 worklist.safe_push (stmt);
10124 }
10125 }
10126 }
10127 }
10128
10129 free (bbs);
10130 if (worklist.is_empty () || worklist.length()==1)
10131 return;
10132
10133 /* Loop has vec_cond_expr. */
10134 while (!worklist.is_empty ())
10135 {
10136 gimple *last, *last_store, *last1;
10137 edge e, efalse;
10138 tree mask,mask2;
10139 basic_block store_bb, join_bb;
10140 gimple_stmt_iterator gsi_to;
10141 gimple_stmt_iterator gsi_stmt_def,gsi_mask_def;
10142 tree vdef, new_vdef;
10143 gphi *phi;
10144 tree vectype;
10145 tree zero_vector;
10146
10147 last = worklist.pop ();
10148 gassign *stmt_assign = dyn_cast <gassign *> (last);
10149 mask = gimple_assign_rhs1(stmt_assign);
10150 tree true_vector_operand = gimple_assign_rhs2(stmt_assign);
10151
10152 gimple *mask_def = SSA_NAME_DEF_STMT (mask);
10153
10154 gsi_mask_def = gsi_for_stmt(mask_def);
10155 gsi_prev(&gsi_mask_def);
10156 gimple *mask2_def = gsi_stmt(gsi_mask_def);
10157 gassign *stmt_mask2 = dyn_cast <gassign *> (mask2_def);
10158 mask2 = gimple_assign_lhs(stmt_mask2);
10159
10160
10161 gimple *stmt_def = SSA_NAME_DEF_STMT (true_vector_operand);
10162
10163 bb = gimple_bb (stmt_def);
10164
10165 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10166 the same loop as if_bb. It could be different to LOOP when two
10167 level loop-nest is vectorized and mask_store belongs to the inner
10168 one. */
10169
10170 gsi_stmt_def = gsi_for_stmt (stmt_def);
10171 gsi_next(&gsi_stmt_def);
10172
10173 stmt_def = gsi_stmt(gsi_stmt_def);
10174
10175 e = split_block (bb, stmt_def);
10176 bb_loop = bb->loop_father;
10177 // gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10178 join_bb = e->dest;
10179 store_bb = create_empty_bb (bb);
10180 add_bb_to_loop (store_bb, bb_loop);
10181 e->flags = EDGE_TRUE_VALUE;
10182 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10183 /* Put STORE_BB to likely part. */
10184 efalse->probability = profile_probability::unlikely ();
10185 store_bb->count = efalse->count ();
10186 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10187 if (dom_info_available_p (CDI_DOMINATORS))
10188 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10189 if (dump_enabled_p ())
10190 dump_printf_loc (MSG_NOTE, vect_location,
10191 "Create new block %d to sink vect cond expr",
10192 store_bb->index);
10193 /* Create vector comparison with boolean result. */
10194 vectype = TREE_TYPE (mask);
10195 zero_vector = build_zero_cst (vectype);
10196
10197 tree combined_mask = create_tmp_var(TREE_TYPE(zero_vector), "combined_mask");
10198
10199 gimple *combine_stmt1 = gimple_build_assign(combined_mask, BIT_IOR_EXPR, mask, mask2);
10200
10201 gsi = gsi_for_stmt (mask_def);
10202 gsi_next(&gsi);
10203 gsi_insert_after (&gsi, combine_stmt1, GSI_SAME_STMT);
10204
10205 /* vec<constructor_elt, va_gc> *ret_ctor_elts_tmp = NULL;
10206 vec_alloc (ret_ctor_elts_tmp, 2);
10207 CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask2); // 添加第二个左子树
10208 CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask); // 添加第一个左子树
10209
10210 // tree signed_boolean_type = build_nonstandard_integer_type(64, 1);
10211 tree signed_boolean_type = build_nonstandard_boolean_type(64);
10212
10213 tree vect_type = build_vector_type(signed_boolean_type, 4);
10214 tree constructor = build_constructor(vect_type, ret_ctor_elts_tmp);
10215
10216 tree new_var_constru = create_tmp_var(vect_type, "mask_array");
10217 gimple *new_stmt_construc = gimple_build_assign(make_ssa_name(new_var_constru), constructor);
10218 gsi_next(&gsi);
10219 gsi_insert_after (&gsi, new_stmt_construc, GSI_SAME_STMT);*/
10220
10221 gimple *gcond = gimple_build_cond(EQ_EXPR, combined_mask, zero_vector, NULL, NULL);
10222 gsi_next(&gsi);
10223 gsi_insert_after(&gsi, gcond, GSI_NEW_STMT);
10224
10225
10226 /* Put all masked stores with the same mask to STORE_BB if possible. */
10227 // while (true)
10228 // {
10229 gimple_stmt_iterator gsi_from;
10230 gimple *stmt1 = NULL;
10231
10232 /* Move vec_cond second var def to STORE_BB. */
10233 last_store = stmt_def;
10234 gsi = gsi_for_stmt (stmt_def);
10235 gsi_from = gsi;
10236 /* Shift GSI to the previous stmt for further traversal. */
10237 gsi_prev (&gsi);
10238 gsi_to = gsi_start_bb (store_bb);
10239 gsi_move_before (&gsi_from, &gsi_to);
10240 /* Setup GSI_TO to the non-empty block start. */
10241 gsi_to = gsi_start_bb (store_bb);
10242 if (dump_enabled_p ())
10243 dump_printf_loc (MSG_NOTE, vect_location,
10244 "Move stmt to created bb\n%G", last);
10245 /* Move all stored value producers if possible. */
10246 while (!gsi_end_p (gsi))
10247 {
10248 tree lhs;
10249 imm_use_iterator imm_iter;
10250 use_operand_p use_p;
10251 bool res;
10252
10253 /* Skip debug statements. */
10254 if (is_gimple_debug (gsi_stmt (gsi)))
10255 {
10256 gsi_prev (&gsi);
10257 continue;
10258 }
10259 stmt1 = gsi_stmt (gsi);
10260 /* Do not consider statements writing to memory or having
10261 volatile operand. */
10262 if (gimple_vdef (stmt1)
10263 || gimple_has_volatile_ops (stmt1))
10264 break;
10265 gsi_from = gsi;
10266 gsi_prev (&gsi);
10267 lhs = gimple_get_lhs (stmt1);
10268 if (!lhs)
10269 break;
10270
10271 /* LHS of vectorized stmt must be SSA_NAME. */
10272 if (TREE_CODE (lhs) != SSA_NAME)
10273 break;
10274
10275 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10276 {
10277 /* Remove dead scalar statement. */
10278 if (has_zero_uses (lhs))
10279 {
10280 gsi_remove (&gsi_from, true);
10281 continue;
10282 }
10283 }
10284
10285 /* Check that LHS does not have uses outside of STORE_BB. */
10286 res = true;
10287 // FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10288 gimple *use_lhs;
10289 FOR_EACH_IMM_USE_STMT (use_lhs, imm_iter, lhs)
10290 {
10291 gimple *use_stmt;
10292 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) {
10293
10294 // gimple *use_stmt;
10295 use_stmt = USE_STMT (use_p);
10296 if (is_gimple_debug (use_stmt))
10297 continue;
10298 if (gimple_bb (use_stmt) != store_bb && gimple_bb (use_stmt) != gimple_bb (last))
10299 {
10300 // res = false;
10301
10302 if (dump_enabled_p ())
10303 dump_printf_loc (MSG_NOTE, vect_location,
10304 "LHS have use outside of store_BB\n%G", stmt1);
10305 tree new_lhs,new_lhs1,new_lhs2;
10306 tree new_lhs_phi;
10307 gphi *phi;
10308 tree vectype;
10309 tree zero;
10310 gimple *zero_def;
10311
10312 gimple *new_assign_stmt;
10313
10314 if (is_gimple_assign(stmt1) && is_gimple_assign(use_lhs)) {
10315 for (unsigned int i = 1; i < gimple_num_ops(use_stmt); i++) {
10316 tree rhs = gimple_op(use_stmt, i);
10317 if(TREE_CODE (rhs) == SSA_NAME && (rhs == lhs)) {
10318
10319 if (dump_enabled_p ())
10320 dump_printf_loc (MSG_NOTE, vect_location,
10321 "insert new stmt to use out of BB\n");
10322 new_lhs = create_tmp_var(TREE_TYPE(lhs), "new_tmp_var");
10323 new_lhs1 = make_ssa_name(new_lhs,NULL);
10324 tree rhs1 = gimple_assign_rhs1(stmt1);
10325 tree rhs2 = gimple_assign_rhs2(stmt1);
10326 new_assign_stmt = gimple_build_assign(new_lhs1, gimple_assign_rhs_code(stmt1), rhs1, rhs2);
10327
10328 gimple_stmt_iterator gsi_temp = gsi_for_stmt(use_stmt);
10329 gsi_insert_before (&gsi_temp,new_assign_stmt,GSI_SAME_STMT);
10330 update_stmt(new_assign_stmt);
10331
10332 if( i == 1) {
10333
10334 gimple_assign_set_rhs1(use_stmt, new_lhs1);
10335 // update_stmt(use_stmt);
10336 }
10337 else if (i == 2) {
10338 gimple_assign_set_rhs2(use_stmt, new_lhs1);
10339 // update_stmt(use_stmt);
10340 }
10341
10342 // update_stmt(use_stmt);
10343 }
10344 }
10345 }
10346 }
10347 }
10348
10349 update_stmt(use_stmt);
10350 }
10351
10352 /* Can move STMT1 to STORE_BB. */
10353 /* if (dump_enabled_p ())
10354 dump_printf_loc (MSG_NOTE, vect_location,
10355 "Move stmt to created bb\n%G", stmt1);*/
10356 gsi_move_before (&gsi_from, &gsi_to);
10357 /* Shift GSI_TO for further insertion. */
10358 gsi_prev (&gsi_to);
10359 }
10360 if (!worklist.is_empty ())
10361 last = worklist.pop ();
10362 }
10363
10364 }
|
对 if continue的分块
10161 /* if(worklist.length()== 1) {
10162 if (dump_enabled_p ())
10163 dump_printf_loc (MSG_NOTE, vect_location,
10164 " if-continue split bb\n");
10165 tree mask_tmp2 = gimple_assign_rhs2(stmt_mask2);
10166 tree mask_tmp1 = gimple_assign_rhs2(stmt_mask1);
10167
10168 gimple *mask_temp2_def = SSA_NAME_DEF_STMT (mask_tmp2);
10169 gimple *mask_temp1_def = SSA_NAME_DEF_STMT (mask_tmp1);
10170
10171 gassign *stmt_mask_tmp2 = dyn_cast <gassign *> (mask_temp2_def);
10172 gassign *stmt_mask_tmp1 = dyn_cast <gassign *> (mask_temp1_def);
10173
10174 tree temp2_rhs1 = gimple_assign_rhs1(stmt_mask_tmp2);
10175 tree temp1_rhs1 = gimple_assign_rhs1(stmt_mask_tmp1);
10176
10177 tree target_mask3 = gimple_assign_lhs(stmt_mask_tmp2);
10178 tree target_mask4 = gimple_assign_lhs(stmt_mask_tmp1);
10179
10180 tree temp2_rhs2 = gimple_assign_rhs2(stmt_mask_tmp2);
10181 tree temp1_rhs2 = gimple_assign_rhs2(stmt_mask_tmp1);
10182
10183 gimple *target_stmt1 = SSA_NAME_DEF_STMT (temp2_rhs1);
10184 gimple *target_stmt2 = SSA_NAME_DEF_STMT (temp1_rhs1);
10185
10186 gassign *stmt_target_stmt1 = dyn_cast <gassign *> (target_stmt1);
10187 gassign *stmt_target_stmt2 = dyn_cast <gassign *> (target_stmt2);
10188
10189 tree target_mask1 = gimple_assign_lhs(stmt_target_stmt1);
10190 tree target_mask2 = gimple_assign_lhs(stmt_target_stmt2);
10191
10192
10193 gimple *target_stmt3 = SSA_NAME_DEF_STMT (temp2_rhs2);
10194 gimple *target_stmt4 = SSA_NAME_DEF_STMT (temp1_rhs2);
10195
10196 basic_block bb_tmp = gimple_bb (target_stmt1);
10197 basic_block bb_tmp_next = gimple_bb (target_stmt4);
10198 edge e_tmp;
10199 gimple_stmt_iterator target_stmt4_gsi = gsi_for_stmt(mask_temp1_def);
10200 gsi_next(&target_stmt4_gsi);
10201 gimple *target_stmt4_next = gsi_stmt(target_stmt4_gsi);
10202
10203 gimple_stmt_iterator target_stmt2_gsi = gsi_for_stmt(target_stmt2);
10204 gsi_next(&target_stmt2_gsi);
10205 gimple *target_stmt2_next = gsi_stmt(target_stmt2_gsi);
10206
10207 e_tmp = split_block (bb_tmp, target_stmt4_next);
10208 class loop *bb_loop_tmp = bb_tmp->loop_father;
10209 gcc_assert (loop == bb_loop_tmp || flow_loop_nested_p (loop, bb_loop_tmp));
10210
10211 basic_block bb_last_tmp = gimple_bb(last);
10212 basic_block join_bb_tmp;
10213 gimple *last_stmt_tmp = last_stmt(bb_last_tmp);
10214 if (last_stmt_tmp && gimple_code(last_stmt_tmp) == GIMPLE_COND) {
10215
10216 edge e_tmp2;
10217 edge_iterator ei_tmp2;
10218 basic_block true_bb;
10219
10220 FOR_EACH_EDGE(e_tmp2, ei_tmp2, bb_last_tmp->succs) {
10221 // 检查是否为 true 分支
10222 if (e_tmp2->flags & EDGE_TRUE_VALUE) {
10223 true_bb = e_tmp2->dest;
10224 }
10225 }
10226 join_bb_tmp = e_tmp->dest;
10227 basic_block store_bb_tmp = create_empty_bb (bb_tmp);
10228 add_bb_to_loop (store_bb_tmp, bb_loop_tmp);
10229 // e_tmp->flags = EDGE_TRUE_VALUE;
10230
10231 edge efalse_tmp_true = make_edge (bb_tmp, bb_last_tmp, EDGE_TRUE_VALUE);
10232 /* Put STORE_BB to likely part. */
10233 /* efalse_tmp_true->probability = profile_probability::likely ();
10234 store_bb_tmp->count = efalse_tmp_true->count ();
10235
10236 edge efalse_tmp = make_edge (bb_tmp, store_bb_tmp, EDGE_FALSE_VALUE);
10237 /* Put STORE_BB to likely part. */
10238 /* efalse_tmp->probability = profile_probability::unlikely ();
10239 store_bb_tmp->count = efalse_tmp->count ();
10240 // make_single_succ_edge (store_bb_tmp, join_bb_tmp, EDGE_FALLTHRU);
10241
10242 edge efalse_tmp_next = make_edge (store_bb_tmp, join_bb_tmp, EDGE_FALSE_VALUE);
10243 efalse_tmp_next->probability = profile_probability::unlikely ();
10244 // store_bb_tmp->count = efalse_tmp_true->count ();
10245
10246 edge etrue_tmp_next = make_edge (store_bb_tmp, bb_last_tmp, EDGE_TRUE_VALUE);
10247 etrue_tmp_next->probability = profile_probability::likely ();
10248 store_bb_tmp->count = efalse_tmp_true->count ();
10249 // true_bb = e_tmp->dest;
10250
10251 // e_tmp->dest = NULL;
10252 // e_tmp->flags = EDGE_TRUE_VALUE;
10253
10254 edge e_dele = find_edge(bb_tmp, join_bb_tmp);
10255 if (e_dele) {
10256 remove_edge(e_dele); // 删除这条边
10257 }
10258
10259 // true_bb->preds = chainon(true_bb->preds, e_tmp);
10260 add_to_dominance_info(CDI_DOMINATORS,join_bb_tmp);
10261
10262 if (dom_info_available_p (CDI_DOMINATORS)) {
10263 set_immediate_dominator (CDI_DOMINATORS, store_bb_tmp, bb_tmp);
10264 set_immediate_dominator (CDI_DOMINATORS, join_bb_tmp, store_bb_tmp);
10265 set_immediate_dominator (CDI_DOMINATORS, bb_last_tmp, bb_tmp);
10266 // free_dominance_info(CDI_DOMINATORS);
10267 calculate_dominance_info(CDI_DOMINATORS);
10268 }
10269
10270 // free_dominance_info(CDI_DOMINATORS);
10271 // calculate_dominance_info(CDI_DOMINATORS);
10272
10273 tree vectype_tmp = TREE_TYPE (mask_tmp1);
10274 tree zero_vector_tmp = build_zero_cst (vectype_tmp);
10275
10276 tree combined_mask_tmp = create_tmp_var(TREE_TYPE(zero_vector_tmp), "combined_mask_ifconti");
10277
10278 tree combined_mask_tmp2 = create_tmp_var(TREE_TYPE(zero_vector_tmp), "combined_mask_ifconti2");
10279
10280 gimple *combine_stmt1_tmp = gimple_build_assign(combined_mask_tmp, BIT_IOR_EXPR, target_mask1, target_mask2);
10281
10282 gimple *combine_stmt1_tmp2 = gimple_build_assign(combined_mask_tmp2, BIT_IOR_EXPR, target_mask3, target_mask4);
10283
10284 gimple_stmt_iterator gsi_tmp = gsi_for_stmt (target_stmt2);
10285 gsi_next(&gsi_tmp);
10286 gsi_insert_after (&gsi_tmp, combine_stmt1_tmp, GSI_SAME_STMT);
10287
10288 gimple_stmt_iterator gsi_tmp_next_if = gsi_last_bb (store_bb_tmp);
10289 // gsi_prev(&gsi_tmp_next_if);
10290 gsi_insert_before (&gsi_tmp_next_if, combine_stmt1_tmp2, GSI_SAME_STMT);
10291
10292 gimple *gcond_tmp = gimple_build_cond(EQ_EXPR, combined_mask_tmp, zero_vector_tmp, NULL, NULL);
10293 gsi_next(&gsi_tmp);
10294 gsi_insert_after(&gsi_tmp, gcond_tmp, GSI_NEW_STMT);
10295
10296 gimple *gcond_tmp_next = gimple_build_cond(EQ_EXPR, combined_mask_tmp2, zero_vector_tmp, NULL, NULL);
10297 // gsi_next(&gsi_tmp_next_if);
10298 gsi_insert_before(&gsi_tmp_next_if, gcond_tmp_next, GSI_NEW_STMT);
10299
10300 // calculate_dominance_info(CDI_DOMINATORS);
10301
10302 gimple_stmt_iterator gsi_from_tmp;
10303 gimple *stmt1 = NULL;
10304
10305 /* Move vec_cond second var def to STORE_BB. */
10306 /* gimple *last_store = target_stmt4_next;
10307 gimple_stmt_iterator gsi_tmp4 = gsi_for_stmt (target_stmt4_next);
10308 gsi_from_tmp = gsi_tmp4;
10309 /* Shift GSI to the previous stmt for further traversal. */
10310 /* gsi_prev (&gsi_tmp4);
10311 gimple_stmt_iterator gsi_to_tmp = gsi_start_bb (store_bb_tmp);
10312 gsi_move_before (&gsi_from_tmp, &gsi_to_tmp);
10313 /* Setup GSI_TO to the non-empty block start. */
10314 /* gsi_to_tmp = gsi_start_bb (store_bb_tmp);
10315 if (dump_enabled_p ())
10316 dump_printf_loc (MSG_NOTE, vect_location,
10317 "Move if-continue stmt to created bb\n%G", last);
10318 /* Move all stored value producers if possible. */
10319 /* while (!gsi_end_p (gsi_tmp4)) {
10320
10321 tree lhs;
10322 imm_use_iterator imm_iter;
10323 use_operand_p use_p;
10324 bool res;
10325
10326 /* Skip debug statements. */
10327 /* if (is_gimple_debug (gsi_stmt (gsi_tmp4)))
10328 {
10329 gsi_prev (&gsi_tmp4);
10330 continue;
10331 }
10332 stmt1 = gsi_stmt (gsi_tmp4);
10333 /* Do not consider statements writing to memory or having
10334 volatile operand. */
10335 /* if (gimple_vdef (stmt1) || gimple_has_volatile_ops (stmt1))
10336 break;
10337 gsi_from_tmp = gsi_tmp4;
10338 gsi_prev (&gsi_tmp4);
10339 lhs = gimple_get_lhs (stmt1);
10340 if (!lhs)
10341 break;
10342
10343 /* LHS of vectorized stmt must be SSA_NAME. */
10344 /* if (TREE_CODE (lhs) != SSA_NAME)
10345 break;
10346
10347 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10348 {
10349 /* Remove dead scalar statement. */
10350 /* if (has_zero_uses (lhs))
10351 {
10352 gsi_remove (&gsi_from_tmp, true);
10353 continue;
10354 }
10355 }
10356
10357 gsi_move_before (&gsi_from_tmp, &gsi_to_tmp);
10358 /* Shift GSI_TO for further insertion. */
10359 /* gsi_prev (&gsi_to_tmp);
10360 }
10361 }
10362 }*/ |
当vf 是4的时候,进行mask的合并,以及将合并后的mask加入到数学函数里面
mask合并代码
10410 vec<constructor_elt, va_gc> *ret_ctor_elts_tmp = NULL; 10411 vec_alloc (ret_ctor_elts_tmp, 2); 10412 CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask2); // 添加第二个左子树 10413 CONSTRUCTOR_APPEND_ELT(ret_ctor_elts_tmp, NULL_TREE, mask); // 添加第一个左子树 10414 10415 // tree signed_boolean_type = build_nonstandard_integer_type(64, 1); 10416 tree signed_boolean_type = build_nonstandard_boolean_type(64); 10417 10418 tree vect_type = build_vector_type(signed_boolean_type, 4); 10419 tree constructor = build_constructor(vect_type, ret_ctor_elts_tmp); 10420 10421 tree new_var_constru = create_tmp_var(vect_type, "mask_array"); 10422 gimple *new_stmt_construc = gimple_build_assign(make_ssa_name(new_var_constru), constructor); 10423 gsi_next(&gsi); 10424 gsi_insert_after (&gsi, new_stmt_construc, GSI_SAME_STMT); |
将合并后的mask加入到数学函数里面
195 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mask_operand)
196 {
197 gimple *use_stmt;
198 use_stmt = USE_STMT (use_p);
199 if(is_gimple_assign(use_stmt)) {
200 tree rhs1_tmp1 = gimple_assign_rhs1(use_stmt);
201 if (TREE_CODE(rhs1_tmp1) == CONSTRUCTOR) {
202 tree lhs_tmp1 = gimple_assign_lhs(use_stmt);
203 if(stmt_vecmath)
204 add_mask_to_call(stmt_vecmath,lhs_tmp1);
205 }
206 }
207 }
|
oneapi的cfg图

在移动的过程中如果store bb的中的LHS在 除了store bb外的其他bb中被使用,则需要重新计算
10490 /* Check that LHS does not have uses outside of STORE_BB. */
10491 res = true;
10492 // FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10493 gimple *use_lhs;
10494 FOR_EACH_IMM_USE_STMT (use_lhs, imm_iter, lhs)
10495 {
10496 gimple *use_stmt;
10497 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) {
10498
10499 // gimple *use_stmt;
10500 use_stmt = USE_STMT (use_p);
10501 if (is_gimple_debug (use_stmt))
10502 continue;
10503 if (gimple_bb (use_stmt) != store_bb && gimple_bb (use_stmt) != gimple_bb (last))
10504 {
10505 // res = false;
10506
10507 if (dump_enabled_p ())
10508 dump_printf_loc (MSG_NOTE, vect_location,
10509 "LHS have use outside of store_BB\n%G", stmt1);
10510 tree new_lhs,new_lhs1,new_lhs2;
10511 tree new_lhs_phi;
10512 gphi *phi;
10513 tree vectype;
10514 tree zero;
10515 gimple *zero_def;
10516
10517 gimple *new_assign_stmt;
10518
10519 if (is_gimple_assign(stmt1) && is_gimple_assign(use_lhs)) {
10520 for (unsigned int i = 1; i < gimple_num_ops(use_stmt); i++) {
10521 tree rhs = gimple_op(use_stmt, i);
10522 if(TREE_CODE (rhs) == SSA_NAME && (rhs == lhs)) {
10523
10524 if (dump_enabled_p ())
10525 dump_printf_loc (MSG_NOTE, vect_location,
10526 "insert new stmt to use out of BB\n");
10527 new_lhs = create_tmp_var(TREE_TYPE(lhs), "new_tmp_var");
10528 new_lhs1 = make_ssa_name(new_lhs,NULL);
10529 tree rhs1 = gimple_assign_rhs1(stmt1);
10530 tree rhs2 = gimple_assign_rhs2(stmt1);
10531 new_assign_stmt = gimple_build_assign(new_lhs1, gimple_assign_rhs_code(stmt1), rhs1, rhs2);
10532
10533 gimple_stmt_iterator gsi_temp = gsi_for_stmt(use_stmt);
10534 gsi_insert_before (&gsi_temp,new_assign_stmt,GSI_SAME_STMT);
10535 update_stmt(new_assign_stmt);
10536
10537 if( i == 1) {
10538
10539 gimple_assign_set_rhs1(use_stmt, new_lhs1);
10540 // update_stmt(use_stmt);
10541 }
10542 else if (i == 2) {
10543 gimple_assign_set_rhs2(use_stmt, new_lhs1);
10544 // update_stmt(use_stmt);
10545 }
10546
10547 // update_stmt(use_stmt);
10548 }
10549 }
10550 }
10551 }
10552 }
10553
10554 update_stmt(use_stmt);
10555 } */
|
消除同一个reduction 在loop 中使用多次
# temp_value.920_2824 = PHI <tmp_var.921_2823(234), 0.0(279)>
48420 # temp_value.923_2821 = PHI <tmp_var.924_2820(234), 0.0(279)>
48421 # temp_value.926_2814 = PHI <tmp_var.927_2813(234), 0.0(279)>
48422 # temp_value.929_2807 = PHI <tmp_var.930_2806(234), 0.0(279)>
48423 # temp_value.932_2800 = PHI <tmp_var.933_2798(234), 0.0(279)>
_ifc__2843 = _3089 ? _2132 : 0.0;
48574 tmp_var.927_2813 = _ifc__2843 + temp_value.926_2814;
48575 _ifc__2842 = _3084 ? _2145 : 0.0;
48576 tmp_var.930_2806 = _ifc__2842 + temp_value.929_2807;
48577 _ifc__2841 = _3192 ? _2085 : 0.0;
48578 tmp_var.921_2823 = _ifc__2841 + temp_value.920_2824;
48579 _ifc__2840 = _3172 ? _2101 : 0.0;
48580 tmp_var.933_2798 = _ifc__2840 + temp_value.932_2800;
48581 _ifc__2839 = _3161 ? _2113 : 0.0;
48582 tmp_var.924_2820 = _ifc__2839 + temp_value.923_2821;
# tmp_sumi.922_2822 = PHI <tmp_var.921_2823(83), 0.0(81), 0.0(276)>
48880 # tmp_sumi.925_2816 = PHI <tmp_var.924_2820(83), 0.0(81), 0.0(276)>
48881 # tmp_sumi.928_2809 = PHI <tmp_var.927_2813(83), 0.0(81), 0.0(276)>
48882 # tmp_sumi.931_2805 = PHI <tmp_var.930_2806(83), 0.0(81), 0.0(276)>
48883 # tmp_sumi.934_2793 = PHI <tmp_var.933_2798(83), 0.0(81), 0.0(276)>
_2752 = tmp_sumi.922_2822 + tmp_sumi.925_2816;
48885 _2750 = _2752 + tmp_sumi.928_2809;
48886 _2747 = _2750 + tmp_sumi.931_2805;
48887 _2746 = _2747 + tmp_sumi.934_2793;
_2156 = ri1i_2025 + _2746;
48931 _2163 = _2160 * _2746;
|
1761 for (k = 0; k < lpears[i] + upears[i]; k++) {
1762
1763 if (pearlist[i] == NULL) {
1764 fprintf(nabout,
1765 "NULL pair list entry in egb loop 1, taskid = %d\n",
1766 mytaskid);
1767 fflush(nabout);
1768 }
1769 j = pearlist[i][k];
1770
1771 xij = xi - x[dim * j];
1772 yij = yi - x[dim * j + 1];
1773 zij = zi - x[dim * j + 2];
1774 r2 = xij * xij + yij * yij + zij * zij;
1775
1776 if (dim == 4) { // delete
1777 wij = wi - x[dim * j + 3];
1778 r2 += wij * wij;
1779 }
1780
1781 if (r2 > rgbmaxpsmax2) // %hir.cmp.4310 ule
1782 continue;
1783 dij1i = 1.0 / sqrt(r2);
1784 dij = r2 * dij1i;
1785 sj = fs[j] * (rborn[j] - BOFFSET); // select fast
1786 sj2 = sj * sj;
1787
1788 /*
1789 * ---following are from the Appendix of Schaefer and Froemmel,
1790 * JMB 216:1045-1066, 1990; Taylor series expansion for d>>s
1791 * is by Andreas Svrcek-Seiler; smooth rgbmax idea is from
1792 * Andreas Svrcek-Seiler and Alexey Onufriev.
1793 */
1794
1795 if (dij > rgbmax + sj) // rgbmax = 20; %hir.cmp.4333 ule
1796 continue;
1797
1798 if ((dij > rgbmax - sj)) { // %hir.cmp.4349 ogt
1799 uij = 1. / (dij - sj);
1800 sumi -= 0.125 * dij1i * (1.0 + 2.0 * dij * uij +
1801 rgbmax2i * (r2 -
1802 4.0 * rgbmax *
1803 dij - sj2) +
1804 2.0 * log((dij - sj) * rgbmax1i));
1805
1806 } else if (dij > 4.0 * sj) {
1807 dij2i = dij1i * dij1i;
1808 tmpsd = sj2 * dij2i;
1809 dumbo =
1810 TA + tmpsd * (TB +
1811 tmpsd * (TC +
1812 tmpsd * (TD + tmpsd * TDD)));
1813 sumi -= sj * tmpsd * dij2i * dumbo;
1814
1815 } else if (dij > ri + sj) {
1816 sumi -= 0.5 * (sj / (r2 - sj2) +
1817 0.5 * dij1i * log((dij - sj) / (dij + sj)));
1818
1819 } else if (dij > fabs(ri - sj)) {
1820 theta = 0.5 * ri1i * dij1i * (r2 + ri * ri - sj2);
1821 uij = 1. / (dij + sj);
1822 sumi -= 0.25 * (ri1i * (2. - theta) - uij +
1823 dij1i * log(ri * uij));
1824
1825 } else if (ri < sj) {
1826 sumi -= 0.5 * (sj / (r2 - sj2) + 2. * ri1i +
1827 0.5 * dij1i * log((sj - dij) / (sj + dij)));
1828
1829 }
1830
1831 } |

1:if fprintf 分析不出内存关系,无法ifcvt。(lim pass 其无法外提也是因为fprintf中内存关系无法分析)
解决:将其外提到最内层循环外面。
2 : dim常量传播 (ipa-cp pass)
mme → mme34 → egb
dim 作为全局变量无法常量传播,作为函数参数的时候可以传播到。
解决:新建一个pass,识别全局变量(当其没有作为函数传参时)和函数调用关系,在函数调用的地方将变量替换为常量值。(pass 的位置?是否有参数能解决)



根据inline pass debug的信息,发现mme34无法inline进mme 原因是--param early-inlining-insns= 值过小,将此值调大,可以成功inline。
inline 过后
;; basic block 2, loop depth 0, count 27580514 (estimated locally), maybe hot
74798 ;; prev block 0, next block 3, flags: (NEW, REACHABLE, VISITED)
74799 ;; pred: ENTRY [always] count:27580514 (estimated locally) (FALLTHRU,EXECUTABLE)
74800 # .MEM_2325 = VDEF <.MEM_2324(D)>
74801 dim.lto_priv.0D.4751 = 3;
74802 # VUSE <.MEM_2325>
basic block 96, loop depth 2, count 954868629 (estimated locally), maybe hot
77095 ;; prev block 95, next block 97, flags: (NEW, REACHABLE, VISITED)
77096 ;; pred: 94 [82.6% (guessed)] count:788435027 (estimated locally) (FALSE_VALUE,EXECUTABLE)
77097 ;; 95 [always] count:166433602 (estimated locally) (FALLTHRU,EXECUTABLE)
_698 = dim.lto_priv.0D.4751;
77112 _699 = j_697 * _698;
if (_698 == 4)
77146 goto <bb 97>; [34.00%]
77147 else
77148 goto <bb 98>; [66.00%]
|
怀疑是mme34函数中其他部分的代码,影响了其做常量传播的分析,注释掉mme34函数中的部分代码,发现其能够做到常量将dim =3 作为常量。
_77 = j_76 * 3; |
但是需要同时注释掉的内容较多,无法准确找到哪部分代码影响了传播,以及这部分代码的特性。
写了一个例子发现其静态全局变量可以成功作为常量计算,怀疑是mme34函数中的其他部分,影响到dim的常量传播。
1 #include<stdio.h>
2 #include<math.h>
3 #include<stdlib.h>
4
5
6 static int threshold = 5;
7
8 static inline int check_value1(int x) {
9 if(threshold < 20)
10 return x*threshold;
11 else return threshold;
12 }
13
14 static inline int check_value2(int x) {
15 if(threshold < 5)
16 return x+threshold;
17 else return threshold;
18 }
19 static inline int check_value3(int x) {
20 threshold = 10;
21 return check_value1(x);
22 }
23 static inline int check_value4(int x) {
24 threshold = 50;
25 return check_value2(x);
26 }
27
28 int use_threshold(int threshold) {
29
30 return 10 + threshold;
31 }
32 int main()
33 {
34 int num = 30;
35 int num2 = 5;
36 int ans3 = use_threshold(threshold);
37 int ans1 = check_value3(num);
38 int ans2 = check_value4(num2);
39 int ans = ans1 + ans2 +ans3;
40 printf("ans is %d\n",ans);
41 return 0;
42 }
|
查看ccp pass 中的debug的信息
39040 Visiting statement: 39041 # VUSE <.MEM_2279> 39042 _698 = dim.lto_priv.0D.4751; 39043 which is likely CONSTANT 39044 Lattice value changed to VARYING. Adding SSA edges to worklist. |
在这里进行gdb 调试,
69046 Substituting values and folding statements 69048 Folding statement: dim = 3; 69049 Not folded |
1761 for (k = 0; k < lpears[i] + upears[i]; k++) {
1762
1763 if (pearlist[i] == NULL) {
1764 fprintf(nabout,
1765 "NULL pair list entry in egb loop 1, taskid = %d\n",
1766 mytaskid);
1767 fflush(nabout);
abort();
1768 }
1769 j = pearlist[i][k];
1770
|
在ifcvt pass k看if 并没有被外提,无法ifcvt
插入abort需要识别的patern
14044 <bb 148> [local count: 919275880]: 14045 _2044 = _127 + _2039; 14046 _2045 = *_2044; 14047 if (_2045 == 0B) 14048 goto <bb 149>; [17.43%] 14049 else 14050 goto <bb 150>; [82.57%] 14051 14052 <bb 149> [local count: 160229786]: 14053 _2046 = 0; 14054 _2047 = nabout; 14055 fprintf (_2047, "NULL pair list entry in egb loop 1, taskid = %d\n", _2046); 14056 _2048 = nabout; 14057 fflush (_2048); 14058 14059 <bb 150> [local count: 919275880]: 14060 _2049 = *_2044; 14061 _2051 = (long unsigned int) k_2050; 14062 _2052 = _2051 * 4; 14063 _2053 = _2049 + _2052; 14064 j_2054 = *_2053; |
Eff.c:3282
build_base_HygonGCC_Spec2017_rate_perf-test.cfg-64.0000
build_base_HygonGCC_Spec2017_rate_perf.cfg-64.0001
加上一个参数使mme34内联进mme中,但是dim = 3的常量传播无法做到。写了一个静态全局变量的例子,发现其能够传播到,怀疑是函数中的其他代码影响了对常量的分析无法传播到,通过注释原题中的代码
加上if -continue 107
不加 106
Base 99.6
