1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
51 slp_tree, slp_instance);
52 static tree vect_create_destination_var (tree, tree);
53 static tree vect_create_data_ref_ptr
54 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *, tree);
55 static tree vect_create_addr_base_for_vector_ref
56 (gimple, gimple_seq *, tree, struct loop *);
57 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
58 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
59 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
60 static void vect_finish_stmt_generation
61 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
62 static bool vect_is_simple_cond (tree, loop_vec_info);
63 static void vect_create_epilog_for_reduction
64 (tree, gimple, int, enum tree_code, gimple);
65 static tree get_initial_def_for_reduction (gimple, tree, tree *);
67 /* Utility function dealing with loop peeling (not peeling itself). */
68 static void vect_generate_tmps_on_preheader
69 (loop_vec_info, tree *, tree *, tree *);
70 static tree vect_build_loop_niters (loop_vec_info);
71 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
72 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
73 static void vect_update_init_of_dr (struct data_reference *, tree niters);
74 static void vect_update_inits_of_drs (loop_vec_info, tree);
75 static int vect_min_worthwhile_factor (enum tree_code);
79 cost_for_stmt (gimple stmt)
81 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
83 switch (STMT_VINFO_TYPE (stmt_info))
85 case load_vec_info_type:
86 return TARG_SCALAR_LOAD_COST;
87 case store_vec_info_type:
88 return TARG_SCALAR_STORE_COST;
89 case op_vec_info_type:
90 case condition_vec_info_type:
91 case assignment_vec_info_type:
92 case reduc_vec_info_type:
93 case induc_vec_info_type:
94 case type_promotion_vec_info_type:
95 case type_demotion_vec_info_type:
96 case type_conversion_vec_info_type:
97 case call_vec_info_type:
98 return TARG_SCALAR_STMT_COST;
99 case undef_vec_info_type:
106 /* Function vect_estimate_min_profitable_iters
108 Return the number of iterations required for the vector version of the
109 loop to be profitable relative to the cost of the scalar version of the
112 TODO: Take profile info into account before making vectorization
113 decisions, if available. */
116 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
119 int min_profitable_iters;
120 int peel_iters_prologue;
121 int peel_iters_epilogue;
122 int vec_inside_cost = 0;
123 int vec_outside_cost = 0;
124 int scalar_single_iter_cost = 0;
125 int scalar_outside_cost = 0;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
144 /* Requires loop versioning tests to handle misalignment. */
145 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
147 /* FIXME: Make cost depend on complexity of individual check. */
149 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
150 if (vect_print_dump_info (REPORT_COST))
151 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
152 "versioning to treat misalignment.\n");
155 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
157 /* FIXME: Make cost depend on complexity of individual check. */
159 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
160 if (vect_print_dump_info (REPORT_COST))
161 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
162 "versioning aliasing.\n");
165 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
166 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
168 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
171 /* Count statements in scalar loop. Using this as scalar cost for a single
174 TODO: Add outer loop support.
176 TODO: Consider assigning different costs to different scalar
181 innerloop_iters = 50; /* FIXME */
183 for (i = 0; i < nbbs; i++)
185 gimple_stmt_iterator si;
186 basic_block bb = bbs[i];
188 if (bb->loop_father == loop->inner)
189 factor = innerloop_iters;
193 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
195 gimple stmt = gsi_stmt (si);
196 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
197 /* Skip stmts that are not vectorized inside the loop. */
198 if (!STMT_VINFO_RELEVANT_P (stmt_info)
199 && (!STMT_VINFO_LIVE_P (stmt_info)
200 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
202 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
203 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
204 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
205 some of the "outside" costs are generated inside the outer-loop. */
206 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
210 /* Add additional cost for the peeled instructions in prologue and epilogue
213 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
214 at compile-time - we assume it's vf/2 (the worst would be vf-1).
216 TODO: Build an expression that represents peel_iters for prologue and
217 epilogue to be used in a run-time test. */
219 if (byte_misalign < 0)
221 peel_iters_prologue = vf/2;
222 if (vect_print_dump_info (REPORT_COST))
223 fprintf (vect_dump, "cost model: "
224 "prologue peel iters set to vf/2.");
226 /* If peeling for alignment is unknown, loop bound of main loop becomes
228 peel_iters_epilogue = vf/2;
229 if (vect_print_dump_info (REPORT_COST))
230 fprintf (vect_dump, "cost model: "
231 "epilogue peel iters set to vf/2 because "
232 "peeling for alignment is unknown .");
234 /* If peeled iterations are unknown, count a taken branch and a not taken
235 branch per peeled loop. Even if scalar loop iterations are known,
236 vector iterations are not known since peeled prologue iterations are
237 not known. Hence guards remain the same. */
238 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
239 + TARG_COND_NOT_TAKEN_BRANCH_COST);
245 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
246 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
247 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
248 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
250 peel_iters_prologue = nelements - (byte_misalign / element_size);
253 peel_iters_prologue = 0;
255 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
257 peel_iters_epilogue = vf/2;
258 if (vect_print_dump_info (REPORT_COST))
259 fprintf (vect_dump, "cost model: "
260 "epilogue peel iters set to vf/2 because "
261 "loop iterations are unknown .");
263 /* If peeled iterations are known but number of scalar loop
264 iterations are unknown, count a taken branch per peeled loop. */
265 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
270 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
271 peel_iters_prologue = niters < peel_iters_prologue ?
272 niters : peel_iters_prologue;
273 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
277 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
278 + (peel_iters_epilogue * scalar_single_iter_cost)
281 /* FORNOW: The scalar outside cost is incremented in one of the
284 1. The vectorizer checks for alignment and aliasing and generates
285 a condition that allows dynamic vectorization. A cost model
286 check is ANDED with the versioning condition. Hence scalar code
287 path now has the added cost of the versioning check.
289 if (cost > th & versioning_check)
292 Hence run-time scalar is incremented by not-taken branch cost.
294 2. The vectorizer then checks if a prologue is required. If the
295 cost model check was not done before during versioning, it has to
296 be done before the prologue check.
299 prologue = scalar_iters
304 if (prologue == num_iters)
307 Hence the run-time scalar cost is incremented by a taken branch,
308 plus a not-taken branch, plus a taken branch cost.
310 3. The vectorizer then checks if an epilogue is required. If the
311 cost model check was not done before during prologue check, it
312 has to be done with the epilogue check.
318 if (prologue == num_iters)
321 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
324 Hence the run-time scalar cost should be incremented by 2 taken
327 TODO: The back end may reorder the BBS's differently and reverse
328 conditions/branch directions. Change the estimates below to
329 something more reasonable. */
331 /* If the number of iterations is known and we do not do versioning, we can
332 decide whether to vectorize at compile time. Hence the scalar version
333 do not carry cost model guard costs. */
334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
335 || VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
336 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
338 /* Cost model check occurs at versioning. */
339 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
340 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
341 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
344 /* Cost model check occurs at prologue generation. */
345 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
346 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
347 + TARG_COND_NOT_TAKEN_BRANCH_COST;
348 /* Cost model check occurs at epilogue generation. */
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
355 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
356 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
358 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
359 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
362 /* Calculate number of iterations required to make the vector version
363 profitable, relative to the loop bodies only. The following condition
365 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
367 SIC = scalar iteration cost, VIC = vector iteration cost,
368 VOC = vector outside cost, VF = vectorization factor,
369 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
370 SOC = scalar outside cost for run time cost model check. */
372 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
374 if (vec_outside_cost <= 0)
375 min_profitable_iters = 1;
378 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
379 - vec_inside_cost * peel_iters_prologue
380 - vec_inside_cost * peel_iters_epilogue)
381 / ((scalar_single_iter_cost * vf)
384 if ((scalar_single_iter_cost * vf * min_profitable_iters)
385 <= ((vec_inside_cost * min_profitable_iters)
386 + ((vec_outside_cost - scalar_outside_cost) * vf)))
387 min_profitable_iters++;
390 /* vector version will never be profitable. */
393 if (vect_print_dump_info (REPORT_COST))
394 fprintf (vect_dump, "cost model: vector iteration cost = %d "
395 "is divisible by scalar iteration cost = %d by a factor "
396 "greater than or equal to the vectorization factor = %d .",
397 vec_inside_cost, scalar_single_iter_cost, vf);
401 if (vect_print_dump_info (REPORT_COST))
403 fprintf (vect_dump, "Cost model analysis: \n");
404 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
406 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
408 fprintf (vect_dump, " Scalar iteration cost: %d\n",
409 scalar_single_iter_cost);
410 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
411 fprintf (vect_dump, " prologue iterations: %d\n",
412 peel_iters_prologue);
413 fprintf (vect_dump, " epilogue iterations: %d\n",
414 peel_iters_epilogue);
415 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
416 min_profitable_iters);
419 min_profitable_iters =
420 min_profitable_iters < vf ? vf : min_profitable_iters;
422 /* Because the condition we create is:
423 if (niters <= min_profitable_iters)
424 then skip the vectorized loop. */
425 min_profitable_iters--;
427 if (vect_print_dump_info (REPORT_COST))
428 fprintf (vect_dump, " Profitability threshold = %d\n",
429 min_profitable_iters);
431 return min_profitable_iters;
435 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
436 functions. Design better to avoid maintenance issues. */
438 /* Function vect_model_reduction_cost.
440 Models cost for a reduction operation, including the vector ops
441 generated within the strip-mine loop, the initial definition before
442 the loop, and the epilogue code that must be generated. */
445 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
452 gimple stmt, orig_stmt;
454 enum machine_mode mode;
455 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
456 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
459 /* Cost of reduction op inside loop. */
460 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
462 stmt = STMT_VINFO_STMT (stmt_info);
464 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
466 case GIMPLE_SINGLE_RHS:
467 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
468 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
470 case GIMPLE_UNARY_RHS:
471 reduction_op = gimple_assign_rhs1 (stmt);
473 case GIMPLE_BINARY_RHS:
474 reduction_op = gimple_assign_rhs2 (stmt);
480 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
483 if (vect_print_dump_info (REPORT_COST))
485 fprintf (vect_dump, "unsupported data-type ");
486 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
491 mode = TYPE_MODE (vectype);
492 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
495 orig_stmt = STMT_VINFO_STMT (stmt_info);
497 code = gimple_assign_rhs_code (orig_stmt);
499 /* Add in cost for initial definition. */
500 outer_cost += TARG_SCALAR_TO_VEC_COST;
502 /* Determine cost of epilogue code.
504 We have a reduction operator that will reduce the vector in one statement.
505 Also requires scalar extract. */
507 if (!nested_in_vect_loop_p (loop, orig_stmt))
509 if (reduc_code < NUM_TREE_CODES)
510 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
513 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
515 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
516 int element_bitsize = tree_low_cst (bitsize, 1);
517 int nelements = vec_size_in_bits / element_bitsize;
519 optab = optab_for_tree_code (code, vectype, optab_default);
521 /* We have a whole vector shift available. */
522 if (VECTOR_MODE_P (mode)
523 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
524 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
525 /* Final reduction via vector shifts and the reduction operator. Also
526 requires scalar extract. */
527 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
528 + TARG_VEC_TO_SCALAR_COST);
530 /* Use extracts and reduction op for final reduction. For N elements,
531 we have N extracts and N-1 reduction ops. */
532 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
536 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
538 if (vect_print_dump_info (REPORT_COST))
539 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
540 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
541 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
547 /* Function vect_model_induction_cost.
549 Models cost for induction operations. */
552 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
554 /* loop cost for vec_loop. */
555 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
556 /* prologue cost for vec_init and vec_step. */
557 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
559 if (vect_print_dump_info (REPORT_COST))
560 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
561 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
562 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
566 /* Function vect_model_simple_cost.
568 Models cost for simple operations, i.e. those that only emit ncopies of a
569 single op. Right now, this does not account for multiple insns that could
570 be generated for the single vector op. We will handle that shortly. */
573 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
574 enum vect_def_type *dt, slp_tree slp_node)
577 int inside_cost = 0, outside_cost = 0;
579 /* The SLP costs were already calculated during SLP tree build. */
580 if (PURE_SLP_STMT (stmt_info))
583 inside_cost = ncopies * TARG_VEC_STMT_COST;
585 /* FORNOW: Assuming maximum 2 args per stmts. */
586 for (i = 0; i < 2; i++)
588 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
589 outside_cost += TARG_SCALAR_TO_VEC_COST;
592 if (vect_print_dump_info (REPORT_COST))
593 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
594 "outside_cost = %d .", inside_cost, outside_cost);
596 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
597 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
598 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
602 /* Function vect_cost_strided_group_size
604 For strided load or store, return the group_size only if it is the first
605 load or store of a group, else return 1. This ensures that group size is
606 only returned once per group. */
609 vect_cost_strided_group_size (stmt_vec_info stmt_info)
611 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
613 if (first_stmt == STMT_VINFO_STMT (stmt_info))
614 return DR_GROUP_SIZE (stmt_info);
620 /* Function vect_model_store_cost
622 Models cost for stores. In the case of strided accesses, one access
623 has the overhead of the strided access attributed to it. */
626 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
627 enum vect_def_type dt, slp_tree slp_node)
630 int inside_cost = 0, outside_cost = 0;
632 /* The SLP costs were already calculated during SLP tree build. */
633 if (PURE_SLP_STMT (stmt_info))
636 if (dt == vect_constant_def || dt == vect_invariant_def)
637 outside_cost = TARG_SCALAR_TO_VEC_COST;
639 /* Strided access? */
640 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
641 group_size = vect_cost_strided_group_size (stmt_info);
642 /* Not a strided access. */
646 /* Is this an access in a group of stores, which provide strided access?
647 If so, add in the cost of the permutes. */
650 /* Uses a high and low interleave operation for each needed permute. */
651 inside_cost = ncopies * exact_log2(group_size) * group_size
652 * TARG_VEC_STMT_COST;
654 if (vect_print_dump_info (REPORT_COST))
655 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
660 /* Costs of the stores. */
661 inside_cost += ncopies * TARG_VEC_STORE_COST;
663 if (vect_print_dump_info (REPORT_COST))
664 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
665 "outside_cost = %d .", inside_cost, outside_cost);
667 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
668 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
669 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
673 /* Function vect_model_load_cost
675 Models cost for loads. In the case of strided accesses, the last access
676 has the overhead of the strided access attributed to it. Since unaligned
677 accesses are supported for loads, we also account for the costs of the
678 access scheme chosen. */
681 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
685 int alignment_support_cheme;
687 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
688 int inside_cost = 0, outside_cost = 0;
690 /* The SLP costs were already calculated during SLP tree build. */
691 if (PURE_SLP_STMT (stmt_info))
694 /* Strided accesses? */
695 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
696 if (first_stmt && !slp_node)
698 group_size = vect_cost_strided_group_size (stmt_info);
699 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
701 /* Not a strided access. */
708 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
710 /* Is this an access in a group of loads providing strided access?
711 If so, add in the cost of the permutes. */
714 /* Uses an even and odd extract operations for each needed permute. */
715 inside_cost = ncopies * exact_log2(group_size) * group_size
716 * TARG_VEC_STMT_COST;
718 if (vect_print_dump_info (REPORT_COST))
719 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
724 /* The loads themselves. */
725 switch (alignment_support_cheme)
729 inside_cost += ncopies * TARG_VEC_LOAD_COST;
731 if (vect_print_dump_info (REPORT_COST))
732 fprintf (vect_dump, "vect_model_load_cost: aligned.");
736 case dr_unaligned_supported:
738 /* Here, we assign an additional cost for the unaligned load. */
739 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
741 if (vect_print_dump_info (REPORT_COST))
742 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
747 case dr_explicit_realign:
749 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
751 /* FIXME: If the misalignment remains fixed across the iterations of
752 the containing loop, the following cost should be added to the
754 if (targetm.vectorize.builtin_mask_for_load)
755 inside_cost += TARG_VEC_STMT_COST;
759 case dr_explicit_realign_optimized:
761 if (vect_print_dump_info (REPORT_COST))
762 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
765 /* Unaligned software pipeline has a load of an address, an initial
766 load, and possibly a mask operation to "prime" the loop. However,
767 if this is an access in a group of loads, which provide strided
768 access, then the above cost should only be considered for one
769 access in the group. Inside the loop, there is a load op
770 and a realignment op. */
772 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
774 outside_cost = 2*TARG_VEC_STMT_COST;
775 if (targetm.vectorize.builtin_mask_for_load)
776 outside_cost += TARG_VEC_STMT_COST;
779 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
788 if (vect_print_dump_info (REPORT_COST))
789 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
790 "outside_cost = %d .", inside_cost, outside_cost);
792 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
793 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
794 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
798 /* Function vect_get_new_vect_var.
800 Returns a name for a new variable. The current naming scheme appends the
801 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
802 the name of vectorizer generated variables, and appends that to NAME if
806 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
813 case vect_simple_var:
816 case vect_scalar_var:
819 case vect_pointer_var:
828 char* tmp = concat (prefix, name, NULL);
829 new_vect_var = create_tmp_var (type, tmp);
833 new_vect_var = create_tmp_var (type, prefix);
835 /* Mark vector typed variable as a gimple register variable. */
836 if (TREE_CODE (type) == VECTOR_TYPE)
837 DECL_GIMPLE_REG_P (new_vect_var) = true;
843 /* Function vect_create_addr_base_for_vector_ref.
845 Create an expression that computes the address of the first memory location
846 that will be accessed for a data reference.
849 STMT: The statement containing the data reference.
850 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
851 OFFSET: Optional. If supplied, it is be added to the initial address.
852 LOOP: Specify relative to which loop-nest should the address be computed.
853 For example, when the dataref is in an inner-loop nested in an
854 outer-loop that is now being vectorized, LOOP can be either the
855 outer-loop, or the inner-loop. The first memory location accessed
856 by the following dataref ('in' points to short):
863 if LOOP=i_loop: &in (relative to i_loop)
864 if LOOP=j_loop: &in+i*2B (relative to j_loop)
867 1. Return an SSA_NAME whose value is the address of the memory location of
868 the first vector of the data reference.
869 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
870 these statement(s) which define the returned SSA_NAME.
872 FORNOW: We are only handling array accesses with step 1. */
875 vect_create_addr_base_for_vector_ref (gimple stmt,
876 gimple_seq *new_stmt_list,
880 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
881 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
882 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
883 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
885 tree data_ref_base_var;
887 tree addr_base, addr_expr;
889 gimple_seq seq = NULL;
890 tree base_offset = unshare_expr (DR_OFFSET (dr));
891 tree init = unshare_expr (DR_INIT (dr));
892 tree vect_ptr_type, addr_expr2;
893 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
896 if (loop != containing_loop)
898 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
899 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
901 gcc_assert (nested_in_vect_loop_p (loop, stmt));
903 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
904 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
905 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
908 /* Create data_ref_base */
909 base_name = build_fold_indirect_ref (data_ref_base);
910 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
911 add_referenced_var (data_ref_base_var);
912 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
914 gimple_seq_add_seq (new_stmt_list, seq);
916 /* Create base_offset */
917 base_offset = size_binop (PLUS_EXPR,
918 fold_convert (sizetype, base_offset),
919 fold_convert (sizetype, init));
920 dest = create_tmp_var (sizetype, "base_off");
921 add_referenced_var (dest);
922 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
923 gimple_seq_add_seq (new_stmt_list, seq);
927 tree tmp = create_tmp_var (sizetype, "offset");
929 add_referenced_var (tmp);
930 offset = fold_build2 (MULT_EXPR, sizetype,
931 fold_convert (sizetype, offset), step);
932 base_offset = fold_build2 (PLUS_EXPR, sizetype,
933 base_offset, offset);
934 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
935 gimple_seq_add_seq (new_stmt_list, seq);
938 /* base + base_offset */
939 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
940 data_ref_base, base_offset);
942 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
944 /* addr_expr = addr_base */
945 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
946 get_name (base_name));
947 add_referenced_var (addr_expr);
948 vec_stmt = fold_convert (vect_ptr_type, addr_base);
949 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
950 get_name (base_name));
951 add_referenced_var (addr_expr2);
952 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
953 gimple_seq_add_seq (new_stmt_list, seq);
955 if (vect_print_dump_info (REPORT_DETAILS))
957 fprintf (vect_dump, "created ");
958 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
964 /* Function vect_create_data_ref_ptr.
966 Create a new pointer to vector type (vp), that points to the first location
967 accessed in the loop by STMT, along with the def-use update chain to
968 appropriately advance the pointer through the loop iterations. Also set
969 aliasing information for the pointer. This vector pointer is used by the
970 callers to this function to create a memory reference expression for vector
974 1. STMT: a stmt that references memory. Expected to be of the form
975 GIMPLE_ASSIGN <name, data-ref> or
976 GIMPLE_ASSIGN <data-ref, name>.
977 2. AT_LOOP: the loop where the vector memref is to be created.
978 3. OFFSET (optional): an offset to be added to the initial address accessed
979 by the data-ref in STMT.
980 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
981 pointing to the initial address.
982 5. TYPE: if not NULL indicates the required type of the data-ref.
985 1. Declare a new ptr to vector_type, and have it point to the base of the
986 data reference (initial addressed accessed by the data reference).
987 For example, for vector of type V8HI, the following code is generated:
990 vp = (v8hi *)initial_address;
992 if OFFSET is not supplied:
993 initial_address = &a[init];
994 if OFFSET is supplied:
995 initial_address = &a[init + OFFSET];
997 Return the initial_address in INITIAL_ADDRESS.
999 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
1000 update the pointer in each iteration of the loop.
1002 Return the increment stmt that updates the pointer in PTR_INCR.
1004 3. Set INV_P to true if the access pattern of the data reference in the
1005 vectorized loop is invariant. Set it to false otherwise.
1007 4. Return the pointer. */
1010 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1011 tree offset, tree *initial_address, gimple *ptr_incr,
1012 bool only_init, bool *inv_p, tree type)
1015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1016 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1017 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1018 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1019 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1020 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1026 gimple_seq new_stmt_list = NULL;
1030 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1032 gimple_stmt_iterator incr_gsi;
1034 tree indx_before_incr, indx_after_incr;
1038 /* Check the step (evolution) of the load in LOOP, and record
1039 whether it's invariant. */
1040 if (nested_in_vect_loop)
1041 step = STMT_VINFO_DR_STEP (stmt_info);
1043 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1045 if (tree_int_cst_compare (step, size_zero_node) == 0)
1050 /* Create an expression for the first address accessed by this load
1052 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1054 if (vect_print_dump_info (REPORT_DETAILS))
1056 tree data_ref_base = base_name;
1057 fprintf (vect_dump, "create vector-pointer variable to type: ");
1058 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1059 if (TREE_CODE (data_ref_base) == VAR_DECL)
1060 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1061 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1062 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1063 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1064 fprintf (vect_dump, " vectorizing a record based array ref: ");
1065 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1066 fprintf (vect_dump, " vectorizing a pointer ref: ");
1067 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1070 /** (1) Create the new vector-pointer variable: **/
1072 vect_ptr_type = build_pointer_type (type);
1074 vect_ptr_type = build_pointer_type (vectype);
1076 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1077 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1078 vect_ptr_type = build_qualified_type (vect_ptr_type, TYPE_QUAL_RESTRICT);
1079 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1080 get_name (base_name));
1081 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1082 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1084 get_alias_set (base_name);
1085 DECL_POINTER_ALIAS_SET (vect_ptr)
1086 = DECL_POINTER_ALIAS_SET (SSA_NAME_VAR (DR_BASE_ADDRESS (dr)));
1089 add_referenced_var (vect_ptr);
1091 /** (2) Add aliasing information to the new vector-pointer:
1092 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1094 tag = DR_SYMBOL_TAG (dr);
1097 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1098 tag must be created with tag added to its may alias list. */
1100 new_type_alias (vect_ptr, tag, DR_REF (dr));
1103 set_symbol_mem_tag (vect_ptr, tag);
1104 mark_sym_for_renaming (tag);
1107 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1108 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1109 def-use update cycles for the pointer: One relative to the outer-loop
1110 (LOOP), which is what steps (3) and (4) below do. The other is relative
1111 to the inner-loop (which is the inner-most loop containing the dataref),
1112 and this is done be step (5) below.
1114 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1115 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1116 redundant. Steps (3),(4) create the following:
1119 LOOP: vp1 = phi(vp0,vp2)
1125 If there is an inner-loop nested in loop, then step (5) will also be
1126 applied, and an additional update in the inner-loop will be created:
1129 LOOP: vp1 = phi(vp0,vp2)
1131 inner: vp3 = phi(vp1,vp4)
1132 vp4 = vp3 + inner_step
1138 /** (3) Calculate the initial address the vector-pointer, and set
1139 the vector-pointer to point to it before the loop: **/
1141 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1143 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1145 pe = loop_preheader_edge (loop);
1148 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1149 gcc_assert (!new_bb);
1152 *initial_address = new_temp;
1154 /* Create: p = (vectype *) initial_base */
1155 vec_stmt = gimple_build_assign (vect_ptr,
1156 fold_convert (vect_ptr_type, new_temp));
1157 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1158 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1159 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1160 gcc_assert (!new_bb);
1163 /** (4) Handle the updating of the vector-pointer inside the loop.
1164 This is needed when ONLY_INIT is false, and also when AT_LOOP
1165 is the inner-loop nested in LOOP (during outer-loop vectorization).
1168 if (only_init && at_loop == loop) /* No update in loop is required. */
1170 /* Copy the points-to information if it exists. */
1171 if (DR_PTR_INFO (dr))
1172 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1173 vptr = vect_ptr_init;
1177 /* The step of the vector pointer is the Vector Size. */
1178 tree step = TYPE_SIZE_UNIT (vectype);
1179 /* One exception to the above is when the scalar step of the load in
1180 LOOP is zero. In this case the step here is also zero. */
1182 step = size_zero_node;
1184 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1186 create_iv (vect_ptr_init,
1187 fold_convert (vect_ptr_type, step),
1188 vect_ptr, loop, &incr_gsi, insert_after,
1189 &indx_before_incr, &indx_after_incr);
1190 incr = gsi_stmt (incr_gsi);
1191 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1193 /* Copy the points-to information if it exists. */
1194 if (DR_PTR_INFO (dr))
1196 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1197 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1199 merge_alias_info (vect_ptr_init, indx_before_incr);
1200 merge_alias_info (vect_ptr_init, indx_after_incr);
1204 vptr = indx_before_incr;
1207 if (!nested_in_vect_loop || only_init)
1211 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1212 nested in LOOP, if exists: **/
1214 gcc_assert (nested_in_vect_loop);
1217 standard_iv_increment_position (containing_loop, &incr_gsi,
1219 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), vect_ptr,
1220 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1222 incr = gsi_stmt (incr_gsi);
1223 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1225 /* Copy the points-to information if it exists. */
1226 if (DR_PTR_INFO (dr))
1228 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1229 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1231 merge_alias_info (vect_ptr_init, indx_before_incr);
1232 merge_alias_info (vect_ptr_init, indx_after_incr);
1236 return indx_before_incr;
1243 /* Function bump_vector_ptr
1245 Increment a pointer (to a vector type) by vector-size. If requested,
1246 i.e. if PTR-INCR is given, then also connect the new increment stmt
1247 to the existing def-use update-chain of the pointer, by modifying
1248 the PTR_INCR as illustrated below:
1250 The pointer def-use update-chain before this function:
1251 DATAREF_PTR = phi (p_0, p_2)
1253 PTR_INCR: p_2 = DATAREF_PTR + step
1255 The pointer def-use update-chain after this function:
1256 DATAREF_PTR = phi (p_0, p_2)
1258 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1260 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1263 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1265 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1266 the loop. The increment amount across iterations is expected
1268 BSI - location where the new update stmt is to be placed.
1269 STMT - the original scalar memory-access stmt that is being vectorized.
1270 BUMP - optional. The offset by which to bump the pointer. If not given,
1271 the offset is assumed to be vector_size.
1273 Output: Return NEW_DATAREF_PTR as illustrated above.
1278 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1279 gimple stmt, tree bump)
1281 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1282 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1283 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1284 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1285 tree update = TYPE_SIZE_UNIT (vectype);
1288 use_operand_p use_p;
1289 tree new_dataref_ptr;
1294 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1295 dataref_ptr, update);
1296 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1297 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1298 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1300 /* Copy the points-to information if it exists. */
1301 if (DR_PTR_INFO (dr))
1302 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1303 merge_alias_info (new_dataref_ptr, dataref_ptr);
1306 return new_dataref_ptr;
1308 /* Update the vector-pointer's cross-iteration increment. */
1309 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1311 tree use = USE_FROM_PTR (use_p);
1313 if (use == dataref_ptr)
1314 SET_USE (use_p, new_dataref_ptr);
1316 gcc_assert (tree_int_cst_compare (use, update) == 0);
1319 return new_dataref_ptr;
1323 /* Function vect_create_destination_var.
1325 Create a new temporary of type VECTYPE. */
1328 vect_create_destination_var (tree scalar_dest, tree vectype)
1331 const char *new_name;
1333 enum vect_var_kind kind;
1335 kind = vectype ? vect_simple_var : vect_scalar_var;
1336 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1338 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1340 new_name = get_name (scalar_dest);
1343 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1344 add_referenced_var (vec_dest);
1350 /* Function vect_init_vector.
1352 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1353 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1354 is not NULL. Otherwise, place the initialization at the loop preheader.
1355 Return the DEF of INIT_STMT.
1356 It will be used in the vectorization of STMT. */
1359 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1360 gimple_stmt_iterator *gsi)
1362 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1370 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1371 add_referenced_var (new_var);
1372 init_stmt = gimple_build_assign (new_var, vector_var);
1373 new_temp = make_ssa_name (new_var, init_stmt);
1374 gimple_assign_set_lhs (init_stmt, new_temp);
1377 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1380 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1381 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1383 if (nested_in_vect_loop_p (loop, stmt))
1385 pe = loop_preheader_edge (loop);
1386 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1387 gcc_assert (!new_bb);
1390 if (vect_print_dump_info (REPORT_DETAILS))
1392 fprintf (vect_dump, "created new init_stmt: ");
1393 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1396 vec_oprnd = gimple_assign_lhs (init_stmt);
1401 /* For constant and loop invariant defs of SLP_NODE this function returns
1402 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1403 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1404 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
1407 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1408 unsigned int op_num, unsigned int number_of_vectors)
1410 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1411 gimple stmt = VEC_index (gimple, stmts, 0);
1412 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1413 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1417 int j, number_of_places_left_in_vector;
1420 int group_size = VEC_length (gimple, stmts);
1421 unsigned int vec_num, i;
1422 int number_of_copies = 1;
1423 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1424 bool constant_p, is_store;
1426 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1429 op = gimple_assign_rhs1 (stmt);
1434 op = gimple_op (stmt, op_num + 1);
1437 if (CONSTANT_CLASS_P (op))
1439 vector_type = vectype;
1444 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1445 gcc_assert (vector_type);
1449 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1451 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1452 created vectors. It is greater than 1 if unrolling is performed.
1454 For example, we have two scalar operands, s1 and s2 (e.g., group of
1455 strided accesses of size two), while NUNITS is four (i.e., four scalars
1456 of this type can be packed in a vector). The output vector will contain
1457 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1460 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1461 containing the operands.
1463 For example, NUNITS is four as before, and the group size is 8
1464 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1465 {s5, s6, s7, s8}. */
1467 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1469 number_of_places_left_in_vector = nunits;
1470 for (j = 0; j < number_of_copies; j++)
1472 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1475 op = gimple_assign_rhs1 (stmt);
1477 op = gimple_op (stmt, op_num + 1);
1479 /* Create 'vect_ = {op0,op1,...,opn}'. */
1480 t = tree_cons (NULL_TREE, op, t);
1482 number_of_places_left_in_vector--;
1484 if (number_of_places_left_in_vector == 0)
1486 number_of_places_left_in_vector = nunits;
1489 vec_cst = build_vector (vector_type, t);
1491 vec_cst = build_constructor_from_list (vector_type, t);
1492 VEC_quick_push (tree, voprnds,
1493 vect_init_vector (stmt, vec_cst, vector_type, NULL));
1499 /* Since the vectors are created in the reverse order, we should invert
1501 vec_num = VEC_length (tree, voprnds);
1502 for (j = vec_num - 1; j >= 0; j--)
1504 vop = VEC_index (tree, voprnds, j);
1505 VEC_quick_push (tree, *vec_oprnds, vop);
1508 VEC_free (tree, heap, voprnds);
1510 /* In case that VF is greater than the unrolling factor needed for the SLP
1511 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1512 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1513 to replicate the vectors. */
1514 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1516 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1517 VEC_quick_push (tree, *vec_oprnds, vop);
1522 /* Get vectorized definitions from SLP_NODE that contains corresponding
1523 vectorized def-stmts. */
1526 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1529 gimple vec_def_stmt;
1532 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1535 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1538 gcc_assert (vec_def_stmt);
1539 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1540 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1545 /* Get vectorized definitions for SLP_NODE.
1546 If the scalar definitions are loop invariants or constants, collect them and
1547 call vect_get_constant_vectors() to create vector stmts.
1548 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1549 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1550 vect_get_slp_vect_defs() to retrieve them.
1551 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1552 the right node. This is used when the second operand must remain scalar. */
1555 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1556 VEC (tree,heap) **vec_oprnds1)
1559 enum tree_code code;
1560 int number_of_vects;
1561 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
1563 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1564 /* The number of vector defs is determined by the number of vector statements
1565 in the node from which we get those statements. */
1566 if (SLP_TREE_LEFT (slp_node))
1567 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
1570 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1571 /* Number of vector stmts was calculated according to LHS in
1572 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
1573 necessary. See vect_get_smallest_scalar_type() for details. */
1574 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
1576 if (rhs_size_unit != lhs_size_unit)
1578 number_of_vects *= rhs_size_unit;
1579 number_of_vects /= lhs_size_unit;
1583 /* Allocate memory for vectorized defs. */
1584 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
1586 /* SLP_NODE corresponds either to a group of stores or to a group of
1587 unary/binary operations. We don't call this function for loads. */
1588 if (SLP_TREE_LEFT (slp_node))
1589 /* The defs are already vectorized. */
1590 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1592 /* Build vectors from scalar defs. */
1593 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
1595 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1596 /* Since we don't call this function with loads, this is a group of
1600 code = gimple_assign_rhs_code (first_stmt);
1601 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1604 /* The number of vector defs is determined by the number of vector statements
1605 in the node from which we get those statements. */
1606 if (SLP_TREE_RIGHT (slp_node))
1607 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
1609 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1611 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
1613 if (SLP_TREE_RIGHT (slp_node))
1614 /* The defs are already vectorized. */
1615 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1617 /* Build vectors from scalar defs. */
1618 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
1622 /* Function get_initial_def_for_induction
1625 STMT - a stmt that performs an induction operation in the loop.
1626 IV_PHI - the initial value of the induction variable
1629 Return a vector variable, initialized with the first VF values of
1630 the induction variable. E.g., for an iv with IV_PHI='X' and
1631 evolution S, for a vector of 4 units, we want to return:
1632 [X, X + S, X + 2*S, X + 3*S]. */
1635 get_initial_def_for_induction (gimple iv_phi)
1637 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1638 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1639 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1640 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1643 edge pe = loop_preheader_edge (loop);
1644 struct loop *iv_loop;
1646 tree vec, vec_init, vec_step, t;
1650 gimple init_stmt, induction_phi, new_stmt;
1651 tree induc_def, vec_def, vec_dest;
1652 tree init_expr, step_expr;
1653 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1658 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1659 bool nested_in_vect_loop = false;
1660 gimple_seq stmts = NULL;
1661 imm_use_iterator imm_iter;
1662 use_operand_p use_p;
1666 gimple_stmt_iterator si;
1667 basic_block bb = gimple_bb (iv_phi);
1669 vectype = get_vectype_for_scalar_type (scalar_type);
1670 gcc_assert (vectype);
1671 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1672 ncopies = vf / nunits;
1674 gcc_assert (phi_info);
1675 gcc_assert (ncopies >= 1);
1677 /* Find the first insertion point in the BB. */
1678 si = gsi_after_labels (bb);
1680 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1681 step_expr = build_int_cst (scalar_type, 0);
1683 step_expr = build_real (scalar_type, dconst0);
1685 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1686 if (nested_in_vect_loop_p (loop, iv_phi))
1688 nested_in_vect_loop = true;
1689 iv_loop = loop->inner;
1693 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1695 latch_e = loop_latch_edge (iv_loop);
1696 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1698 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1699 gcc_assert (access_fn);
1700 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1701 &init_expr, &step_expr);
1703 pe = loop_preheader_edge (iv_loop);
1705 /* Create the vector that holds the initial_value of the induction. */
1706 if (nested_in_vect_loop)
1708 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1709 been created during vectorization of previous stmts; We obtain it from
1710 the STMT_VINFO_VEC_STMT of the defining stmt. */
1711 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1712 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1716 /* iv_loop is the loop to be vectorized. Create:
1717 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1718 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1719 add_referenced_var (new_var);
1721 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1724 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1725 gcc_assert (!new_bb);
1729 t = tree_cons (NULL_TREE, init_expr, t);
1730 for (i = 1; i < nunits; i++)
1732 /* Create: new_name_i = new_name + step_expr */
1733 enum tree_code code = POINTER_TYPE_P (scalar_type)
1734 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1735 init_stmt = gimple_build_assign_with_ops (code, new_var,
1736 new_name, step_expr);
1737 new_name = make_ssa_name (new_var, init_stmt);
1738 gimple_assign_set_lhs (init_stmt, new_name);
1740 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1741 gcc_assert (!new_bb);
1743 if (vect_print_dump_info (REPORT_DETAILS))
1745 fprintf (vect_dump, "created new init_stmt: ");
1746 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1748 t = tree_cons (NULL_TREE, new_name, t);
1750 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1751 vec = build_constructor_from_list (vectype, nreverse (t));
1752 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1756 /* Create the vector that holds the step of the induction. */
1757 if (nested_in_vect_loop)
1758 /* iv_loop is nested in the loop to be vectorized. Generate:
1759 vec_step = [S, S, S, S] */
1760 new_name = step_expr;
1763 /* iv_loop is the loop to be vectorized. Generate:
1764 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1765 expr = build_int_cst (scalar_type, vf);
1766 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1770 for (i = 0; i < nunits; i++)
1771 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1772 gcc_assert (CONSTANT_CLASS_P (new_name));
1773 vec = build_vector (vectype, t);
1774 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1777 /* Create the following def-use cycle:
1782 vec_iv = PHI <vec_init, vec_loop>
1786 vec_loop = vec_iv + vec_step; */
1788 /* Create the induction-phi that defines the induction-operand. */
1789 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1790 add_referenced_var (vec_dest);
1791 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1792 set_vinfo_for_stmt (induction_phi,
1793 new_stmt_vec_info (induction_phi, loop_vinfo));
1794 induc_def = PHI_RESULT (induction_phi);
1796 /* Create the iv update inside the loop */
1797 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1798 induc_def, vec_step);
1799 vec_def = make_ssa_name (vec_dest, new_stmt);
1800 gimple_assign_set_lhs (new_stmt, vec_def);
1801 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1802 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1804 /* Set the arguments of the phi node: */
1805 add_phi_arg (induction_phi, vec_init, pe);
1806 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1809 /* In case that vectorization factor (VF) is bigger than the number
1810 of elements that we can fit in a vectype (nunits), we have to generate
1811 more than one vector stmt - i.e - we need to "unroll" the
1812 vector stmt by a factor VF/nunits. For more details see documentation
1813 in vectorizable_operation. */
1817 stmt_vec_info prev_stmt_vinfo;
1818 /* FORNOW. This restriction should be relaxed. */
1819 gcc_assert (!nested_in_vect_loop);
1821 /* Create the vector that holds the step of the induction. */
1822 expr = build_int_cst (scalar_type, nunits);
1823 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1825 for (i = 0; i < nunits; i++)
1826 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1827 gcc_assert (CONSTANT_CLASS_P (new_name));
1828 vec = build_vector (vectype, t);
1829 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1831 vec_def = induc_def;
1832 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1833 for (i = 1; i < ncopies; i++)
1835 /* vec_i = vec_prev + vec_step */
1836 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1838 vec_def = make_ssa_name (vec_dest, new_stmt);
1839 gimple_assign_set_lhs (new_stmt, vec_def);
1841 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1842 set_vinfo_for_stmt (new_stmt,
1843 new_stmt_vec_info (new_stmt, loop_vinfo));
1844 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1845 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1849 if (nested_in_vect_loop)
1851 /* Find the loop-closed exit-phi of the induction, and record
1852 the final vector of induction results: */
1854 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1856 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1858 exit_phi = USE_STMT (use_p);
1864 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1865 /* FORNOW. Currently not supporting the case that an inner-loop induction
1866 is not used in the outer-loop (i.e. only outside the outer-loop). */
1867 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1868 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1870 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1871 if (vect_print_dump_info (REPORT_DETAILS))
1873 fprintf (vect_dump, "vector of inductions after inner-loop:");
1874 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1880 if (vect_print_dump_info (REPORT_DETAILS))
1882 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1883 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1884 fprintf (vect_dump, "\n");
1885 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1888 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1893 /* Function vect_get_vec_def_for_operand.
1895 OP is an operand in STMT. This function returns a (vector) def that will be
1896 used in the vectorized stmt for STMT.
1898 In the case that OP is an SSA_NAME which is defined in the loop, then
1899 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1901 In case OP is an invariant or constant, a new stmt that creates a vector def
1902 needs to be introduced. */
1905 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1910 stmt_vec_info def_stmt_info = NULL;
1911 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1912 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1913 unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1914 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1920 enum vect_def_type dt;
1924 if (vect_print_dump_info (REPORT_DETAILS))
1926 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1927 print_generic_expr (vect_dump, op, TDF_SLIM);
1930 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1931 gcc_assert (is_simple_use);
1932 if (vect_print_dump_info (REPORT_DETAILS))
1936 fprintf (vect_dump, "def = ");
1937 print_generic_expr (vect_dump, def, TDF_SLIM);
1941 fprintf (vect_dump, " def_stmt = ");
1942 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1948 /* Case 1: operand is a constant. */
1949 case vect_constant_def:
1954 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1955 if (vect_print_dump_info (REPORT_DETAILS))
1956 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1958 for (i = nunits - 1; i >= 0; --i)
1960 t = tree_cons (NULL_TREE, op, t);
1962 vec_cst = build_vector (vectype, t);
1963 return vect_init_vector (stmt, vec_cst, vectype, NULL);
1966 /* Case 2: operand is defined outside the loop - loop invariant. */
1967 case vect_invariant_def:
1969 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1970 gcc_assert (vector_type);
1971 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1976 /* Create 'vec_inv = {inv,inv,..,inv}' */
1977 if (vect_print_dump_info (REPORT_DETAILS))
1978 fprintf (vect_dump, "Create vector_inv.");
1980 for (i = nunits - 1; i >= 0; --i)
1982 t = tree_cons (NULL_TREE, def, t);
1985 /* FIXME: use build_constructor directly. */
1986 vec_inv = build_constructor_from_list (vector_type, t);
1987 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1990 /* Case 3: operand is defined inside the loop. */
1994 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1996 /* Get the def from the vectorized stmt. */
1997 def_stmt_info = vinfo_for_stmt (def_stmt);
1998 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1999 gcc_assert (vec_stmt);
2000 if (gimple_code (vec_stmt) == GIMPLE_PHI)
2001 vec_oprnd = PHI_RESULT (vec_stmt);
2002 else if (is_gimple_call (vec_stmt))
2003 vec_oprnd = gimple_call_lhs (vec_stmt);
2005 vec_oprnd = gimple_assign_lhs (vec_stmt);
2009 /* Case 4: operand is defined by a loop header phi - reduction */
2010 case vect_reduction_def:
2014 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2015 loop = (gimple_bb (def_stmt))->loop_father;
2017 /* Get the def before the loop */
2018 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
2019 return get_initial_def_for_reduction (stmt, op, scalar_def);
2022 /* Case 5: operand is defined by loop-header phi - induction. */
2023 case vect_induction_def:
2025 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2027 /* Get the def from the vectorized stmt. */
2028 def_stmt_info = vinfo_for_stmt (def_stmt);
2029 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2030 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
2031 vec_oprnd = PHI_RESULT (vec_stmt);
2041 /* Function vect_get_vec_def_for_stmt_copy
2043 Return a vector-def for an operand. This function is used when the
2044 vectorized stmt to be created (by the caller to this function) is a "copy"
2045 created in case the vectorized result cannot fit in one vector, and several
2046 copies of the vector-stmt are required. In this case the vector-def is
2047 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
2048 of the stmt that defines VEC_OPRND.
2049 DT is the type of the vector def VEC_OPRND.
2052 In case the vectorization factor (VF) is bigger than the number
2053 of elements that can fit in a vectype (nunits), we have to generate
2054 more than one vector stmt to vectorize the scalar stmt. This situation
2055 arises when there are multiple data-types operated upon in the loop; the
2056 smallest data-type determines the VF, and as a result, when vectorizing
2057 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2058 vector stmt (each computing a vector of 'nunits' results, and together
2059 computing 'VF' results in each iteration). This function is called when
2060 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2061 which VF=16 and nunits=4, so the number of copies required is 4):
2063 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2065 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2066 VS1.1: vx.1 = memref1 VS1.2
2067 VS1.2: vx.2 = memref2 VS1.3
2068 VS1.3: vx.3 = memref3
2070 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2071 VSnew.1: vz1 = vx.1 + ... VSnew.2
2072 VSnew.2: vz2 = vx.2 + ... VSnew.3
2073 VSnew.3: vz3 = vx.3 + ...
2075 The vectorization of S1 is explained in vectorizable_load.
2076 The vectorization of S2:
2077 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2078 the function 'vect_get_vec_def_for_operand' is called to
2079 get the relevant vector-def for each operand of S2. For operand x it
2080 returns the vector-def 'vx.0'.
2082 To create the remaining copies of the vector-stmt (VSnew.j), this
2083 function is called to get the relevant vector-def for each operand. It is
2084 obtained from the respective VS1.j stmt, which is recorded in the
2085 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2087 For example, to obtain the vector-def 'vx.1' in order to create the
2088 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2089 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2090 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2091 and return its def ('vx.1').
2092 Overall, to create the above sequence this function will be called 3 times:
2093 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2094 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2095 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2098 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2100 gimple vec_stmt_for_operand;
2101 stmt_vec_info def_stmt_info;
2103 /* Do nothing; can reuse same def. */
2104 if (dt == vect_invariant_def || dt == vect_constant_def )
2107 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2108 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2109 gcc_assert (def_stmt_info);
2110 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2111 gcc_assert (vec_stmt_for_operand);
2112 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2113 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2114 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2116 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2121 /* Get vectorized definitions for the operands to create a copy of an original
2122 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2125 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2126 VEC(tree,heap) **vec_oprnds0,
2127 VEC(tree,heap) **vec_oprnds1)
2129 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2131 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2132 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2134 if (vec_oprnds1 && *vec_oprnds1)
2136 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2137 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2138 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2143 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2146 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2147 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2151 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2156 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2157 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2158 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2162 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2163 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2164 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2170 /* Function vect_finish_stmt_generation.
2172 Insert a new stmt. */
2175 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2176 gimple_stmt_iterator *gsi)
2178 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2179 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2181 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2183 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2185 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2187 if (vect_print_dump_info (REPORT_DETAILS))
2189 fprintf (vect_dump, "add new stmt: ");
2190 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2193 gimple_set_location (vec_stmt, gimple_location (gsi_stmt (*gsi)));
2197 /* Function get_initial_def_for_reduction
2200 STMT - a stmt that performs a reduction operation in the loop.
2201 INIT_VAL - the initial value of the reduction variable
2204 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2205 of the reduction (used for adjusting the epilog - see below).
2206 Return a vector variable, initialized according to the operation that STMT
2207 performs. This vector will be used as the initial value of the
2208 vector of partial results.
2210 Option1 (adjust in epilog): Initialize the vector as follows:
2213 min/max: [init_val,init_val,..,init_val,init_val]
2214 bit and/or: [init_val,init_val,..,init_val,init_val]
2215 and when necessary (e.g. add/mult case) let the caller know
2216 that it needs to adjust the result by init_val.
2218 Option2: Initialize the vector as follows:
2219 add: [0,0,...,0,init_val]
2220 mult: [1,1,...,1,init_val]
2221 min/max: [init_val,init_val,...,init_val]
2222 bit and/or: [init_val,init_val,...,init_val]
2223 and no adjustments are needed.
2225 For example, for the following code:
2231 STMT is 's = s + a[i]', and the reduction variable is 's'.
2232 For a vector of 4 units, we want to return either [0,0,0,init_val],
2233 or [0,0,0,0] and let the caller know that it needs to adjust
2234 the result at the end by 'init_val'.
2236 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2237 initialization vector is simpler (same element in all entries).
2238 A cost model should help decide between these two schemes. */
2241 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2243 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2244 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2245 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2246 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2247 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2248 tree scalar_type = TREE_TYPE (vectype);
2249 enum tree_code code = gimple_assign_rhs_code (stmt);
2250 tree type = TREE_TYPE (init_val);
2256 bool nested_in_vect_loop = false;
2258 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2259 if (nested_in_vect_loop_p (loop, stmt))
2260 nested_in_vect_loop = true;
2262 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2264 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2268 case WIDEN_SUM_EXPR:
2271 if (nested_in_vect_loop)
2272 *adjustment_def = vecdef;
2274 *adjustment_def = init_val;
2275 /* Create a vector of zeros for init_def. */
2276 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2277 def_for_init = build_real (scalar_type, dconst0);
2279 def_for_init = build_int_cst (scalar_type, 0);
2281 for (i = nunits - 1; i >= 0; --i)
2282 t = tree_cons (NULL_TREE, def_for_init, t);
2283 init_def = build_vector (vectype, t);
2288 *adjustment_def = NULL_TREE;
2300 /* Function vect_create_epilog_for_reduction
2302 Create code at the loop-epilog to finalize the result of a reduction
2305 VECT_DEF is a vector of partial results.
2306 REDUC_CODE is the tree-code for the epilog reduction.
2307 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2308 number of elements that we can fit in a vectype (nunits). In this case
2309 we have to generate more than one vector stmt - i.e - we need to "unroll"
2310 the vector stmt by a factor VF/nunits. For more details see documentation
2311 in vectorizable_operation.
2312 STMT is the scalar reduction stmt that is being vectorized.
2313 REDUCTION_PHI is the phi-node that carries the reduction computation.
2316 1. Creates the reduction def-use cycle: sets the arguments for
2318 The loop-entry argument is the vectorized initial-value of the reduction.
2319 The loop-latch argument is VECT_DEF - the vector of partial sums.
2320 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2321 by applying the operation specified by REDUC_CODE if available, or by
2322 other means (whole-vector shifts or a scalar loop).
2323 The function also creates a new phi node at the loop exit to preserve
2324 loop-closed form, as illustrated below.
2326 The flow at the entry to this function:
2329 vec_def = phi <null, null> # REDUCTION_PHI
2330 VECT_DEF = vector_stmt # vectorized form of STMT
2331 s_loop = scalar_stmt # (scalar) STMT
2333 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2337 The above is transformed by this function into:
2340 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2341 VECT_DEF = vector_stmt # vectorized form of STMT
2342 s_loop = scalar_stmt # (scalar) STMT
2344 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2345 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2346 v_out2 = reduce <v_out1>
2347 s_out3 = extract_field <v_out2, 0>
2348 s_out4 = adjust_result <s_out3>
2354 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2356 enum tree_code reduc_code,
2357 gimple reduction_phi)
2359 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2360 stmt_vec_info prev_phi_info;
2362 enum machine_mode mode;
2363 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2364 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2365 basic_block exit_bb;
2368 gimple new_phi = NULL, phi;
2369 gimple_stmt_iterator exit_gsi;
2371 tree new_temp = NULL_TREE;
2373 gimple epilog_stmt = NULL;
2374 tree new_scalar_dest, new_dest;
2376 tree bitsize, bitpos, bytesize;
2377 enum tree_code code = gimple_assign_rhs_code (stmt);
2378 tree adjustment_def;
2379 tree vec_initial_def, def;
2381 imm_use_iterator imm_iter;
2382 use_operand_p use_p;
2383 bool extract_scalar_result = false;
2384 tree reduction_op, expr;
2387 bool nested_in_vect_loop = false;
2388 VEC(gimple,heap) *phis = NULL;
2389 enum vect_def_type dt = vect_unknown_def_type;
2392 if (nested_in_vect_loop_p (loop, stmt))
2395 nested_in_vect_loop = true;
2398 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2400 case GIMPLE_SINGLE_RHS:
2401 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2402 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2404 case GIMPLE_UNARY_RHS:
2405 reduction_op = gimple_assign_rhs1 (stmt);
2407 case GIMPLE_BINARY_RHS:
2408 reduction_op = gimple_assign_rhs2 (stmt);
2414 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2415 gcc_assert (vectype);
2416 mode = TYPE_MODE (vectype);
2418 /*** 1. Create the reduction def-use cycle ***/
2420 /* For the case of reduction, vect_get_vec_def_for_operand returns
2421 the scalar def before the loop, that defines the initial value
2422 of the reduction variable. */
2423 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2426 phi = reduction_phi;
2428 for (j = 0; j < ncopies; j++)
2430 /* 1.1 set the loop-entry arg of the reduction-phi: */
2431 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2433 /* 1.2 set the loop-latch arg for the reduction-phi: */
2435 def = vect_get_vec_def_for_stmt_copy (dt, def);
2436 add_phi_arg (phi, def, loop_latch_edge (loop));
2438 if (vect_print_dump_info (REPORT_DETAILS))
2440 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2441 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2442 fprintf (vect_dump, "\n");
2443 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2446 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2449 /*** 2. Create epilog code
2450 The reduction epilog code operates across the elements of the vector
2451 of partial results computed by the vectorized loop.
2452 The reduction epilog code consists of:
2453 step 1: compute the scalar result in a vector (v_out2)
2454 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2455 step 3: adjust the scalar result (s_out3) if needed.
2457 Step 1 can be accomplished using one the following three schemes:
2458 (scheme 1) using reduc_code, if available.
2459 (scheme 2) using whole-vector shifts, if available.
2460 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2463 The overall epilog code looks like this:
2465 s_out0 = phi <s_loop> # original EXIT_PHI
2466 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2467 v_out2 = reduce <v_out1> # step 1
2468 s_out3 = extract_field <v_out2, 0> # step 2
2469 s_out4 = adjust_result <s_out3> # step 3
2471 (step 3 is optional, and steps 1 and 2 may be combined).
2472 Lastly, the uses of s_out0 are replaced by s_out4.
2476 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2477 v_out1 = phi <v_loop> */
2479 exit_bb = single_exit (loop)->dest;
2481 prev_phi_info = NULL;
2482 for (j = 0; j < ncopies; j++)
2484 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2485 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2490 def = vect_get_vec_def_for_stmt_copy (dt, def);
2491 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2493 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2494 prev_phi_info = vinfo_for_stmt (phi);
2496 exit_gsi = gsi_after_labels (exit_bb);
2498 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2499 (i.e. when reduc_code is not available) and in the final adjustment
2500 code (if needed). Also get the original scalar reduction variable as
2501 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2502 represents a reduction pattern), the tree-code and scalar-def are
2503 taken from the original stmt that the pattern-stmt (STMT) replaces.
2504 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2505 are taken from STMT. */
2507 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2510 /* Regular reduction */
2515 /* Reduction pattern */
2516 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2517 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2518 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2520 code = gimple_assign_rhs_code (orig_stmt);
2521 scalar_dest = gimple_assign_lhs (orig_stmt);
2522 scalar_type = TREE_TYPE (scalar_dest);
2523 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2524 bitsize = TYPE_SIZE (scalar_type);
2525 bytesize = TYPE_SIZE_UNIT (scalar_type);
2528 /* In case this is a reduction in an inner-loop while vectorizing an outer
2529 loop - we don't need to extract a single scalar result at the end of the
2530 inner-loop. The final vector of partial results will be used in the
2531 vectorized outer-loop, or reduced to a scalar result at the end of the
2533 if (nested_in_vect_loop)
2534 goto vect_finalize_reduction;
2537 gcc_assert (ncopies == 1);
2539 /* 2.3 Create the reduction code, using one of the three schemes described
2542 if (reduc_code < NUM_TREE_CODES)
2546 /*** Case 1: Create:
2547 v_out2 = reduc_expr <v_out1> */
2549 if (vect_print_dump_info (REPORT_DETAILS))
2550 fprintf (vect_dump, "Reduce using direct vector reduction.");
2552 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2553 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2554 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2555 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2556 gimple_assign_set_lhs (epilog_stmt, new_temp);
2557 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2559 extract_scalar_result = true;
2563 enum tree_code shift_code = 0;
2564 bool have_whole_vector_shift = true;
2566 int element_bitsize = tree_low_cst (bitsize, 1);
2567 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2570 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2571 shift_code = VEC_RSHIFT_EXPR;
2573 have_whole_vector_shift = false;
2575 /* Regardless of whether we have a whole vector shift, if we're
2576 emulating the operation via tree-vect-generic, we don't want
2577 to use it. Only the first round of the reduction is likely
2578 to still be profitable via emulation. */
2579 /* ??? It might be better to emit a reduction tree code here, so that
2580 tree-vect-generic can expand the first round via bit tricks. */
2581 if (!VECTOR_MODE_P (mode))
2582 have_whole_vector_shift = false;
2585 optab optab = optab_for_tree_code (code, vectype, optab_default);
2586 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2587 have_whole_vector_shift = false;
2590 if (have_whole_vector_shift)
2592 /*** Case 2: Create:
2593 for (offset = VS/2; offset >= element_size; offset/=2)
2595 Create: va' = vec_shift <va, offset>
2596 Create: va = vop <va, va'>
2599 if (vect_print_dump_info (REPORT_DETAILS))
2600 fprintf (vect_dump, "Reduce using vector shifts");
2602 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2603 new_temp = PHI_RESULT (new_phi);
2605 for (bit_offset = vec_size_in_bits/2;
2606 bit_offset >= element_bitsize;
2609 tree bitpos = size_int (bit_offset);
2610 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2612 new_name = make_ssa_name (vec_dest, epilog_stmt);
2613 gimple_assign_set_lhs (epilog_stmt, new_name);
2614 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2616 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2617 new_name, new_temp);
2618 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2619 gimple_assign_set_lhs (epilog_stmt, new_temp);
2620 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2623 extract_scalar_result = true;
2629 /*** Case 3: Create:
2630 s = extract_field <v_out2, 0>
2631 for (offset = element_size;
2632 offset < vector_size;
2633 offset += element_size;)
2635 Create: s' = extract_field <v_out2, offset>
2636 Create: s = op <s, s'>
2639 if (vect_print_dump_info (REPORT_DETAILS))
2640 fprintf (vect_dump, "Reduce using scalar code. ");
2642 vec_temp = PHI_RESULT (new_phi);
2643 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2644 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2646 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2647 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2648 gimple_assign_set_lhs (epilog_stmt, new_temp);
2649 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2651 for (bit_offset = element_bitsize;
2652 bit_offset < vec_size_in_bits;
2653 bit_offset += element_bitsize)
2655 tree bitpos = bitsize_int (bit_offset);
2656 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2659 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2660 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2661 gimple_assign_set_lhs (epilog_stmt, new_name);
2662 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2664 epilog_stmt = gimple_build_assign_with_ops (code,
2666 new_name, new_temp);
2667 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2668 gimple_assign_set_lhs (epilog_stmt, new_temp);
2669 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2672 extract_scalar_result = false;
2676 /* 2.4 Extract the final scalar result. Create:
2677 s_out3 = extract_field <v_out2, bitpos> */
2679 if (extract_scalar_result)
2683 gcc_assert (!nested_in_vect_loop);
2684 if (vect_print_dump_info (REPORT_DETAILS))
2685 fprintf (vect_dump, "extract scalar result");
2687 if (BYTES_BIG_ENDIAN)
2688 bitpos = size_binop (MULT_EXPR,
2689 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2690 TYPE_SIZE (scalar_type));
2692 bitpos = bitsize_zero_node;
2694 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2695 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2696 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2697 gimple_assign_set_lhs (epilog_stmt, new_temp);
2698 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2701 vect_finalize_reduction:
2703 /* 2.5 Adjust the final result by the initial value of the reduction
2704 variable. (When such adjustment is not needed, then
2705 'adjustment_def' is zero). For example, if code is PLUS we create:
2706 new_temp = loop_exit_def + adjustment_def */
2710 if (nested_in_vect_loop)
2712 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2713 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2714 new_dest = vect_create_destination_var (scalar_dest, vectype);
2718 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2719 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2720 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2722 epilog_stmt = gimple_build_assign (new_dest, expr);
2723 new_temp = make_ssa_name (new_dest, epilog_stmt);
2724 gimple_assign_set_lhs (epilog_stmt, new_temp);
2725 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2726 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2730 /* 2.6 Handle the loop-exit phi */
2732 /* Replace uses of s_out0 with uses of s_out3:
2733 Find the loop-closed-use at the loop exit of the original scalar result.
2734 (The reduction result is expected to have two immediate uses - one at the
2735 latch block, and one at the loop exit). */
2736 phis = VEC_alloc (gimple, heap, 10);
2737 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2739 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2741 exit_phi = USE_STMT (use_p);
2742 VEC_quick_push (gimple, phis, exit_phi);
2745 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2746 gcc_assert (!VEC_empty (gimple, phis));
2748 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2750 if (nested_in_vect_loop)
2752 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2754 /* FORNOW. Currently not supporting the case that an inner-loop
2755 reduction is not used in the outer-loop (but only outside the
2757 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2758 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2760 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2761 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2762 set_vinfo_for_stmt (epilog_stmt,
2763 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2765 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2766 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2770 /* Replace the uses: */
2771 orig_name = PHI_RESULT (exit_phi);
2772 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2773 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2774 SET_USE (use_p, new_temp);
2776 VEC_free (gimple, heap, phis);
2780 /* Function vectorizable_reduction.
2782 Check if STMT performs a reduction operation that can be vectorized.
2783 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2784 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2785 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2787 This function also handles reduction idioms (patterns) that have been
2788 recognized in advance during vect_pattern_recog. In this case, STMT may be
2790 X = pattern_expr (arg0, arg1, ..., X)
2791 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2792 sequence that had been detected and replaced by the pattern-stmt (STMT).
2794 In some cases of reduction patterns, the type of the reduction variable X is
2795 different than the type of the other arguments of STMT.
2796 In such cases, the vectype that is used when transforming STMT into a vector
2797 stmt is different than the vectype that is used to determine the
2798 vectorization factor, because it consists of a different number of elements
2799 than the actual number of elements that are being operated upon in parallel.
2801 For example, consider an accumulation of shorts into an int accumulator.
2802 On some targets it's possible to vectorize this pattern operating on 8
2803 shorts at a time (hence, the vectype for purposes of determining the
2804 vectorization factor should be V8HI); on the other hand, the vectype that
2805 is used to create the vector form is actually V4SI (the type of the result).
2807 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2808 indicates what is the actual level of parallelism (V8HI in the example), so
2809 that the right vectorization factor would be derived. This vectype
2810 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2811 be used to create the vectorized stmt. The right vectype for the vectorized
2812 stmt is obtained from the type of the result X:
2813 get_vectype_for_scalar_type (TREE_TYPE (X))
2815 This means that, contrary to "regular" reductions (or "regular" stmts in
2816 general), the following equation:
2817 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2818 does *NOT* necessarily hold for reduction patterns. */
2821 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2826 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2827 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2828 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2829 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2830 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2831 enum tree_code code, orig_code, epilog_reduc_code = 0;
2832 enum machine_mode vec_mode;
2834 optab optab, reduc_optab;
2835 tree new_temp = NULL_TREE;
2838 enum vect_def_type dt;
2839 gimple new_phi = NULL;
2843 stmt_vec_info orig_stmt_info;
2844 tree expr = NULL_TREE;
2846 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2847 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2849 stmt_vec_info prev_stmt_info, prev_phi_info;
2850 gimple first_phi = NULL;
2851 bool single_defuse_cycle = false;
2853 gimple new_stmt = NULL;
2857 if (nested_in_vect_loop_p (loop, stmt))
2860 gcc_assert (ncopies >= 1);
2862 /* FORNOW: SLP not supported. */
2863 if (STMT_SLP_TYPE (stmt_info))
2866 /* 1. Is vectorizable reduction? */
2868 /* Not supportable if the reduction variable is used in the loop. */
2869 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2872 /* Reductions that are not used even in an enclosing outer-loop,
2873 are expected to be "live" (used out of the loop). */
2874 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2875 && !STMT_VINFO_LIVE_P (stmt_info))
2878 /* Make sure it was already recognized as a reduction computation. */
2879 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2882 /* 2. Has this been recognized as a reduction pattern?
2884 Check if STMT represents a pattern that has been recognized
2885 in earlier analysis stages. For stmts that represent a pattern,
2886 the STMT_VINFO_RELATED_STMT field records the last stmt in
2887 the original sequence that constitutes the pattern. */
2889 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2892 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2893 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2894 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2895 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2898 /* 3. Check the operands of the operation. The first operands are defined
2899 inside the loop body. The last operand is the reduction variable,
2900 which is defined by the loop-header-phi. */
2902 gcc_assert (is_gimple_assign (stmt));
2905 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2907 case GIMPLE_SINGLE_RHS:
2908 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2909 if (op_type == ternary_op)
2911 tree rhs = gimple_assign_rhs1 (stmt);
2912 ops[0] = TREE_OPERAND (rhs, 0);
2913 ops[1] = TREE_OPERAND (rhs, 1);
2914 ops[2] = TREE_OPERAND (rhs, 2);
2915 code = TREE_CODE (rhs);
2921 case GIMPLE_BINARY_RHS:
2922 code = gimple_assign_rhs_code (stmt);
2923 op_type = TREE_CODE_LENGTH (code);
2924 gcc_assert (op_type == binary_op);
2925 ops[0] = gimple_assign_rhs1 (stmt);
2926 ops[1] = gimple_assign_rhs2 (stmt);
2929 case GIMPLE_UNARY_RHS:
2936 scalar_dest = gimple_assign_lhs (stmt);
2937 scalar_type = TREE_TYPE (scalar_dest);
2938 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2939 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2942 /* All uses but the last are expected to be defined in the loop.
2943 The last use is the reduction variable. */
2944 for (i = 0; i < op_type-1; i++)
2946 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2948 gcc_assert (is_simple_use);
2949 if (dt != vect_loop_def
2950 && dt != vect_invariant_def
2951 && dt != vect_constant_def
2952 && dt != vect_induction_def)
2956 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2957 gcc_assert (is_simple_use);
2958 gcc_assert (dt == vect_reduction_def);
2959 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2961 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2963 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2965 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2968 /* 4. Supportable by target? */
2970 /* 4.1. check support for the operation in the loop */
2971 optab = optab_for_tree_code (code, vectype, optab_default);
2974 if (vect_print_dump_info (REPORT_DETAILS))
2975 fprintf (vect_dump, "no optab.");
2978 vec_mode = TYPE_MODE (vectype);
2979 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2981 if (vect_print_dump_info (REPORT_DETAILS))
2982 fprintf (vect_dump, "op not supported by target.");
2983 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2984 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2985 < vect_min_worthwhile_factor (code))
2987 if (vect_print_dump_info (REPORT_DETAILS))
2988 fprintf (vect_dump, "proceeding using word mode.");
2991 /* Worthwhile without SIMD support? */
2992 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2993 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2994 < vect_min_worthwhile_factor (code))
2996 if (vect_print_dump_info (REPORT_DETAILS))
2997 fprintf (vect_dump, "not worthwhile without SIMD support.");
3001 /* 4.2. Check support for the epilog operation.
3003 If STMT represents a reduction pattern, then the type of the
3004 reduction variable may be different than the type of the rest
3005 of the arguments. For example, consider the case of accumulation
3006 of shorts into an int accumulator; The original code:
3007 S1: int_a = (int) short_a;
3008 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
3011 STMT: int_acc = widen_sum <short_a, int_acc>
3014 1. The tree-code that is used to create the vector operation in the
3015 epilog code (that reduces the partial results) is not the
3016 tree-code of STMT, but is rather the tree-code of the original
3017 stmt from the pattern that STMT is replacing. I.e, in the example
3018 above we want to use 'widen_sum' in the loop, but 'plus' in the
3020 2. The type (mode) we use to check available target support
3021 for the vector operation to be created in the *epilog*, is
3022 determined by the type of the reduction variable (in the example
3023 above we'd check this: plus_optab[vect_int_mode]).
3024 However the type (mode) we use to check available target support
3025 for the vector operation to be created *inside the loop*, is
3026 determined by the type of the other arguments to STMT (in the
3027 example we'd check this: widen_sum_optab[vect_short_mode]).
3029 This is contrary to "regular" reductions, in which the types of all
3030 the arguments are the same as the type of the reduction variable.
3031 For "regular" reductions we can therefore use the same vector type
3032 (and also the same tree-code) when generating the epilog code and
3033 when generating the code inside the loop. */
3037 /* This is a reduction pattern: get the vectype from the type of the
3038 reduction variable, and get the tree-code from orig_stmt. */
3039 orig_code = gimple_assign_rhs_code (orig_stmt);
3040 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3043 if (vect_print_dump_info (REPORT_DETAILS))
3045 fprintf (vect_dump, "unsupported data-type ");
3046 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3051 vec_mode = TYPE_MODE (vectype);
3055 /* Regular reduction: use the same vectype and tree-code as used for
3056 the vector code inside the loop can be used for the epilog code. */
3060 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3062 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3065 if (vect_print_dump_info (REPORT_DETAILS))
3066 fprintf (vect_dump, "no optab for reduction.");
3067 epilog_reduc_code = NUM_TREE_CODES;
3069 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3071 if (vect_print_dump_info (REPORT_DETAILS))
3072 fprintf (vect_dump, "reduc op not supported by target.");
3073 epilog_reduc_code = NUM_TREE_CODES;
3076 if (!vec_stmt) /* transformation not required. */
3078 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3079 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3086 if (vect_print_dump_info (REPORT_DETAILS))
3087 fprintf (vect_dump, "transform reduction.");
3089 /* Create the destination vector */
3090 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3092 /* In case the vectorization factor (VF) is bigger than the number
3093 of elements that we can fit in a vectype (nunits), we have to generate
3094 more than one vector stmt - i.e - we need to "unroll" the
3095 vector stmt by a factor VF/nunits. For more details see documentation
3096 in vectorizable_operation. */
3098 /* If the reduction is used in an outer loop we need to generate
3099 VF intermediate results, like so (e.g. for ncopies=2):
3104 (i.e. we generate VF results in 2 registers).
3105 In this case we have a separate def-use cycle for each copy, and therefore
3106 for each copy we get the vector def for the reduction variable from the
3107 respective phi node created for this copy.
3109 Otherwise (the reduction is unused in the loop nest), we can combine
3110 together intermediate results, like so (e.g. for ncopies=2):
3114 (i.e. we generate VF/2 results in a single register).
3115 In this case for each copy we get the vector def for the reduction variable
3116 from the vectorized reduction operation generated in the previous iteration.
3119 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3121 single_defuse_cycle = true;
3125 epilog_copies = ncopies;
3127 prev_stmt_info = NULL;
3128 prev_phi_info = NULL;
3129 for (j = 0; j < ncopies; j++)
3131 if (j == 0 || !single_defuse_cycle)
3133 /* Create the reduction-phi that defines the reduction-operand. */
3134 new_phi = create_phi_node (vec_dest, loop->header);
3135 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3141 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3142 if (op_type == ternary_op)
3144 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3147 /* Get the vector def for the reduction variable from the phi node */
3148 reduc_def = PHI_RESULT (new_phi);
3149 first_phi = new_phi;
3153 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3154 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3155 if (op_type == ternary_op)
3156 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3158 if (single_defuse_cycle)
3159 reduc_def = gimple_assign_lhs (new_stmt);
3161 reduc_def = PHI_RESULT (new_phi);
3163 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3166 /* Arguments are ready. create the new vector stmt. */
3167 if (op_type == binary_op)
3168 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3170 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3172 new_stmt = gimple_build_assign (vec_dest, expr);
3173 new_temp = make_ssa_name (vec_dest, new_stmt);
3174 gimple_assign_set_lhs (new_stmt, new_temp);
3175 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3178 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3180 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3181 prev_stmt_info = vinfo_for_stmt (new_stmt);
3182 prev_phi_info = vinfo_for_stmt (new_phi);
3185 /* Finalize the reduction-phi (set its arguments) and create the
3186 epilog reduction code. */
3187 if (!single_defuse_cycle)
3188 new_temp = gimple_assign_lhs (*vec_stmt);
3189 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3190 epilog_reduc_code, first_phi);
3194 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3195 a function declaration if the target has a vectorized version
3196 of the function, or NULL_TREE if the function cannot be vectorized. */
3199 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3201 tree fndecl = gimple_call_fndecl (call);
3202 enum built_in_function code;
3204 /* We only handle functions that do not read or clobber memory -- i.e.
3205 const or novops ones. */
3206 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3210 || TREE_CODE (fndecl) != FUNCTION_DECL
3211 || !DECL_BUILT_IN (fndecl))
3214 code = DECL_FUNCTION_CODE (fndecl);
3215 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3219 /* Function vectorizable_call.
3221 Check if STMT performs a function call that can be vectorized.
3222 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3223 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3224 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3227 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3232 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3233 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3234 tree vectype_out, vectype_in;
3237 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3238 tree fndecl, new_temp, def, rhs_type, lhs_type;
3240 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3243 VEC(tree, heap) *vargs = NULL;
3244 enum { NARROW, NONE, WIDEN } modifier;
3247 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3250 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3253 /* FORNOW: SLP not supported. */
3254 if (STMT_SLP_TYPE (stmt_info))
3257 /* Is STMT a vectorizable call? */
3258 if (!is_gimple_call (stmt))
3261 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3264 /* Process function arguments. */
3265 rhs_type = NULL_TREE;
3266 nargs = gimple_call_num_args (stmt);
3268 /* Bail out if the function has more than two arguments, we
3269 do not have interesting builtin functions to vectorize with
3270 more than two arguments. No arguments is also not good. */
3271 if (nargs == 0 || nargs > 2)
3274 for (i = 0; i < nargs; i++)
3276 op = gimple_call_arg (stmt, i);
3278 /* We can only handle calls with arguments of the same type. */
3280 && rhs_type != TREE_TYPE (op))
3282 if (vect_print_dump_info (REPORT_DETAILS))
3283 fprintf (vect_dump, "argument types differ.");
3286 rhs_type = TREE_TYPE (op);
3288 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3290 if (vect_print_dump_info (REPORT_DETAILS))
3291 fprintf (vect_dump, "use not simple.");
3296 vectype_in = get_vectype_for_scalar_type (rhs_type);
3299 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3301 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3302 vectype_out = get_vectype_for_scalar_type (lhs_type);
3305 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3308 if (nunits_in == nunits_out / 2)
3310 else if (nunits_out == nunits_in)
3312 else if (nunits_out == nunits_in / 2)
3317 /* For now, we only vectorize functions if a target specific builtin
3318 is available. TODO -- in some cases, it might be profitable to
3319 insert the calls for pieces of the vector, in order to be able
3320 to vectorize other operations in the loop. */
3321 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3322 if (fndecl == NULL_TREE)
3324 if (vect_print_dump_info (REPORT_DETAILS))
3325 fprintf (vect_dump, "function is not vectorizable.");
3330 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3332 if (modifier == NARROW)
3333 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3335 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3337 /* Sanity check: make sure that at least one copy of the vectorized stmt
3338 needs to be generated. */
3339 gcc_assert (ncopies >= 1);
3341 if (!vec_stmt) /* transformation not required. */
3343 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3344 if (vect_print_dump_info (REPORT_DETAILS))
3345 fprintf (vect_dump, "=== vectorizable_call ===");
3346 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3352 if (vect_print_dump_info (REPORT_DETAILS))
3353 fprintf (vect_dump, "transform operation.");
3356 scalar_dest = gimple_call_lhs (stmt);
3357 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3359 prev_stmt_info = NULL;
3363 for (j = 0; j < ncopies; ++j)
3365 /* Build argument list for the vectorized call. */
3367 vargs = VEC_alloc (tree, heap, nargs);
3369 VEC_truncate (tree, vargs, 0);
3371 for (i = 0; i < nargs; i++)
3373 op = gimple_call_arg (stmt, i);
3376 = vect_get_vec_def_for_operand (op, stmt, NULL);
3379 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3381 VEC_quick_push (tree, vargs, vec_oprnd0);
3384 new_stmt = gimple_build_call_vec (fndecl, vargs);
3385 new_temp = make_ssa_name (vec_dest, new_stmt);
3386 gimple_call_set_lhs (new_stmt, new_temp);
3388 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3389 mark_symbols_for_renaming (new_stmt);
3392 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3394 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3396 prev_stmt_info = vinfo_for_stmt (new_stmt);
3402 for (j = 0; j < ncopies; ++j)
3404 /* Build argument list for the vectorized call. */
3406 vargs = VEC_alloc (tree, heap, nargs * 2);
3408 VEC_truncate (tree, vargs, 0);
3410 for (i = 0; i < nargs; i++)
3412 op = gimple_call_arg (stmt, i);
3416 = vect_get_vec_def_for_operand (op, stmt, NULL);
3418 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3423 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3425 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3428 VEC_quick_push (tree, vargs, vec_oprnd0);
3429 VEC_quick_push (tree, vargs, vec_oprnd1);
3432 new_stmt = gimple_build_call_vec (fndecl, vargs);
3433 new_temp = make_ssa_name (vec_dest, new_stmt);
3434 gimple_call_set_lhs (new_stmt, new_temp);
3436 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3437 mark_symbols_for_renaming (new_stmt);
3440 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3442 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3444 prev_stmt_info = vinfo_for_stmt (new_stmt);
3447 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3452 /* No current target implements this case. */
3456 VEC_free (tree, heap, vargs);
3458 /* Update the exception handling table with the vector stmt if necessary. */
3459 if (maybe_clean_or_replace_eh_stmt (stmt, *vec_stmt))
3460 gimple_purge_dead_eh_edges (gimple_bb (stmt));
3462 /* The call in STMT might prevent it from being removed in dce.
3463 We however cannot remove it here, due to the way the ssa name
3464 it defines is mapped to the new definition. So just replace
3465 rhs of the statement with something harmless. */
3467 type = TREE_TYPE (scalar_dest);
3468 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3469 fold_convert (type, integer_zero_node));
3470 set_vinfo_for_stmt (new_stmt, stmt_info);
3471 set_vinfo_for_stmt (stmt, NULL);
3472 STMT_VINFO_STMT (stmt_info) = new_stmt;
3473 gsi_replace (gsi, new_stmt, false);
3474 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3480 /* Function vect_gen_widened_results_half
3482 Create a vector stmt whose code, type, number of arguments, and result
3483 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
3484 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3485 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3486 needs to be created (DECL is a function-decl of a target-builtin).
3487 STMT is the original scalar stmt that we are vectorizing. */
3490 vect_gen_widened_results_half (enum tree_code code,
3492 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3493 tree vec_dest, gimple_stmt_iterator *gsi,
3501 /* Generate half of the widened result: */
3502 if (code == CALL_EXPR)
3504 /* Target specific support */
3505 if (op_type == binary_op)
3506 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3508 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3509 new_temp = make_ssa_name (vec_dest, new_stmt);
3510 gimple_call_set_lhs (new_stmt, new_temp);
3514 /* Generic support */
3515 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3516 if (op_type != binary_op)
3518 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3520 new_temp = make_ssa_name (vec_dest, new_stmt);
3521 gimple_assign_set_lhs (new_stmt, new_temp);
3523 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3525 if (code == CALL_EXPR)
3527 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3529 if (TREE_CODE (sym) == SSA_NAME)
3530 sym = SSA_NAME_VAR (sym);
3531 mark_sym_for_renaming (sym);
3539 /* Check if STMT performs a conversion operation, that can be vectorized.
3540 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3541 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3542 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3545 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3546 gimple *vec_stmt, slp_tree slp_node)
3551 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3552 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3553 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3554 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3555 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3559 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3560 gimple new_stmt = NULL;
3561 stmt_vec_info prev_stmt_info;
3564 tree vectype_out, vectype_in;
3567 tree rhs_type, lhs_type;
3569 enum { NARROW, NONE, WIDEN } modifier;
3571 VEC(tree,heap) *vec_oprnds0 = NULL;
3574 VEC(tree,heap) *dummy = NULL;
3577 /* Is STMT a vectorizable conversion? */
3579 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3582 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3585 if (!is_gimple_assign (stmt))
3588 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3591 code = gimple_assign_rhs_code (stmt);
3592 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3595 /* Check types of lhs and rhs. */
3596 op0 = gimple_assign_rhs1 (stmt);
3597 rhs_type = TREE_TYPE (op0);
3598 vectype_in = get_vectype_for_scalar_type (rhs_type);
3601 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3603 scalar_dest = gimple_assign_lhs (stmt);
3604 lhs_type = TREE_TYPE (scalar_dest);
3605 vectype_out = get_vectype_for_scalar_type (lhs_type);
3608 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3611 if (nunits_in == nunits_out / 2)
3613 else if (nunits_out == nunits_in)
3615 else if (nunits_out == nunits_in / 2)
3620 if (modifier == NONE)
3621 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3623 /* Bail out if the types are both integral or non-integral. */
3624 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3625 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3628 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3630 if (modifier == NARROW)
3631 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3633 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3635 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3636 this, so we can safely override NCOPIES with 1 here. */
3640 /* Sanity check: make sure that at least one copy of the vectorized stmt
3641 needs to be generated. */
3642 gcc_assert (ncopies >= 1);
3644 /* Check the operands of the operation. */
3645 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3647 if (vect_print_dump_info (REPORT_DETAILS))
3648 fprintf (vect_dump, "use not simple.");
3652 /* Supportable by target? */
3653 if ((modifier == NONE
3654 && !targetm.vectorize.builtin_conversion (code, integral_type))
3655 || (modifier == WIDEN
3656 && !supportable_widening_operation (code, stmt, vectype_in,
3659 &dummy_int, &dummy))
3660 || (modifier == NARROW
3661 && !supportable_narrowing_operation (code, stmt, vectype_in,
3662 &code1, &dummy_int, &dummy)))
3664 if (vect_print_dump_info (REPORT_DETAILS))
3665 fprintf (vect_dump, "conversion not supported by target.");
3669 if (modifier != NONE)
3671 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3672 /* FORNOW: SLP not supported. */
3673 if (STMT_SLP_TYPE (stmt_info))
3677 if (!vec_stmt) /* transformation not required. */
3679 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3684 if (vect_print_dump_info (REPORT_DETAILS))
3685 fprintf (vect_dump, "transform conversion.");
3688 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3690 if (modifier == NONE && !slp_node)
3691 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3693 prev_stmt_info = NULL;
3697 for (j = 0; j < ncopies; j++)
3703 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3705 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3708 targetm.vectorize.builtin_conversion (code, integral_type);
3709 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3711 /* Arguments are ready. create the new vector stmt. */
3712 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3713 new_temp = make_ssa_name (vec_dest, new_stmt);
3714 gimple_call_set_lhs (new_stmt, new_temp);
3715 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3716 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3717 SSA_OP_ALL_VIRTUALS)
3719 if (TREE_CODE (sym) == SSA_NAME)
3720 sym = SSA_NAME_VAR (sym);
3721 mark_sym_for_renaming (sym);
3724 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3728 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3730 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3731 prev_stmt_info = vinfo_for_stmt (new_stmt);
3736 /* In case the vectorization factor (VF) is bigger than the number
3737 of elements that we can fit in a vectype (nunits), we have to
3738 generate more than one vector stmt - i.e - we need to "unroll"
3739 the vector stmt by a factor VF/nunits. */
3740 for (j = 0; j < ncopies; j++)
3743 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3745 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3747 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3749 /* Generate first half of the widened result: */
3751 = vect_gen_widened_results_half (code1, decl1,
3752 vec_oprnd0, vec_oprnd1,
3753 unary_op, vec_dest, gsi, stmt);
3755 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3757 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3758 prev_stmt_info = vinfo_for_stmt (new_stmt);
3760 /* Generate second half of the widened result: */
3762 = vect_gen_widened_results_half (code2, decl2,
3763 vec_oprnd0, vec_oprnd1,
3764 unary_op, vec_dest, gsi, stmt);
3765 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3766 prev_stmt_info = vinfo_for_stmt (new_stmt);
3771 /* In case the vectorization factor (VF) is bigger than the number
3772 of elements that we can fit in a vectype (nunits), we have to
3773 generate more than one vector stmt - i.e - we need to "unroll"
3774 the vector stmt by a factor VF/nunits. */
3775 for (j = 0; j < ncopies; j++)
3780 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3781 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3785 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3786 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3789 /* Arguments are ready. Create the new vector stmt. */
3790 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3791 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3793 new_temp = make_ssa_name (vec_dest, new_stmt);
3794 gimple_assign_set_lhs (new_stmt, new_temp);
3795 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3798 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3800 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3802 prev_stmt_info = vinfo_for_stmt (new_stmt);
3805 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3809 VEC_free (tree, heap, vec_oprnds0);
3815 /* Function vectorizable_assignment.
3817 Check if STMT performs an assignment (copy) that can be vectorized.
3818 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3819 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3820 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3823 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3824 gimple *vec_stmt, slp_tree slp_node)
3829 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3830 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3831 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3835 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3836 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3839 VEC(tree,heap) *vec_oprnds = NULL;
3842 /* Multiple types in SLP are handled by creating the appropriate number of
3843 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3848 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3850 gcc_assert (ncopies >= 1);
3852 return false; /* FORNOW */
3854 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3857 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3860 /* Is vectorizable assignment? */
3861 if (!is_gimple_assign (stmt))
3864 scalar_dest = gimple_assign_lhs (stmt);
3865 if (TREE_CODE (scalar_dest) != SSA_NAME)
3868 if (gimple_assign_single_p (stmt)
3869 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3870 op = gimple_assign_rhs1 (stmt);
3874 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3876 if (vect_print_dump_info (REPORT_DETAILS))
3877 fprintf (vect_dump, "use not simple.");
3881 if (!vec_stmt) /* transformation not required. */
3883 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3884 if (vect_print_dump_info (REPORT_DETAILS))
3885 fprintf (vect_dump, "=== vectorizable_assignment ===");
3886 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3891 if (vect_print_dump_info (REPORT_DETAILS))
3892 fprintf (vect_dump, "transform assignment.");
3895 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3898 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3900 /* Arguments are ready. create the new vector stmt. */
3901 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3903 *vec_stmt = gimple_build_assign (vec_dest, vop);
3904 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3905 gimple_assign_set_lhs (*vec_stmt, new_temp);
3906 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3907 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3910 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3913 VEC_free (tree, heap, vec_oprnds);
3918 /* Function vect_min_worthwhile_factor.
3920 For a loop where we could vectorize the operation indicated by CODE,
3921 return the minimum vectorization factor that makes it worthwhile
3922 to use generic vectors. */
3924 vect_min_worthwhile_factor (enum tree_code code)
3945 /* Function vectorizable_induction
3947 Check if PHI performs an induction computation that can be vectorized.
3948 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3949 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3950 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3953 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3956 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3957 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3958 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3959 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3960 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3961 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3964 gcc_assert (ncopies >= 1);
3965 /* FORNOW. This restriction should be relaxed. */
3966 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3968 if (vect_print_dump_info (REPORT_DETAILS))
3969 fprintf (vect_dump, "multiple types in nested loop.");
3973 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3976 /* FORNOW: SLP not supported. */
3977 if (STMT_SLP_TYPE (stmt_info))
3980 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3982 if (gimple_code (phi) != GIMPLE_PHI)
3985 if (!vec_stmt) /* transformation not required. */
3987 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3988 if (vect_print_dump_info (REPORT_DETAILS))
3989 fprintf (vect_dump, "=== vectorizable_induction ===");
3990 vect_model_induction_cost (stmt_info, ncopies);
3996 if (vect_print_dump_info (REPORT_DETAILS))
3997 fprintf (vect_dump, "transform induction phi.");
3999 vec_def = get_initial_def_for_induction (phi);
4000 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4005 /* Function vectorizable_operation.
4007 Check if STMT performs a binary or unary operation that can be vectorized.
4008 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4009 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4010 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4013 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
4014 gimple *vec_stmt, slp_tree slp_node)
4018 tree op0, op1 = NULL;
4019 tree vec_oprnd1 = NULL_TREE;
4020 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4021 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4022 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4023 enum tree_code code;
4024 enum machine_mode vec_mode;
4029 enum machine_mode optab_op2_mode;
4032 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4033 gimple new_stmt = NULL;
4034 stmt_vec_info prev_stmt_info;
4035 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
4040 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4043 bool shift_p = false;
4044 bool scalar_shift_arg = false;
4046 /* Multiple types in SLP are handled by creating the appropriate number of
4047 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4052 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4054 gcc_assert (ncopies >= 1);
4056 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4059 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4062 /* Is STMT a vectorizable binary/unary operation? */
4063 if (!is_gimple_assign (stmt))
4066 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4069 scalar_dest = gimple_assign_lhs (stmt);
4070 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4073 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4074 if (nunits_out != nunits_in)
4077 code = gimple_assign_rhs_code (stmt);
4079 /* For pointer addition, we should use the normal plus for
4080 the vector addition. */
4081 if (code == POINTER_PLUS_EXPR)
4084 /* Support only unary or binary operations. */
4085 op_type = TREE_CODE_LENGTH (code);
4086 if (op_type != unary_op && op_type != binary_op)
4088 if (vect_print_dump_info (REPORT_DETAILS))
4089 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4093 op0 = gimple_assign_rhs1 (stmt);
4094 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4096 if (vect_print_dump_info (REPORT_DETAILS))
4097 fprintf (vect_dump, "use not simple.");
4101 if (op_type == binary_op)
4103 op1 = gimple_assign_rhs2 (stmt);
4104 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4106 if (vect_print_dump_info (REPORT_DETAILS))
4107 fprintf (vect_dump, "use not simple.");
4112 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4113 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4115 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4116 || code == RROTATE_EXPR)
4120 /* vector shifted by vector */
4121 if (dt[1] == vect_loop_def)
4123 optab = optab_for_tree_code (code, vectype, optab_vector);
4124 if (vect_print_dump_info (REPORT_DETAILS))
4125 fprintf (vect_dump, "vector/vector shift/rotate found.");
4128 /* See if the machine has a vector shifted by scalar insn and if not
4129 then see if it has a vector shifted by vector insn */
4130 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4132 optab = optab_for_tree_code (code, vectype, optab_scalar);
4134 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4135 != CODE_FOR_nothing))
4137 scalar_shift_arg = true;
4138 if (vect_print_dump_info (REPORT_DETAILS))
4139 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4143 optab = optab_for_tree_code (code, vectype, optab_vector);
4144 if (vect_print_dump_info (REPORT_DETAILS)
4146 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4147 != CODE_FOR_nothing))
4148 fprintf (vect_dump, "vector/vector shift/rotate found.");
4154 if (vect_print_dump_info (REPORT_DETAILS))
4155 fprintf (vect_dump, "operand mode requires invariant argument.");
4160 optab = optab_for_tree_code (code, vectype, optab_default);
4162 /* Supportable by target? */
4165 if (vect_print_dump_info (REPORT_DETAILS))
4166 fprintf (vect_dump, "no optab.");
4169 vec_mode = TYPE_MODE (vectype);
4170 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4171 if (icode == CODE_FOR_nothing)
4173 if (vect_print_dump_info (REPORT_DETAILS))
4174 fprintf (vect_dump, "op not supported by target.");
4175 /* Check only during analysis. */
4176 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4177 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4178 < vect_min_worthwhile_factor (code)
4181 if (vect_print_dump_info (REPORT_DETAILS))
4182 fprintf (vect_dump, "proceeding using word mode.");
4185 /* Worthwhile without SIMD support? Check only during analysis. */
4186 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4187 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4188 < vect_min_worthwhile_factor (code)
4191 if (vect_print_dump_info (REPORT_DETAILS))
4192 fprintf (vect_dump, "not worthwhile without SIMD support.");
4196 if (!vec_stmt) /* transformation not required. */
4198 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4199 if (vect_print_dump_info (REPORT_DETAILS))
4200 fprintf (vect_dump, "=== vectorizable_operation ===");
4201 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4207 if (vect_print_dump_info (REPORT_DETAILS))
4208 fprintf (vect_dump, "transform binary/unary operation.");
4211 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4213 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4214 created in the previous stages of the recursion, so no allocation is
4215 needed, except for the case of shift with scalar shift argument. In that
4216 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4217 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4218 In case of loop-based vectorization we allocate VECs of size 1. We
4219 allocate VEC_OPRNDS1 only in case of binary operation. */
4222 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4223 if (op_type == binary_op)
4224 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4226 else if (scalar_shift_arg)
4227 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4229 /* In case the vectorization factor (VF) is bigger than the number
4230 of elements that we can fit in a vectype (nunits), we have to generate
4231 more than one vector stmt - i.e - we need to "unroll" the
4232 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4233 from one copy of the vector stmt to the next, in the field
4234 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4235 stages to find the correct vector defs to be used when vectorizing
4236 stmts that use the defs of the current stmt. The example below illustrates
4237 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4238 4 vectorized stmts):
4240 before vectorization:
4241 RELATED_STMT VEC_STMT
4245 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4247 RELATED_STMT VEC_STMT
4248 VS1_0: vx0 = memref0 VS1_1 -
4249 VS1_1: vx1 = memref1 VS1_2 -
4250 VS1_2: vx2 = memref2 VS1_3 -
4251 VS1_3: vx3 = memref3 - -
4252 S1: x = load - VS1_0
4255 step2: vectorize stmt S2 (done here):
4256 To vectorize stmt S2 we first need to find the relevant vector
4257 def for the first operand 'x'. This is, as usual, obtained from
4258 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4259 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4260 relevant vector def 'vx0'. Having found 'vx0' we can generate
4261 the vector stmt VS2_0, and as usual, record it in the
4262 STMT_VINFO_VEC_STMT of stmt S2.
4263 When creating the second copy (VS2_1), we obtain the relevant vector
4264 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4265 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4266 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4267 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4268 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4269 chain of stmts and pointers:
4270 RELATED_STMT VEC_STMT
4271 VS1_0: vx0 = memref0 VS1_1 -
4272 VS1_1: vx1 = memref1 VS1_2 -
4273 VS1_2: vx2 = memref2 VS1_3 -
4274 VS1_3: vx3 = memref3 - -
4275 S1: x = load - VS1_0
4276 VS2_0: vz0 = vx0 + v1 VS2_1 -
4277 VS2_1: vz1 = vx1 + v1 VS2_2 -
4278 VS2_2: vz2 = vx2 + v1 VS2_3 -
4279 VS2_3: vz3 = vx3 + v1 - -
4280 S2: z = x + 1 - VS2_0 */
4282 prev_stmt_info = NULL;
4283 for (j = 0; j < ncopies; j++)
4288 if (op_type == binary_op && scalar_shift_arg)
4290 /* Vector shl and shr insn patterns can be defined with scalar
4291 operand 2 (shift operand). In this case, use constant or loop
4292 invariant op1 directly, without extending it to vector mode
4294 optab_op2_mode = insn_data[icode].operand[2].mode;
4295 if (!VECTOR_MODE_P (optab_op2_mode))
4297 if (vect_print_dump_info (REPORT_DETAILS))
4298 fprintf (vect_dump, "operand 1 using scalar mode.");
4300 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4303 /* Store vec_oprnd1 for every vector stmt to be created
4304 for SLP_NODE. We check during the analysis that all the
4305 shift arguments are the same.
4306 TODO: Allow different constants for different vector
4307 stmts generated for an SLP instance. */
4308 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4309 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4314 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4315 (a special case for certain kind of vector shifts); otherwise,
4316 operand 1 should be of a vector type (the usual case). */
4317 if (op_type == binary_op && !vec_oprnd1)
4318 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4321 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4325 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4327 /* Arguments are ready. Create the new vector stmt. */
4328 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4330 vop1 = ((op_type == binary_op)
4331 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4332 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4333 new_temp = make_ssa_name (vec_dest, new_stmt);
4334 gimple_assign_set_lhs (new_stmt, new_temp);
4335 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4337 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4344 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4346 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4347 prev_stmt_info = vinfo_for_stmt (new_stmt);
4350 VEC_free (tree, heap, vec_oprnds0);
4352 VEC_free (tree, heap, vec_oprnds1);
4358 /* Get vectorized definitions for loop-based vectorization. For the first
4359 operand we call vect_get_vec_def_for_operand() (with OPRND containing
4360 scalar operand), and for the rest we get a copy with
4361 vect_get_vec_def_for_stmt_copy() using the previous vector definition
4362 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
4363 The vectors are collected into VEC_OPRNDS. */
4366 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
4367 VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
4371 /* Get first vector operand. */
4372 /* All the vector operands except the very first one (that is scalar oprnd)
4374 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
4375 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
4377 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
4379 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4381 /* Get second vector operand. */
4382 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
4383 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4387 /* For conversion in multiple steps, continue to get operands
4390 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1);
4394 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4395 For multi-step conversions store the resulting vectors and call the function
4399 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
4400 int multi_step_cvt, gimple stmt,
4401 VEC (tree, heap) *vec_dsts,
4402 gimple_stmt_iterator *gsi,
4403 slp_tree slp_node, enum tree_code code,
4404 stmt_vec_info *prev_stmt_info)
4407 tree vop0, vop1, new_tmp, vec_dest;
4409 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4411 vec_dest = VEC_pop (tree, vec_dsts);
4413 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
4415 /* Create demotion operation. */
4416 vop0 = VEC_index (tree, *vec_oprnds, i);
4417 vop1 = VEC_index (tree, *vec_oprnds, i + 1);
4418 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4419 new_tmp = make_ssa_name (vec_dest, new_stmt);
4420 gimple_assign_set_lhs (new_stmt, new_tmp);
4421 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4424 /* Store the resulting vector for next recursive call. */
4425 VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
4428 /* This is the last step of the conversion sequence. Store the
4429 vectors in SLP_NODE or in vector info of the scalar statement
4430 (or in STMT_VINFO_RELATED_STMT chain). */
4432 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4435 if (!*prev_stmt_info)
4436 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4438 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
4440 *prev_stmt_info = vinfo_for_stmt (new_stmt);
4445 /* For multi-step demotion operations we first generate demotion operations
4446 from the source type to the intermediate types, and then combine the
4447 results (stored in VEC_OPRNDS) in demotion operation to the destination
4451 /* At each level of recursion we have have of the operands we had at the
4453 VEC_truncate (tree, *vec_oprnds, (i+1)/2);
4454 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
4455 stmt, vec_dsts, gsi, slp_node,
4456 code, prev_stmt_info);
4461 /* Function vectorizable_type_demotion
4463 Check if STMT performs a binary or unary operation that involves
4464 type demotion, and if it can be vectorized.
4465 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4466 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4467 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4470 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4471 gimple *vec_stmt, slp_tree slp_node)
4476 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4477 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4478 enum tree_code code, code1 = ERROR_MARK;
4481 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4482 stmt_vec_info prev_stmt_info;
4489 int multi_step_cvt = 0;
4490 VEC (tree, heap) *vec_oprnds0 = NULL;
4491 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4492 tree last_oprnd, intermediate_type;
4494 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4497 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4500 /* Is STMT a vectorizable type-demotion operation? */
4501 if (!is_gimple_assign (stmt))
4504 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4507 code = gimple_assign_rhs_code (stmt);
4508 if (!CONVERT_EXPR_CODE_P (code))
4511 op0 = gimple_assign_rhs1 (stmt);
4512 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4515 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4517 scalar_dest = gimple_assign_lhs (stmt);
4518 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4521 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4522 if (nunits_in >= nunits_out)
4525 /* Multiple types in SLP are handled by creating the appropriate number of
4526 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4531 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4533 gcc_assert (ncopies >= 1);
4535 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4536 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4537 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4538 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4539 && CONVERT_EXPR_CODE_P (code))))
4542 /* Check the operands of the operation. */
4543 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4545 if (vect_print_dump_info (REPORT_DETAILS))
4546 fprintf (vect_dump, "use not simple.");
4550 /* Supportable by target? */
4551 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4552 &multi_step_cvt, &interm_types))
4555 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4557 if (!vec_stmt) /* transformation not required. */
4559 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4560 if (vect_print_dump_info (REPORT_DETAILS))
4561 fprintf (vect_dump, "=== vectorizable_demotion ===");
4562 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4567 if (vect_print_dump_info (REPORT_DETAILS))
4568 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4571 /* In case of multi-step demotion, we first generate demotion operations to
4572 the intermediate types, and then from that types to the final one.
4573 We create vector destinations for the intermediate type (TYPES) received
4574 from supportable_narrowing_operation, and store them in the correct order
4575 for future use in vect_create_vectorized_demotion_stmts(). */
4577 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4579 vec_dsts = VEC_alloc (tree, heap, 1);
4581 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4582 VEC_quick_push (tree, vec_dsts, vec_dest);
4586 for (i = VEC_length (tree, interm_types) - 1;
4587 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4589 vec_dest = vect_create_destination_var (scalar_dest,
4591 VEC_quick_push (tree, vec_dsts, vec_dest);
4595 /* In case the vectorization factor (VF) is bigger than the number
4596 of elements that we can fit in a vectype (nunits), we have to generate
4597 more than one vector stmt - i.e - we need to "unroll" the
4598 vector stmt by a factor VF/nunits. */
4600 prev_stmt_info = NULL;
4601 for (j = 0; j < ncopies; j++)
4605 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
4608 VEC_free (tree, heap, vec_oprnds0);
4609 vec_oprnds0 = VEC_alloc (tree, heap,
4610 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
4611 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
4612 vect_pow2 (multi_step_cvt) - 1);
4615 /* Arguments are ready. Create the new vector stmts. */
4616 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4617 vect_create_vectorized_demotion_stmts (&vec_oprnds0,
4618 multi_step_cvt, stmt, tmp_vec_dsts,
4619 gsi, slp_node, code1,
4623 VEC_free (tree, heap, vec_oprnds0);
4624 VEC_free (tree, heap, vec_dsts);
4625 VEC_free (tree, heap, tmp_vec_dsts);
4626 VEC_free (tree, heap, interm_types);
4628 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4633 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
4634 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store
4635 the resulting vectors and call the function recursively. */
4638 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
4639 VEC (tree, heap) **vec_oprnds1,
4640 int multi_step_cvt, gimple stmt,
4641 VEC (tree, heap) *vec_dsts,
4642 gimple_stmt_iterator *gsi,
4643 slp_tree slp_node, enum tree_code code1,
4644 enum tree_code code2, tree decl1,
4645 tree decl2, int op_type,
4646 stmt_vec_info *prev_stmt_info)
4649 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
4650 gimple new_stmt1, new_stmt2;
4651 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4652 VEC (tree, heap) *vec_tmp;
4654 vec_dest = VEC_pop (tree, vec_dsts);
4655 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
4657 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++)
4659 if (op_type == binary_op)
4660 vop1 = VEC_index (tree, *vec_oprnds1, i);
4664 /* Generate the two halves of promotion operation. */
4665 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
4666 op_type, vec_dest, gsi, stmt);
4667 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
4668 op_type, vec_dest, gsi, stmt);
4669 if (is_gimple_call (new_stmt1))
4671 new_tmp1 = gimple_call_lhs (new_stmt1);
4672 new_tmp2 = gimple_call_lhs (new_stmt2);
4676 new_tmp1 = gimple_assign_lhs (new_stmt1);
4677 new_tmp2 = gimple_assign_lhs (new_stmt2);
4682 /* Store the results for the recursive call. */
4683 VEC_quick_push (tree, vec_tmp, new_tmp1);
4684 VEC_quick_push (tree, vec_tmp, new_tmp2);
4688 /* Last step of promotion sequience - store the results. */
4691 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
4692 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
4696 if (!*prev_stmt_info)
4697 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
4699 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
4701 *prev_stmt_info = vinfo_for_stmt (new_stmt1);
4702 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
4703 *prev_stmt_info = vinfo_for_stmt (new_stmt2);
4710 /* For multi-step promotion operation we first generate we call the
4711 function recurcively for every stage. We start from the input type,
4712 create promotion operations to the intermediate types, and then
4713 create promotions to the output type. */
4714 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
4715 VEC_free (tree, heap, vec_tmp);
4716 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
4717 multi_step_cvt - 1, stmt,
4718 vec_dsts, gsi, slp_node, code1,
4719 code2, decl2, decl2, op_type,
4725 /* Function vectorizable_type_promotion
4727 Check if STMT performs a binary or unary operation that involves
4728 type promotion, and if it can be vectorized.
4729 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4730 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4731 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4734 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4735 gimple *vec_stmt, slp_tree slp_node)
4739 tree op0, op1 = NULL;
4740 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4741 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4742 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4743 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4744 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4748 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4749 stmt_vec_info prev_stmt_info;
4756 tree intermediate_type = NULL_TREE;
4757 int multi_step_cvt = 0;
4758 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4759 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4761 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4764 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4767 /* Is STMT a vectorizable type-promotion operation? */
4768 if (!is_gimple_assign (stmt))
4771 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4774 code = gimple_assign_rhs_code (stmt);
4775 if (!CONVERT_EXPR_CODE_P (code)
4776 && code != WIDEN_MULT_EXPR)
4779 op0 = gimple_assign_rhs1 (stmt);
4780 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4783 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4785 scalar_dest = gimple_assign_lhs (stmt);
4786 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4789 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4790 if (nunits_in <= nunits_out)
4793 /* Multiple types in SLP are handled by creating the appropriate number of
4794 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4799 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4801 gcc_assert (ncopies >= 1);
4803 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4804 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4805 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4806 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4807 && CONVERT_EXPR_CODE_P (code))))
4810 /* Check the operands of the operation. */
4811 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4813 if (vect_print_dump_info (REPORT_DETAILS))
4814 fprintf (vect_dump, "use not simple.");
4818 op_type = TREE_CODE_LENGTH (code);
4819 if (op_type == binary_op)
4821 op1 = gimple_assign_rhs2 (stmt);
4822 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4824 if (vect_print_dump_info (REPORT_DETAILS))
4825 fprintf (vect_dump, "use not simple.");
4830 /* Supportable by target? */
4831 if (!supportable_widening_operation (code, stmt, vectype_in,
4832 &decl1, &decl2, &code1, &code2,
4833 &multi_step_cvt, &interm_types))
4836 /* Binary widening operation can only be supported directly by the
4838 gcc_assert (!(multi_step_cvt && op_type == binary_op));
4840 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4842 if (!vec_stmt) /* transformation not required. */
4844 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4845 if (vect_print_dump_info (REPORT_DETAILS))
4846 fprintf (vect_dump, "=== vectorizable_promotion ===");
4847 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4853 if (vect_print_dump_info (REPORT_DETAILS))
4854 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4858 /* In case of multi-step promotion, we first generate promotion operations
4859 to the intermediate types, and then from that types to the final one.
4860 We store vector destination in VEC_DSTS in the correct order for
4861 recursive creation of promotion operations in
4862 vect_create_vectorized_promotion_stmts(). Vector destinations are created
4863 according to TYPES recieved from supportable_widening_operation(). */
4865 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4867 vec_dsts = VEC_alloc (tree, heap, 1);
4869 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4870 VEC_quick_push (tree, vec_dsts, vec_dest);
4874 for (i = VEC_length (tree, interm_types) - 1;
4875 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4877 vec_dest = vect_create_destination_var (scalar_dest,
4879 VEC_quick_push (tree, vec_dsts, vec_dest);
4885 vec_oprnds0 = VEC_alloc (tree, heap,
4886 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
4887 if (op_type == binary_op)
4888 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4891 /* In case the vectorization factor (VF) is bigger than the number
4892 of elements that we can fit in a vectype (nunits), we have to generate
4893 more than one vector stmt - i.e - we need to "unroll" the
4894 vector stmt by a factor VF/nunits. */
4896 prev_stmt_info = NULL;
4897 for (j = 0; j < ncopies; j++)
4903 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
4906 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4907 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
4908 if (op_type == binary_op)
4910 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4911 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4917 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4918 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
4919 if (op_type == binary_op)
4921 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4922 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
4926 /* Arguments are ready. Create the new vector stmts. */
4927 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4928 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
4929 multi_step_cvt, stmt,
4931 gsi, slp_node, code1, code2,
4932 decl1, decl2, op_type,
4936 VEC_free (tree, heap, vec_dsts);
4937 VEC_free (tree, heap, tmp_vec_dsts);
4938 VEC_free (tree, heap, interm_types);
4939 VEC_free (tree, heap, vec_oprnds0);
4940 VEC_free (tree, heap, vec_oprnds1);
4942 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4947 /* Function vect_strided_store_supported.
4949 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4950 and FALSE otherwise. */
4953 vect_strided_store_supported (tree vectype)
4955 optab interleave_high_optab, interleave_low_optab;
4958 mode = (int) TYPE_MODE (vectype);
4960 /* Check that the operation is supported. */
4961 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4962 vectype, optab_default);
4963 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4964 vectype, optab_default);
4965 if (!interleave_high_optab || !interleave_low_optab)
4967 if (vect_print_dump_info (REPORT_DETAILS))
4968 fprintf (vect_dump, "no optab for interleave.");
4972 if (optab_handler (interleave_high_optab, mode)->insn_code
4974 || optab_handler (interleave_low_optab, mode)->insn_code
4975 == CODE_FOR_nothing)
4977 if (vect_print_dump_info (REPORT_DETAILS))
4978 fprintf (vect_dump, "interleave op not supported by target.");
4986 /* Function vect_permute_store_chain.
4988 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4989 a power of 2, generate interleave_high/low stmts to reorder the data
4990 correctly for the stores. Return the final references for stores in
4993 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4994 The input is 4 vectors each containing 8 elements. We assign a number to each
4995 element, the input sequence is:
4997 1st vec: 0 1 2 3 4 5 6 7
4998 2nd vec: 8 9 10 11 12 13 14 15
4999 3rd vec: 16 17 18 19 20 21 22 23
5000 4th vec: 24 25 26 27 28 29 30 31
5002 The output sequence should be:
5004 1st vec: 0 8 16 24 1 9 17 25
5005 2nd vec: 2 10 18 26 3 11 19 27
5006 3rd vec: 4 12 20 28 5 13 21 30
5007 4th vec: 6 14 22 30 7 15 23 31
5009 i.e., we interleave the contents of the four vectors in their order.
5011 We use interleave_high/low instructions to create such output. The input of
5012 each interleave_high/low operation is two vectors:
5015 the even elements of the result vector are obtained left-to-right from the
5016 high/low elements of the first vector. The odd elements of the result are
5017 obtained left-to-right from the high/low elements of the second vector.
5018 The output of interleave_high will be: 0 4 1 5
5019 and of interleave_low: 2 6 3 7
5022 The permutation is done in log LENGTH stages. In each stage interleave_high
5023 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5024 where the first argument is taken from the first half of DR_CHAIN and the
5025 second argument from it's second half.
5028 I1: interleave_high (1st vec, 3rd vec)
5029 I2: interleave_low (1st vec, 3rd vec)
5030 I3: interleave_high (2nd vec, 4th vec)
5031 I4: interleave_low (2nd vec, 4th vec)
5033 The output for the first stage is:
5035 I1: 0 16 1 17 2 18 3 19
5036 I2: 4 20 5 21 6 22 7 23
5037 I3: 8 24 9 25 10 26 11 27
5038 I4: 12 28 13 29 14 30 15 31
5040 The output of the second stage, i.e. the final result is:
5042 I1: 0 8 16 24 1 9 17 25
5043 I2: 2 10 18 26 3 11 19 27
5044 I3: 4 12 20 28 5 13 21 30
5045 I4: 6 14 22 30 7 15 23 31. */
5048 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
5049 unsigned int length,
5051 gimple_stmt_iterator *gsi,
5052 VEC(tree,heap) **result_chain)
5054 tree perm_dest, vect1, vect2, high, low;
5056 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5060 enum tree_code high_code, low_code;
5062 scalar_dest = gimple_assign_lhs (stmt);
5064 /* Check that the operation is supported. */
5065 if (!vect_strided_store_supported (vectype))
5068 *result_chain = VEC_copy (tree, heap, dr_chain);
5070 for (i = 0; i < exact_log2 (length); i++)
5072 for (j = 0; j < length/2; j++)
5074 vect1 = VEC_index (tree, dr_chain, j);
5075 vect2 = VEC_index (tree, dr_chain, j+length/2);
5077 /* Create interleaving stmt:
5078 in the case of big endian:
5079 high = interleave_high (vect1, vect2)
5080 and in the case of little endian:
5081 high = interleave_low (vect1, vect2). */
5082 perm_dest = create_tmp_var (vectype, "vect_inter_high");
5083 DECL_GIMPLE_REG_P (perm_dest) = 1;
5084 add_referenced_var (perm_dest);
5085 if (BYTES_BIG_ENDIAN)
5087 high_code = VEC_INTERLEAVE_HIGH_EXPR;
5088 low_code = VEC_INTERLEAVE_LOW_EXPR;
5092 low_code = VEC_INTERLEAVE_HIGH_EXPR;
5093 high_code = VEC_INTERLEAVE_LOW_EXPR;
5095 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
5097 high = make_ssa_name (perm_dest, perm_stmt);
5098 gimple_assign_set_lhs (perm_stmt, high);
5099 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5100 VEC_replace (tree, *result_chain, 2*j, high);
5102 /* Create interleaving stmt:
5103 in the case of big endian:
5104 low = interleave_low (vect1, vect2)
5105 and in the case of little endian:
5106 low = interleave_high (vect1, vect2). */
5107 perm_dest = create_tmp_var (vectype, "vect_inter_low");
5108 DECL_GIMPLE_REG_P (perm_dest) = 1;
5109 add_referenced_var (perm_dest);
5110 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
5112 low = make_ssa_name (perm_dest, perm_stmt);
5113 gimple_assign_set_lhs (perm_stmt, low);
5114 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5115 VEC_replace (tree, *result_chain, 2*j+1, low);
5117 dr_chain = VEC_copy (tree, heap, *result_chain);
5123 /* Function vectorizable_store.
5125 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
5127 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5128 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5129 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5132 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5138 tree vec_oprnd = NULL_TREE;
5139 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5140 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
5141 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5142 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5143 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5144 enum machine_mode vec_mode;
5146 enum dr_alignment_support alignment_support_scheme;
5149 enum vect_def_type dt;
5150 stmt_vec_info prev_stmt_info = NULL;
5151 tree dataref_ptr = NULL_TREE;
5152 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5155 gimple next_stmt, first_stmt = NULL;
5156 bool strided_store = false;
5157 unsigned int group_size, i;
5158 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
5160 VEC(tree,heap) *vec_oprnds = NULL;
5161 bool slp = (slp_node != NULL);
5162 stmt_vec_info first_stmt_vinfo;
5163 unsigned int vec_num;
5165 /* Multiple types in SLP are handled by creating the appropriate number of
5166 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5171 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5173 gcc_assert (ncopies >= 1);
5175 /* FORNOW. This restriction should be relaxed. */
5176 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
5178 if (vect_print_dump_info (REPORT_DETAILS))
5179 fprintf (vect_dump, "multiple types in nested loop.");
5183 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5186 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5189 /* Is vectorizable store? */
5191 if (!is_gimple_assign (stmt))
5194 scalar_dest = gimple_assign_lhs (stmt);
5195 if (TREE_CODE (scalar_dest) != ARRAY_REF
5196 && TREE_CODE (scalar_dest) != INDIRECT_REF
5197 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5200 gcc_assert (gimple_assign_single_p (stmt));
5201 op = gimple_assign_rhs1 (stmt);
5202 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5204 if (vect_print_dump_info (REPORT_DETAILS))
5205 fprintf (vect_dump, "use not simple.");
5209 /* The scalar rhs type needs to be trivially convertible to the vector
5210 component type. This should always be the case. */
5211 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (op)))
5213 if (vect_print_dump_info (REPORT_DETAILS))
5214 fprintf (vect_dump, "??? operands of different types");
5218 vec_mode = TYPE_MODE (vectype);
5219 /* FORNOW. In some cases can vectorize even if data-type not supported
5220 (e.g. - array initialization with 0). */
5221 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5224 if (!STMT_VINFO_DATA_REF (stmt_info))
5227 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5229 strided_store = true;
5230 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5231 if (!vect_strided_store_supported (vectype)
5232 && !PURE_SLP_STMT (stmt_info) && !slp)
5235 if (first_stmt == stmt)
5237 /* STMT is the leader of the group. Check the operands of all the
5238 stmts of the group. */
5239 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5242 gcc_assert (gimple_assign_single_p (next_stmt));
5243 op = gimple_assign_rhs1 (next_stmt);
5244 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5246 if (vect_print_dump_info (REPORT_DETAILS))
5247 fprintf (vect_dump, "use not simple.");
5250 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5255 if (!vec_stmt) /* transformation not required. */
5257 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5258 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5266 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5267 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5269 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5272 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5274 /* We vectorize all the stmts of the interleaving group when we
5275 reach the last stmt in the group. */
5276 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5277 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5285 strided_store = false;
5287 /* VEC_NUM is the number of vect stmts to be created for this group. */
5289 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5291 vec_num = group_size;
5297 group_size = vec_num = 1;
5298 first_stmt_vinfo = stmt_info;
5301 if (vect_print_dump_info (REPORT_DETAILS))
5302 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5304 dr_chain = VEC_alloc (tree, heap, group_size);
5305 oprnds = VEC_alloc (tree, heap, group_size);
5307 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5308 gcc_assert (alignment_support_scheme);
5309 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5311 /* In case the vectorization factor (VF) is bigger than the number
5312 of elements that we can fit in a vectype (nunits), we have to generate
5313 more than one vector stmt - i.e - we need to "unroll" the
5314 vector stmt by a factor VF/nunits. For more details see documentation in
5315 vect_get_vec_def_for_copy_stmt. */
5317 /* In case of interleaving (non-unit strided access):
5324 We create vectorized stores starting from base address (the access of the
5325 first stmt in the chain (S2 in the above example), when the last store stmt
5326 of the chain (S4) is reached:
5329 VS2: &base + vec_size*1 = vx0
5330 VS3: &base + vec_size*2 = vx1
5331 VS4: &base + vec_size*3 = vx3
5333 Then permutation statements are generated:
5335 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5336 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5339 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5340 (the order of the data-refs in the output of vect_permute_store_chain
5341 corresponds to the order of scalar stmts in the interleaving chain - see
5342 the documentation of vect_permute_store_chain()).
5344 In case of both multiple types and interleaving, above vector stores and
5345 permutation stmts are created for every copy. The result vector stmts are
5346 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5347 STMT_VINFO_RELATED_STMT for the next copies.
5350 prev_stmt_info = NULL;
5351 for (j = 0; j < ncopies; j++)
5360 /* Get vectorized arguments for SLP_NODE. */
5361 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5363 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5367 /* For interleaved stores we collect vectorized defs for all the
5368 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5369 used as an input to vect_permute_store_chain(), and OPRNDS as
5370 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5372 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5373 OPRNDS are of size 1. */
5374 next_stmt = first_stmt;
5375 for (i = 0; i < group_size; i++)
5377 /* Since gaps are not supported for interleaved stores,
5378 GROUP_SIZE is the exact number of stmts in the chain.
5379 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5380 there is no interleaving, GROUP_SIZE is 1, and only one
5381 iteration of the loop will be executed. */
5382 gcc_assert (next_stmt
5383 && gimple_assign_single_p (next_stmt));
5384 op = gimple_assign_rhs1 (next_stmt);
5386 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5388 VEC_quick_push(tree, dr_chain, vec_oprnd);
5389 VEC_quick_push(tree, oprnds, vec_oprnd);
5390 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5394 /* We should have catched mismatched types earlier. */
5395 gcc_assert (useless_type_conversion_p (vectype,
5396 TREE_TYPE (vec_oprnd)));
5397 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5398 &dummy, &ptr_incr, false,
5400 gcc_assert (!inv_p);
5404 /* For interleaved stores we created vectorized defs for all the
5405 defs stored in OPRNDS in the previous iteration (previous copy).
5406 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5407 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5409 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5410 OPRNDS are of size 1. */
5411 for (i = 0; i < group_size; i++)
5413 op = VEC_index (tree, oprnds, i);
5414 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5415 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5416 VEC_replace(tree, dr_chain, i, vec_oprnd);
5417 VEC_replace(tree, oprnds, i, vec_oprnd);
5420 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5425 result_chain = VEC_alloc (tree, heap, group_size);
5427 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5432 next_stmt = first_stmt;
5433 for (i = 0; i < vec_num; i++)
5436 /* Bump the vector pointer. */
5437 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5441 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5442 else if (strided_store)
5443 /* For strided stores vectorized defs are interleaved in
5444 vect_permute_store_chain(). */
5445 vec_oprnd = VEC_index (tree, result_chain, i);
5447 data_ref = build_fold_indirect_ref (dataref_ptr);
5449 /* Arguments are ready. Create the new vector stmt. */
5450 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5451 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5452 mark_symbols_for_renaming (new_stmt);
5458 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5460 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5462 prev_stmt_info = vinfo_for_stmt (new_stmt);
5463 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5469 VEC_free (tree, heap, dr_chain);
5470 VEC_free (tree, heap, oprnds);
5472 VEC_free (tree, heap, result_chain);
5478 /* Function vect_setup_realignment
5480 This function is called when vectorizing an unaligned load using
5481 the dr_explicit_realign[_optimized] scheme.
5482 This function generates the following code at the loop prolog:
5485 x msq_init = *(floor(p)); # prolog load
5486 realignment_token = call target_builtin;
5488 x msq = phi (msq_init, ---)
5490 The stmts marked with x are generated only for the case of
5491 dr_explicit_realign_optimized.
5493 The code above sets up a new (vector) pointer, pointing to the first
5494 location accessed by STMT, and a "floor-aligned" load using that pointer.
5495 It also generates code to compute the "realignment-token" (if the relevant
5496 target hook was defined), and creates a phi-node at the loop-header bb
5497 whose arguments are the result of the prolog-load (created by this
5498 function) and the result of a load that takes place in the loop (to be
5499 created by the caller to this function).
5501 For the case of dr_explicit_realign_optimized:
5502 The caller to this function uses the phi-result (msq) to create the
5503 realignment code inside the loop, and sets up the missing phi argument,
5506 msq = phi (msq_init, lsq)
5507 lsq = *(floor(p')); # load in loop
5508 result = realign_load (msq, lsq, realignment_token);
5510 For the case of dr_explicit_realign:
5512 msq = *(floor(p)); # load in loop
5514 lsq = *(floor(p')); # load in loop
5515 result = realign_load (msq, lsq, realignment_token);
5518 STMT - (scalar) load stmt to be vectorized. This load accesses
5519 a memory location that may be unaligned.
5520 BSI - place where new code is to be inserted.
5521 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5525 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5526 target hook, if defined.
5527 Return value - the result of the loop-header phi node. */
5530 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5531 tree *realignment_token,
5532 enum dr_alignment_support alignment_support_scheme,
5534 struct loop **at_loop)
5536 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5537 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5538 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5539 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5541 tree scalar_dest = gimple_assign_lhs (stmt);
5548 tree msq_init = NULL_TREE;
5551 tree msq = NULL_TREE;
5552 gimple_seq stmts = NULL;
5554 bool compute_in_loop = false;
5555 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5556 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5557 struct loop *loop_for_initial_load;
5559 gcc_assert (alignment_support_scheme == dr_explicit_realign
5560 || alignment_support_scheme == dr_explicit_realign_optimized);
5562 /* We need to generate three things:
5563 1. the misalignment computation
5564 2. the extra vector load (for the optimized realignment scheme).
5565 3. the phi node for the two vectors from which the realignment is
5566 done (for the optimized realignment scheme).
5569 /* 1. Determine where to generate the misalignment computation.
5571 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5572 calculation will be generated by this function, outside the loop (in the
5573 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5574 caller, inside the loop.
5576 Background: If the misalignment remains fixed throughout the iterations of
5577 the loop, then both realignment schemes are applicable, and also the
5578 misalignment computation can be done outside LOOP. This is because we are
5579 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5580 are a multiple of VS (the Vector Size), and therefore the misalignment in
5581 different vectorized LOOP iterations is always the same.
5582 The problem arises only if the memory access is in an inner-loop nested
5583 inside LOOP, which is now being vectorized using outer-loop vectorization.
5584 This is the only case when the misalignment of the memory access may not
5585 remain fixed throughout the iterations of the inner-loop (as explained in
5586 detail in vect_supportable_dr_alignment). In this case, not only is the
5587 optimized realignment scheme not applicable, but also the misalignment
5588 computation (and generation of the realignment token that is passed to
5589 REALIGN_LOAD) have to be done inside the loop.
5591 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5592 or not, which in turn determines if the misalignment is computed inside
5593 the inner-loop, or outside LOOP. */
5595 if (init_addr != NULL_TREE)
5597 compute_in_loop = true;
5598 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5602 /* 2. Determine where to generate the extra vector load.
5604 For the optimized realignment scheme, instead of generating two vector
5605 loads in each iteration, we generate a single extra vector load in the
5606 preheader of the loop, and in each iteration reuse the result of the
5607 vector load from the previous iteration. In case the memory access is in
5608 an inner-loop nested inside LOOP, which is now being vectorized using
5609 outer-loop vectorization, we need to determine whether this initial vector
5610 load should be generated at the preheader of the inner-loop, or can be
5611 generated at the preheader of LOOP. If the memory access has no evolution
5612 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5613 to be generated inside LOOP (in the preheader of the inner-loop). */
5615 if (nested_in_vect_loop)
5617 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5618 bool invariant_in_outerloop =
5619 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5620 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5623 loop_for_initial_load = loop;
5625 *at_loop = loop_for_initial_load;
5627 /* 3. For the case of the optimized realignment, create the first vector
5628 load at the loop preheader. */
5630 if (alignment_support_scheme == dr_explicit_realign_optimized)
5632 /* Create msq_init = *(floor(p1)) in the loop preheader */
5634 gcc_assert (!compute_in_loop);
5635 pe = loop_preheader_edge (loop_for_initial_load);
5636 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5637 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5638 &init_addr, &inc, true, &inv_p, NULL_TREE);
5639 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5640 new_stmt = gimple_build_assign (vec_dest, data_ref);
5641 new_temp = make_ssa_name (vec_dest, new_stmt);
5642 gimple_assign_set_lhs (new_stmt, new_temp);
5643 mark_symbols_for_renaming (new_stmt);
5644 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5645 gcc_assert (!new_bb);
5646 msq_init = gimple_assign_lhs (new_stmt);
5649 /* 4. Create realignment token using a target builtin, if available.
5650 It is done either inside the containing loop, or before LOOP (as
5651 determined above). */
5653 if (targetm.vectorize.builtin_mask_for_load)
5657 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5658 if (compute_in_loop)
5659 gcc_assert (init_addr); /* already computed by the caller. */
5662 /* Generate the INIT_ADDR computation outside LOOP. */
5663 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5665 pe = loop_preheader_edge (loop);
5666 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5667 gcc_assert (!new_bb);
5670 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5671 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5673 vect_create_destination_var (scalar_dest,
5674 gimple_call_return_type (new_stmt));
5675 new_temp = make_ssa_name (vec_dest, new_stmt);
5676 gimple_call_set_lhs (new_stmt, new_temp);
5678 if (compute_in_loop)
5679 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5682 /* Generate the misalignment computation outside LOOP. */
5683 pe = loop_preheader_edge (loop);
5684 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5685 gcc_assert (!new_bb);
5688 *realignment_token = gimple_call_lhs (new_stmt);
5690 /* The result of the CALL_EXPR to this builtin is determined from
5691 the value of the parameter and no global variables are touched
5692 which makes the builtin a "const" function. Requiring the
5693 builtin to have the "const" attribute makes it unnecessary
5694 to call mark_call_clobbered. */
5695 gcc_assert (TREE_READONLY (builtin_decl));
5698 if (alignment_support_scheme == dr_explicit_realign)
5701 gcc_assert (!compute_in_loop);
5702 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5705 /* 5. Create msq = phi <msq_init, lsq> in loop */
5707 pe = loop_preheader_edge (containing_loop);
5708 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5709 msq = make_ssa_name (vec_dest, NULL);
5710 phi_stmt = create_phi_node (msq, containing_loop->header);
5711 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5712 add_phi_arg (phi_stmt, msq_init, pe);
5718 /* Function vect_strided_load_supported.
5720 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5721 and FALSE otherwise. */
5724 vect_strided_load_supported (tree vectype)
5726 optab perm_even_optab, perm_odd_optab;
5729 mode = (int) TYPE_MODE (vectype);
5731 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5733 if (!perm_even_optab)
5735 if (vect_print_dump_info (REPORT_DETAILS))
5736 fprintf (vect_dump, "no optab for perm_even.");
5740 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5742 if (vect_print_dump_info (REPORT_DETAILS))
5743 fprintf (vect_dump, "perm_even op not supported by target.");
5747 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5749 if (!perm_odd_optab)
5751 if (vect_print_dump_info (REPORT_DETAILS))
5752 fprintf (vect_dump, "no optab for perm_odd.");
5756 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5758 if (vect_print_dump_info (REPORT_DETAILS))
5759 fprintf (vect_dump, "perm_odd op not supported by target.");
5766 /* Function vect_permute_load_chain.
5768 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5769 a power of 2, generate extract_even/odd stmts to reorder the input data
5770 correctly. Return the final references for loads in RESULT_CHAIN.
5772 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5773 The input is 4 vectors each containing 8 elements. We assign a number to each
5774 element, the input sequence is:
5776 1st vec: 0 1 2 3 4 5 6 7
5777 2nd vec: 8 9 10 11 12 13 14 15
5778 3rd vec: 16 17 18 19 20 21 22 23
5779 4th vec: 24 25 26 27 28 29 30 31
5781 The output sequence should be:
5783 1st vec: 0 4 8 12 16 20 24 28
5784 2nd vec: 1 5 9 13 17 21 25 29
5785 3rd vec: 2 6 10 14 18 22 26 30
5786 4th vec: 3 7 11 15 19 23 27 31
5788 i.e., the first output vector should contain the first elements of each
5789 interleaving group, etc.
5791 We use extract_even/odd instructions to create such output. The input of each
5792 extract_even/odd operation is two vectors
5796 and the output is the vector of extracted even/odd elements. The output of
5797 extract_even will be: 0 2 4 6
5798 and of extract_odd: 1 3 5 7
5801 The permutation is done in log LENGTH stages. In each stage extract_even and
5802 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5803 order. In our example,
5805 E1: extract_even (1st vec, 2nd vec)
5806 E2: extract_odd (1st vec, 2nd vec)
5807 E3: extract_even (3rd vec, 4th vec)
5808 E4: extract_odd (3rd vec, 4th vec)
5810 The output for the first stage will be:
5812 E1: 0 2 4 6 8 10 12 14
5813 E2: 1 3 5 7 9 11 13 15
5814 E3: 16 18 20 22 24 26 28 30
5815 E4: 17 19 21 23 25 27 29 31
5817 In order to proceed and create the correct sequence for the next stage (or
5818 for the correct output, if the second stage is the last one, as in our
5819 example), we first put the output of extract_even operation and then the
5820 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5821 The input for the second stage is:
5823 1st vec (E1): 0 2 4 6 8 10 12 14
5824 2nd vec (E3): 16 18 20 22 24 26 28 30
5825 3rd vec (E2): 1 3 5 7 9 11 13 15
5826 4th vec (E4): 17 19 21 23 25 27 29 31
5828 The output of the second stage:
5830 E1: 0 4 8 12 16 20 24 28
5831 E2: 2 6 10 14 18 22 26 30
5832 E3: 1 5 9 13 17 21 25 29
5833 E4: 3 7 11 15 19 23 27 31
5835 And RESULT_CHAIN after reordering:
5837 1st vec (E1): 0 4 8 12 16 20 24 28
5838 2nd vec (E3): 1 5 9 13 17 21 25 29
5839 3rd vec (E2): 2 6 10 14 18 22 26 30
5840 4th vec (E4): 3 7 11 15 19 23 27 31. */
5843 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5844 unsigned int length,
5846 gimple_stmt_iterator *gsi,
5847 VEC(tree,heap) **result_chain)
5849 tree perm_dest, data_ref, first_vect, second_vect;
5851 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5855 /* Check that the operation is supported. */
5856 if (!vect_strided_load_supported (vectype))
5859 *result_chain = VEC_copy (tree, heap, dr_chain);
5860 for (i = 0; i < exact_log2 (length); i++)
5862 for (j = 0; j < length; j +=2)
5864 first_vect = VEC_index (tree, dr_chain, j);
5865 second_vect = VEC_index (tree, dr_chain, j+1);
5867 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5868 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5869 DECL_GIMPLE_REG_P (perm_dest) = 1;
5870 add_referenced_var (perm_dest);
5872 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5873 perm_dest, first_vect,
5876 data_ref = make_ssa_name (perm_dest, perm_stmt);
5877 gimple_assign_set_lhs (perm_stmt, data_ref);
5878 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5879 mark_symbols_for_renaming (perm_stmt);
5881 VEC_replace (tree, *result_chain, j/2, data_ref);
5883 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5884 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5885 DECL_GIMPLE_REG_P (perm_dest) = 1;
5886 add_referenced_var (perm_dest);
5888 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5889 perm_dest, first_vect,
5891 data_ref = make_ssa_name (perm_dest, perm_stmt);
5892 gimple_assign_set_lhs (perm_stmt, data_ref);
5893 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5894 mark_symbols_for_renaming (perm_stmt);
5896 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5898 dr_chain = VEC_copy (tree, heap, *result_chain);
5904 /* Function vect_transform_strided_load.
5906 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5907 to perform their permutation and ascribe the result vectorized statements to
5908 the scalar statements.
5912 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5913 gimple_stmt_iterator *gsi)
5915 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5916 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5917 gimple next_stmt, new_stmt;
5918 VEC(tree,heap) *result_chain = NULL;
5919 unsigned int i, gap_count;
5922 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5923 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5924 vectors, that are ready for vector computation. */
5925 result_chain = VEC_alloc (tree, heap, size);
5927 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5930 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5931 Since we scan the chain starting from it's first node, their order
5932 corresponds the order of data-refs in RESULT_CHAIN. */
5933 next_stmt = first_stmt;
5935 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5940 /* Skip the gaps. Loads created for the gaps will be removed by dead
5941 code elimination pass later. No need to check for the first stmt in
5942 the group, since it always exists.
5943 DR_GROUP_GAP is the number of steps in elements from the previous
5944 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5945 correspond to the gaps.
5947 if (next_stmt != first_stmt
5948 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5956 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5957 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5958 copies, and we put the new vector statement in the first available
5960 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5961 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5964 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5967 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5969 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5972 prev_stmt = rel_stmt;
5974 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5977 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5982 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5984 /* If NEXT_STMT accesses the same DR as the previous statement,
5985 put the same TMP_DATA_REF as its vectorized statement; otherwise
5986 get the next data-ref from RESULT_CHAIN. */
5987 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5992 VEC_free (tree, heap, result_chain);
5997 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
5998 building a vector of type MASK_TYPE from it) and two input vectors placed in
5999 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
6000 shifting by STRIDE elements of DR_CHAIN for every copy.
6001 (STRIDE is the number of vectorized stmts for NODE divided by the number of
6003 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
6004 the created stmts must be inserted. */
6007 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
6008 int *mask_array, int mask_nunits,
6009 tree mask_element_type, tree mask_type,
6010 int first_vec_indx, int second_vec_indx,
6011 gimple_stmt_iterator *gsi, slp_tree node,
6012 tree builtin_decl, tree vectype,
6013 VEC(tree,heap) *dr_chain,
6014 int ncopies, int vect_stmts_counter)
6016 tree t = NULL_TREE, mask_vec, mask, perm_dest;
6017 gimple perm_stmt = NULL;
6018 stmt_vec_info next_stmt_info;
6019 int i, group_size, stride, dr_chain_size;
6020 tree first_vec, second_vec, data_ref;
6023 VEC (tree, heap) *params = NULL;
6025 /* Create a vector mask. */
6026 for (i = mask_nunits - 1; i >= 0; --i)
6027 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
6029 mask_vec = build_vector (mask_type, t);
6030 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
6032 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
6033 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
6034 dr_chain_size = VEC_length (tree, dr_chain);
6036 /* Initialize the vect stmts of NODE to properly insert the generated
6038 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
6039 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6040 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
6042 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
6043 for (i = 0; i < ncopies; i++)
6045 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
6046 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
6048 /* Build argument list for the vectorized call. */
6049 VEC_free (tree, heap, params);
6050 params = VEC_alloc (tree, heap, 3);
6051 VEC_quick_push (tree, params, first_vec);
6052 VEC_quick_push (tree, params, second_vec);
6053 VEC_quick_push (tree, params, mask);
6055 /* Generate the permute statement. */
6056 perm_stmt = gimple_build_call_vec (builtin_decl, params);
6057 data_ref = make_ssa_name (perm_dest, perm_stmt);
6058 gimple_call_set_lhs (perm_stmt, data_ref);
6059 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6060 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
6062 if (TREE_CODE (sym) == SSA_NAME)
6063 sym = SSA_NAME_VAR (sym);
6064 mark_sym_for_renaming (sym);
6067 /* Store the vector statement in NODE. */
6068 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
6069 stride * i + vect_stmts_counter, perm_stmt);
6071 first_vec_indx += stride;
6072 second_vec_indx += stride;
6075 /* Mark the scalar stmt as vectorized. */
6076 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
6077 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
6081 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
6082 return in CURRENT_MASK_ELEMENT its equivalent in target specific
6083 representation. Check that the mask is valid and return FALSE if not.
6084 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
6085 the next vector, i.e., the current first vector is not needed. */
6088 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
6089 int mask_nunits, bool only_one_vec, int index,
6090 int *mask, int *current_mask_element,
6091 bool *need_next_vector)
6094 static int number_of_mask_fixes = 1;
6095 static bool mask_fixed = false;
6096 static bool needs_first_vector = false;
6098 /* Convert to target specific representation. */
6099 *current_mask_element = first_mask_element + m;
6100 /* Adjust the value in case it's a mask for second and third vectors. */
6101 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
6103 if (*current_mask_element < mask_nunits)
6104 needs_first_vector = true;
6106 /* We have only one input vector to permute but the mask accesses values in
6107 the next vector as well. */
6108 if (only_one_vec && *current_mask_element >= mask_nunits)
6110 if (vect_print_dump_info (REPORT_DETAILS))
6112 fprintf (vect_dump, "permutation requires at least two vectors ");
6113 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6119 /* The mask requires the next vector. */
6120 if (*current_mask_element >= mask_nunits * 2)
6122 if (needs_first_vector || mask_fixed)
6124 /* We either need the first vector too or have already moved to the
6125 next vector. In both cases, this permutation needs three
6127 if (vect_print_dump_info (REPORT_DETAILS))
6129 fprintf (vect_dump, "permutation requires at "
6130 "least three vectors ");
6131 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6137 /* We move to the next vector, dropping the first one and working with
6138 the second and the third - we need to adjust the values of the mask
6140 *current_mask_element -= mask_nunits * number_of_mask_fixes;
6142 for (i = 0; i < index; i++)
6143 mask[i] -= mask_nunits * number_of_mask_fixes;
6145 (number_of_mask_fixes)++;
6149 *need_next_vector = mask_fixed;
6151 /* This was the last element of this mask. Start a new one. */
6152 if (index == mask_nunits - 1)
6154 number_of_mask_fixes = 1;
6156 needs_first_vector = false;
6163 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6164 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6165 permute statements for SLP_NODE_INSTANCE. */
6167 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
6168 gimple_stmt_iterator *gsi, int vf,
6169 slp_instance slp_node_instance, bool analyze_only)
6171 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6172 tree mask_element_type = NULL_TREE, mask_type;
6173 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
6175 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
6176 gimple next_scalar_stmt;
6177 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6178 int first_mask_element;
6179 int index, unroll_factor, *mask, current_mask_element, ncopies;
6180 bool only_one_vec = false, need_next_vector = false;
6181 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
6183 if (!targetm.vectorize.builtin_vec_perm)
6185 if (vect_print_dump_info (REPORT_DETAILS))
6187 fprintf (vect_dump, "no builtin for vect permute for ");
6188 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6194 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
6195 &mask_element_type);
6196 if (!builtin_decl || !mask_element_type)
6198 if (vect_print_dump_info (REPORT_DETAILS))
6200 fprintf (vect_dump, "no builtin for vect permute for ");
6201 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6207 mask_type = get_vectype_for_scalar_type (mask_element_type);
6208 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
6209 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
6210 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6211 scale = mask_nunits / nunits;
6212 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6214 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
6215 unrolling factor. */
6216 orig_vec_stmts_num = group_size *
6217 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
6218 if (orig_vec_stmts_num == 1)
6219 only_one_vec = true;
6221 /* Number of copies is determined by the final vectorization factor
6222 relatively to SLP_NODE_INSTANCE unrolling factor. */
6223 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6225 /* Generate permutation masks for every NODE. Number of masks for each NODE
6226 is equal to GROUP_SIZE.
6227 E.g., we have a group of three nodes with three loads from the same
6228 location in each node, and the vector size is 4. I.e., we have a
6229 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6230 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6231 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6234 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
6235 scpecific type, e.g., in bytes for Altivec.
6236 The last mask is illegal since we assume two operands for permute
6237 operation, and the mask element values can't be outside that range. Hence,
6238 the last mask must be converted into {2,5,5,5}.
6239 For the first two permutations we need the first and the second input
6240 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6241 we need the second and the third vectors: {b1,c1,a2,b2} and
6245 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
6251 vect_stmts_counter = 0;
6253 first_vec_index = vec_index++;
6255 second_vec_index = first_vec_index;
6257 second_vec_index = vec_index++;
6259 for (j = 0; j < unroll_factor; j++)
6261 for (k = 0; k < group_size; k++)
6263 first_mask_element = (i + j * group_size) * scale;
6264 for (m = 0; m < scale; m++)
6266 if (!vect_get_mask_element (stmt, first_mask_element, m,
6267 mask_nunits, only_one_vec, index, mask,
6268 ¤t_mask_element, &need_next_vector))
6271 mask[index++] = current_mask_element;
6274 if (index == mask_nunits)
6279 if (need_next_vector)
6281 first_vec_index = second_vec_index;
6282 second_vec_index = vec_index;
6285 next_scalar_stmt = VEC_index (gimple,
6286 SLP_TREE_SCALAR_STMTS (node), scalar_index++);
6288 vect_create_mask_and_perm (stmt, next_scalar_stmt,
6289 mask, mask_nunits, mask_element_type, mask_type,
6290 first_vec_index, second_vec_index, gsi, node,
6291 builtin_decl, vectype, dr_chain, ncopies,
6292 vect_stmts_counter++);
6303 /* vectorizable_load.
6305 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
6307 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6308 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
6309 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6312 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
6313 slp_tree slp_node, slp_instance slp_node_instance)
6316 tree vec_dest = NULL;
6317 tree data_ref = NULL;
6318 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6319 stmt_vec_info prev_stmt_info;
6320 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6321 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6322 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
6323 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
6324 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
6325 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6328 gimple new_stmt = NULL;
6330 enum dr_alignment_support alignment_support_scheme;
6331 tree dataref_ptr = NULL_TREE;
6333 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6335 int i, j, group_size;
6336 tree msq = NULL_TREE, lsq;
6337 tree offset = NULL_TREE;
6338 tree realignment_token = NULL_TREE;
6340 VEC(tree,heap) *dr_chain = NULL;
6341 bool strided_load = false;
6345 bool compute_in_loop = false;
6346 struct loop *at_loop;
6348 bool slp = (slp_node != NULL);
6349 bool slp_perm = false;
6350 enum tree_code code;
6352 /* Multiple types in SLP are handled by creating the appropriate number of
6353 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6358 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6360 gcc_assert (ncopies >= 1);
6362 /* FORNOW. This restriction should be relaxed. */
6363 if (nested_in_vect_loop && ncopies > 1)
6365 if (vect_print_dump_info (REPORT_DETAILS))
6366 fprintf (vect_dump, "multiple types in nested loop.");
6370 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
6373 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6376 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6379 /* Is vectorizable load? */
6380 if (!is_gimple_assign (stmt))
6383 scalar_dest = gimple_assign_lhs (stmt);
6384 if (TREE_CODE (scalar_dest) != SSA_NAME)
6387 code = gimple_assign_rhs_code (stmt);
6388 if (code != ARRAY_REF
6389 && code != INDIRECT_REF
6390 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
6393 if (!STMT_VINFO_DATA_REF (stmt_info))
6396 scalar_type = TREE_TYPE (DR_REF (dr));
6397 mode = (int) TYPE_MODE (vectype);
6399 /* FORNOW. In some cases can vectorize even if data-type not supported
6400 (e.g. - data copies). */
6401 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
6403 if (vect_print_dump_info (REPORT_DETAILS))
6404 fprintf (vect_dump, "Aligned load, but unsupported type.");
6408 /* The vector component type needs to be trivially convertible to the
6409 scalar lhs. This should always be the case. */
6410 if (!useless_type_conversion_p (TREE_TYPE (scalar_dest), TREE_TYPE (vectype)))
6412 if (vect_print_dump_info (REPORT_DETAILS))
6413 fprintf (vect_dump, "??? operands of different types");
6417 /* Check if the load is a part of an interleaving chain. */
6418 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6420 strided_load = true;
6422 gcc_assert (! nested_in_vect_loop);
6424 /* Check if interleaving is supported. */
6425 if (!vect_strided_load_supported (vectype)
6426 && !PURE_SLP_STMT (stmt_info) && !slp)
6430 if (!vec_stmt) /* transformation not required. */
6432 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
6433 vect_model_load_cost (stmt_info, ncopies, NULL);
6437 if (vect_print_dump_info (REPORT_DETAILS))
6438 fprintf (vect_dump, "transform load.");
6444 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
6445 /* Check if the chain of loads is already vectorized. */
6446 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
6448 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6451 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
6452 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
6454 /* VEC_NUM is the number of vect stmts to be created for this group. */
6457 strided_load = false;
6458 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6461 vec_num = group_size;
6463 dr_chain = VEC_alloc (tree, heap, vec_num);
6469 group_size = vec_num = 1;
6472 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
6473 gcc_assert (alignment_support_scheme);
6475 /* In case the vectorization factor (VF) is bigger than the number
6476 of elements that we can fit in a vectype (nunits), we have to generate
6477 more than one vector stmt - i.e - we need to "unroll" the
6478 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6479 from one copy of the vector stmt to the next, in the field
6480 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6481 stages to find the correct vector defs to be used when vectorizing
6482 stmts that use the defs of the current stmt. The example below illustrates
6483 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
6484 4 vectorized stmts):
6486 before vectorization:
6487 RELATED_STMT VEC_STMT
6491 step 1: vectorize stmt S1:
6492 We first create the vector stmt VS1_0, and, as usual, record a
6493 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
6494 Next, we create the vector stmt VS1_1, and record a pointer to
6495 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
6496 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
6498 RELATED_STMT VEC_STMT
6499 VS1_0: vx0 = memref0 VS1_1 -
6500 VS1_1: vx1 = memref1 VS1_2 -
6501 VS1_2: vx2 = memref2 VS1_3 -
6502 VS1_3: vx3 = memref3 - -
6503 S1: x = load - VS1_0
6506 See in documentation in vect_get_vec_def_for_stmt_copy for how the
6507 information we recorded in RELATED_STMT field is used to vectorize
6510 /* In case of interleaving (non-unit strided access):
6517 Vectorized loads are created in the order of memory accesses
6518 starting from the access of the first stmt of the chain:
6521 VS2: vx1 = &base + vec_size*1
6522 VS3: vx3 = &base + vec_size*2
6523 VS4: vx4 = &base + vec_size*3
6525 Then permutation statements are generated:
6527 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
6528 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
6531 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
6532 (the order of the data-refs in the output of vect_permute_load_chain
6533 corresponds to the order of scalar stmts in the interleaving chain - see
6534 the documentation of vect_permute_load_chain()).
6535 The generation of permutation stmts and recording them in
6536 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
6538 In case of both multiple types and interleaving, the vector loads and
6539 permutation stmts above are created for every copy. The result vector stmts
6540 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
6541 STMT_VINFO_RELATED_STMT for the next copies. */
6543 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6544 on a target that supports unaligned accesses (dr_unaligned_supported)
6545 we generate the following code:
6549 p = p + indx * vectype_size;
6554 Otherwise, the data reference is potentially unaligned on a target that
6555 does not support unaligned accesses (dr_explicit_realign_optimized) -
6556 then generate the following code, in which the data in each iteration is
6557 obtained by two vector loads, one from the previous iteration, and one
6558 from the current iteration:
6560 msq_init = *(floor(p1))
6561 p2 = initial_addr + VS - 1;
6562 realignment_token = call target_builtin;
6565 p2 = p2 + indx * vectype_size
6567 vec_dest = realign_load (msq, lsq, realignment_token)
6572 /* If the misalignment remains the same throughout the execution of the
6573 loop, we can create the init_addr and permutation mask at the loop
6574 preheader. Otherwise, it needs to be created inside the loop.
6575 This can only occur when vectorizing memory accesses in the inner-loop
6576 nested within an outer-loop that is being vectorized. */
6578 if (nested_in_vect_loop_p (loop, stmt)
6579 && (TREE_INT_CST_LOW (DR_STEP (dr))
6580 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6582 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6583 compute_in_loop = true;
6586 if ((alignment_support_scheme == dr_explicit_realign_optimized
6587 || alignment_support_scheme == dr_explicit_realign)
6588 && !compute_in_loop)
6590 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6591 alignment_support_scheme, NULL_TREE,
6593 if (alignment_support_scheme == dr_explicit_realign_optimized)
6595 phi = SSA_NAME_DEF_STMT (msq);
6596 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6602 prev_stmt_info = NULL;
6603 for (j = 0; j < ncopies; j++)
6605 /* 1. Create the vector pointer update chain. */
6607 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6609 &dummy, &ptr_incr, false,
6613 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6615 for (i = 0; i < vec_num; i++)
6618 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6621 /* 2. Create the vector-load in the loop. */
6622 switch (alignment_support_scheme)
6625 gcc_assert (aligned_access_p (first_dr));
6626 data_ref = build_fold_indirect_ref (dataref_ptr);
6628 case dr_unaligned_supported:
6630 int mis = DR_MISALIGNMENT (first_dr);
6631 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6633 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6635 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6638 case dr_explicit_realign:
6641 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6643 if (compute_in_loop)
6644 msq = vect_setup_realignment (first_stmt, gsi,
6646 dr_explicit_realign,
6649 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6650 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6651 new_stmt = gimple_build_assign (vec_dest, data_ref);
6652 new_temp = make_ssa_name (vec_dest, new_stmt);
6653 gimple_assign_set_lhs (new_stmt, new_temp);
6654 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6655 copy_virtual_operands (new_stmt, stmt);
6656 mark_symbols_for_renaming (new_stmt);
6659 bump = size_binop (MULT_EXPR, vs_minus_1,
6660 TYPE_SIZE_UNIT (scalar_type));
6661 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6662 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6665 case dr_explicit_realign_optimized:
6666 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6671 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6672 new_stmt = gimple_build_assign (vec_dest, data_ref);
6673 new_temp = make_ssa_name (vec_dest, new_stmt);
6674 gimple_assign_set_lhs (new_stmt, new_temp);
6675 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6676 mark_symbols_for_renaming (new_stmt);
6678 /* 3. Handle explicit realignment if necessary/supported. Create in
6679 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6680 if (alignment_support_scheme == dr_explicit_realign_optimized
6681 || alignment_support_scheme == dr_explicit_realign)
6685 lsq = gimple_assign_lhs (new_stmt);
6686 if (!realignment_token)
6687 realignment_token = dataref_ptr;
6688 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6689 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6691 new_stmt = gimple_build_assign (vec_dest, tmp);
6692 new_temp = make_ssa_name (vec_dest, new_stmt);
6693 gimple_assign_set_lhs (new_stmt, new_temp);
6694 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6696 if (alignment_support_scheme == dr_explicit_realign_optimized)
6699 if (i == vec_num - 1 && j == ncopies - 1)
6700 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6705 /* 4. Handle invariant-load. */
6708 gcc_assert (!strided_load);
6709 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6714 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6716 /* CHECKME: bitpos depends on endianess? */
6717 bitpos = bitsize_zero_node;
6718 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6721 vect_create_destination_var (scalar_dest, NULL_TREE);
6722 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6723 new_temp = make_ssa_name (vec_dest, new_stmt);
6724 gimple_assign_set_lhs (new_stmt, new_temp);
6725 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6727 for (k = nunits - 1; k >= 0; --k)
6728 t = tree_cons (NULL_TREE, new_temp, t);
6729 /* FIXME: use build_constructor directly. */
6730 vec_inv = build_constructor_from_list (vectype, t);
6731 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6732 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6735 gcc_unreachable (); /* FORNOW. */
6738 /* Collect vector loads and later create their permutation in
6739 vect_transform_strided_load (). */
6740 if (strided_load || slp_perm)
6741 VEC_quick_push (tree, dr_chain, new_temp);
6743 /* Store vector loads in the corresponding SLP_NODE. */
6744 if (slp && !slp_perm)
6745 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6748 if (slp && !slp_perm)
6753 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
6754 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
6755 slp_node_instance, false))
6757 VEC_free (tree, heap, dr_chain);
6765 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6768 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6769 VEC_free (tree, heap, dr_chain);
6770 dr_chain = VEC_alloc (tree, heap, group_size);
6775 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6777 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6778 prev_stmt_info = vinfo_for_stmt (new_stmt);
6784 VEC_free (tree, heap, dr_chain);
6790 /* Function vectorizable_live_operation.
6792 STMT computes a value that is used outside the loop. Check if
6793 it can be supported. */
6796 vectorizable_live_operation (gimple stmt,
6797 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6798 gimple *vec_stmt ATTRIBUTE_UNUSED)
6800 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6801 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6802 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6808 enum vect_def_type dt;
6809 enum tree_code code;
6810 enum gimple_rhs_class rhs_class;
6812 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6814 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6817 if (!is_gimple_assign (stmt))
6820 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6823 /* FORNOW. CHECKME. */
6824 if (nested_in_vect_loop_p (loop, stmt))
6827 code = gimple_assign_rhs_code (stmt);
6828 op_type = TREE_CODE_LENGTH (code);
6829 rhs_class = get_gimple_rhs_class (code);
6830 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6831 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6833 /* FORNOW: support only if all uses are invariant. This means
6834 that the scalar operations can remain in place, unvectorized.
6835 The original last scalar value that they compute will be used. */
6837 for (i = 0; i < op_type; i++)
6839 if (rhs_class == GIMPLE_SINGLE_RHS)
6840 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6842 op = gimple_op (stmt, i + 1);
6843 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6845 if (vect_print_dump_info (REPORT_DETAILS))
6846 fprintf (vect_dump, "use not simple.");
6850 if (dt != vect_invariant_def && dt != vect_constant_def)
6854 /* No transformation is required for the cases we currently support. */
6859 /* Function vect_is_simple_cond.
6862 LOOP - the loop that is being vectorized.
6863 COND - Condition that is checked for simple use.
6865 Returns whether a COND can be vectorized. Checks whether
6866 condition operands are supportable using vec_is_simple_use. */
6869 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6873 enum vect_def_type dt;
6875 if (!COMPARISON_CLASS_P (cond))
6878 lhs = TREE_OPERAND (cond, 0);
6879 rhs = TREE_OPERAND (cond, 1);
6881 if (TREE_CODE (lhs) == SSA_NAME)
6883 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6884 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6887 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6888 && TREE_CODE (lhs) != FIXED_CST)
6891 if (TREE_CODE (rhs) == SSA_NAME)
6893 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6894 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6897 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6898 && TREE_CODE (rhs) != FIXED_CST)
6904 /* vectorizable_condition.
6906 Check if STMT is conditional modify expression that can be vectorized.
6907 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6908 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6911 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6914 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6917 tree scalar_dest = NULL_TREE;
6918 tree vec_dest = NULL_TREE;
6919 tree op = NULL_TREE;
6920 tree cond_expr, then_clause, else_clause;
6921 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6922 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6923 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6924 tree vec_compare, vec_cond_expr;
6926 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6927 enum machine_mode vec_mode;
6929 enum vect_def_type dt;
6930 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6931 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6932 enum tree_code code;
6934 gcc_assert (ncopies >= 1);
6936 return false; /* FORNOW */
6938 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6941 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6944 /* FORNOW: SLP not supported. */
6945 if (STMT_SLP_TYPE (stmt_info))
6948 /* FORNOW: not yet supported. */
6949 if (STMT_VINFO_LIVE_P (stmt_info))
6951 if (vect_print_dump_info (REPORT_DETAILS))
6952 fprintf (vect_dump, "value used after loop.");
6956 /* Is vectorizable conditional operation? */
6957 if (!is_gimple_assign (stmt))
6960 code = gimple_assign_rhs_code (stmt);
6962 if (code != COND_EXPR)
6965 gcc_assert (gimple_assign_single_p (stmt));
6966 op = gimple_assign_rhs1 (stmt);
6967 cond_expr = TREE_OPERAND (op, 0);
6968 then_clause = TREE_OPERAND (op, 1);
6969 else_clause = TREE_OPERAND (op, 2);
6971 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6974 /* We do not handle two different vector types for the condition
6976 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6979 if (TREE_CODE (then_clause) == SSA_NAME)
6981 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6982 if (!vect_is_simple_use (then_clause, loop_vinfo,
6983 &then_def_stmt, &def, &dt))
6986 else if (TREE_CODE (then_clause) != INTEGER_CST
6987 && TREE_CODE (then_clause) != REAL_CST
6988 && TREE_CODE (then_clause) != FIXED_CST)
6991 if (TREE_CODE (else_clause) == SSA_NAME)
6993 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6994 if (!vect_is_simple_use (else_clause, loop_vinfo,
6995 &else_def_stmt, &def, &dt))
6998 else if (TREE_CODE (else_clause) != INTEGER_CST
6999 && TREE_CODE (else_clause) != REAL_CST
7000 && TREE_CODE (else_clause) != FIXED_CST)
7004 vec_mode = TYPE_MODE (vectype);
7008 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
7009 return expand_vec_cond_expr_p (op, vec_mode);
7015 scalar_dest = gimple_assign_lhs (stmt);
7016 vec_dest = vect_create_destination_var (scalar_dest, vectype);
7018 /* Handle cond expr. */
7020 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
7022 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
7023 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
7024 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
7026 /* Arguments are ready. Create the new vector stmt. */
7027 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
7028 vec_cond_lhs, vec_cond_rhs);
7029 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
7030 vec_compare, vec_then_clause, vec_else_clause);
7032 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
7033 new_temp = make_ssa_name (vec_dest, *vec_stmt);
7034 gimple_assign_set_lhs (*vec_stmt, new_temp);
7035 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
7041 /* Function vect_transform_stmt.
7043 Create a vectorized stmt to replace STMT, and insert it at BSI. */
7046 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
7047 bool *strided_store, slp_tree slp_node,
7048 slp_instance slp_node_instance)
7050 bool is_store = false;
7051 gimple vec_stmt = NULL;
7052 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7053 gimple orig_stmt_in_pattern;
7055 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7056 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7058 switch (STMT_VINFO_TYPE (stmt_info))
7060 case type_demotion_vec_info_type:
7061 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
7065 case type_promotion_vec_info_type:
7066 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
7070 case type_conversion_vec_info_type:
7071 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
7075 case induc_vec_info_type:
7076 gcc_assert (!slp_node);
7077 done = vectorizable_induction (stmt, gsi, &vec_stmt);
7081 case op_vec_info_type:
7082 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
7086 case assignment_vec_info_type:
7087 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
7091 case load_vec_info_type:
7092 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
7097 case store_vec_info_type:
7098 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
7100 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
7102 /* In case of interleaving, the whole chain is vectorized when the
7103 last store in the chain is reached. Store stmts before the last
7104 one are skipped, and there vec_stmt_info shouldn't be freed
7106 *strided_store = true;
7107 if (STMT_VINFO_VEC_STMT (stmt_info))
7114 case condition_vec_info_type:
7115 gcc_assert (!slp_node);
7116 done = vectorizable_condition (stmt, gsi, &vec_stmt);
7120 case call_vec_info_type:
7121 gcc_assert (!slp_node);
7122 done = vectorizable_call (stmt, gsi, &vec_stmt);
7125 case reduc_vec_info_type:
7126 gcc_assert (!slp_node);
7127 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
7132 if (!STMT_VINFO_LIVE_P (stmt_info))
7134 if (vect_print_dump_info (REPORT_DETAILS))
7135 fprintf (vect_dump, "stmt not supported.");
7140 /* Handle inner-loop stmts whose DEF is used in the loop-nest that
7141 is being vectorized, but outside the immediately enclosing loop. */
7143 && nested_in_vect_loop_p (loop, stmt)
7144 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type
7145 && (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer
7146 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer_by_reduction))
7148 struct loop *innerloop = loop->inner;
7149 imm_use_iterator imm_iter;
7150 use_operand_p use_p;
7154 if (vect_print_dump_info (REPORT_DETAILS))
7155 fprintf (vect_dump, "Record the vdef for outer-loop vectorization.");
7157 /* Find the relevant loop-exit phi-node, and reord the vec_stmt there
7158 (to be used when vectorizing outer-loop stmts that use the DEF of
7160 if (gimple_code (stmt) == GIMPLE_PHI)
7161 scalar_dest = PHI_RESULT (stmt);
7163 scalar_dest = gimple_assign_lhs (stmt);
7165 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
7167 if (!flow_bb_inside_loop_p (innerloop, gimple_bb (USE_STMT (use_p))))
7169 exit_phi = USE_STMT (use_p);
7170 STMT_VINFO_VEC_STMT (vinfo_for_stmt (exit_phi)) = vec_stmt;
7175 /* Handle stmts whose DEF is used outside the loop-nest that is
7176 being vectorized. */
7177 if (STMT_VINFO_LIVE_P (stmt_info)
7178 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
7180 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
7186 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
7187 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
7188 if (orig_stmt_in_pattern)
7190 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
7191 /* STMT was inserted by the vectorizer to replace a computation idiom.
7192 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
7193 computed this idiom. We need to record a pointer to VEC_STMT in
7194 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
7195 documentation of vect_pattern_recog. */
7196 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
7198 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
7199 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
7208 /* This function builds ni_name = number of iterations loop executes
7209 on the loop preheader. */
7212 vect_build_loop_niters (loop_vec_info loop_vinfo)
7215 gimple_seq stmts = NULL;
7217 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7218 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
7220 var = create_tmp_var (TREE_TYPE (ni), "niters");
7221 add_referenced_var (var);
7222 ni_name = force_gimple_operand (ni, &stmts, false, var);
7224 pe = loop_preheader_edge (loop);
7227 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7228 gcc_assert (!new_bb);
7235 /* This function generates the following statements:
7237 ni_name = number of iterations loop executes
7238 ratio = ni_name / vf
7239 ratio_mult_vf_name = ratio * vf
7241 and places them at the loop preheader edge. */
7244 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
7246 tree *ratio_mult_vf_name_ptr,
7247 tree *ratio_name_ptr)
7256 tree ratio_mult_vf_name;
7257 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7258 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
7259 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7262 pe = loop_preheader_edge (loop);
7264 /* Generate temporary variable that contains
7265 number of iterations loop executes. */
7267 ni_name = vect_build_loop_niters (loop_vinfo);
7268 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
7270 /* Create: ratio = ni >> log2(vf) */
7272 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
7273 if (!is_gimple_val (ratio_name))
7275 var = create_tmp_var (TREE_TYPE (ni), "bnd");
7276 add_referenced_var (var);
7279 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
7280 pe = loop_preheader_edge (loop);
7281 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7282 gcc_assert (!new_bb);
7285 /* Create: ratio_mult_vf = ratio << log2 (vf). */
7287 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
7288 ratio_name, log_vf);
7289 if (!is_gimple_val (ratio_mult_vf_name))
7291 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
7292 add_referenced_var (var);
7295 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
7297 pe = loop_preheader_edge (loop);
7298 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7299 gcc_assert (!new_bb);
7302 *ni_name_ptr = ni_name;
7303 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
7304 *ratio_name_ptr = ratio_name;
7310 /* Function vect_update_ivs_after_vectorizer.
7312 "Advance" the induction variables of LOOP to the value they should take
7313 after the execution of LOOP. This is currently necessary because the
7314 vectorizer does not handle induction variables that are used after the
7315 loop. Such a situation occurs when the last iterations of LOOP are
7317 1. We introduced new uses after LOOP for IVs that were not originally used
7318 after LOOP: the IVs of LOOP are now used by an epilog loop.
7319 2. LOOP is going to be vectorized; this means that it will iterate N/VF
7320 times, whereas the loop IVs should be bumped N times.
7323 - LOOP - a loop that is going to be vectorized. The last few iterations
7324 of LOOP were peeled.
7325 - NITERS - the number of iterations that LOOP executes (before it is
7326 vectorized). i.e, the number of times the ivs should be bumped.
7327 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
7328 coming out from LOOP on which there are uses of the LOOP ivs
7329 (this is the path from LOOP->exit to epilog_loop->preheader).
7331 The new definitions of the ivs are placed in LOOP->exit.
7332 The phi args associated with the edge UPDATE_E in the bb
7333 UPDATE_E->dest are updated accordingly.
7335 Assumption 1: Like the rest of the vectorizer, this function assumes
7336 a single loop exit that has a single predecessor.
7338 Assumption 2: The phi nodes in the LOOP header and in update_bb are
7339 organized in the same order.
7341 Assumption 3: The access function of the ivs is simple enough (see
7342 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
7344 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
7345 coming out of LOOP on which the ivs of LOOP are used (this is the path
7346 that leads to the epilog loop; other paths skip the epilog loop). This
7347 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
7348 needs to have its phis updated.
7352 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
7355 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7356 basic_block exit_bb = single_exit (loop)->dest;
7358 gimple_stmt_iterator gsi, gsi1;
7359 basic_block update_bb = update_e->dest;
7361 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
7363 /* Make sure there exists a single-predecessor exit bb: */
7364 gcc_assert (single_pred_p (exit_bb));
7366 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
7367 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
7368 gsi_next (&gsi), gsi_next (&gsi1))
7370 tree access_fn = NULL;
7371 tree evolution_part;
7374 tree var, ni, ni_name;
7375 gimple_stmt_iterator last_gsi;
7377 phi = gsi_stmt (gsi);
7378 phi1 = gsi_stmt (gsi1);
7379 if (vect_print_dump_info (REPORT_DETAILS))
7381 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
7382 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7385 /* Skip virtual phi's. */
7386 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
7388 if (vect_print_dump_info (REPORT_DETAILS))
7389 fprintf (vect_dump, "virtual phi. skip.");
7393 /* Skip reduction phis. */
7394 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
7396 if (vect_print_dump_info (REPORT_DETAILS))
7397 fprintf (vect_dump, "reduc phi. skip.");
7401 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
7402 gcc_assert (access_fn);
7403 STRIP_NOPS (access_fn);
7405 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
7406 gcc_assert (evolution_part != NULL_TREE);
7408 /* FORNOW: We do not support IVs whose evolution function is a polynomial
7409 of degree >= 2 or exponential. */
7410 gcc_assert (!tree_is_chrec (evolution_part));
7412 step_expr = evolution_part;
7413 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
7416 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
7417 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
7419 fold_convert (sizetype,
7420 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
7421 niters, step_expr)));
7423 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
7424 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
7425 fold_convert (TREE_TYPE (init_expr),
7432 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
7433 add_referenced_var (var);
7435 last_gsi = gsi_last_bb (exit_bb);
7436 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
7437 true, GSI_SAME_STMT);
7439 /* Fix phi expressions in the successor bb. */
7440 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
7444 /* Return the more conservative threshold between the
7445 min_profitable_iters returned by the cost model and the user
7446 specified threshold, if provided. */
7449 conservative_cost_threshold (loop_vec_info loop_vinfo,
7450 int min_profitable_iters)
7453 int min_scalar_loop_bound;
7455 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
7456 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
7458 /* Use the cost model only if it is more conservative than user specified
7460 th = (unsigned) min_scalar_loop_bound;
7461 if (min_profitable_iters
7462 && (!min_scalar_loop_bound
7463 || min_profitable_iters > min_scalar_loop_bound))
7464 th = (unsigned) min_profitable_iters;
7466 if (th && vect_print_dump_info (REPORT_COST))
7467 fprintf (vect_dump, "Vectorization may not be profitable.");
7472 /* Function vect_do_peeling_for_loop_bound
7474 Peel the last iterations of the loop represented by LOOP_VINFO.
7475 The peeled iterations form a new epilog loop. Given that the loop now
7476 iterates NITERS times, the new epilog loop iterates
7477 NITERS % VECTORIZATION_FACTOR times.
7479 The original loop will later be made to iterate
7480 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
7483 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
7485 tree ni_name, ratio_mult_vf_name;
7486 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7487 struct loop *new_loop;
7489 basic_block preheader;
7491 bool check_profitability = false;
7492 unsigned int th = 0;
7493 int min_profitable_iters;
7495 if (vect_print_dump_info (REPORT_DETAILS))
7496 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
7498 initialize_original_copy_tables ();
7500 /* Generate the following variables on the preheader of original loop:
7502 ni_name = number of iteration the original loop executes
7503 ratio = ni_name / vf
7504 ratio_mult_vf_name = ratio * vf */
7505 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
7506 &ratio_mult_vf_name, ratio);
7508 loop_num = loop->num;
7510 /* If cost model check not done during versioning and
7511 peeling for alignment. */
7512 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7513 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
7514 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7516 check_profitability = true;
7518 /* Get profitability threshold for vectorized loop. */
7519 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7521 th = conservative_cost_threshold (loop_vinfo,
7522 min_profitable_iters);
7525 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
7526 ratio_mult_vf_name, ni_name, false,
7527 th, check_profitability);
7528 gcc_assert (new_loop);
7529 gcc_assert (loop_num == loop->num);
7530 #ifdef ENABLE_CHECKING
7531 slpeel_verify_cfg_after_peeling (loop, new_loop);
7534 /* A guard that controls whether the new_loop is to be executed or skipped
7535 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
7536 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
7537 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
7538 is on the path where the LOOP IVs are used and need to be updated. */
7540 preheader = loop_preheader_edge (new_loop)->src;
7541 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
7542 update_e = EDGE_PRED (preheader, 0);
7544 update_e = EDGE_PRED (preheader, 1);
7546 /* Update IVs of original loop as if they were advanced
7547 by ratio_mult_vf_name steps. */
7548 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
7550 /* After peeling we have to reset scalar evolution analyzer. */
7553 free_original_copy_tables ();
7557 /* Function vect_gen_niters_for_prolog_loop
7559 Set the number of iterations for the loop represented by LOOP_VINFO
7560 to the minimum between LOOP_NITERS (the original iteration count of the loop)
7561 and the misalignment of DR - the data reference recorded in
7562 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
7563 this loop, the data reference DR will refer to an aligned location.
7565 The following computation is generated:
7567 If the misalignment of DR is known at compile time:
7568 addr_mis = int mis = DR_MISALIGNMENT (dr);
7569 Else, compute address misalignment in bytes:
7570 addr_mis = addr & (vectype_size - 1)
7572 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
7574 (elem_size = element type size; an element is the scalar element whose type
7575 is the inner type of the vectype)
7577 When the step of the data-ref in the loop is not 1 (as in interleaved data
7578 and SLP), the number of iterations of the prolog must be divided by the step
7579 (which is equal to the size of interleaved group).
7581 The above formulas assume that VF == number of elements in the vector. This
7582 may not hold when there are multiple-types in the loop.
7583 In this case, for some data-references in the loop the VF does not represent
7584 the number of elements that fit in the vector. Therefore, instead of VF we
7585 use TYPE_VECTOR_SUBPARTS. */
7588 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
7590 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
7591 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7594 tree iters, iters_name;
7597 gimple dr_stmt = DR_STMT (dr);
7598 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7599 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7600 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7601 tree niters_type = TREE_TYPE (loop_niters);
7603 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7604 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7606 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7607 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7609 pe = loop_preheader_edge (loop);
7611 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7613 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7614 int elem_misalign = byte_misalign / element_size;
7616 if (vect_print_dump_info (REPORT_DETAILS))
7617 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7619 iters = build_int_cst (niters_type,
7620 (((nelements - elem_misalign) & (nelements - 1)) / step));
7624 gimple_seq new_stmts = NULL;
7625 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7626 &new_stmts, NULL_TREE, loop);
7627 tree ptr_type = TREE_TYPE (start_addr);
7628 tree size = TYPE_SIZE (ptr_type);
7629 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7630 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7631 tree elem_size_log =
7632 build_int_cst (type, exact_log2 (vectype_align/nelements));
7633 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7634 tree nelements_tree = build_int_cst (type, nelements);
7638 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7639 gcc_assert (!new_bb);
7641 /* Create: byte_misalign = addr & (vectype_size - 1) */
7643 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7645 /* Create: elem_misalign = byte_misalign / element_size */
7647 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7649 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7650 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7651 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7652 iters = fold_convert (niters_type, iters);
7655 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7656 /* If the loop bound is known at compile time we already verified that it is
7657 greater than vf; since the misalignment ('iters') is at most vf, there's
7658 no need to generate the MIN_EXPR in this case. */
7659 if (TREE_CODE (loop_niters) != INTEGER_CST)
7660 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7662 if (vect_print_dump_info (REPORT_DETAILS))
7664 fprintf (vect_dump, "niters for prolog loop: ");
7665 print_generic_expr (vect_dump, iters, TDF_SLIM);
7668 var = create_tmp_var (niters_type, "prolog_loop_niters");
7669 add_referenced_var (var);
7671 iters_name = force_gimple_operand (iters, &stmts, false, var);
7673 /* Insert stmt on loop preheader edge. */
7676 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7677 gcc_assert (!new_bb);
7684 /* Function vect_update_init_of_dr
7686 NITERS iterations were peeled from LOOP. DR represents a data reference
7687 in LOOP. This function updates the information recorded in DR to
7688 account for the fact that the first NITERS iterations had already been
7689 executed. Specifically, it updates the OFFSET field of DR. */
7692 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7694 tree offset = DR_OFFSET (dr);
7696 niters = fold_build2 (MULT_EXPR, sizetype,
7697 fold_convert (sizetype, niters),
7698 fold_convert (sizetype, DR_STEP (dr)));
7699 offset = fold_build2 (PLUS_EXPR, sizetype, offset, niters);
7700 DR_OFFSET (dr) = offset;
7704 /* Function vect_update_inits_of_drs
7706 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7707 This function updates the information recorded for the data references in
7708 the loop to account for the fact that the first NITERS iterations had
7709 already been executed. Specifically, it updates the initial_condition of
7710 the access_function of all the data_references in the loop. */
7713 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7716 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7717 struct data_reference *dr;
7719 if (vect_print_dump_info (REPORT_DETAILS))
7720 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7722 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7723 vect_update_init_of_dr (dr, niters);
7727 /* Function vect_do_peeling_for_alignment
7729 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7730 'niters' is set to the misalignment of one of the data references in the
7731 loop, thereby forcing it to refer to an aligned location at the beginning
7732 of the execution of this loop. The data reference for which we are
7733 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7736 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7738 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7739 tree niters_of_prolog_loop, ni_name;
7741 struct loop *new_loop;
7742 bool check_profitability = false;
7743 unsigned int th = 0;
7744 int min_profitable_iters;
7746 if (vect_print_dump_info (REPORT_DETAILS))
7747 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7749 initialize_original_copy_tables ();
7751 ni_name = vect_build_loop_niters (loop_vinfo);
7752 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7755 /* If cost model check not done during versioning. */
7756 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7757 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7759 check_profitability = true;
7761 /* Get profitability threshold for vectorized loop. */
7762 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7764 th = conservative_cost_threshold (loop_vinfo,
7765 min_profitable_iters);
7768 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7770 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7771 niters_of_prolog_loop, ni_name, true,
7772 th, check_profitability);
7774 gcc_assert (new_loop);
7775 #ifdef ENABLE_CHECKING
7776 slpeel_verify_cfg_after_peeling (new_loop, loop);
7779 /* Update number of times loop executes. */
7780 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7781 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7782 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7784 /* Update the init conditions of the access functions of all data refs. */
7785 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7787 /* After peeling we have to reset scalar evolution analyzer. */
7790 free_original_copy_tables ();
7794 /* Function vect_create_cond_for_align_checks.
7796 Create a conditional expression that represents the alignment checks for
7797 all of data references (array element references) whose alignment must be
7801 COND_EXPR - input conditional expression. New conditions will be chained
7802 with logical AND operation.
7803 LOOP_VINFO - two fields of the loop information are used.
7804 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7805 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7808 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7810 The returned value is the conditional expression to be used in the if
7811 statement that controls which version of the loop gets executed at runtime.
7813 The algorithm makes two assumptions:
7814 1) The number of bytes "n" in a vector is a power of 2.
7815 2) An address "a" is aligned if a%n is zero and that this
7816 test can be done as a&(n-1) == 0. For example, for 16
7817 byte vectors the test is a&0xf == 0. */
7820 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7822 gimple_seq *cond_expr_stmt_list)
7824 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7825 VEC(gimple,heap) *may_misalign_stmts
7826 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7828 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7832 tree int_ptrsize_type;
7834 tree or_tmp_name = NULL_TREE;
7835 tree and_tmp, and_tmp_name;
7838 tree part_cond_expr;
7840 /* Check that mask is one less than a power of 2, i.e., mask is
7841 all zeros followed by all ones. */
7842 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7844 /* CHECKME: what is the best integer or unsigned type to use to hold a
7845 cast from a pointer value? */
7846 psize = TYPE_SIZE (ptr_type_node);
7848 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7850 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7851 of the first vector of the i'th data reference. */
7853 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7855 gimple_seq new_stmt_list = NULL;
7857 tree addr_tmp, addr_tmp_name;
7858 tree or_tmp, new_or_tmp_name;
7859 gimple addr_stmt, or_stmt;
7861 /* create: addr_tmp = (int)(address_of_first_vector) */
7863 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7865 if (new_stmt_list != NULL)
7866 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7868 sprintf (tmp_name, "%s%d", "addr2int", i);
7869 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7870 add_referenced_var (addr_tmp);
7871 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7872 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name,
7873 addr_base, NULL_TREE);
7874 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7875 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7877 /* The addresses are OR together. */
7879 if (or_tmp_name != NULL_TREE)
7881 /* create: or_tmp = or_tmp | addr_tmp */
7882 sprintf (tmp_name, "%s%d", "orptrs", i);
7883 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7884 add_referenced_var (or_tmp);
7885 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7886 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7888 or_tmp_name, addr_tmp_name);
7889 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7890 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7891 or_tmp_name = new_or_tmp_name;
7894 or_tmp_name = addr_tmp_name;
7898 mask_cst = build_int_cst (int_ptrsize_type, mask);
7900 /* create: and_tmp = or_tmp & mask */
7901 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7902 add_referenced_var (and_tmp);
7903 and_tmp_name = make_ssa_name (and_tmp, NULL);
7905 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7906 or_tmp_name, mask_cst);
7907 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7908 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7910 /* Make and_tmp the left operand of the conditional test against zero.
7911 if and_tmp has a nonzero bit then some address is unaligned. */
7912 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7913 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7914 and_tmp_name, ptrsize_zero);
7916 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7917 *cond_expr, part_cond_expr);
7919 *cond_expr = part_cond_expr;
7922 /* Function vect_vfa_segment_size.
7924 Create an expression that computes the size of segment
7925 that will be accessed for a data reference. The functions takes into
7926 account that realignment loads may access one more vector.
7929 DR: The data reference.
7930 VECT_FACTOR: vectorization factor.
7932 Return an expression whose value is the size of segment which will be
7936 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7938 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7939 DR_STEP (dr), vect_factor);
7941 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7943 tree vector_size = TYPE_SIZE_UNIT
7944 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7946 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7947 segment_length, vector_size);
7949 return fold_convert (sizetype, segment_length);
7952 /* Function vect_create_cond_for_alias_checks.
7954 Create a conditional expression that represents the run-time checks for
7955 overlapping of address ranges represented by a list of data references
7956 relations passed as input.
7959 COND_EXPR - input conditional expression. New conditions will be chained
7960 with logical AND operation.
7961 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7965 COND_EXPR - conditional expression.
7966 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7970 The returned value is the conditional expression to be used in the if
7971 statement that controls which version of the loop gets executed at runtime.
7975 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7977 gimple_seq * cond_expr_stmt_list)
7979 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7980 VEC (ddr_p, heap) * may_alias_ddrs =
7981 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7983 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7987 tree part_cond_expr;
7989 /* Create expression
7990 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7991 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7995 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7996 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7998 if (VEC_empty (ddr_p, may_alias_ddrs))
8001 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
8003 struct data_reference *dr_a, *dr_b;
8004 gimple dr_group_first_a, dr_group_first_b;
8005 tree addr_base_a, addr_base_b;
8006 tree segment_length_a, segment_length_b;
8007 gimple stmt_a, stmt_b;
8010 stmt_a = DR_STMT (DDR_A (ddr));
8011 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
8012 if (dr_group_first_a)
8014 stmt_a = dr_group_first_a;
8015 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
8019 stmt_b = DR_STMT (DDR_B (ddr));
8020 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
8021 if (dr_group_first_b)
8023 stmt_b = dr_group_first_b;
8024 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
8028 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
8031 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
8034 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
8035 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
8037 if (vect_print_dump_info (REPORT_DR_DETAILS))
8040 "create runtime check for data references ");
8041 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
8042 fprintf (vect_dump, " and ");
8043 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
8048 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
8049 fold_build2 (LT_EXPR, boolean_type_node,
8050 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
8054 fold_build2 (LT_EXPR, boolean_type_node,
8055 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
8061 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
8062 *cond_expr, part_cond_expr);
8064 *cond_expr = part_cond_expr;
8066 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8067 fprintf (vect_dump, "created %u versioning for alias checks.\n",
8068 VEC_length (ddr_p, may_alias_ddrs));
8072 /* Function vect_loop_versioning.
8074 If the loop has data references that may or may not be aligned or/and
8075 has data reference relations whose independence was not proven then
8076 two versions of the loop need to be generated, one which is vectorized
8077 and one which isn't. A test is then generated to control which of the
8078 loops is executed. The test checks for the alignment of all of the
8079 data references that may or may not be aligned. An additional
8080 sequence of runtime tests is generated for each pairs of DDRs whose
8081 independence was not proven. The vectorized version of loop is
8082 executed only if both alias and alignment tests are passed.
8084 The test generated to check which version of loop is executed
8085 is modified to also check for profitability as indicated by the
8086 cost model initially. */
8089 vect_loop_versioning (loop_vec_info loop_vinfo)
8091 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8093 tree cond_expr = NULL_TREE;
8094 gimple_seq cond_expr_stmt_list = NULL;
8095 basic_block condition_bb;
8096 gimple_stmt_iterator gsi, cond_exp_gsi;
8097 basic_block merge_bb;
8098 basic_block new_exit_bb;
8100 gimple orig_phi, new_phi;
8102 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
8103 gimple_seq gimplify_stmt_list = NULL;
8104 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
8105 int min_profitable_iters = 0;
8108 /* Get profitability threshold for vectorized loop. */
8109 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
8111 th = conservative_cost_threshold (loop_vinfo,
8112 min_profitable_iters);
8115 fold_build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
8116 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
8118 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
8121 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
8122 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
8123 &cond_expr_stmt_list);
8125 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8126 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
8127 &cond_expr_stmt_list);
8130 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
8132 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
8133 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
8135 initialize_original_copy_tables ();
8136 nloop = loop_version (loop, cond_expr, &condition_bb,
8137 prob, prob, REG_BR_PROB_BASE - prob, true);
8138 free_original_copy_tables();
8140 /* Loop versioning violates an assumption we try to maintain during
8141 vectorization - that the loop exit block has a single predecessor.
8142 After versioning, the exit block of both loop versions is the same
8143 basic block (i.e. it has two predecessors). Just in order to simplify
8144 following transformations in the vectorizer, we fix this situation
8145 here by adding a new (empty) block on the exit-edge of the loop,
8146 with the proper loop-exit phis to maintain loop-closed-form. */
8148 merge_bb = single_exit (loop)->dest;
8149 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
8150 new_exit_bb = split_edge (single_exit (loop));
8151 new_exit_e = single_exit (loop);
8152 e = EDGE_SUCC (new_exit_bb, 0);
8154 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8156 orig_phi = gsi_stmt (gsi);
8157 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
8159 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
8160 add_phi_arg (new_phi, arg, new_exit_e);
8161 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
8164 /* End loop-exit-fixes after versioning. */
8166 update_ssa (TODO_update_ssa);
8167 if (cond_expr_stmt_list)
8169 cond_exp_gsi = gsi_last_bb (condition_bb);
8170 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
8174 /* Remove a group of stores (for SLP or interleaving), free their
8178 vect_remove_stores (gimple first_stmt)
8180 gimple next = first_stmt;
8182 gimple_stmt_iterator next_si;
8186 /* Free the attached stmt_vec_info and remove the stmt. */
8187 next_si = gsi_for_stmt (next);
8188 gsi_remove (&next_si, true);
8189 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
8190 free_stmt_vec_info (next);
8196 /* Vectorize SLP instance tree in postorder. */
8199 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
8200 unsigned int vectorization_factor)
8203 bool strided_store, is_store;
8204 gimple_stmt_iterator si;
8205 stmt_vec_info stmt_info;
8206 unsigned int vec_stmts_size, nunits, group_size;
8209 slp_tree loads_node;
8214 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
8215 vectorization_factor);
8216 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
8217 vectorization_factor);
8219 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
8220 stmt_info = vinfo_for_stmt (stmt);
8222 /* VECTYPE is the type of the destination. */
8223 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt)));
8224 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
8225 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
8227 /* For each SLP instance calculate number of vector stmts to be created
8228 for the scalar stmts in each node of the SLP tree. Number of vector
8229 elements in one vector iteration is the number of scalar elements in
8230 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
8232 vec_stmts_size = (vectorization_factor * group_size) / nunits;
8234 /* In case of load permutation we have to allocate vectorized statements for
8235 all the nodes that participate in that permutation. */
8236 if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
8239 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
8242 if (!SLP_TREE_VEC_STMTS (loads_node))
8244 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
8246 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
8251 if (!SLP_TREE_VEC_STMTS (node))
8253 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
8254 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
8257 if (vect_print_dump_info (REPORT_DETAILS))
8259 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
8260 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8263 /* Loads should be inserted before the first load. */
8264 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
8265 && STMT_VINFO_STRIDED_ACCESS (stmt_info)
8266 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
8267 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
8269 si = gsi_for_stmt (stmt);
8271 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
8274 if (DR_GROUP_FIRST_DR (stmt_info))
8275 /* If IS_STORE is TRUE, the vectorization of the
8276 interleaving chain was completed - free all the stores in
8278 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8280 /* FORNOW: SLP originates only from strided stores. */
8286 /* FORNOW: SLP originates only from strided stores. */
8292 vect_schedule_slp (loop_vec_info loop_vinfo)
8294 VEC (slp_instance, heap) *slp_instances =
8295 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8296 slp_instance instance;
8298 bool is_store = false;
8300 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
8302 /* Schedule the tree of INSTANCE. */
8303 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
8304 instance, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
8306 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
8307 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
8308 fprintf (vect_dump, "vectorizing stmts using SLP.");
8314 /* Function vect_transform_loop.
8316 The analysis phase has determined that the loop is vectorizable.
8317 Vectorize the loop - created vectorized stmts to replace the scalar
8318 stmts in the loop, and update the loop exit condition. */
8321 vect_transform_loop (loop_vec_info loop_vinfo)
8323 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8324 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8325 int nbbs = loop->num_nodes;
8326 gimple_stmt_iterator si;
8329 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8331 bool slp_scheduled = false;
8332 unsigned int nunits;
8334 if (vect_print_dump_info (REPORT_DETAILS))
8335 fprintf (vect_dump, "=== vec_transform_loop ===");
8337 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
8338 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8339 vect_loop_versioning (loop_vinfo);
8341 /* CHECKME: we wouldn't need this if we called update_ssa once
8343 bitmap_zero (vect_memsyms_to_rename);
8345 /* Peel the loop if there are data refs with unknown alignment.
8346 Only one data ref with unknown store is allowed. */
8348 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
8349 vect_do_peeling_for_alignment (loop_vinfo);
8351 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
8352 compile time constant), or it is a constant that doesn't divide by the
8353 vectorization factor, then an epilog loop needs to be created.
8354 We therefore duplicate the loop: the original loop will be vectorized,
8355 and will compute the first (n/VF) iterations. The second copy of the loop
8356 will remain scalar and will compute the remaining (n%VF) iterations.
8357 (VF is the vectorization factor). */
8359 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8360 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8361 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
8362 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
8364 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8365 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
8367 /* 1) Make sure the loop header has exactly two entries
8368 2) Make sure we have a preheader basic block. */
8370 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8372 split_edge (loop_preheader_edge (loop));
8374 /* FORNOW: the vectorizer supports only loops which body consist
8375 of one basic block (header + empty latch). When the vectorizer will
8376 support more involved loop forms, the order by which the BBs are
8377 traversed need to be reconsidered. */
8379 for (i = 0; i < nbbs; i++)
8381 basic_block bb = bbs[i];
8382 stmt_vec_info stmt_info;
8385 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
8387 phi = gsi_stmt (si);
8388 if (vect_print_dump_info (REPORT_DETAILS))
8390 fprintf (vect_dump, "------>vectorizing phi: ");
8391 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
8393 stmt_info = vinfo_for_stmt (phi);
8397 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8398 && !STMT_VINFO_LIVE_P (stmt_info))
8401 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
8402 != (unsigned HOST_WIDE_INT) vectorization_factor)
8403 && vect_print_dump_info (REPORT_DETAILS))
8404 fprintf (vect_dump, "multiple-types.");
8406 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
8408 if (vect_print_dump_info (REPORT_DETAILS))
8409 fprintf (vect_dump, "transform phi.");
8410 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8414 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
8416 gimple stmt = gsi_stmt (si);
8419 if (vect_print_dump_info (REPORT_DETAILS))
8421 fprintf (vect_dump, "------>vectorizing statement: ");
8422 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8425 stmt_info = vinfo_for_stmt (stmt);
8427 /* vector stmts created in the outer-loop during vectorization of
8428 stmts in an inner-loop may not have a stmt_info, and do not
8429 need to be vectorized. */
8436 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8437 && !STMT_VINFO_LIVE_P (stmt_info))
8443 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
8445 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8446 if (!STMT_SLP_TYPE (stmt_info)
8447 && nunits != (unsigned int) vectorization_factor
8448 && vect_print_dump_info (REPORT_DETAILS))
8449 /* For SLP VF is set according to unrolling factor, and not to
8450 vector size, hence for SLP this print is not valid. */
8451 fprintf (vect_dump, "multiple-types.");
8453 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8455 if (STMT_SLP_TYPE (stmt_info))
8459 slp_scheduled = true;
8461 if (vect_print_dump_info (REPORT_DETAILS))
8462 fprintf (vect_dump, "=== scheduling SLP instances ===");
8464 vect_schedule_slp (loop_vinfo);
8467 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8468 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8475 /* -------- vectorize statement ------------ */
8476 if (vect_print_dump_info (REPORT_DETAILS))
8477 fprintf (vect_dump, "transform statement.");
8479 strided_store = false;
8480 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
8483 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
8485 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8486 interleaving chain was completed - free all the stores in
8488 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8489 gsi_remove (&si, true);
8494 /* Free the attached stmt_vec_info and remove the stmt. */
8495 free_stmt_vec_info (stmt);
8496 gsi_remove (&si, true);
8504 slpeel_make_loop_iterate_ntimes (loop, ratio);
8506 mark_set_for_renaming (vect_memsyms_to_rename);
8508 /* The memory tags and pointers in vectorized statements need to
8509 have their SSA forms updated. FIXME, why can't this be delayed
8510 until all the loops have been transformed? */
8511 update_ssa (TODO_update_ssa);
8513 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8514 fprintf (vect_dump, "LOOP VECTORIZED.");
8515 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8516 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");