Update gcc-50 to SVN version 225979 (gcc-5-branch)
[dragonfly.git] / contrib / gcc-5.0 / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "function.h"
43 #include "basic-block.h"
44 #include "gimple-pretty-print.h"
45 #include "tree-ssa-alias.h"
46 #include "internal-fn.h"
47 #include "gimple-expr.h"
48 #include "is-a.h"
49 #include "gimple.h"
50 #include "gimple-iterator.h"
51 #include "gimple-ssa.h"
52 #include "tree-phinodes.h"
53 #include "ssa-iterators.h"
54 #include "stringpool.h"
55 #include "tree-ssanames.h"
56 #include "tree-pass.h"
57 #include "cfgloop.h"
58 #include "hashtab.h"
59 #include "rtl.h"
60 #include "flags.h"
61 #include "statistics.h"
62 #include "real.h"
63 #include "fixed-value.h"
64 #include "insn-config.h"
65 #include "expmed.h"
66 #include "dojump.h"
67 #include "explow.h"
68 #include "calls.h"
69 #include "emit-rtl.h"
70 #include "varasm.h"
71 #include "stmt.h"
72 #include "expr.h"
73 #include "recog.h"              /* FIXME: for insn_data */
74 #include "insn-codes.h"
75 #include "optabs.h"
76 #include "tree-vectorizer.h"
77 #include "langhooks.h"
78 #include "gimple-walk.h"
79
80 /* Extract the location of the basic block in the source code.
81    Return the basic block location if succeed and NULL if not.  */
82
83 source_location
84 find_bb_location (basic_block bb)
85 {
86   gimple stmt = NULL;
87   gimple_stmt_iterator si;
88
89   if (!bb)
90     return UNKNOWN_LOCATION;
91
92   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
93     {
94       stmt = gsi_stmt (si);
95       if (gimple_location (stmt) != UNKNOWN_LOCATION)
96         return gimple_location (stmt);
97     }
98
99   return UNKNOWN_LOCATION;
100 }
101
102
103 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
104
105 static void
106 vect_free_slp_tree (slp_tree node)
107 {
108   int i;
109   slp_tree child;
110
111   if (!node)
112     return;
113
114   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115     vect_free_slp_tree (child);
116
117   SLP_TREE_CHILDREN (node).release ();
118   SLP_TREE_SCALAR_STMTS (node).release ();
119   SLP_TREE_VEC_STMTS (node).release ();
120   SLP_TREE_LOAD_PERMUTATION (node).release ();
121
122   free (node);
123 }
124
125
126 /* Free the memory allocated for the SLP instance.  */
127
128 void
129 vect_free_slp_instance (slp_instance instance)
130 {
131   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132   SLP_INSTANCE_LOADS (instance).release ();
133   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
134   free (instance);
135 }
136
137
138 /* Create an SLP node for SCALAR_STMTS.  */
139
140 static slp_tree
141 vect_create_new_slp_node (vec<gimple> scalar_stmts)
142 {
143   slp_tree node;
144   gimple stmt = scalar_stmts[0];
145   unsigned int nops;
146
147   if (is_gimple_call (stmt))
148     nops = gimple_call_num_args (stmt);
149   else if (is_gimple_assign (stmt))
150     {
151       nops = gimple_num_ops (stmt) - 1;
152       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
153         nops++;
154     }
155   else
156     return NULL;
157
158   node = XNEW (struct _slp_tree);
159   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
160   SLP_TREE_VEC_STMTS (node).create (0);
161   SLP_TREE_CHILDREN (node).create (nops);
162   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
163
164   return node;
165 }
166
167
168 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
169    operand.  */
170 static vec<slp_oprnd_info> 
171 vect_create_oprnd_info (int nops, int group_size)
172 {
173   int i;
174   slp_oprnd_info oprnd_info;
175   vec<slp_oprnd_info> oprnds_info;
176
177   oprnds_info.create (nops);
178   for (i = 0; i < nops; i++)
179     {
180       oprnd_info = XNEW (struct _slp_oprnd_info);
181       oprnd_info->def_stmts.create (group_size);
182       oprnd_info->first_dt = vect_uninitialized_def;
183       oprnd_info->first_op_type = NULL_TREE;
184       oprnd_info->first_pattern = false;
185       oprnds_info.quick_push (oprnd_info);
186     }
187
188   return oprnds_info;
189 }
190
191
192 /* Free operands info.  */
193
194 static void
195 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
196 {
197   int i;
198   slp_oprnd_info oprnd_info;
199
200   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
201     {
202       oprnd_info->def_stmts.release ();
203       XDELETE (oprnd_info);
204     }
205
206   oprnds_info.release ();
207 }
208
209
210 /* Find the place of the data-ref in STMT in the interleaving chain that starts
211    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
212
213 static int
214 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
215 {
216   gimple next_stmt = first_stmt;
217   int result = 0;
218
219   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
220     return -1;
221
222   do
223     {
224       if (next_stmt == stmt)
225         return result;
226       result++;
227       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
228     }
229   while (next_stmt);
230
231   return -1;
232 }
233
234
235 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
236    they are of a valid type and that they match the defs of the first stmt of
237    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
238    return -1, if the error could be corrected by swapping operands of the
239    operation return 1, if everything is ok return 0.  */
240
241 static int 
242 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
243                              gimple stmt, bool first,
244                              vec<slp_oprnd_info> *oprnds_info)
245 {
246   tree oprnd;
247   unsigned int i, number_of_oprnds;
248   tree def;
249   gimple def_stmt;
250   enum vect_def_type dt = vect_uninitialized_def;
251   struct loop *loop = NULL;
252   bool pattern = false;
253   slp_oprnd_info oprnd_info;
254   int first_op_idx = 1;
255   bool commutative = false;
256   bool first_op_cond = false;
257
258   if (loop_vinfo)
259     loop = LOOP_VINFO_LOOP (loop_vinfo);
260
261   if (is_gimple_call (stmt))
262     {
263       number_of_oprnds = gimple_call_num_args (stmt);
264       first_op_idx = 3;
265     }
266   else if (is_gimple_assign (stmt))
267     {
268       enum tree_code code = gimple_assign_rhs_code (stmt);
269       number_of_oprnds = gimple_num_ops (stmt) - 1;
270       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
271         {
272           first_op_cond = true;
273           commutative = true;
274           number_of_oprnds++;
275         }
276       else
277         commutative = commutative_tree_code (code);
278     }
279   else
280     return -1;
281
282   bool swapped = false;
283   for (i = 0; i < number_of_oprnds; i++)
284     {
285 again:
286       if (first_op_cond)
287         {
288           if (i == 0 || i == 1)
289             oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
290                                   swapped ? !i : i);
291           else
292             oprnd = gimple_op (stmt, first_op_idx + i - 1);
293         }
294       else
295         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
296
297       oprnd_info = (*oprnds_info)[i];
298
299       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
300                                &def, &dt)
301           || (!def_stmt && dt != vect_constant_def))
302         {
303           if (dump_enabled_p ())
304             {
305               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306                                "Build SLP failed: can't find def for ");
307               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
308               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
309             }
310
311           return -1;
312         }
313
314       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
315          from the pattern.  Check that all the stmts of the node are in the
316          pattern.  */
317       if (def_stmt && gimple_bb (def_stmt)
318           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
319               || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
320                   && gimple_code (def_stmt) != GIMPLE_PHI))
321           && vinfo_for_stmt (def_stmt)
322           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
323           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
324           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
325         {
326           pattern = true;
327           if (!first && !oprnd_info->first_pattern)
328             {
329               if (i == 0
330                   && !swapped
331                   && commutative)
332                 {
333                   swapped = true;
334                   goto again;
335                 }
336
337               if (dump_enabled_p ())
338                 {
339                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340                                    "Build SLP failed: some of the stmts"
341                                    " are in a pattern, and others are not ");
342                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
343                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344                 }
345
346               return 1;
347             }
348
349           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
350           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
351
352           if (dt == vect_unknown_def_type)
353             {
354               if (dump_enabled_p ())
355                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356                                  "Unsupported pattern.\n");
357               return -1;
358             }
359
360           switch (gimple_code (def_stmt))
361             {
362               case GIMPLE_PHI:
363                 def = gimple_phi_result (def_stmt);
364                 break;
365
366               case GIMPLE_ASSIGN:
367                 def = gimple_assign_lhs (def_stmt);
368                 break;
369
370               default:
371                 if (dump_enabled_p ())
372                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
373                                    "unsupported defining stmt:\n");
374                 return -1;
375             }
376         }
377
378       if (first)
379         {
380           oprnd_info->first_dt = dt;
381           oprnd_info->first_pattern = pattern;
382           oprnd_info->first_op_type = TREE_TYPE (oprnd);
383         }
384       else
385         {
386           /* Not first stmt of the group, check that the def-stmt/s match
387              the def-stmt/s of the first stmt.  Allow different definition
388              types for reduction chains: the first stmt must be a
389              vect_reduction_def (a phi node), and the rest
390              vect_internal_def.  */
391           if (((oprnd_info->first_dt != dt
392                 && !(oprnd_info->first_dt == vect_reduction_def
393                      && dt == vect_internal_def)
394                 && !((oprnd_info->first_dt == vect_external_def
395                       || oprnd_info->first_dt == vect_constant_def)
396                      && (dt == vect_external_def
397                          || dt == vect_constant_def)))
398                || !types_compatible_p (oprnd_info->first_op_type,
399                                        TREE_TYPE (oprnd))))
400             {
401               /* Try swapping operands if we got a mismatch.  */
402               if (i == 0
403                   && !swapped
404                   && commutative)
405                 {
406                   swapped = true;
407                   goto again;
408                 }
409
410               if (dump_enabled_p ())
411                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412                                  "Build SLP failed: different types\n");
413
414               return 1;
415             }
416         }
417
418       /* Check the types of the definitions.  */
419       switch (dt)
420         {
421         case vect_constant_def:
422         case vect_external_def:
423         case vect_reduction_def:
424           break;
425
426         case vect_internal_def:
427           oprnd_info->def_stmts.quick_push (def_stmt);
428           break;
429
430         default:
431           /* FORNOW: Not supported.  */
432           if (dump_enabled_p ())
433             {
434               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435                                "Build SLP failed: illegal type of def ");
436               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
437               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
438             }
439
440           return -1;
441         }
442     }
443
444   /* Swap operands.  */
445   if (swapped)
446     {
447       if (first_op_cond)
448         {
449           tree cond = gimple_assign_rhs1 (stmt);
450           swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
451                              &TREE_OPERAND (cond, 1));
452           TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
453         }
454       else
455         swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
456                            gimple_assign_rhs2_ptr (stmt));
457     }
458
459   return 0;
460 }
461
462
463 /* Verify if the scalar stmts STMTS are isomorphic, require data
464    permutation or are of unsupported types of operation.  Return
465    true if they are, otherwise return false and indicate in *MATCHES
466    which stmts are not isomorphic to the first one.  If MATCHES[0]
467    is false then this indicates the comparison could not be
468    carried out or the stmts will never be vectorized by SLP.  */
469
470 static bool
471 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
472                        vec<gimple> stmts, unsigned int group_size,
473                        unsigned nops, unsigned int *max_nunits,
474                        unsigned int vectorization_factor, bool *matches)
475 {
476   unsigned int i;
477   gimple stmt = stmts[0];
478   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
479   enum tree_code first_cond_code = ERROR_MARK;
480   tree lhs;
481   bool need_same_oprnds = false;
482   tree vectype, scalar_type, first_op1 = NULL_TREE;
483   optab optab;
484   int icode;
485   machine_mode optab_op2_mode;
486   machine_mode vec_mode;
487   struct data_reference *first_dr;
488   HOST_WIDE_INT dummy;
489   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
490   tree cond;
491
492   /* For every stmt in NODE find its def stmt/s.  */
493   FOR_EACH_VEC_ELT (stmts, i, stmt)
494     {
495       matches[i] = false;
496
497       if (dump_enabled_p ())
498         {
499           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
500           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
501           dump_printf (MSG_NOTE, "\n");
502         }
503
504       /* Fail to vectorize statements marked as unvectorizable.  */
505       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
506         {
507           if (dump_enabled_p ())
508             {
509               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510                                "Build SLP failed: unvectorizable statement ");
511               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
512               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513             }
514           /* Fatal mismatch.  */
515           matches[0] = false;
516           return false;
517         }
518
519       lhs = gimple_get_lhs (stmt);
520       if (lhs == NULL_TREE)
521         {
522           if (dump_enabled_p ())
523             {
524               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525                                "Build SLP failed: not GIMPLE_ASSIGN nor "
526                                "GIMPLE_CALL ");
527               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
528               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529             }
530           /* Fatal mismatch.  */
531           matches[0] = false;
532           return false;
533         }
534
535        if (is_gimple_assign (stmt)
536            && gimple_assign_rhs_code (stmt) == COND_EXPR
537            && (cond = gimple_assign_rhs1 (stmt))
538            && !COMPARISON_CLASS_P (cond))
539         {
540           if (dump_enabled_p ())
541             {
542               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
543                                "Build SLP failed: condition is not "
544                                "comparison ");
545               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
546               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547             }
548           /* Fatal mismatch.  */
549           matches[0] = false;
550           return false;
551         }
552
553       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
554       vectype = get_vectype_for_scalar_type (scalar_type);
555       if (!vectype)
556         {
557           if (dump_enabled_p ())
558             {
559               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
560                                "Build SLP failed: unsupported data-type ");
561               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
562                                  scalar_type);
563               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
564             }
565           /* Fatal mismatch.  */
566           matches[0] = false;
567           return false;
568         }
569
570       /* In case of multiple types we need to detect the smallest type.  */
571       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
572         {
573           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
574           if (bb_vinfo)
575             vectorization_factor = *max_nunits;
576         }
577
578       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
579         {
580           rhs_code = CALL_EXPR;
581           if (gimple_call_internal_p (call_stmt)
582               || gimple_call_tail_p (call_stmt)
583               || gimple_call_noreturn_p (call_stmt)
584               || !gimple_call_nothrow_p (call_stmt)
585               || gimple_call_chain (call_stmt))
586             {
587               if (dump_enabled_p ())
588                 {
589                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
590                                    "Build SLP failed: unsupported call type ");
591                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
592                                     call_stmt, 0);
593                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
594                 }
595               /* Fatal mismatch.  */
596               matches[0] = false;
597               return false;
598             }
599         }
600       else
601         rhs_code = gimple_assign_rhs_code (stmt);
602
603       /* Check the operation.  */
604       if (i == 0)
605         {
606           first_stmt_code = rhs_code;
607
608           /* Shift arguments should be equal in all the packed stmts for a
609              vector shift with scalar shift operand.  */
610           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
611               || rhs_code == LROTATE_EXPR
612               || rhs_code == RROTATE_EXPR)
613             {
614               vec_mode = TYPE_MODE (vectype);
615
616               /* First see if we have a vector/vector shift.  */
617               optab = optab_for_tree_code (rhs_code, vectype,
618                                            optab_vector);
619
620               if (!optab
621                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
622                 {
623                   /* No vector/vector shift, try for a vector/scalar shift.  */
624                   optab = optab_for_tree_code (rhs_code, vectype,
625                                                optab_scalar);
626
627                   if (!optab)
628                     {
629                       if (dump_enabled_p ())
630                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
631                                          "Build SLP failed: no optab.\n");
632                       /* Fatal mismatch.  */
633                       matches[0] = false;
634                       return false;
635                     }
636                   icode = (int) optab_handler (optab, vec_mode);
637                   if (icode == CODE_FOR_nothing)
638                     {
639                       if (dump_enabled_p ())
640                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641                                          "Build SLP failed: "
642                                          "op not supported by target.\n");
643                       /* Fatal mismatch.  */
644                       matches[0] = false;
645                       return false;
646                     }
647                   optab_op2_mode = insn_data[icode].operand[2].mode;
648                   if (!VECTOR_MODE_P (optab_op2_mode))
649                     {
650                       need_same_oprnds = true;
651                       first_op1 = gimple_assign_rhs2 (stmt);
652                     }
653                 }
654             }
655           else if (rhs_code == WIDEN_LSHIFT_EXPR)
656             {
657               need_same_oprnds = true;
658               first_op1 = gimple_assign_rhs2 (stmt);
659             }
660         }
661       else
662         {
663           if (first_stmt_code != rhs_code
664               && (first_stmt_code != IMAGPART_EXPR
665                   || rhs_code != REALPART_EXPR)
666               && (first_stmt_code != REALPART_EXPR
667                   || rhs_code != IMAGPART_EXPR)
668               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
669                    && (first_stmt_code == ARRAY_REF
670                        || first_stmt_code == BIT_FIELD_REF
671                        || first_stmt_code == INDIRECT_REF
672                        || first_stmt_code == COMPONENT_REF
673                        || first_stmt_code == MEM_REF)))
674             {
675               if (dump_enabled_p ())
676                 {
677                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
678                                    "Build SLP failed: different operation "
679                                    "in stmt ");
680                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
681                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
682                 }
683               /* Mismatch.  */
684               continue;
685             }
686
687           if (need_same_oprnds
688               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
689             {
690               if (dump_enabled_p ())
691                 {
692                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
693                                    "Build SLP failed: different shift "
694                                    "arguments in ");
695                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
696                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
697                 }
698               /* Mismatch.  */
699               continue;
700             }
701
702           if (rhs_code == CALL_EXPR)
703             {
704               gimple first_stmt = stmts[0];
705               if (gimple_call_num_args (stmt) != nops
706                   || !operand_equal_p (gimple_call_fn (first_stmt),
707                                        gimple_call_fn (stmt), 0)
708                   || gimple_call_fntype (first_stmt)
709                      != gimple_call_fntype (stmt))
710                 {
711                   if (dump_enabled_p ())
712                     {
713                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
714                                        "Build SLP failed: different calls in ");
715                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
716                                         stmt, 0);
717                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
718                     }
719                   /* Mismatch.  */
720                   continue;
721                 }
722             }
723         }
724
725       /* Grouped store or load.  */
726       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
727         {
728           if (REFERENCE_CLASS_P (lhs))
729             {
730               /* Store.  */
731               ;
732             }
733           else
734             {
735               /* Load.  */
736               unsigned unrolling_factor
737                 = least_common_multiple
738                     (*max_nunits, group_size) / group_size;
739               /* FORNOW: Check that there is no gap between the loads
740                  and no gap between the groups when we need to load
741                  multiple groups at once.
742                  ???  We should enhance this to only disallow gaps
743                  inside vectors.  */
744               if ((unrolling_factor > 1
745                    && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
746                         && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
747                        /* If the group is split up then GROUP_GAP
748                           isn't correct here, nor is GROUP_FIRST_ELEMENT.  */
749                        || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
750                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
751                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
752                 {
753                   if (dump_enabled_p ())
754                     {
755                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756                                        "Build SLP failed: grouped "
757                                        "loads have gaps ");
758                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
759                                         stmt, 0);
760                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
761                     }
762                   /* Fatal mismatch.  */
763                   matches[0] = false;
764                   return false;
765                 }
766
767               /* Check that the size of interleaved loads group is not
768                  greater than the SLP group size.  */
769               unsigned ncopies
770                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
771               if (loop_vinfo
772                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
773                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
774                        - GROUP_GAP (vinfo_for_stmt (stmt)))
775                       > ncopies * group_size))
776                 {
777                   if (dump_enabled_p ())
778                     {
779                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
780                                        "Build SLP failed: the number "
781                                        "of interleaved loads is greater than "
782                                        "the SLP group size ");
783                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
784                                         stmt, 0);
785                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
786                     }
787                   /* Fatal mismatch.  */
788                   matches[0] = false;
789                   return false;
790                 }
791
792               old_first_load = first_load;
793               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
794               if (prev_first_load)
795                 {
796                   /* Check that there are no loads from different interleaving
797                      chains in the same node.  */
798                   if (prev_first_load != first_load)
799                     {
800                       if (dump_enabled_p ())
801                         {
802                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
803                                            vect_location, 
804                                            "Build SLP failed: different "
805                                            "interleaving chains in one node ");
806                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
807                                             stmt, 0);
808                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
809                         }
810                       /* Mismatch.  */
811                       continue;
812                     }
813                 }
814               else
815                 prev_first_load = first_load;
816
817               /* In some cases a group of loads is just the same load
818                  repeated N times.  Only analyze its cost once.  */
819               if (first_load == stmt && old_first_load != first_load)
820                 {
821                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
822                   if (vect_supportable_dr_alignment (first_dr, false)
823                       == dr_unaligned_unsupported)
824                     {
825                       if (dump_enabled_p ())
826                         {
827                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
828                                            vect_location, 
829                                            "Build SLP failed: unsupported "
830                                            "unaligned load ");
831                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
832                                             stmt, 0);
833                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
834                         }
835                       /* Fatal mismatch.  */
836                       matches[0] = false;
837                       return false;
838                     }
839                 }
840            }
841         } /* Grouped access.  */
842       else
843         {
844           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
845             {
846               /* Not grouped load.  */
847               if (dump_enabled_p ())
848                 {
849                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
850                                    "Build SLP failed: not grouped load ");
851                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
852                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
853                 }
854
855               /* FORNOW: Not grouped loads are not supported.  */
856               /* Fatal mismatch.  */
857               matches[0] = false;
858               return false;
859             }
860
861           /* Not memory operation.  */
862           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
863               && TREE_CODE_CLASS (rhs_code) != tcc_unary
864               && rhs_code != COND_EXPR
865               && rhs_code != CALL_EXPR)
866             {
867               if (dump_enabled_p ())
868                 {
869                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
870                                    "Build SLP failed: operation");
871                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
872                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
873                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
874                 }
875               /* Fatal mismatch.  */
876               matches[0] = false;
877               return false;
878             }
879
880           if (rhs_code == COND_EXPR)
881             {
882               tree cond_expr = gimple_assign_rhs1 (stmt);
883
884               if (i == 0)
885                 first_cond_code = TREE_CODE (cond_expr);
886               else if (first_cond_code != TREE_CODE (cond_expr))
887                 {
888                   if (dump_enabled_p ())
889                     {
890                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891                                        "Build SLP failed: different"
892                                        " operation");
893                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
894                                         stmt, 0);
895                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
896                     }
897                   /* Mismatch.  */
898                   continue;
899                 }
900             }
901         }
902
903       matches[i] = true;
904     }
905
906   for (i = 0; i < group_size; ++i)
907     if (!matches[i])
908       return false;
909
910   return true;
911 }
912
913 /* Recursively build an SLP tree starting from NODE.
914    Fail (and return a value not equal to zero) if def-stmts are not
915    isomorphic, require data permutation or are of unsupported types of
916    operation.  Otherwise, return 0.
917    The value returned is the depth in the SLP tree where a mismatch
918    was found.  */
919
920 static bool
921 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
922                      slp_tree *node, unsigned int group_size,
923                      unsigned int *max_nunits,
924                      vec<slp_tree> *loads,
925                      unsigned int vectorization_factor,
926                      bool *matches, unsigned *npermutes, unsigned *tree_size,
927                      unsigned max_tree_size)
928 {
929   unsigned nops, i, this_tree_size = 0;
930   gimple stmt;
931
932   matches[0] = false;
933
934   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
935   if (is_gimple_call (stmt))
936     nops = gimple_call_num_args (stmt);
937   else if (is_gimple_assign (stmt))
938     {
939       nops = gimple_num_ops (stmt) - 1;
940       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
941         nops++;
942     }
943   else
944     return false;
945
946   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
947                               SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
948                               max_nunits, vectorization_factor, matches))
949     return false;
950
951   /* If the SLP node is a load, terminate the recursion.  */
952   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
953       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
954     {
955       loads->safe_push (*node);
956       return true;
957     }
958
959   /* Get at the operands, verifying they are compatible.  */
960   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
961   slp_oprnd_info oprnd_info;
962   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
963     {
964       switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
965                                            stmt, (i == 0), &oprnds_info))
966         {
967         case 0:
968           break;
969         case -1:
970           matches[0] = false;
971           vect_free_oprnd_info (oprnds_info);
972           return false;
973         case 1:
974           matches[i] = false;
975           break;
976         }
977     }
978   for (i = 0; i < group_size; ++i)
979     if (!matches[i])
980       {
981         vect_free_oprnd_info (oprnds_info);
982         return false;
983       }
984
985   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
986
987   /* Create SLP_TREE nodes for the definition node/s.  */
988   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
989     {
990       slp_tree child;
991       unsigned old_nloads = loads->length ();
992       unsigned old_max_nunits = *max_nunits;
993
994       if (oprnd_info->first_dt != vect_internal_def)
995         continue;
996
997       if (++this_tree_size > max_tree_size)
998         {
999           vect_free_oprnd_info (oprnds_info);
1000           return false;
1001         }
1002
1003       child = vect_create_new_slp_node (oprnd_info->def_stmts);
1004       if (!child)
1005         {
1006           vect_free_oprnd_info (oprnds_info);
1007           return false;
1008         }
1009
1010       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1011                                group_size, max_nunits, loads,
1012                                vectorization_factor, matches,
1013                                npermutes, &this_tree_size, max_tree_size))
1014         {
1015           oprnd_info->def_stmts = vNULL;
1016           SLP_TREE_CHILDREN (*node).quick_push (child);
1017           continue;
1018         }
1019
1020       /* If the SLP build for operand zero failed and operand zero
1021          and one can be commutated try that for the scalar stmts
1022          that failed the match.  */
1023       if (i == 0
1024           /* A first scalar stmt mismatch signals a fatal mismatch.  */
1025           && matches[0]
1026           /* ???  For COND_EXPRs we can swap the comparison operands
1027              as well as the arms under some constraints.  */
1028           && nops == 2
1029           && oprnds_info[1]->first_dt == vect_internal_def
1030           && is_gimple_assign (stmt)
1031           && commutative_tree_code (gimple_assign_rhs_code (stmt))
1032           /* Do so only if the number of not successful permutes was nor more
1033              than a cut-ff as re-trying the recursive match on
1034              possibly each level of the tree would expose exponential
1035              behavior.  */
1036           && *npermutes < 4)
1037         {
1038           unsigned int j;
1039           slp_tree grandchild;
1040
1041           /* Roll back.  */
1042           *max_nunits = old_max_nunits;
1043           loads->truncate (old_nloads);
1044           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1045             vect_free_slp_tree (grandchild);
1046           SLP_TREE_CHILDREN (child).truncate (0);
1047
1048           /* Swap mismatched definition stmts.  */
1049           dump_printf_loc (MSG_NOTE, vect_location,
1050                            "Re-trying with swapped operands of stmts ");
1051           for (j = 0; j < group_size; ++j)
1052             if (!matches[j])
1053               {
1054                 gimple tem = oprnds_info[0]->def_stmts[j];
1055                 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1056                 oprnds_info[1]->def_stmts[j] = tem;
1057                 dump_printf (MSG_NOTE, "%d ", j);
1058               }
1059           dump_printf (MSG_NOTE, "\n");
1060           /* And try again ... */
1061           if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1062                                    group_size, max_nunits, loads,
1063                                    vectorization_factor,
1064                                    matches, npermutes, &this_tree_size,
1065                                    max_tree_size))
1066             {
1067               oprnd_info->def_stmts = vNULL;
1068               SLP_TREE_CHILDREN (*node).quick_push (child);
1069               continue;
1070             }
1071
1072           ++*npermutes;
1073         }
1074
1075       oprnd_info->def_stmts = vNULL;
1076       vect_free_slp_tree (child);
1077       vect_free_oprnd_info (oprnds_info);
1078       return false;
1079     }
1080
1081   if (tree_size)
1082     *tree_size += this_tree_size;
1083
1084   vect_free_oprnd_info (oprnds_info);
1085   return true;
1086 }
1087
1088 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1089
1090 static void
1091 vect_print_slp_tree (int dump_kind, slp_tree node)
1092 {
1093   int i;
1094   gimple stmt;
1095   slp_tree child;
1096
1097   if (!node)
1098     return;
1099
1100   dump_printf (dump_kind, "node ");
1101   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1102     {
1103       dump_printf (dump_kind, "\n\tstmt %d ", i);
1104       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1105     }
1106   dump_printf (dump_kind, "\n");
1107
1108   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1109     vect_print_slp_tree (dump_kind, child);
1110 }
1111
1112
1113 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1114    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1115    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1116    stmts in NODE are to be marked.  */
1117
1118 static void
1119 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1120 {
1121   int i;
1122   gimple stmt;
1123   slp_tree child;
1124
1125   if (!node)
1126     return;
1127
1128   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1129     if (j < 0 || i == j)
1130       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1131
1132   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1133     vect_mark_slp_stmts (child, mark, j);
1134 }
1135
1136
1137 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1138
1139 static void
1140 vect_mark_slp_stmts_relevant (slp_tree node)
1141 {
1142   int i;
1143   gimple stmt;
1144   stmt_vec_info stmt_info;
1145   slp_tree child;
1146
1147   if (!node)
1148     return;
1149
1150   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1151     {
1152       stmt_info = vinfo_for_stmt (stmt);
1153       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1154                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1155       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1156     }
1157
1158   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1159     vect_mark_slp_stmts_relevant (child);
1160 }
1161
1162
1163 /* Rearrange the statements of NODE according to PERMUTATION.  */
1164
1165 static void
1166 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1167                           vec<unsigned> permutation)
1168 {
1169   gimple stmt;
1170   vec<gimple> tmp_stmts;
1171   unsigned int i;
1172   slp_tree child;
1173
1174   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1175     vect_slp_rearrange_stmts (child, group_size, permutation);
1176
1177   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1178   tmp_stmts.create (group_size);
1179   tmp_stmts.quick_grow_cleared (group_size);
1180
1181   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1182     tmp_stmts[permutation[i]] = stmt;
1183
1184   SLP_TREE_SCALAR_STMTS (node).release ();
1185   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1186 }
1187
1188
1189 /* Check if the required load permutations in the SLP instance
1190    SLP_INSTN are supported.  */
1191
1192 static bool
1193 vect_supported_load_permutation_p (slp_instance slp_instn)
1194 {
1195   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1196   unsigned int i, j, k, next;
1197   sbitmap load_index;
1198   slp_tree node;
1199   gimple stmt, load, next_load, first_load;
1200   struct data_reference *dr;
1201
1202   if (dump_enabled_p ())
1203     {
1204       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1205       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1206         if (node->load_permutation.exists ())
1207           FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1208             dump_printf (MSG_NOTE, "%d ", next);
1209         else
1210           for (k = 0; k < group_size; ++k)
1211             dump_printf (MSG_NOTE, "%d ", k);
1212       dump_printf (MSG_NOTE, "\n");
1213     }
1214
1215   /* In case of reduction every load permutation is allowed, since the order
1216      of the reduction statements is not important (as opposed to the case of
1217      grouped stores).  The only condition we need to check is that all the
1218      load nodes are of the same size and have the same permutation (and then
1219      rearrange all the nodes of the SLP instance according to this 
1220      permutation).  */
1221
1222   /* Check that all the load nodes are of the same size.  */
1223   /* ???  Can't we assert this? */
1224   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1225     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1226       return false;
1227
1228   node = SLP_INSTANCE_TREE (slp_instn);
1229   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1230
1231   /* Reduction (there are no data-refs in the root).
1232      In reduction chain the order of the loads is important.  */
1233   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1234       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1235     {
1236       slp_tree load;
1237       unsigned int lidx;
1238
1239       /* Compare all the permutation sequences to the first one.  We know
1240          that at least one load is permuted.  */
1241       node = SLP_INSTANCE_LOADS (slp_instn)[0];
1242       if (!node->load_permutation.exists ())
1243         return false;
1244       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1245         {
1246           if (!load->load_permutation.exists ())
1247             return false;
1248           FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1249             if (lidx != node->load_permutation[j])
1250               return false;
1251         }
1252
1253       /* Check that the loads in the first sequence are different and there
1254          are no gaps between them.  */
1255       load_index = sbitmap_alloc (group_size);
1256       bitmap_clear (load_index);
1257       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1258         {
1259           if (bitmap_bit_p (load_index, lidx))
1260             {
1261               sbitmap_free (load_index);
1262               return false;
1263             }
1264           bitmap_set_bit (load_index, lidx);
1265         }
1266       for (i = 0; i < group_size; i++)
1267         if (!bitmap_bit_p (load_index, i))
1268           {
1269             sbitmap_free (load_index);
1270             return false;
1271           }
1272       sbitmap_free (load_index);
1273
1274       /* This permutation is valid for reduction.  Since the order of the
1275          statements in the nodes is not important unless they are memory
1276          accesses, we can rearrange the statements in all the nodes
1277          according to the order of the loads.  */
1278       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1279                                 node->load_permutation);
1280
1281       /* We are done, no actual permutations need to be generated.  */
1282       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1283         SLP_TREE_LOAD_PERMUTATION (node).release ();
1284       return true;
1285     }
1286
1287   /* In basic block vectorization we allow any subchain of an interleaving
1288      chain.
1289      FORNOW: not supported in loop SLP because of realignment compications.  */
1290   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1291     {
1292       /* Check that for every node in the instance the loads
1293          form a subchain.  */
1294       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1295         {
1296           next_load = NULL;
1297           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1298             {
1299               if (j != 0 && next_load != load)
1300                 return false;
1301               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1302             }
1303         }
1304
1305       /* Check that the alignment of the first load in every subchain, i.e.,
1306          the first statement in every load node, is supported.
1307          ???  This belongs in alignment checking.  */
1308       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1309         {
1310           first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1311           if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1312             {
1313               dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1314               if (vect_supportable_dr_alignment (dr, false)
1315                   == dr_unaligned_unsupported)
1316                 {
1317                   if (dump_enabled_p ())
1318                     {
1319                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1320                                        vect_location,
1321                                        "unsupported unaligned load ");
1322                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1323                                         first_load, 0);
1324                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1325                     }
1326                   return false;
1327                 }
1328             }
1329         }
1330
1331       /* We are done, no actual permutations need to be generated.  */
1332       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1333         SLP_TREE_LOAD_PERMUTATION (node).release ();
1334       return true;
1335     }
1336
1337   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1338      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1339      well (unless it's reduction).  */
1340   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1341     return false;
1342   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1343     if (!node->load_permutation.exists ())
1344       return false;
1345
1346   load_index = sbitmap_alloc (group_size);
1347   bitmap_clear (load_index);
1348   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1349     {
1350       unsigned int lidx = node->load_permutation[0];
1351       if (bitmap_bit_p (load_index, lidx))
1352         {
1353           sbitmap_free (load_index);
1354           return false;
1355         }
1356       bitmap_set_bit (load_index, lidx);
1357       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1358         if (k != lidx)
1359           {
1360             sbitmap_free (load_index);
1361             return false;
1362           }
1363     }
1364   for (i = 0; i < group_size; i++)
1365     if (!bitmap_bit_p (load_index, i))
1366       {
1367         sbitmap_free (load_index);
1368         return false;
1369       }
1370   sbitmap_free (load_index);
1371
1372   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1373     if (node->load_permutation.exists ()
1374         && !vect_transform_slp_perm_load
1375               (node, vNULL, NULL,
1376                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1377       return false;
1378   return true;
1379 }
1380
1381
1382 /* Find the first load in the loop that belongs to INSTANCE.
1383    When loads are in several SLP nodes, there can be a case in which the first
1384    load does not appear in the first SLP node to be transformed, causing
1385    incorrect order of statements.  Since we generate all the loads together,
1386    they must be inserted before the first load of the SLP instance and not
1387    before the first load of the first node of the instance.  */
1388
1389 static gimple
1390 vect_find_first_load_in_slp_instance (slp_instance instance)
1391 {
1392   int i, j;
1393   slp_tree load_node;
1394   gimple first_load = NULL, load;
1395
1396   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1397     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1398       first_load = get_earlier_stmt (load, first_load);
1399
1400   return first_load;
1401 }
1402
1403
1404 /* Find the last store in SLP INSTANCE.  */
1405
1406 static gimple
1407 vect_find_last_store_in_slp_instance (slp_instance instance)
1408 {
1409   int i;
1410   slp_tree node;
1411   gimple last_store = NULL, store;
1412
1413   node = SLP_INSTANCE_TREE (instance);
1414   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1415     last_store = get_later_stmt (store, last_store);
1416
1417   return last_store;
1418 }
1419
1420 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1421
1422 static void
1423 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1424                          slp_instance instance, slp_tree node,
1425                          stmt_vector_for_cost *prologue_cost_vec,
1426                          unsigned ncopies_for_cost)
1427 {
1428   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1429
1430   unsigned i;
1431   slp_tree child;
1432   gimple stmt, s;
1433   stmt_vec_info stmt_info;
1434   tree lhs;
1435   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1436
1437   /* Recurse down the SLP tree.  */
1438   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1439     vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1440                              instance, child, prologue_cost_vec,
1441                              ncopies_for_cost);
1442
1443   /* Look at the first scalar stmt to determine the cost.  */
1444   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1445   stmt_info = vinfo_for_stmt (stmt);
1446   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1447     {
1448       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1449         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1450                                vect_uninitialized_def,
1451                                node, prologue_cost_vec, body_cost_vec);
1452       else
1453         {
1454           int i;
1455           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1456           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1457                                 node, prologue_cost_vec, body_cost_vec);
1458           /* If the load is permuted record the cost for the permutation.
1459              ???  Loads from multiple chains are let through here only
1460              for a single special case involving complex numbers where
1461              in the end no permutation is necessary.  */
1462           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1463             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1464                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1465                 && vect_get_place_in_interleaving_chain
1466                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1467               {
1468                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1469                                   stmt_info, 0, vect_body);
1470                 break;
1471               }
1472         }
1473     }
1474   else
1475     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1476                       stmt_info, 0, vect_body);
1477
1478   /* Scan operands and account for prologue cost of constants/externals.
1479      ???  This over-estimates cost for multiple uses and should be
1480      re-engineered.  */
1481   lhs = gimple_get_lhs (stmt);
1482   for (i = 0; i < gimple_num_ops (stmt); ++i)
1483     {
1484       tree def, op = gimple_op (stmt, i);
1485       gimple def_stmt;
1486       enum vect_def_type dt;
1487       if (!op || op == lhs)
1488         continue;
1489       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1490                               &def_stmt, &def, &dt)
1491           && (dt == vect_constant_def || dt == vect_external_def))
1492         record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1493                           stmt_info, 0, vect_prologue);
1494     }
1495 }
1496
1497 /* Compute the cost for the SLP instance INSTANCE.  */
1498
1499 static void
1500 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1501                        slp_instance instance, unsigned nunits)
1502 {
1503   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1504   unsigned ncopies_for_cost;
1505   stmt_info_for_cost *si;
1506   unsigned i;
1507
1508   /* Calculate the number of vector stmts to create based on the unrolling
1509      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1510      GROUP_SIZE / NUNITS otherwise.  */
1511   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1512   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1513
1514   prologue_cost_vec.create (10);
1515   body_cost_vec.create (10);
1516   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1517   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1518                            instance, SLP_INSTANCE_TREE (instance),
1519                            &prologue_cost_vec, ncopies_for_cost);
1520
1521   /* Record the prologue costs, which were delayed until we were
1522      sure that SLP was successful.  Unlike the body costs, we know
1523      the final values now regardless of the loop vectorization factor.  */
1524   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1525                 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1526   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1527     {
1528       struct _stmt_vec_info *stmt_info
1529         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1530       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1531                             si->misalign, vect_prologue);
1532     }
1533
1534   prologue_cost_vec.release ();
1535 }
1536
1537 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1538    vect_build_slp_tree to build a tree of packed stmts if possible.
1539    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1540
1541 static bool
1542 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1543                            gimple stmt, unsigned max_tree_size)
1544 {
1545   slp_instance new_instance;
1546   slp_tree node;
1547   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1548   unsigned int unrolling_factor = 1, nunits;
1549   tree vectype, scalar_type = NULL_TREE;
1550   gimple next;
1551   unsigned int vectorization_factor = 0;
1552   int i;
1553   unsigned int max_nunits = 0;
1554   vec<slp_tree> loads;
1555   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1556   vec<gimple> scalar_stmts;
1557
1558   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1559     {
1560       if (dr)
1561         {
1562           scalar_type = TREE_TYPE (DR_REF (dr));
1563           vectype = get_vectype_for_scalar_type (scalar_type);
1564         }
1565       else
1566         {
1567           gcc_assert (loop_vinfo);
1568           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1569         }
1570
1571       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1572     }
1573   else
1574     {
1575       gcc_assert (loop_vinfo);
1576       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1577       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1578     }
1579
1580   if (!vectype)
1581     {
1582       if (dump_enabled_p ())
1583         {
1584           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1585                            "Build SLP failed: unsupported data-type ");
1586           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1587           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1588         }
1589
1590       return false;
1591     }
1592
1593   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1594   if (loop_vinfo)
1595     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1596   else
1597     vectorization_factor = nunits;
1598
1599   /* Calculate the unrolling factor.  */
1600   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1601   if (unrolling_factor != 1 && !loop_vinfo)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605                          "Build SLP failed: unrolling required in basic"
1606                          " block SLP\n");
1607
1608       return false;
1609     }
1610
1611   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1612   scalar_stmts.create (group_size);
1613   next = stmt;
1614   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1615     {
1616       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1617       while (next)
1618         {
1619           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1620               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1621             scalar_stmts.safe_push (
1622                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1623           else
1624             scalar_stmts.safe_push (next);
1625           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1626         }
1627     }
1628   else
1629     {
1630       /* Collect reduction statements.  */
1631       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1632       for (i = 0; reductions.iterate (i, &next); i++)
1633         scalar_stmts.safe_push (next);
1634     }
1635
1636   node = vect_create_new_slp_node (scalar_stmts);
1637
1638   loads.create (group_size);
1639
1640   /* Build the tree for the SLP instance.  */
1641   bool *matches = XALLOCAVEC (bool, group_size);
1642   unsigned npermutes = 0;
1643   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1644                            &max_nunits, &loads,
1645                            vectorization_factor, matches, &npermutes, NULL,
1646                            max_tree_size))
1647     {
1648       /* Calculate the unrolling factor based on the smallest type.  */
1649       if (max_nunits > nunits)
1650         unrolling_factor = least_common_multiple (max_nunits, group_size)
1651                            / group_size;
1652
1653       if (unrolling_factor != 1 && !loop_vinfo)
1654         {
1655           if (dump_enabled_p ())
1656             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657                              "Build SLP failed: unrolling required in basic"
1658                              " block SLP\n");
1659           vect_free_slp_tree (node);
1660           loads.release ();
1661           return false;
1662         }
1663
1664       /* Create a new SLP instance.  */
1665       new_instance = XNEW (struct _slp_instance);
1666       SLP_INSTANCE_TREE (new_instance) = node;
1667       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1668       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1669       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1670       SLP_INSTANCE_LOADS (new_instance) = loads;
1671       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1672
1673       /* Compute the load permutation.  */
1674       slp_tree load_node;
1675       bool loads_permuted = false;
1676       FOR_EACH_VEC_ELT (loads, i, load_node)
1677         {
1678           vec<unsigned> load_permutation;
1679           int j;
1680           gimple load, first_stmt;
1681           bool this_load_permuted = false;
1682           load_permutation.create (group_size);
1683           first_stmt = GROUP_FIRST_ELEMENT
1684               (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1685           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1686             {
1687               int load_place
1688                 = vect_get_place_in_interleaving_chain (load, first_stmt);
1689               gcc_assert (load_place != -1);
1690               if (load_place != j)
1691                 this_load_permuted = true;
1692               load_permutation.safe_push (load_place);
1693             }
1694           if (!this_load_permuted)
1695             {
1696               load_permutation.release ();
1697               continue;
1698             }
1699           SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1700           loads_permuted = true;
1701         }
1702
1703       if (loads_permuted)
1704         {
1705           if (!vect_supported_load_permutation_p (new_instance))
1706             {
1707               if (dump_enabled_p ())
1708                 {
1709                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710                                    "Build SLP failed: unsupported load "
1711                                    "permutation ");
1712                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1713                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1714                 }
1715               vect_free_slp_instance (new_instance);
1716               return false;
1717             }
1718
1719           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1720             = vect_find_first_load_in_slp_instance (new_instance);
1721         }
1722
1723       /* Compute the costs of this SLP instance.  */
1724       vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1725                              new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1726
1727       if (loop_vinfo)
1728         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1729       else
1730         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1731
1732       if (dump_enabled_p ())
1733         vect_print_slp_tree (MSG_NOTE, node);
1734
1735       return true;
1736     }
1737
1738   /* Failed to SLP.  */
1739   /* Free the allocated memory.  */
1740   vect_free_slp_tree (node);
1741   loads.release ();
1742
1743   return false;
1744 }
1745
1746
1747 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1748    trees of packed scalar stmts if SLP is possible.  */
1749
1750 bool
1751 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1752                   unsigned max_tree_size)
1753 {
1754   unsigned int i;
1755   vec<gimple> grouped_stores;
1756   vec<gimple> reductions = vNULL;
1757   vec<gimple> reduc_chains = vNULL;
1758   gimple first_element;
1759   bool ok = false;
1760
1761   if (dump_enabled_p ())
1762     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1763
1764   if (loop_vinfo)
1765     {
1766       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1767       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1768       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1769     }
1770   else
1771     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1772
1773   /* Find SLP sequences starting from groups of grouped stores.  */
1774   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1775     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1776                                    max_tree_size))
1777       ok = true;
1778
1779   if (bb_vinfo && !ok)
1780     {
1781       if (dump_enabled_p ())
1782         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1783                          "Failed to SLP the basic block.\n");
1784
1785       return false;
1786     }
1787
1788   if (loop_vinfo
1789       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1790     {
1791       /* Find SLP sequences starting from reduction chains.  */
1792       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1793         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1794                                        max_tree_size))
1795           ok = true;
1796         else
1797           return false;
1798
1799       /* Don't try to vectorize SLP reductions if reduction chain was
1800          detected.  */
1801       return ok;
1802     }
1803
1804   /* Find SLP sequences starting from groups of reductions.  */
1805   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1806       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1807                                     max_tree_size))
1808     ok = true;
1809
1810   return true;
1811 }
1812
1813
1814 /* For each possible SLP instance decide whether to SLP it and calculate overall
1815    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1816    least one instance.  */
1817
1818 bool
1819 vect_make_slp_decision (loop_vec_info loop_vinfo)
1820 {
1821   unsigned int i, unrolling_factor = 1;
1822   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1823   slp_instance instance;
1824   int decided_to_slp = 0;
1825
1826   if (dump_enabled_p ())
1827     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1828                      "\n");
1829
1830   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1831     {
1832       /* FORNOW: SLP if you can.  */
1833       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1834         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1835
1836       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1837          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1838          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1839       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1840       decided_to_slp++;
1841     }
1842
1843   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1844
1845   if (decided_to_slp && dump_enabled_p ())
1846     dump_printf_loc (MSG_NOTE, vect_location,
1847                      "Decided to SLP %d instances. Unrolling factor %d\n",
1848                      decided_to_slp, unrolling_factor);
1849
1850   return (decided_to_slp > 0);
1851 }
1852
1853
1854 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1855    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1856
1857 static void
1858 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1859 {
1860   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1861   imm_use_iterator imm_iter;
1862   gimple use_stmt;
1863   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1864   slp_tree child;
1865   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1866   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1867   int j;
1868
1869   /* Propagate hybrid down the SLP tree.  */
1870   if (stype == hybrid)
1871     ;
1872   else if (HYBRID_SLP_STMT (stmt_vinfo))
1873     stype = hybrid;
1874   else
1875     {
1876       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
1877       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1878       /* We always get the pattern stmt here, but for immediate
1879          uses we have to use the LHS of the original stmt.  */
1880       gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1881       if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
1882         stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
1883       if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1884         FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1885           {
1886             if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1887               continue;
1888             use_vinfo = vinfo_for_stmt (use_stmt);
1889             if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1890                 && STMT_VINFO_RELATED_STMT (use_vinfo))
1891               use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
1892             if (!STMT_SLP_TYPE (use_vinfo)
1893                 && (STMT_VINFO_RELEVANT (use_vinfo)
1894                     || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
1895                 && !(gimple_code (use_stmt) == GIMPLE_PHI
1896                      && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1897               stype = hybrid;
1898           }
1899     }
1900
1901   if (stype == hybrid)
1902     STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1903
1904   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1905     vect_detect_hybrid_slp_stmts (child, i, stype);
1906 }
1907
1908 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
1909
1910 static tree
1911 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
1912 {
1913   walk_stmt_info *wi = (walk_stmt_info *)data;
1914   struct loop *loopp = (struct loop *)wi->info;
1915
1916   if (wi->is_lhs)
1917     return NULL_TREE;
1918
1919   if (TREE_CODE (*tp) == SSA_NAME
1920       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
1921     {
1922       gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
1923       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
1924           && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
1925         STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
1926     }
1927
1928   return NULL_TREE;
1929 }
1930
1931 static tree
1932 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
1933                           walk_stmt_info *)
1934 {
1935   /* If the stmt is in a SLP instance then this isn't a reason
1936      to mark use definitions in other SLP instances as hybrid.  */
1937   if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
1938     *handled = true;
1939   return NULL_TREE;
1940 }
1941
1942 /* Find stmts that must be both vectorized and SLPed.  */
1943
1944 void
1945 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1946 {
1947   unsigned int i;
1948   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1949   slp_instance instance;
1950
1951   if (dump_enabled_p ())
1952     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1953                      "\n");
1954
1955   /* First walk all pattern stmt in the loop and mark defs of uses as
1956      hybrid because immediate uses in them are not recorded.  */
1957   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
1958     {
1959       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
1960       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1961            gsi_next (&gsi))
1962         {
1963           gimple stmt = gsi_stmt (gsi);
1964           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1965           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
1966             {
1967               walk_stmt_info wi;
1968               memset (&wi, 0, sizeof (wi));
1969               wi.info = LOOP_VINFO_LOOP (loop_vinfo);
1970               gimple_stmt_iterator gsi2
1971                 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
1972               walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
1973                                 vect_detect_hybrid_slp_1, &wi);
1974               walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
1975                                vect_detect_hybrid_slp_2,
1976                                vect_detect_hybrid_slp_1, &wi);
1977             }
1978         }
1979     }
1980
1981   /* Then walk the SLP instance trees marking stmts with uses in
1982      non-SLP stmts as hybrid, also propagating hybrid down the
1983      SLP tree, collecting the above info on-the-fly.  */
1984   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1985     {
1986       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
1987         vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
1988                                       i, pure_slp);
1989     }
1990 }
1991
1992
1993 /* Create and initialize a new bb_vec_info struct for BB, as well as
1994    stmt_vec_info structs for all the stmts in it.  */
1995
1996 static bb_vec_info
1997 new_bb_vec_info (basic_block bb)
1998 {
1999   bb_vec_info res = NULL;
2000   gimple_stmt_iterator gsi;
2001
2002   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2003   BB_VINFO_BB (res) = bb;
2004
2005   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2006     {
2007       gimple stmt = gsi_stmt (gsi);
2008       gimple_set_uid (stmt, 0);
2009       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2010     }
2011
2012   BB_VINFO_GROUPED_STORES (res).create (10);
2013   BB_VINFO_SLP_INSTANCES (res).create (2);
2014   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2015
2016   bb->aux = res;
2017   return res;
2018 }
2019
2020
2021 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2022    stmts in the basic block.  */
2023
2024 static void
2025 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2026 {
2027   vec<slp_instance> slp_instances;
2028   slp_instance instance;
2029   basic_block bb;
2030   gimple_stmt_iterator si;
2031   unsigned i;
2032
2033   if (!bb_vinfo)
2034     return;
2035
2036   bb = BB_VINFO_BB (bb_vinfo);
2037
2038   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2039     {
2040       gimple stmt = gsi_stmt (si);
2041       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2042
2043       if (stmt_info)
2044         /* Free stmt_vec_info.  */
2045         free_stmt_vec_info (stmt);
2046     }
2047
2048   vect_destroy_datarefs (NULL, bb_vinfo);
2049   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2050   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2051   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2052   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2053     vect_free_slp_instance (instance);
2054   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2055   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2056   free (bb_vinfo);
2057   bb->aux = NULL;
2058 }
2059
2060
2061 /* Analyze statements contained in SLP tree node after recursively analyzing
2062    the subtree. Return TRUE if the operations are supported.  */
2063
2064 static bool
2065 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2066 {
2067   bool dummy;
2068   int i;
2069   gimple stmt;
2070   slp_tree child;
2071
2072   if (!node)
2073     return true;
2074
2075   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2076     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2077       return false;
2078
2079   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2080     {
2081       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2082       gcc_assert (stmt_info);
2083       gcc_assert (PURE_SLP_STMT (stmt_info));
2084
2085       if (!vect_analyze_stmt (stmt, &dummy, node))
2086         return false;
2087     }
2088
2089   return true;
2090 }
2091
2092
2093 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2094    operations are supported. */
2095
2096 static bool
2097 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2098 {
2099   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2100   slp_instance instance;
2101   int i;
2102
2103   for (i = 0; slp_instances.iterate (i, &instance); )
2104     {
2105       if (!vect_slp_analyze_node_operations (bb_vinfo,
2106                                              SLP_INSTANCE_TREE (instance)))
2107         {
2108           vect_free_slp_instance (instance);
2109           slp_instances.ordered_remove (i);
2110         }
2111       else
2112         i++;
2113     }
2114
2115   if (!slp_instances.length ())
2116     return false;
2117
2118   return true;
2119 }
2120
2121
2122 /* Compute the scalar cost of the SLP node NODE and its children
2123    and return it.  Do not account defs that are marked in LIFE and
2124    update LIFE according to uses of NODE.  */
2125
2126 static unsigned
2127 vect_bb_slp_scalar_cost (basic_block bb,
2128                          slp_tree node, vec<bool, va_heap> *life)
2129 {
2130   unsigned scalar_cost = 0;
2131   unsigned i;
2132   gimple stmt;
2133   slp_tree child;
2134
2135   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2136     {
2137       unsigned stmt_cost;
2138       ssa_op_iter op_iter;
2139       def_operand_p def_p;
2140       stmt_vec_info stmt_info;
2141
2142       if ((*life)[i])
2143         continue;
2144
2145       /* If there is a non-vectorized use of the defs then the scalar
2146          stmt is kept live in which case we do not account it or any
2147          required defs in the SLP children in the scalar cost.  This
2148          way we make the vectorization more costly when compared to
2149          the scalar cost.  */
2150       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2151         {
2152           imm_use_iterator use_iter;
2153           gimple use_stmt;
2154           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2155             if (!is_gimple_debug (use_stmt)
2156                 && (gimple_code (use_stmt) == GIMPLE_PHI
2157                     || gimple_bb (use_stmt) != bb
2158                     || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2159               {
2160                 (*life)[i] = true;
2161                 BREAK_FROM_IMM_USE_STMT (use_iter);
2162               }
2163         }
2164       if ((*life)[i])
2165         continue;
2166
2167       stmt_info = vinfo_for_stmt (stmt);
2168       if (STMT_VINFO_DATA_REF (stmt_info))
2169         {
2170           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2171             stmt_cost = vect_get_stmt_cost (scalar_load);
2172           else
2173             stmt_cost = vect_get_stmt_cost (scalar_store);
2174         }
2175       else
2176         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2177
2178       scalar_cost += stmt_cost;
2179     }
2180
2181   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2182     scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2183
2184   return scalar_cost;
2185 }
2186
2187 /* Check if vectorization of the basic block is profitable.  */
2188
2189 static bool
2190 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2191 {
2192   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2193   slp_instance instance;
2194   int i, j;
2195   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2196   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2197   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2198   stmt_vec_info stmt_info = NULL;
2199   stmt_vector_for_cost body_cost_vec;
2200   stmt_info_for_cost *ci;
2201
2202   /* Calculate vector costs.  */
2203   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2204     {
2205       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2206
2207       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2208         {
2209           stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2210           (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2211                                 stmt_info, ci->misalign, vect_body);
2212         }
2213     }
2214
2215   /* Calculate scalar cost.  */
2216   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2217     {
2218       auto_vec<bool, 20> life;
2219       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2220       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2221                                               SLP_INSTANCE_TREE (instance),
2222                                               &life);
2223     }
2224
2225   /* Complete the target-specific cost calculation.  */
2226   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2227                &vec_inside_cost, &vec_epilogue_cost);
2228
2229   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2230
2231   if (dump_enabled_p ())
2232     {
2233       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2234       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2235                    vec_inside_cost);
2236       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2237       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2238       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2239     }
2240
2241   /* Vectorization is profitable if its cost is less than the cost of scalar
2242      version.  */
2243   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2244     return false;
2245
2246   return true;
2247 }
2248
2249 /* Check if the basic block can be vectorized.  */
2250
2251 static bb_vec_info
2252 vect_slp_analyze_bb_1 (basic_block bb)
2253 {
2254   bb_vec_info bb_vinfo;
2255   vec<slp_instance> slp_instances;
2256   slp_instance instance;
2257   int i;
2258   int min_vf = 2;
2259   unsigned n_stmts = 0;
2260
2261   bb_vinfo = new_bb_vec_info (bb);
2262   if (!bb_vinfo)
2263     return NULL;
2264
2265   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2266     {
2267       if (dump_enabled_p ())
2268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269                          "not vectorized: unhandled data-ref in basic "
2270                          "block.\n");
2271
2272       destroy_bb_vec_info (bb_vinfo);
2273       return NULL;
2274     }
2275
2276   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2277     {
2278       if (dump_enabled_p ())
2279         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280                          "not vectorized: not enough data-refs in "
2281                          "basic block.\n");
2282
2283       destroy_bb_vec_info (bb_vinfo);
2284       return NULL;
2285     }
2286
2287   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2288     {
2289      if (dump_enabled_p ())
2290        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291                         "not vectorized: unhandled data access in "
2292                         "basic block.\n");
2293
2294       destroy_bb_vec_info (bb_vinfo);
2295       return NULL;
2296     }
2297
2298   vect_pattern_recog (NULL, bb_vinfo);
2299
2300   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2301     {
2302       if (dump_enabled_p ())
2303         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2304                          "not vectorized: bad data alignment in basic "
2305                          "block.\n");
2306
2307       destroy_bb_vec_info (bb_vinfo);
2308       return NULL;
2309     }
2310
2311   /* Check the SLP opportunities in the basic block, analyze and build SLP
2312      trees.  */
2313   if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2314     {
2315       if (dump_enabled_p ())
2316         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2317                          "not vectorized: failed to find SLP opportunities "
2318                          "in basic block.\n");
2319
2320       destroy_bb_vec_info (bb_vinfo);
2321       return NULL;
2322     }
2323
2324   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2325
2326   /* Mark all the statements that we want to vectorize as pure SLP and
2327      relevant.  */
2328   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2329     {
2330       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2331       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2332     }
2333
2334   /* Mark all the statements that we do not want to vectorize.  */
2335   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2336        !gsi_end_p (gsi); gsi_next (&gsi))
2337     {
2338       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2339       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2340         STMT_VINFO_VECTORIZABLE (vinfo) = false;
2341     }
2342
2343   /* Analyze dependences.  At this point all stmts not participating in
2344      vectorization have to be marked.  Dependence analysis assumes
2345      that we either vectorize all SLP instances or none at all.  */
2346   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2347      {
2348        if (dump_enabled_p ())
2349          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2350                           "not vectorized: unhandled data dependence "
2351                           "in basic block.\n");
2352
2353        destroy_bb_vec_info (bb_vinfo);
2354        return NULL;
2355      }
2356
2357   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2358     {
2359       if (dump_enabled_p ())
2360         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                          "not vectorized: unsupported alignment in basic "
2362                          "block.\n");
2363       destroy_bb_vec_info (bb_vinfo);
2364       return NULL;
2365     }
2366
2367   if (!vect_slp_analyze_operations (bb_vinfo))
2368     {
2369       if (dump_enabled_p ())
2370         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371                          "not vectorized: bad operation in basic block.\n");
2372
2373       destroy_bb_vec_info (bb_vinfo);
2374       return NULL;
2375     }
2376
2377   /* Cost model: check if the vectorization is worthwhile.  */
2378   if (!unlimited_cost_model (NULL)
2379       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2380     {
2381       if (dump_enabled_p ())
2382         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383                          "not vectorized: vectorization is not "
2384                          "profitable.\n");
2385
2386       destroy_bb_vec_info (bb_vinfo);
2387       return NULL;
2388     }
2389
2390   if (dump_enabled_p ())
2391     dump_printf_loc (MSG_NOTE, vect_location,
2392                      "Basic block will be vectorized using SLP\n");
2393
2394   return bb_vinfo;
2395 }
2396
2397
2398 bb_vec_info
2399 vect_slp_analyze_bb (basic_block bb)
2400 {
2401   bb_vec_info bb_vinfo;
2402   int insns = 0;
2403   gimple_stmt_iterator gsi;
2404   unsigned int vector_sizes;
2405
2406   if (dump_enabled_p ())
2407     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2408
2409   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2410     {
2411       gimple stmt = gsi_stmt (gsi);
2412       if (!is_gimple_debug (stmt)
2413           && !gimple_nop_p (stmt)
2414           && gimple_code (stmt) != GIMPLE_LABEL)
2415         insns++;
2416     }
2417
2418   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2419     {
2420       if (dump_enabled_p ())
2421         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2422                          "not vectorized: too many instructions in "
2423                          "basic block.\n");
2424
2425       return NULL;
2426     }
2427
2428   /* Autodetect first vector size we try.  */
2429   current_vector_size = 0;
2430   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2431
2432   while (1)
2433     {
2434       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2435       if (bb_vinfo)
2436         return bb_vinfo;
2437
2438       destroy_bb_vec_info (bb_vinfo);
2439
2440       vector_sizes &= ~current_vector_size;
2441       if (vector_sizes == 0
2442           || current_vector_size == 0)
2443         return NULL;
2444
2445       /* Try the next biggest vector size.  */
2446       current_vector_size = 1 << floor_log2 (vector_sizes);
2447       if (dump_enabled_p ())
2448         dump_printf_loc (MSG_NOTE, vect_location,
2449                          "***** Re-trying analysis with "
2450                          "vector size %d\n", current_vector_size);
2451     }
2452 }
2453
2454
2455 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2456    the number of created vector stmts depends on the unrolling factor).
2457    However, the actual number of vector stmts for every SLP node depends on
2458    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2459    should be updated.  In this function we assume that the inside costs
2460    calculated in vect_model_xxx_cost are linear in ncopies.  */
2461
2462 void
2463 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2464 {
2465   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2466   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2467   slp_instance instance;
2468   stmt_vector_for_cost body_cost_vec;
2469   stmt_info_for_cost *si;
2470   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2471
2472   if (dump_enabled_p ())
2473     dump_printf_loc (MSG_NOTE, vect_location,
2474                      "=== vect_update_slp_costs_according_to_vf ===\n");
2475
2476   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2477     {
2478       /* We assume that costs are linear in ncopies.  */
2479       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2480
2481       /* Record the instance's instructions in the target cost model.
2482          This was delayed until here because the count of instructions
2483          isn't known beforehand.  */
2484       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2485
2486       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2487         (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2488                               vinfo_for_stmt (si->stmt), si->misalign,
2489                               vect_body);
2490     }
2491 }
2492
2493
2494 /* For constant and loop invariant defs of SLP_NODE this function returns
2495    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2496    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2497    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2498    REDUC_INDEX is the index of the reduction operand in the statements, unless
2499    it is -1.  */
2500
2501 static void
2502 vect_get_constant_vectors (tree op, slp_tree slp_node,
2503                            vec<tree> *vec_oprnds,
2504                            unsigned int op_num, unsigned int number_of_vectors,
2505                            int reduc_index)
2506 {
2507   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2508   gimple stmt = stmts[0];
2509   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2510   unsigned nunits;
2511   tree vec_cst;
2512   tree *elts;
2513   unsigned j, number_of_places_left_in_vector;
2514   tree vector_type;
2515   tree vop;
2516   int group_size = stmts.length ();
2517   unsigned int vec_num, i;
2518   unsigned number_of_copies = 1;
2519   vec<tree> voprnds;
2520   voprnds.create (number_of_vectors);
2521   bool constant_p, is_store;
2522   tree neutral_op = NULL;
2523   enum tree_code code = gimple_expr_code (stmt);
2524   gimple def_stmt;
2525   struct loop *loop;
2526   gimple_seq ctor_seq = NULL;
2527
2528   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2529       && reduc_index != -1)
2530     {
2531       op_num = reduc_index - 1;
2532       op = gimple_op (stmt, reduc_index);
2533       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2534          we need either neutral operands or the original operands.  See
2535          get_initial_def_for_reduction() for details.  */
2536       switch (code)
2537         {
2538           case WIDEN_SUM_EXPR:
2539           case DOT_PROD_EXPR:
2540           case PLUS_EXPR:
2541           case MINUS_EXPR:
2542           case BIT_IOR_EXPR:
2543           case BIT_XOR_EXPR:
2544              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2545                neutral_op = build_real (TREE_TYPE (op), dconst0);
2546              else
2547                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2548
2549              break;
2550
2551           case MULT_EXPR:
2552              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2553                neutral_op = build_real (TREE_TYPE (op), dconst1);
2554              else
2555                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2556
2557              break;
2558
2559           case BIT_AND_EXPR:
2560             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2561             break;
2562
2563           /* For MIN/MAX we don't have an easy neutral operand but
2564              the initial values can be used fine here.  Only for
2565              a reduction chain we have to force a neutral element.  */
2566           case MAX_EXPR:
2567           case MIN_EXPR:
2568             if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2569               neutral_op = NULL;
2570             else
2571               {
2572                 def_stmt = SSA_NAME_DEF_STMT (op);
2573                 loop = (gimple_bb (stmt))->loop_father;
2574                 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2575                                                     loop_preheader_edge (loop));
2576               }
2577             break;
2578
2579           default:
2580             neutral_op = NULL;
2581         }
2582     }
2583
2584   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2585     {
2586       is_store = true;
2587       op = gimple_assign_rhs1 (stmt);
2588     }
2589   else
2590     is_store = false;
2591
2592   gcc_assert (op);
2593
2594   if (CONSTANT_CLASS_P (op))
2595     constant_p = true;
2596   else
2597     constant_p = false;
2598
2599   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2600   gcc_assert (vector_type);
2601   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2602
2603   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2604      created vectors. It is greater than 1 if unrolling is performed.
2605
2606      For example, we have two scalar operands, s1 and s2 (e.g., group of
2607      strided accesses of size two), while NUNITS is four (i.e., four scalars
2608      of this type can be packed in a vector).  The output vector will contain
2609      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2610      will be 2).
2611
2612      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2613      containing the operands.
2614
2615      For example, NUNITS is four as before, and the group size is 8
2616      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2617      {s5, s6, s7, s8}.  */
2618
2619   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2620
2621   number_of_places_left_in_vector = nunits;
2622   elts = XALLOCAVEC (tree, nunits);
2623   for (j = 0; j < number_of_copies; j++)
2624     {
2625       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2626         {
2627           if (is_store)
2628             op = gimple_assign_rhs1 (stmt);
2629           else
2630             {
2631               switch (code)
2632                 {
2633                   case COND_EXPR:
2634                     if (op_num == 0 || op_num == 1)
2635                       {
2636                         tree cond = gimple_assign_rhs1 (stmt);
2637                         op = TREE_OPERAND (cond, op_num);
2638                       }
2639                     else
2640                       {
2641                         if (op_num == 2)
2642                           op = gimple_assign_rhs2 (stmt);
2643                         else
2644                           op = gimple_assign_rhs3 (stmt);
2645                       }
2646                     break;
2647
2648                   case CALL_EXPR:
2649                     op = gimple_call_arg (stmt, op_num);
2650                     break;
2651
2652                   case LSHIFT_EXPR:
2653                   case RSHIFT_EXPR:
2654                   case LROTATE_EXPR:
2655                   case RROTATE_EXPR:
2656                     op = gimple_op (stmt, op_num + 1);
2657                     /* Unlike the other binary operators, shifts/rotates have
2658                        the shift count being int, instead of the same type as
2659                        the lhs, so make sure the scalar is the right type if
2660                        we are dealing with vectors of
2661                        long long/long/short/char.  */
2662                     if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2663                       op = fold_convert (TREE_TYPE (vector_type), op);
2664                     break;
2665
2666                   default:
2667                     op = gimple_op (stmt, op_num + 1);
2668                     break;
2669                 }
2670             }
2671
2672           if (reduc_index != -1)
2673             {
2674               loop = (gimple_bb (stmt))->loop_father;
2675               def_stmt = SSA_NAME_DEF_STMT (op);
2676
2677               gcc_assert (loop);
2678
2679               /* Get the def before the loop.  In reduction chain we have only
2680                  one initial value.  */
2681               if ((j != (number_of_copies - 1)
2682                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2683                        && i != 0))
2684                   && neutral_op)
2685                 op = neutral_op;
2686               else
2687                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2688                                             loop_preheader_edge (loop));
2689             }
2690
2691           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2692           number_of_places_left_in_vector--;
2693           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2694             {
2695               if (CONSTANT_CLASS_P (op))
2696                 {
2697                   op = fold_unary (VIEW_CONVERT_EXPR,
2698                                    TREE_TYPE (vector_type), op);
2699                   gcc_assert (op && CONSTANT_CLASS_P (op));
2700                 }
2701               else
2702                 {
2703                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2704                   gimple init_stmt;
2705                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2706                   init_stmt
2707                     = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2708                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2709                   op = new_temp;
2710                 }
2711             }
2712           elts[number_of_places_left_in_vector] = op;
2713           if (!CONSTANT_CLASS_P (op))
2714             constant_p = false;
2715
2716           if (number_of_places_left_in_vector == 0)
2717             {
2718               number_of_places_left_in_vector = nunits;
2719
2720               if (constant_p)
2721                 vec_cst = build_vector (vector_type, elts);
2722               else
2723                 {
2724                   vec<constructor_elt, va_gc> *v;
2725                   unsigned k;
2726                   vec_alloc (v, nunits);
2727                   for (k = 0; k < nunits; ++k)
2728                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2729                   vec_cst = build_constructor (vector_type, v);
2730                 }
2731               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2732                                                     vector_type, NULL));
2733               if (ctor_seq != NULL)
2734                 {
2735                   gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2736                   gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2737                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2738                                                         GSI_SAME_STMT);
2739                   ctor_seq = NULL;
2740                 }
2741             }
2742         }
2743     }
2744
2745   /* Since the vectors are created in the reverse order, we should invert
2746      them.  */
2747   vec_num = voprnds.length ();
2748   for (j = vec_num; j != 0; j--)
2749     {
2750       vop = voprnds[j - 1];
2751       vec_oprnds->quick_push (vop);
2752     }
2753
2754   voprnds.release ();
2755
2756   /* In case that VF is greater than the unrolling factor needed for the SLP
2757      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2758      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2759      to replicate the vectors.  */
2760   while (number_of_vectors > vec_oprnds->length ())
2761     {
2762       tree neutral_vec = NULL;
2763
2764       if (neutral_op)
2765         {
2766           if (!neutral_vec)
2767             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2768
2769           vec_oprnds->quick_push (neutral_vec);
2770         }
2771       else
2772         {
2773           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2774             vec_oprnds->quick_push (vop);
2775         }
2776     }
2777 }
2778
2779
2780 /* Get vectorized definitions from SLP_NODE that contains corresponding
2781    vectorized def-stmts.  */
2782
2783 static void
2784 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2785 {
2786   tree vec_oprnd;
2787   gimple vec_def_stmt;
2788   unsigned int i;
2789
2790   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2791
2792   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2793     {
2794       gcc_assert (vec_def_stmt);
2795       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2796       vec_oprnds->quick_push (vec_oprnd);
2797     }
2798 }
2799
2800
2801 /* Get vectorized definitions for SLP_NODE.
2802    If the scalar definitions are loop invariants or constants, collect them and
2803    call vect_get_constant_vectors() to create vector stmts.
2804    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2805    must be stored in the corresponding child of SLP_NODE, and we call
2806    vect_get_slp_vect_defs () to retrieve them.  */
2807
2808 void
2809 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2810                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2811 {
2812   gimple first_stmt;
2813   int number_of_vects = 0, i;
2814   unsigned int child_index = 0;
2815   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2816   slp_tree child = NULL;
2817   vec<tree> vec_defs;
2818   tree oprnd;
2819   bool vectorized_defs;
2820
2821   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2822   FOR_EACH_VEC_ELT (ops, i, oprnd)
2823     {
2824       /* For each operand we check if it has vectorized definitions in a child
2825          node or we need to create them (for invariants and constants).  We
2826          check if the LHS of the first stmt of the next child matches OPRND.
2827          If it does, we found the correct child.  Otherwise, we call
2828          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2829          to check this child node for the next operand.  */
2830       vectorized_defs = false;
2831       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2832         {
2833           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2834
2835           /* We have to check both pattern and original def, if available.  */
2836           gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2837           gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2838
2839           if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2840               || (related
2841                   && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2842             {
2843               /* The number of vector defs is determined by the number of
2844                  vector statements in the node from which we get those
2845                  statements.  */
2846               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2847               vectorized_defs = true;
2848               child_index++;
2849             }
2850         }
2851
2852       if (!vectorized_defs)
2853         {
2854           if (i == 0)
2855             {
2856               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2857               /* Number of vector stmts was calculated according to LHS in
2858                  vect_schedule_slp_instance (), fix it by replacing LHS with
2859                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2860                  details.  */
2861               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2862                                              &rhs_size_unit);
2863               if (rhs_size_unit != lhs_size_unit)
2864                 {
2865                   number_of_vects *= rhs_size_unit;
2866                   number_of_vects /= lhs_size_unit;
2867                 }
2868             }
2869         }
2870
2871       /* Allocate memory for vectorized defs.  */
2872       vec_defs = vNULL;
2873       vec_defs.create (number_of_vects);
2874
2875       /* For reduction defs we call vect_get_constant_vectors (), since we are
2876          looking for initial loop invariant values.  */
2877       if (vectorized_defs && reduc_index == -1)
2878         /* The defs are already vectorized.  */
2879         vect_get_slp_vect_defs (child, &vec_defs);
2880       else
2881         /* Build vectors from scalar defs.  */
2882         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2883                                    number_of_vects, reduc_index);
2884
2885       vec_oprnds->quick_push (vec_defs);
2886
2887       /* For reductions, we only need initial values.  */
2888       if (reduc_index != -1)
2889         return;
2890     }
2891 }
2892
2893
2894 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2895    building a vector of type MASK_TYPE from it) and two input vectors placed in
2896    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2897    shifting by STRIDE elements of DR_CHAIN for every copy.
2898    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2899    copies).
2900    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2901    the created stmts must be inserted.  */
2902
2903 static inline void
2904 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2905                            tree mask, int first_vec_indx, int second_vec_indx,
2906                            gimple_stmt_iterator *gsi, slp_tree node,
2907                            tree vectype, vec<tree> dr_chain,
2908                            int ncopies, int vect_stmts_counter)
2909 {
2910   tree perm_dest;
2911   gimple perm_stmt = NULL;
2912   stmt_vec_info next_stmt_info;
2913   int i, stride;
2914   tree first_vec, second_vec, data_ref;
2915
2916   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2917
2918   /* Initialize the vect stmts of NODE to properly insert the generated
2919      stmts later.  */
2920   for (i = SLP_TREE_VEC_STMTS (node).length ();
2921        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2922     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2923
2924   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2925   for (i = 0; i < ncopies; i++)
2926     {
2927       first_vec = dr_chain[first_vec_indx];
2928       second_vec = dr_chain[second_vec_indx];
2929
2930       /* Generate the permute statement.  */
2931       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
2932                                        first_vec, second_vec, mask);
2933       data_ref = make_ssa_name (perm_dest, perm_stmt);
2934       gimple_set_lhs (perm_stmt, data_ref);
2935       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2936
2937       /* Store the vector statement in NODE.  */
2938       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2939
2940       first_vec_indx += stride;
2941       second_vec_indx += stride;
2942     }
2943
2944   /* Mark the scalar stmt as vectorized.  */
2945   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2946   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2947 }
2948
2949
2950 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2951    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2952    representation.  Check that the mask is valid and return FALSE if not.
2953    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2954    the next vector, i.e., the current first vector is not needed.  */
2955
2956 static bool
2957 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2958                        int mask_nunits, bool only_one_vec, int index,
2959                        unsigned char *mask, int *current_mask_element,
2960                        bool *need_next_vector, int *number_of_mask_fixes,
2961                        bool *mask_fixed, bool *needs_first_vector)
2962 {
2963   int i;
2964
2965   /* Convert to target specific representation.  */
2966   *current_mask_element = first_mask_element + m;
2967   /* Adjust the value in case it's a mask for second and third vectors.  */
2968   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2969
2970   if (*current_mask_element < mask_nunits)
2971     *needs_first_vector = true;
2972
2973   /* We have only one input vector to permute but the mask accesses values in
2974      the next vector as well.  */
2975   if (only_one_vec && *current_mask_element >= mask_nunits)
2976     {
2977       if (dump_enabled_p ())
2978         {
2979           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2980                            "permutation requires at least two vectors ");
2981           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2982           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2983         }
2984
2985       return false;
2986     }
2987
2988   /* The mask requires the next vector.  */
2989   while (*current_mask_element >= mask_nunits * 2)
2990     {
2991       if (*needs_first_vector || *mask_fixed)
2992         {
2993           /* We either need the first vector too or have already moved to the
2994              next vector. In both cases, this permutation needs three
2995              vectors.  */
2996           if (dump_enabled_p ())
2997             {
2998               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2999                                "permutation requires at "
3000                                "least three vectors ");
3001               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3002               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3003             }
3004
3005           return false;
3006         }
3007
3008       /* We move to the next vector, dropping the first one and working with
3009          the second and the third - we need to adjust the values of the mask
3010          accordingly.  */
3011       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3012
3013       for (i = 0; i < index; i++)
3014         mask[i] -= mask_nunits * *number_of_mask_fixes;
3015
3016       (*number_of_mask_fixes)++;
3017       *mask_fixed = true;
3018     }
3019
3020   *need_next_vector = *mask_fixed;
3021
3022   /* This was the last element of this mask. Start a new one.  */
3023   if (index == mask_nunits - 1)
3024     {
3025       *number_of_mask_fixes = 1;
3026       *mask_fixed = false;
3027       *needs_first_vector = false;
3028     }
3029
3030   return true;
3031 }
3032
3033
3034 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3035    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3036    permute statements for the SLP node NODE of the SLP instance
3037    SLP_NODE_INSTANCE.  */
3038
3039 bool
3040 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3041                               gimple_stmt_iterator *gsi, int vf,
3042                               slp_instance slp_node_instance, bool analyze_only)
3043 {
3044   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3045   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3046   tree mask_element_type = NULL_TREE, mask_type;
3047   int i, j, k, nunits, vec_index = 0, scalar_index;
3048   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3049   gimple next_scalar_stmt;
3050   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3051   int first_mask_element;
3052   int index, unroll_factor, current_mask_element, ncopies;
3053   unsigned char *mask;
3054   bool only_one_vec = false, need_next_vector = false;
3055   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3056   int number_of_mask_fixes = 1;
3057   bool mask_fixed = false;
3058   bool needs_first_vector = false;
3059   machine_mode mode;
3060
3061   mode = TYPE_MODE (vectype);
3062
3063   if (!can_vec_perm_p (mode, false, NULL))
3064     {
3065       if (dump_enabled_p ())
3066         {
3067           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3068                            "no vect permute for ");
3069           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3070           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3071         }
3072       return false;
3073     }
3074
3075   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3076      same size as the vector element being permuted.  */
3077   mask_element_type = lang_hooks.types.type_for_mode
3078                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3079   mask_type = get_vectype_for_scalar_type (mask_element_type);
3080   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3081   mask = XALLOCAVEC (unsigned char, nunits);
3082   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3083
3084   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3085      unrolling factor.  */
3086   orig_vec_stmts_num = group_size *
3087                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3088   if (orig_vec_stmts_num == 1)
3089     only_one_vec = true;
3090
3091   /* Number of copies is determined by the final vectorization factor
3092      relatively to SLP_NODE_INSTANCE unrolling factor.  */
3093   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3094
3095   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3096     return false;
3097
3098   /* Generate permutation masks for every NODE. Number of masks for each NODE
3099      is equal to GROUP_SIZE.
3100      E.g., we have a group of three nodes with three loads from the same
3101      location in each node, and the vector size is 4. I.e., we have a
3102      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3103      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3104      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3105      ...
3106
3107      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3108      The last mask is illegal since we assume two operands for permute
3109      operation, and the mask element values can't be outside that range.
3110      Hence, the last mask must be converted into {2,5,5,5}.
3111      For the first two permutations we need the first and the second input
3112      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3113      we need the second and the third vectors: {b1,c1,a2,b2} and
3114      {c2,a3,b3,c3}.  */
3115
3116     {
3117       scalar_index = 0;
3118       index = 0;
3119       vect_stmts_counter = 0;
3120       vec_index = 0;
3121       first_vec_index = vec_index++;
3122       if (only_one_vec)
3123         second_vec_index = first_vec_index;
3124       else
3125         second_vec_index =  vec_index++;
3126
3127       for (j = 0; j < unroll_factor; j++)
3128         {
3129           for (k = 0; k < group_size; k++)
3130             {
3131               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3132               first_mask_element = i + j * group_size;
3133               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3134                                           nunits, only_one_vec, index,
3135                                           mask, &current_mask_element,
3136                                           &need_next_vector,
3137                                           &number_of_mask_fixes, &mask_fixed,
3138                                           &needs_first_vector))
3139                 return false;
3140               gcc_assert (current_mask_element < 2 * nunits);
3141               mask[index++] = current_mask_element;
3142
3143               if (index == nunits)
3144                 {
3145                   index = 0;
3146                   if (!can_vec_perm_p (mode, false, mask))
3147                     {
3148                       if (dump_enabled_p ())
3149                         {
3150                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3151                                            vect_location, 
3152                                            "unsupported vect permute { ");
3153                           for (i = 0; i < nunits; ++i)
3154                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3155                                          mask[i]);
3156                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3157                         }
3158                       return false;
3159                     }
3160
3161                   if (!analyze_only)
3162                     {
3163                       int l;
3164                       tree mask_vec, *mask_elts;
3165                       mask_elts = XALLOCAVEC (tree, nunits);
3166                       for (l = 0; l < nunits; ++l)
3167                         mask_elts[l] = build_int_cst (mask_element_type,
3168                                                       mask[l]);
3169                       mask_vec = build_vector (mask_type, mask_elts);
3170
3171                       if (need_next_vector)
3172                         {
3173                           first_vec_index = second_vec_index;
3174                           second_vec_index = vec_index;
3175                         }
3176
3177                       next_scalar_stmt
3178                           = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3179
3180                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3181                                mask_vec, first_vec_index, second_vec_index,
3182                                gsi, node, vectype, dr_chain,
3183                                ncopies, vect_stmts_counter++);
3184                     }
3185                 }
3186             }
3187         }
3188     }
3189
3190   return true;
3191 }
3192
3193
3194
3195 /* Vectorize SLP instance tree in postorder.  */
3196
3197 static bool
3198 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3199                             unsigned int vectorization_factor)
3200 {
3201   gimple stmt;
3202   bool grouped_store, is_store;
3203   gimple_stmt_iterator si;
3204   stmt_vec_info stmt_info;
3205   unsigned int vec_stmts_size, nunits, group_size;
3206   tree vectype;
3207   int i;
3208   slp_tree child;
3209
3210   if (!node)
3211     return false;
3212
3213   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3214     vect_schedule_slp_instance (child, instance, vectorization_factor);
3215
3216   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3217   stmt_info = vinfo_for_stmt (stmt);
3218
3219   /* VECTYPE is the type of the destination.  */
3220   vectype = STMT_VINFO_VECTYPE (stmt_info);
3221   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3222   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3223
3224   /* For each SLP instance calculate number of vector stmts to be created
3225      for the scalar stmts in each node of the SLP tree.  Number of vector
3226      elements in one vector iteration is the number of scalar elements in
3227      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3228      size.  */
3229   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3230
3231   if (!SLP_TREE_VEC_STMTS (node).exists ())
3232     {
3233       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3234       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3235     }
3236
3237   if (dump_enabled_p ())
3238     {
3239       dump_printf_loc (MSG_NOTE,vect_location,
3240                        "------>vectorizing SLP node starting from: ");
3241       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3242       dump_printf (MSG_NOTE, "\n");
3243     }
3244
3245   /* Loads should be inserted before the first load.  */
3246   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3247       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3248       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3249       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3250     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3251   else if (is_pattern_stmt_p (stmt_info))
3252     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3253   else
3254     si = gsi_for_stmt (stmt);
3255
3256   /* Stores should be inserted just before the last store.  */
3257   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3258       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3259     { 
3260       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3261       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3262        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3263       si = gsi_for_stmt (last_store);
3264     }
3265
3266   /* Mark the first element of the reduction chain as reduction to properly
3267      transform the node.  In the analysis phase only the last element of the
3268      chain is marked as reduction.  */
3269   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3270       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3271     {
3272       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3273       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3274     }
3275
3276   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3277   return is_store;
3278 }
3279
3280 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3281    For loop vectorization this is done in vectorizable_call, but for SLP
3282    it needs to be deferred until end of vect_schedule_slp, because multiple
3283    SLP instances may refer to the same scalar stmt.  */
3284
3285 static void
3286 vect_remove_slp_scalar_calls (slp_tree node)
3287 {
3288   gimple stmt, new_stmt;
3289   gimple_stmt_iterator gsi;
3290   int i;
3291   slp_tree child;
3292   tree lhs;
3293   stmt_vec_info stmt_info;
3294
3295   if (!node)
3296     return;
3297
3298   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3299     vect_remove_slp_scalar_calls (child);
3300
3301   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3302     {
3303       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3304         continue;
3305       stmt_info = vinfo_for_stmt (stmt);
3306       if (stmt_info == NULL
3307           || is_pattern_stmt_p (stmt_info)
3308           || !PURE_SLP_STMT (stmt_info))
3309         continue;
3310       lhs = gimple_call_lhs (stmt);
3311       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3312       set_vinfo_for_stmt (new_stmt, stmt_info);
3313       set_vinfo_for_stmt (stmt, NULL);
3314       STMT_VINFO_STMT (stmt_info) = new_stmt;
3315       gsi = gsi_for_stmt (stmt);
3316       gsi_replace (&gsi, new_stmt, false);
3317       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3318     }
3319 }
3320
3321 /* Generate vector code for all SLP instances in the loop/basic block.  */
3322
3323 bool
3324 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3325 {
3326   vec<slp_instance> slp_instances;
3327   slp_instance instance;
3328   unsigned int i, vf;
3329   bool is_store = false;
3330
3331   if (loop_vinfo)
3332     {
3333       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3334       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3335     }
3336   else
3337     {
3338       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3339       vf = 1;
3340     }
3341
3342   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3343     {
3344       /* Schedule the tree of INSTANCE.  */
3345       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3346                                              instance, vf);
3347       if (dump_enabled_p ())
3348         dump_printf_loc (MSG_NOTE, vect_location,
3349                          "vectorizing stmts using SLP.\n");
3350     }
3351
3352   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3353     {
3354       slp_tree root = SLP_INSTANCE_TREE (instance);
3355       gimple store;
3356       unsigned int j;
3357       gimple_stmt_iterator gsi;
3358
3359       /* Remove scalar call stmts.  Do not do this for basic-block
3360          vectorization as not all uses may be vectorized.
3361          ???  Why should this be necessary?  DCE should be able to
3362          remove the stmts itself.
3363          ???  For BB vectorization we can as well remove scalar
3364          stmts starting from the SLP tree root if they have no
3365          uses.  */
3366       if (loop_vinfo)
3367         vect_remove_slp_scalar_calls (root);
3368
3369       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3370                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3371         {
3372           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3373             break;
3374
3375          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3376            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3377           /* Free the attached stmt_vec_info and remove the stmt.  */
3378           gsi = gsi_for_stmt (store);
3379           unlink_stmt_vdef (store);
3380           gsi_remove (&gsi, true);
3381           release_defs (store);
3382           free_stmt_vec_info (store);
3383         }
3384     }
3385
3386   return is_store;
3387 }
3388
3389
3390 /* Vectorize the basic block.  */
3391
3392 void
3393 vect_slp_transform_bb (basic_block bb)
3394 {
3395   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3396   gimple_stmt_iterator si;
3397
3398   gcc_assert (bb_vinfo);
3399
3400   if (dump_enabled_p ())
3401     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3402
3403   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3404     {
3405       gimple stmt = gsi_stmt (si);
3406       stmt_vec_info stmt_info;
3407
3408       if (dump_enabled_p ())
3409         {
3410           dump_printf_loc (MSG_NOTE, vect_location,
3411                            "------>SLPing statement: ");
3412           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3413           dump_printf (MSG_NOTE, "\n");
3414         }
3415
3416       stmt_info = vinfo_for_stmt (stmt);
3417       gcc_assert (stmt_info);
3418
3419       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3420       if (STMT_SLP_TYPE (stmt_info))
3421         {
3422           vect_schedule_slp (NULL, bb_vinfo);
3423           break;
3424         }
3425     }
3426
3427   if (dump_enabled_p ())
3428     dump_printf_loc (MSG_NOTE, vect_location,
3429                      "BASIC BLOCK VECTORIZED\n");
3430
3431   destroy_bb_vec_info (bb_vinfo);
3432 }