Update gcc-50 to SVN version 222321 (gcc-5-branch)
[dragonfly.git] / contrib / gcc-5.0 / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "function.h"
43 #include "basic-block.h"
44 #include "gimple-pretty-print.h"
45 #include "tree-ssa-alias.h"
46 #include "internal-fn.h"
47 #include "gimple-expr.h"
48 #include "is-a.h"
49 #include "gimple.h"
50 #include "gimple-iterator.h"
51 #include "gimple-ssa.h"
52 #include "tree-phinodes.h"
53 #include "ssa-iterators.h"
54 #include "stringpool.h"
55 #include "tree-ssanames.h"
56 #include "tree-pass.h"
57 #include "cfgloop.h"
58 #include "hashtab.h"
59 #include "rtl.h"
60 #include "flags.h"
61 #include "statistics.h"
62 #include "real.h"
63 #include "fixed-value.h"
64 #include "insn-config.h"
65 #include "expmed.h"
66 #include "dojump.h"
67 #include "explow.h"
68 #include "calls.h"
69 #include "emit-rtl.h"
70 #include "varasm.h"
71 #include "stmt.h"
72 #include "expr.h"
73 #include "recog.h"              /* FIXME: for insn_data */
74 #include "insn-codes.h"
75 #include "optabs.h"
76 #include "tree-vectorizer.h"
77 #include "langhooks.h"
78 #include "gimple-walk.h"
79
80 /* Extract the location of the basic block in the source code.
81    Return the basic block location if succeed and NULL if not.  */
82
83 source_location
84 find_bb_location (basic_block bb)
85 {
86   gimple stmt = NULL;
87   gimple_stmt_iterator si;
88
89   if (!bb)
90     return UNKNOWN_LOCATION;
91
92   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
93     {
94       stmt = gsi_stmt (si);
95       if (gimple_location (stmt) != UNKNOWN_LOCATION)
96         return gimple_location (stmt);
97     }
98
99   return UNKNOWN_LOCATION;
100 }
101
102
103 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
104
105 static void
106 vect_free_slp_tree (slp_tree node)
107 {
108   int i;
109   slp_tree child;
110
111   if (!node)
112     return;
113
114   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115     vect_free_slp_tree (child);
116
117   SLP_TREE_CHILDREN (node).release ();
118   SLP_TREE_SCALAR_STMTS (node).release ();
119   SLP_TREE_VEC_STMTS (node).release ();
120   SLP_TREE_LOAD_PERMUTATION (node).release ();
121
122   free (node);
123 }
124
125
126 /* Free the memory allocated for the SLP instance.  */
127
128 void
129 vect_free_slp_instance (slp_instance instance)
130 {
131   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132   SLP_INSTANCE_LOADS (instance).release ();
133   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
134   free (instance);
135 }
136
137
138 /* Create an SLP node for SCALAR_STMTS.  */
139
140 static slp_tree
141 vect_create_new_slp_node (vec<gimple> scalar_stmts)
142 {
143   slp_tree node;
144   gimple stmt = scalar_stmts[0];
145   unsigned int nops;
146
147   if (is_gimple_call (stmt))
148     nops = gimple_call_num_args (stmt);
149   else if (is_gimple_assign (stmt))
150     {
151       nops = gimple_num_ops (stmt) - 1;
152       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
153         nops++;
154     }
155   else
156     return NULL;
157
158   node = XNEW (struct _slp_tree);
159   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
160   SLP_TREE_VEC_STMTS (node).create (0);
161   SLP_TREE_CHILDREN (node).create (nops);
162   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
163
164   return node;
165 }
166
167
168 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
169    operand.  */
170 static vec<slp_oprnd_info> 
171 vect_create_oprnd_info (int nops, int group_size)
172 {
173   int i;
174   slp_oprnd_info oprnd_info;
175   vec<slp_oprnd_info> oprnds_info;
176
177   oprnds_info.create (nops);
178   for (i = 0; i < nops; i++)
179     {
180       oprnd_info = XNEW (struct _slp_oprnd_info);
181       oprnd_info->def_stmts.create (group_size);
182       oprnd_info->first_dt = vect_uninitialized_def;
183       oprnd_info->first_op_type = NULL_TREE;
184       oprnd_info->first_pattern = false;
185       oprnds_info.quick_push (oprnd_info);
186     }
187
188   return oprnds_info;
189 }
190
191
192 /* Free operands info.  */
193
194 static void
195 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
196 {
197   int i;
198   slp_oprnd_info oprnd_info;
199
200   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
201     {
202       oprnd_info->def_stmts.release ();
203       XDELETE (oprnd_info);
204     }
205
206   oprnds_info.release ();
207 }
208
209
210 /* Find the place of the data-ref in STMT in the interleaving chain that starts
211    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
212
213 static int
214 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
215 {
216   gimple next_stmt = first_stmt;
217   int result = 0;
218
219   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
220     return -1;
221
222   do
223     {
224       if (next_stmt == stmt)
225         return result;
226       result++;
227       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
228     }
229   while (next_stmt);
230
231   return -1;
232 }
233
234
235 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
236    they are of a valid type and that they match the defs of the first stmt of
237    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
238    return -1, if the error could be corrected by swapping operands of the
239    operation return 1, if everything is ok return 0.  */
240
241 static int 
242 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
243                              gimple stmt, bool first,
244                              vec<slp_oprnd_info> *oprnds_info)
245 {
246   tree oprnd;
247   unsigned int i, number_of_oprnds;
248   tree def;
249   gimple def_stmt;
250   enum vect_def_type dt = vect_uninitialized_def;
251   struct loop *loop = NULL;
252   bool pattern = false;
253   slp_oprnd_info oprnd_info;
254   int first_op_idx = 1;
255   bool commutative = false;
256   bool first_op_cond = false;
257
258   if (loop_vinfo)
259     loop = LOOP_VINFO_LOOP (loop_vinfo);
260
261   if (is_gimple_call (stmt))
262     {
263       number_of_oprnds = gimple_call_num_args (stmt);
264       first_op_idx = 3;
265     }
266   else if (is_gimple_assign (stmt))
267     {
268       enum tree_code code = gimple_assign_rhs_code (stmt);
269       number_of_oprnds = gimple_num_ops (stmt) - 1;
270       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
271         {
272           first_op_cond = true;
273           commutative = true;
274           number_of_oprnds++;
275         }
276       else
277         commutative = commutative_tree_code (code);
278     }
279   else
280     return -1;
281
282   bool swapped = false;
283   for (i = 0; i < number_of_oprnds; i++)
284     {
285 again:
286       if (first_op_cond)
287         {
288           if (i == 0 || i == 1)
289             oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
290                                   swapped ? !i : i);
291           else
292             oprnd = gimple_op (stmt, first_op_idx + i - 1);
293         }
294       else
295         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
296
297       oprnd_info = (*oprnds_info)[i];
298
299       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
300                                &def, &dt)
301           || (!def_stmt && dt != vect_constant_def))
302         {
303           if (dump_enabled_p ())
304             {
305               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306                                "Build SLP failed: can't find def for ");
307               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
308               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
309             }
310
311           return -1;
312         }
313
314       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
315          from the pattern.  Check that all the stmts of the node are in the
316          pattern.  */
317       if (def_stmt && gimple_bb (def_stmt)
318           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
319               || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
320                   && gimple_code (def_stmt) != GIMPLE_PHI))
321           && vinfo_for_stmt (def_stmt)
322           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
323           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
324           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
325         {
326           pattern = true;
327           if (!first && !oprnd_info->first_pattern)
328             {
329               if (i == 0
330                   && !swapped
331                   && commutative)
332                 {
333                   swapped = true;
334                   goto again;
335                 }
336
337               if (dump_enabled_p ())
338                 {
339                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340                                    "Build SLP failed: some of the stmts"
341                                    " are in a pattern, and others are not ");
342                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
343                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344                 }
345
346               return 1;
347             }
348
349           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
350           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
351
352           if (dt == vect_unknown_def_type)
353             {
354               if (dump_enabled_p ())
355                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356                                  "Unsupported pattern.\n");
357               return -1;
358             }
359
360           switch (gimple_code (def_stmt))
361             {
362               case GIMPLE_PHI:
363                 def = gimple_phi_result (def_stmt);
364                 break;
365
366               case GIMPLE_ASSIGN:
367                 def = gimple_assign_lhs (def_stmt);
368                 break;
369
370               default:
371                 if (dump_enabled_p ())
372                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
373                                    "unsupported defining stmt:\n");
374                 return -1;
375             }
376         }
377
378       if (first)
379         {
380           oprnd_info->first_dt = dt;
381           oprnd_info->first_pattern = pattern;
382           oprnd_info->first_op_type = TREE_TYPE (oprnd);
383         }
384       else
385         {
386           /* Not first stmt of the group, check that the def-stmt/s match
387              the def-stmt/s of the first stmt.  Allow different definition
388              types for reduction chains: the first stmt must be a
389              vect_reduction_def (a phi node), and the rest
390              vect_internal_def.  */
391           if (((oprnd_info->first_dt != dt
392                 && !(oprnd_info->first_dt == vect_reduction_def
393                      && dt == vect_internal_def)
394                 && !((oprnd_info->first_dt == vect_external_def
395                       || oprnd_info->first_dt == vect_constant_def)
396                      && (dt == vect_external_def
397                          || dt == vect_constant_def)))
398                || !types_compatible_p (oprnd_info->first_op_type,
399                                        TREE_TYPE (oprnd))))
400             {
401               /* Try swapping operands if we got a mismatch.  */
402               if (i == 0
403                   && !swapped
404                   && commutative)
405                 {
406                   swapped = true;
407                   goto again;
408                 }
409
410               if (dump_enabled_p ())
411                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412                                  "Build SLP failed: different types\n");
413
414               return 1;
415             }
416         }
417
418       /* Check the types of the definitions.  */
419       switch (dt)
420         {
421         case vect_constant_def:
422         case vect_external_def:
423         case vect_reduction_def:
424           break;
425
426         case vect_internal_def:
427           oprnd_info->def_stmts.quick_push (def_stmt);
428           break;
429
430         default:
431           /* FORNOW: Not supported.  */
432           if (dump_enabled_p ())
433             {
434               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435                                "Build SLP failed: illegal type of def ");
436               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
437               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
438             }
439
440           return -1;
441         }
442     }
443
444   /* Swap operands.  */
445   if (swapped)
446     {
447       if (first_op_cond)
448         {
449           tree cond = gimple_assign_rhs1 (stmt);
450           swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
451                              &TREE_OPERAND (cond, 1));
452           TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
453         }
454       else
455         swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
456                            gimple_assign_rhs2_ptr (stmt));
457     }
458
459   return 0;
460 }
461
462
463 /* Verify if the scalar stmts STMTS are isomorphic, require data
464    permutation or are of unsupported types of operation.  Return
465    true if they are, otherwise return false and indicate in *MATCHES
466    which stmts are not isomorphic to the first one.  If MATCHES[0]
467    is false then this indicates the comparison could not be
468    carried out or the stmts will never be vectorized by SLP.  */
469
470 static bool
471 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
472                        vec<gimple> stmts, unsigned int group_size,
473                        unsigned nops, unsigned int *max_nunits,
474                        unsigned int vectorization_factor, bool *matches)
475 {
476   unsigned int i;
477   gimple stmt = stmts[0];
478   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
479   enum tree_code first_cond_code = ERROR_MARK;
480   tree lhs;
481   bool need_same_oprnds = false;
482   tree vectype, scalar_type, first_op1 = NULL_TREE;
483   optab optab;
484   int icode;
485   machine_mode optab_op2_mode;
486   machine_mode vec_mode;
487   struct data_reference *first_dr;
488   HOST_WIDE_INT dummy;
489   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
490   tree cond;
491
492   /* For every stmt in NODE find its def stmt/s.  */
493   FOR_EACH_VEC_ELT (stmts, i, stmt)
494     {
495       matches[i] = false;
496
497       if (dump_enabled_p ())
498         {
499           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
500           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
501           dump_printf (MSG_NOTE, "\n");
502         }
503
504       /* Fail to vectorize statements marked as unvectorizable.  */
505       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
506         {
507           if (dump_enabled_p ())
508             {
509               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510                                "Build SLP failed: unvectorizable statement ");
511               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
512               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513             }
514           /* Fatal mismatch.  */
515           matches[0] = false;
516           return false;
517         }
518
519       lhs = gimple_get_lhs (stmt);
520       if (lhs == NULL_TREE)
521         {
522           if (dump_enabled_p ())
523             {
524               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525                                "Build SLP failed: not GIMPLE_ASSIGN nor "
526                                "GIMPLE_CALL ");
527               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
528               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529             }
530           /* Fatal mismatch.  */
531           matches[0] = false;
532           return false;
533         }
534
535        if (is_gimple_assign (stmt)
536            && gimple_assign_rhs_code (stmt) == COND_EXPR
537            && (cond = gimple_assign_rhs1 (stmt))
538            && !COMPARISON_CLASS_P (cond))
539         {
540           if (dump_enabled_p ())
541             {
542               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
543                                "Build SLP failed: condition is not "
544                                "comparison ");
545               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
546               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547             }
548           /* Fatal mismatch.  */
549           matches[0] = false;
550           return false;
551         }
552
553       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
554       vectype = get_vectype_for_scalar_type (scalar_type);
555       if (!vectype)
556         {
557           if (dump_enabled_p ())
558             {
559               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
560                                "Build SLP failed: unsupported data-type ");
561               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
562                                  scalar_type);
563               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
564             }
565           /* Fatal mismatch.  */
566           matches[0] = false;
567           return false;
568         }
569
570       /* In case of multiple types we need to detect the smallest type.  */
571       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
572         {
573           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
574           if (bb_vinfo)
575             vectorization_factor = *max_nunits;
576         }
577
578       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
579         {
580           rhs_code = CALL_EXPR;
581           if (gimple_call_internal_p (call_stmt)
582               || gimple_call_tail_p (call_stmt)
583               || gimple_call_noreturn_p (call_stmt)
584               || !gimple_call_nothrow_p (call_stmt)
585               || gimple_call_chain (call_stmt))
586             {
587               if (dump_enabled_p ())
588                 {
589                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
590                                    "Build SLP failed: unsupported call type ");
591                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
592                                     call_stmt, 0);
593                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
594                 }
595               /* Fatal mismatch.  */
596               matches[0] = false;
597               return false;
598             }
599         }
600       else
601         rhs_code = gimple_assign_rhs_code (stmt);
602
603       /* Check the operation.  */
604       if (i == 0)
605         {
606           first_stmt_code = rhs_code;
607
608           /* Shift arguments should be equal in all the packed stmts for a
609              vector shift with scalar shift operand.  */
610           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
611               || rhs_code == LROTATE_EXPR
612               || rhs_code == RROTATE_EXPR)
613             {
614               vec_mode = TYPE_MODE (vectype);
615
616               /* First see if we have a vector/vector shift.  */
617               optab = optab_for_tree_code (rhs_code, vectype,
618                                            optab_vector);
619
620               if (!optab
621                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
622                 {
623                   /* No vector/vector shift, try for a vector/scalar shift.  */
624                   optab = optab_for_tree_code (rhs_code, vectype,
625                                                optab_scalar);
626
627                   if (!optab)
628                     {
629                       if (dump_enabled_p ())
630                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
631                                          "Build SLP failed: no optab.\n");
632                       /* Fatal mismatch.  */
633                       matches[0] = false;
634                       return false;
635                     }
636                   icode = (int) optab_handler (optab, vec_mode);
637                   if (icode == CODE_FOR_nothing)
638                     {
639                       if (dump_enabled_p ())
640                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641                                          "Build SLP failed: "
642                                          "op not supported by target.\n");
643                       /* Fatal mismatch.  */
644                       matches[0] = false;
645                       return false;
646                     }
647                   optab_op2_mode = insn_data[icode].operand[2].mode;
648                   if (!VECTOR_MODE_P (optab_op2_mode))
649                     {
650                       need_same_oprnds = true;
651                       first_op1 = gimple_assign_rhs2 (stmt);
652                     }
653                 }
654             }
655           else if (rhs_code == WIDEN_LSHIFT_EXPR)
656             {
657               need_same_oprnds = true;
658               first_op1 = gimple_assign_rhs2 (stmt);
659             }
660         }
661       else
662         {
663           if (first_stmt_code != rhs_code
664               && (first_stmt_code != IMAGPART_EXPR
665                   || rhs_code != REALPART_EXPR)
666               && (first_stmt_code != REALPART_EXPR
667                   || rhs_code != IMAGPART_EXPR)
668               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
669                    && (first_stmt_code == ARRAY_REF
670                        || first_stmt_code == BIT_FIELD_REF
671                        || first_stmt_code == INDIRECT_REF
672                        || first_stmt_code == COMPONENT_REF
673                        || first_stmt_code == MEM_REF)))
674             {
675               if (dump_enabled_p ())
676                 {
677                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
678                                    "Build SLP failed: different operation "
679                                    "in stmt ");
680                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
681                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
682                 }
683               /* Mismatch.  */
684               continue;
685             }
686
687           if (need_same_oprnds
688               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
689             {
690               if (dump_enabled_p ())
691                 {
692                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
693                                    "Build SLP failed: different shift "
694                                    "arguments in ");
695                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
696                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
697                 }
698               /* Mismatch.  */
699               continue;
700             }
701
702           if (rhs_code == CALL_EXPR)
703             {
704               gimple first_stmt = stmts[0];
705               if (gimple_call_num_args (stmt) != nops
706                   || !operand_equal_p (gimple_call_fn (first_stmt),
707                                        gimple_call_fn (stmt), 0)
708                   || gimple_call_fntype (first_stmt)
709                      != gimple_call_fntype (stmt))
710                 {
711                   if (dump_enabled_p ())
712                     {
713                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
714                                        "Build SLP failed: different calls in ");
715                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
716                                         stmt, 0);
717                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
718                     }
719                   /* Mismatch.  */
720                   continue;
721                 }
722             }
723         }
724
725       /* Grouped store or load.  */
726       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
727         {
728           if (REFERENCE_CLASS_P (lhs))
729             {
730               /* Store.  */
731               ;
732             }
733           else
734             {
735               /* Load.  */
736               unsigned unrolling_factor
737                 = least_common_multiple
738                     (*max_nunits, group_size) / group_size;
739               /* FORNOW: Check that there is no gap between the loads
740                  and no gap between the groups when we need to load
741                  multiple groups at once.
742                  ???  We should enhance this to only disallow gaps
743                  inside vectors.  */
744               if ((unrolling_factor > 1
745                    && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
746                         && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
747                        /* If the group is split up then GROUP_GAP
748                           isn't correct here, nor is GROUP_FIRST_ELEMENT.  */
749                        || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
750                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
751                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
752                 {
753                   if (dump_enabled_p ())
754                     {
755                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756                                        "Build SLP failed: grouped "
757                                        "loads have gaps ");
758                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
759                                         stmt, 0);
760                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
761                     }
762                   /* Fatal mismatch.  */
763                   matches[0] = false;
764                   return false;
765                 }
766
767               /* Check that the size of interleaved loads group is not
768                  greater than the SLP group size.  */
769               unsigned ncopies
770                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
771               if (loop_vinfo
772                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
773                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
774                        - GROUP_GAP (vinfo_for_stmt (stmt)))
775                       > ncopies * group_size))
776                 {
777                   if (dump_enabled_p ())
778                     {
779                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
780                                        "Build SLP failed: the number "
781                                        "of interleaved loads is greater than "
782                                        "the SLP group size ");
783                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
784                                         stmt, 0);
785                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
786                     }
787                   /* Fatal mismatch.  */
788                   matches[0] = false;
789                   return false;
790                 }
791
792               old_first_load = first_load;
793               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
794               if (prev_first_load)
795                 {
796                   /* Check that there are no loads from different interleaving
797                      chains in the same node.  */
798                   if (prev_first_load != first_load)
799                     {
800                       if (dump_enabled_p ())
801                         {
802                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
803                                            vect_location, 
804                                            "Build SLP failed: different "
805                                            "interleaving chains in one node ");
806                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
807                                             stmt, 0);
808                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
809                         }
810                       /* Mismatch.  */
811                       continue;
812                     }
813                 }
814               else
815                 prev_first_load = first_load;
816
817               /* In some cases a group of loads is just the same load
818                  repeated N times.  Only analyze its cost once.  */
819               if (first_load == stmt && old_first_load != first_load)
820                 {
821                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
822                   if (vect_supportable_dr_alignment (first_dr, false)
823                       == dr_unaligned_unsupported)
824                     {
825                       if (dump_enabled_p ())
826                         {
827                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
828                                            vect_location, 
829                                            "Build SLP failed: unsupported "
830                                            "unaligned load ");
831                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
832                                             stmt, 0);
833                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
834                         }
835                       /* Fatal mismatch.  */
836                       matches[0] = false;
837                       return false;
838                     }
839                 }
840            }
841         } /* Grouped access.  */
842       else
843         {
844           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
845             {
846               /* Not grouped load.  */
847               if (dump_enabled_p ())
848                 {
849                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
850                                    "Build SLP failed: not grouped load ");
851                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
852                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
853                 }
854
855               /* FORNOW: Not grouped loads are not supported.  */
856               /* Fatal mismatch.  */
857               matches[0] = false;
858               return false;
859             }
860
861           /* Not memory operation.  */
862           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
863               && TREE_CODE_CLASS (rhs_code) != tcc_unary
864               && rhs_code != COND_EXPR
865               && rhs_code != CALL_EXPR)
866             {
867               if (dump_enabled_p ())
868                 {
869                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
870                                    "Build SLP failed: operation");
871                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
872                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
873                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
874                 }
875               /* Fatal mismatch.  */
876               matches[0] = false;
877               return false;
878             }
879
880           if (rhs_code == COND_EXPR)
881             {
882               tree cond_expr = gimple_assign_rhs1 (stmt);
883
884               if (i == 0)
885                 first_cond_code = TREE_CODE (cond_expr);
886               else if (first_cond_code != TREE_CODE (cond_expr))
887                 {
888                   if (dump_enabled_p ())
889                     {
890                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891                                        "Build SLP failed: different"
892                                        " operation");
893                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
894                                         stmt, 0);
895                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
896                     }
897                   /* Mismatch.  */
898                   continue;
899                 }
900             }
901         }
902
903       matches[i] = true;
904     }
905
906   for (i = 0; i < group_size; ++i)
907     if (!matches[i])
908       return false;
909
910   return true;
911 }
912
913 /* Recursively build an SLP tree starting from NODE.
914    Fail (and return a value not equal to zero) if def-stmts are not
915    isomorphic, require data permutation or are of unsupported types of
916    operation.  Otherwise, return 0.
917    The value returned is the depth in the SLP tree where a mismatch
918    was found.  */
919
920 static bool
921 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
922                      slp_tree *node, unsigned int group_size,
923                      unsigned int *max_nunits,
924                      vec<slp_tree> *loads,
925                      unsigned int vectorization_factor,
926                      bool *matches, unsigned *npermutes, unsigned *tree_size,
927                      unsigned max_tree_size)
928 {
929   unsigned nops, i, this_tree_size = 0;
930   gimple stmt;
931
932   matches[0] = false;
933
934   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
935   if (is_gimple_call (stmt))
936     nops = gimple_call_num_args (stmt);
937   else if (is_gimple_assign (stmt))
938     {
939       nops = gimple_num_ops (stmt) - 1;
940       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
941         nops++;
942     }
943   else
944     return false;
945
946   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
947                               SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
948                               max_nunits, vectorization_factor, matches))
949     return false;
950
951   /* If the SLP node is a load, terminate the recursion.  */
952   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
953       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
954     {
955       loads->safe_push (*node);
956       return true;
957     }
958
959   /* Get at the operands, verifying they are compatible.  */
960   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
961   slp_oprnd_info oprnd_info;
962   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
963     {
964       switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
965                                            stmt, (i == 0), &oprnds_info))
966         {
967         case 0:
968           break;
969         case -1:
970           matches[0] = false;
971           vect_free_oprnd_info (oprnds_info);
972           return false;
973         case 1:
974           matches[i] = false;
975           break;
976         }
977     }
978   for (i = 0; i < group_size; ++i)
979     if (!matches[i])
980       {
981         vect_free_oprnd_info (oprnds_info);
982         return false;
983       }
984
985   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
986
987   /* Create SLP_TREE nodes for the definition node/s.  */
988   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
989     {
990       slp_tree child;
991       unsigned old_nloads = loads->length ();
992       unsigned old_max_nunits = *max_nunits;
993
994       if (oprnd_info->first_dt != vect_internal_def)
995         continue;
996
997       if (++this_tree_size > max_tree_size)
998         {
999           vect_free_oprnd_info (oprnds_info);
1000           return false;
1001         }
1002
1003       child = vect_create_new_slp_node (oprnd_info->def_stmts);
1004       if (!child)
1005         {
1006           vect_free_oprnd_info (oprnds_info);
1007           return false;
1008         }
1009
1010       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1011                                group_size, max_nunits, loads,
1012                                vectorization_factor, matches,
1013                                npermutes, &this_tree_size, max_tree_size))
1014         {
1015           oprnd_info->def_stmts = vNULL;
1016           SLP_TREE_CHILDREN (*node).quick_push (child);
1017           continue;
1018         }
1019
1020       /* If the SLP build for operand zero failed and operand zero
1021          and one can be commutated try that for the scalar stmts
1022          that failed the match.  */
1023       if (i == 0
1024           /* A first scalar stmt mismatch signals a fatal mismatch.  */
1025           && matches[0]
1026           /* ???  For COND_EXPRs we can swap the comparison operands
1027              as well as the arms under some constraints.  */
1028           && nops == 2
1029           && oprnds_info[1]->first_dt == vect_internal_def
1030           && is_gimple_assign (stmt)
1031           && commutative_tree_code (gimple_assign_rhs_code (stmt))
1032           /* Do so only if the number of not successful permutes was nor more
1033              than a cut-ff as re-trying the recursive match on
1034              possibly each level of the tree would expose exponential
1035              behavior.  */
1036           && *npermutes < 4)
1037         {
1038           unsigned int j;
1039           slp_tree grandchild;
1040
1041           /* Roll back.  */
1042           *max_nunits = old_max_nunits;
1043           loads->truncate (old_nloads);
1044           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1045             vect_free_slp_tree (grandchild);
1046           SLP_TREE_CHILDREN (child).truncate (0);
1047
1048           /* Swap mismatched definition stmts.  */
1049           dump_printf_loc (MSG_NOTE, vect_location,
1050                            "Re-trying with swapped operands of stmts ");
1051           for (j = 0; j < group_size; ++j)
1052             if (!matches[j])
1053               {
1054                 gimple tem = oprnds_info[0]->def_stmts[j];
1055                 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1056                 oprnds_info[1]->def_stmts[j] = tem;
1057                 dump_printf (MSG_NOTE, "%d ", j);
1058               }
1059           dump_printf (MSG_NOTE, "\n");
1060           /* And try again ... */
1061           if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1062                                    group_size, max_nunits, loads,
1063                                    vectorization_factor,
1064                                    matches, npermutes, &this_tree_size,
1065                                    max_tree_size))
1066             {
1067               oprnd_info->def_stmts = vNULL;
1068               SLP_TREE_CHILDREN (*node).quick_push (child);
1069               continue;
1070             }
1071
1072           ++*npermutes;
1073         }
1074
1075       oprnd_info->def_stmts = vNULL;
1076       vect_free_slp_tree (child);
1077       vect_free_oprnd_info (oprnds_info);
1078       return false;
1079     }
1080
1081   if (tree_size)
1082     *tree_size += this_tree_size;
1083
1084   vect_free_oprnd_info (oprnds_info);
1085   return true;
1086 }
1087
1088 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1089
1090 static void
1091 vect_print_slp_tree (int dump_kind, slp_tree node)
1092 {
1093   int i;
1094   gimple stmt;
1095   slp_tree child;
1096
1097   if (!node)
1098     return;
1099
1100   dump_printf (dump_kind, "node ");
1101   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1102     {
1103       dump_printf (dump_kind, "\n\tstmt %d ", i);
1104       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1105     }
1106   dump_printf (dump_kind, "\n");
1107
1108   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1109     vect_print_slp_tree (dump_kind, child);
1110 }
1111
1112
1113 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1114    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1115    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1116    stmts in NODE are to be marked.  */
1117
1118 static void
1119 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1120 {
1121   int i;
1122   gimple stmt;
1123   slp_tree child;
1124
1125   if (!node)
1126     return;
1127
1128   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1129     if (j < 0 || i == j)
1130       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1131
1132   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1133     vect_mark_slp_stmts (child, mark, j);
1134 }
1135
1136
1137 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1138
1139 static void
1140 vect_mark_slp_stmts_relevant (slp_tree node)
1141 {
1142   int i;
1143   gimple stmt;
1144   stmt_vec_info stmt_info;
1145   slp_tree child;
1146
1147   if (!node)
1148     return;
1149
1150   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1151     {
1152       stmt_info = vinfo_for_stmt (stmt);
1153       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1154                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1155       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1156     }
1157
1158   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1159     vect_mark_slp_stmts_relevant (child);
1160 }
1161
1162
1163 /* Rearrange the statements of NODE according to PERMUTATION.  */
1164
1165 static void
1166 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1167                           vec<unsigned> permutation)
1168 {
1169   gimple stmt;
1170   vec<gimple> tmp_stmts;
1171   unsigned int i;
1172   slp_tree child;
1173
1174   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1175     vect_slp_rearrange_stmts (child, group_size, permutation);
1176
1177   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1178   tmp_stmts.create (group_size);
1179   tmp_stmts.quick_grow_cleared (group_size);
1180
1181   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1182     tmp_stmts[permutation[i]] = stmt;
1183
1184   SLP_TREE_SCALAR_STMTS (node).release ();
1185   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1186 }
1187
1188
1189 /* Check if the required load permutations in the SLP instance
1190    SLP_INSTN are supported.  */
1191
1192 static bool
1193 vect_supported_load_permutation_p (slp_instance slp_instn)
1194 {
1195   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1196   unsigned int i, j, k, next;
1197   sbitmap load_index;
1198   slp_tree node;
1199   gimple stmt, load, next_load, first_load;
1200   struct data_reference *dr;
1201
1202   if (dump_enabled_p ())
1203     {
1204       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1205       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1206         if (node->load_permutation.exists ())
1207           FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1208             dump_printf (MSG_NOTE, "%d ", next);
1209         else
1210           for (k = 0; k < group_size; ++k)
1211             dump_printf (MSG_NOTE, "%d ", k);
1212       dump_printf (MSG_NOTE, "\n");
1213     }
1214
1215   /* In case of reduction every load permutation is allowed, since the order
1216      of the reduction statements is not important (as opposed to the case of
1217      grouped stores).  The only condition we need to check is that all the
1218      load nodes are of the same size and have the same permutation (and then
1219      rearrange all the nodes of the SLP instance according to this 
1220      permutation).  */
1221
1222   /* Check that all the load nodes are of the same size.  */
1223   /* ???  Can't we assert this? */
1224   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1225     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1226       return false;
1227
1228   node = SLP_INSTANCE_TREE (slp_instn);
1229   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1230
1231   /* Reduction (there are no data-refs in the root).
1232      In reduction chain the order of the loads is important.  */
1233   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1234       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1235     {
1236       slp_tree load;
1237       unsigned int lidx;
1238
1239       /* Compare all the permutation sequences to the first one.  We know
1240          that at least one load is permuted.  */
1241       node = SLP_INSTANCE_LOADS (slp_instn)[0];
1242       if (!node->load_permutation.exists ())
1243         return false;
1244       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1245         {
1246           if (!load->load_permutation.exists ())
1247             return false;
1248           FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1249             if (lidx != node->load_permutation[j])
1250               return false;
1251         }
1252
1253       /* Check that the loads in the first sequence are different and there
1254          are no gaps between them.  */
1255       load_index = sbitmap_alloc (group_size);
1256       bitmap_clear (load_index);
1257       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1258         {
1259           if (bitmap_bit_p (load_index, lidx))
1260             {
1261               sbitmap_free (load_index);
1262               return false;
1263             }
1264           bitmap_set_bit (load_index, lidx);
1265         }
1266       for (i = 0; i < group_size; i++)
1267         if (!bitmap_bit_p (load_index, i))
1268           {
1269             sbitmap_free (load_index);
1270             return false;
1271           }
1272       sbitmap_free (load_index);
1273
1274       /* This permutation is valid for reduction.  Since the order of the
1275          statements in the nodes is not important unless they are memory
1276          accesses, we can rearrange the statements in all the nodes
1277          according to the order of the loads.  */
1278       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1279                                 node->load_permutation);
1280
1281       /* We are done, no actual permutations need to be generated.  */
1282       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1283         SLP_TREE_LOAD_PERMUTATION (node).release ();
1284       return true;
1285     }
1286
1287   /* In basic block vectorization we allow any subchain of an interleaving
1288      chain.
1289      FORNOW: not supported in loop SLP because of realignment compications.  */
1290   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1291     {
1292       /* Check that for every node in the instance the loads
1293          form a subchain.  */
1294       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1295         {
1296           next_load = NULL;
1297           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1298             {
1299               if (j != 0 && next_load != load)
1300                 return false;
1301               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1302             }
1303         }
1304
1305       /* Check that the alignment of the first load in every subchain, i.e.,
1306          the first statement in every load node, is supported.
1307          ???  This belongs in alignment checking.  */
1308       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1309         {
1310           first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1311           if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1312             {
1313               dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1314               if (vect_supportable_dr_alignment (dr, false)
1315                   == dr_unaligned_unsupported)
1316                 {
1317                   if (dump_enabled_p ())
1318                     {
1319                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1320                                        vect_location,
1321                                        "unsupported unaligned load ");
1322                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1323                                         first_load, 0);
1324                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1325                     }
1326                   return false;
1327                 }
1328             }
1329         }
1330
1331       /* We are done, no actual permutations need to be generated.  */
1332       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1333         SLP_TREE_LOAD_PERMUTATION (node).release ();
1334       return true;
1335     }
1336
1337   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1338      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1339      well (unless it's reduction).  */
1340   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1341     return false;
1342   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1343     if (!node->load_permutation.exists ())
1344       return false;
1345
1346   load_index = sbitmap_alloc (group_size);
1347   bitmap_clear (load_index);
1348   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1349     {
1350       unsigned int lidx = node->load_permutation[0];
1351       if (bitmap_bit_p (load_index, lidx))
1352         {
1353           sbitmap_free (load_index);
1354           return false;
1355         }
1356       bitmap_set_bit (load_index, lidx);
1357       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1358         if (k != lidx)
1359           {
1360             sbitmap_free (load_index);
1361             return false;
1362           }
1363     }
1364   for (i = 0; i < group_size; i++)
1365     if (!bitmap_bit_p (load_index, i))
1366       {
1367         sbitmap_free (load_index);
1368         return false;
1369       }
1370   sbitmap_free (load_index);
1371
1372   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1373     if (node->load_permutation.exists ()
1374         && !vect_transform_slp_perm_load
1375               (node, vNULL, NULL,
1376                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1377       return false;
1378   return true;
1379 }
1380
1381
1382 /* Find the first load in the loop that belongs to INSTANCE.
1383    When loads are in several SLP nodes, there can be a case in which the first
1384    load does not appear in the first SLP node to be transformed, causing
1385    incorrect order of statements.  Since we generate all the loads together,
1386    they must be inserted before the first load of the SLP instance and not
1387    before the first load of the first node of the instance.  */
1388
1389 static gimple
1390 vect_find_first_load_in_slp_instance (slp_instance instance)
1391 {
1392   int i, j;
1393   slp_tree load_node;
1394   gimple first_load = NULL, load;
1395
1396   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1397     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1398       first_load = get_earlier_stmt (load, first_load);
1399
1400   return first_load;
1401 }
1402
1403
1404 /* Find the last store in SLP INSTANCE.  */
1405
1406 static gimple
1407 vect_find_last_store_in_slp_instance (slp_instance instance)
1408 {
1409   int i;
1410   slp_tree node;
1411   gimple last_store = NULL, store;
1412
1413   node = SLP_INSTANCE_TREE (instance);
1414   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1415     last_store = get_later_stmt (store, last_store);
1416
1417   return last_store;
1418 }
1419
1420 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1421
1422 static void
1423 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1424                          slp_instance instance, slp_tree node,
1425                          stmt_vector_for_cost *prologue_cost_vec,
1426                          unsigned ncopies_for_cost)
1427 {
1428   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1429
1430   unsigned i;
1431   slp_tree child;
1432   gimple stmt, s;
1433   stmt_vec_info stmt_info;
1434   tree lhs;
1435   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1436
1437   /* Recurse down the SLP tree.  */
1438   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1439     vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1440                              instance, child, prologue_cost_vec,
1441                              ncopies_for_cost);
1442
1443   /* Look at the first scalar stmt to determine the cost.  */
1444   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1445   stmt_info = vinfo_for_stmt (stmt);
1446   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1447     {
1448       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1449         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1450                                vect_uninitialized_def,
1451                                node, prologue_cost_vec, body_cost_vec);
1452       else
1453         {
1454           int i;
1455           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1456           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1457                                 node, prologue_cost_vec, body_cost_vec);
1458           /* If the load is permuted record the cost for the permutation.
1459              ???  Loads from multiple chains are let through here only
1460              for a single special case involving complex numbers where
1461              in the end no permutation is necessary.  */
1462           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1463             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1464                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1465                 && vect_get_place_in_interleaving_chain
1466                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1467               {
1468                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1469                                   stmt_info, 0, vect_body);
1470                 break;
1471               }
1472         }
1473     }
1474   else
1475     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1476                       stmt_info, 0, vect_body);
1477
1478   /* Scan operands and account for prologue cost of constants/externals.
1479      ???  This over-estimates cost for multiple uses and should be
1480      re-engineered.  */
1481   lhs = gimple_get_lhs (stmt);
1482   for (i = 0; i < gimple_num_ops (stmt); ++i)
1483     {
1484       tree def, op = gimple_op (stmt, i);
1485       gimple def_stmt;
1486       enum vect_def_type dt;
1487       if (!op || op == lhs)
1488         continue;
1489       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1490                               &def_stmt, &def, &dt)
1491           && (dt == vect_constant_def || dt == vect_external_def))
1492         record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1493                           stmt_info, 0, vect_prologue);
1494     }
1495 }
1496
1497 /* Compute the cost for the SLP instance INSTANCE.  */
1498
1499 static void
1500 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1501                        slp_instance instance, unsigned nunits)
1502 {
1503   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1504   unsigned ncopies_for_cost;
1505   stmt_info_for_cost *si;
1506   unsigned i;
1507
1508   /* Calculate the number of vector stmts to create based on the unrolling
1509      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1510      GROUP_SIZE / NUNITS otherwise.  */
1511   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1512   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1513
1514   prologue_cost_vec.create (10);
1515   body_cost_vec.create (10);
1516   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1517   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1518                            instance, SLP_INSTANCE_TREE (instance),
1519                            &prologue_cost_vec, ncopies_for_cost);
1520
1521   /* Record the prologue costs, which were delayed until we were
1522      sure that SLP was successful.  Unlike the body costs, we know
1523      the final values now regardless of the loop vectorization factor.  */
1524   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1525                 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1526   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1527     {
1528       struct _stmt_vec_info *stmt_info
1529         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1530       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1531                             si->misalign, vect_prologue);
1532     }
1533
1534   prologue_cost_vec.release ();
1535 }
1536
1537 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1538    vect_build_slp_tree to build a tree of packed stmts if possible.
1539    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1540
1541 static bool
1542 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1543                            gimple stmt, unsigned max_tree_size)
1544 {
1545   slp_instance new_instance;
1546   slp_tree node;
1547   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1548   unsigned int unrolling_factor = 1, nunits;
1549   tree vectype, scalar_type = NULL_TREE;
1550   gimple next;
1551   unsigned int vectorization_factor = 0;
1552   int i;
1553   unsigned int max_nunits = 0;
1554   vec<slp_tree> loads;
1555   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1556   vec<gimple> scalar_stmts;
1557
1558   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1559     {
1560       if (dr)
1561         {
1562           scalar_type = TREE_TYPE (DR_REF (dr));
1563           vectype = get_vectype_for_scalar_type (scalar_type);
1564         }
1565       else
1566         {
1567           gcc_assert (loop_vinfo);
1568           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1569         }
1570
1571       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1572     }
1573   else
1574     {
1575       gcc_assert (loop_vinfo);
1576       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1577       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1578     }
1579
1580   if (!vectype)
1581     {
1582       if (dump_enabled_p ())
1583         {
1584           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1585                            "Build SLP failed: unsupported data-type ");
1586           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1587           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1588         }
1589
1590       return false;
1591     }
1592
1593   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1594   if (loop_vinfo)
1595     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1596   else
1597     vectorization_factor = nunits;
1598
1599   /* Calculate the unrolling factor.  */
1600   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1601   if (unrolling_factor != 1 && !loop_vinfo)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605                          "Build SLP failed: unrolling required in basic"
1606                          " block SLP\n");
1607
1608       return false;
1609     }
1610
1611   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1612   scalar_stmts.create (group_size);
1613   next = stmt;
1614   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1615     {
1616       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1617       while (next)
1618         {
1619           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1620               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1621             scalar_stmts.safe_push (
1622                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1623           else
1624             scalar_stmts.safe_push (next);
1625           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1626         }
1627     }
1628   else
1629     {
1630       /* Collect reduction statements.  */
1631       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1632       for (i = 0; reductions.iterate (i, &next); i++)
1633         scalar_stmts.safe_push (next);
1634     }
1635
1636   node = vect_create_new_slp_node (scalar_stmts);
1637
1638   loads.create (group_size);
1639
1640   /* Build the tree for the SLP instance.  */
1641   bool *matches = XALLOCAVEC (bool, group_size);
1642   unsigned npermutes = 0;
1643   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1644                            &max_nunits, &loads,
1645                            vectorization_factor, matches, &npermutes, NULL,
1646                            max_tree_size))
1647     {
1648       /* Calculate the unrolling factor based on the smallest type.  */
1649       if (max_nunits > nunits)
1650         unrolling_factor = least_common_multiple (max_nunits, group_size)
1651                            / group_size;
1652
1653       if (unrolling_factor != 1 && !loop_vinfo)
1654         {
1655           if (dump_enabled_p ())
1656             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657                              "Build SLP failed: unrolling required in basic"
1658                              " block SLP\n");
1659           vect_free_slp_tree (node);
1660           loads.release ();
1661           return false;
1662         }
1663
1664       /* Create a new SLP instance.  */
1665       new_instance = XNEW (struct _slp_instance);
1666       SLP_INSTANCE_TREE (new_instance) = node;
1667       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1668       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1669       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1670       SLP_INSTANCE_LOADS (new_instance) = loads;
1671       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1672
1673       /* Compute the load permutation.  */
1674       slp_tree load_node;
1675       bool loads_permuted = false;
1676       FOR_EACH_VEC_ELT (loads, i, load_node)
1677         {
1678           vec<unsigned> load_permutation;
1679           int j;
1680           gimple load, first_stmt;
1681           bool this_load_permuted = false;
1682           load_permutation.create (group_size);
1683           first_stmt = GROUP_FIRST_ELEMENT
1684               (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1685           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1686             {
1687               int load_place
1688                 = vect_get_place_in_interleaving_chain (load, first_stmt);
1689               gcc_assert (load_place != -1);
1690               if (load_place != j)
1691                 this_load_permuted = true;
1692               load_permutation.safe_push (load_place);
1693             }
1694           if (!this_load_permuted)
1695             {
1696               load_permutation.release ();
1697               continue;
1698             }
1699           SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1700           loads_permuted = true;
1701         }
1702
1703       if (loads_permuted)
1704         {
1705           if (!vect_supported_load_permutation_p (new_instance))
1706             {
1707               if (dump_enabled_p ())
1708                 {
1709                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710                                    "Build SLP failed: unsupported load "
1711                                    "permutation ");
1712                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1713                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1714                 }
1715               vect_free_slp_instance (new_instance);
1716               return false;
1717             }
1718
1719           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1720             = vect_find_first_load_in_slp_instance (new_instance);
1721         }
1722
1723       /* Compute the costs of this SLP instance.  */
1724       vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1725                              new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1726
1727       if (loop_vinfo)
1728         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1729       else
1730         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1731
1732       if (dump_enabled_p ())
1733         vect_print_slp_tree (MSG_NOTE, node);
1734
1735       return true;
1736     }
1737
1738   /* Failed to SLP.  */
1739   /* Free the allocated memory.  */
1740   vect_free_slp_tree (node);
1741   loads.release ();
1742
1743   return false;
1744 }
1745
1746
1747 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1748    trees of packed scalar stmts if SLP is possible.  */
1749
1750 bool
1751 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1752                   unsigned max_tree_size)
1753 {
1754   unsigned int i;
1755   vec<gimple> grouped_stores;
1756   vec<gimple> reductions = vNULL;
1757   vec<gimple> reduc_chains = vNULL;
1758   gimple first_element;
1759   bool ok = false;
1760
1761   if (dump_enabled_p ())
1762     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1763
1764   if (loop_vinfo)
1765     {
1766       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1767       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1768       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1769     }
1770   else
1771     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1772
1773   /* Find SLP sequences starting from groups of grouped stores.  */
1774   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1775     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1776                                    max_tree_size))
1777       ok = true;
1778
1779   if (bb_vinfo && !ok)
1780     {
1781       if (dump_enabled_p ())
1782         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1783                          "Failed to SLP the basic block.\n");
1784
1785       return false;
1786     }
1787
1788   if (loop_vinfo
1789       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1790     {
1791       /* Find SLP sequences starting from reduction chains.  */
1792       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1793         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1794                                        max_tree_size))
1795           ok = true;
1796         else
1797           return false;
1798
1799       /* Don't try to vectorize SLP reductions if reduction chain was
1800          detected.  */
1801       return ok;
1802     }
1803
1804   /* Find SLP sequences starting from groups of reductions.  */
1805   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1806       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1807                                     max_tree_size))
1808     ok = true;
1809
1810   return true;
1811 }
1812
1813
1814 /* For each possible SLP instance decide whether to SLP it and calculate overall
1815    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1816    least one instance.  */
1817
1818 bool
1819 vect_make_slp_decision (loop_vec_info loop_vinfo)
1820 {
1821   unsigned int i, unrolling_factor = 1;
1822   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1823   slp_instance instance;
1824   int decided_to_slp = 0;
1825
1826   if (dump_enabled_p ())
1827     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1828                      "\n");
1829
1830   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1831     {
1832       /* FORNOW: SLP if you can.  */
1833       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1834         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1835
1836       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1837          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1838          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1839       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1840       decided_to_slp++;
1841     }
1842
1843   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1844
1845   if (decided_to_slp && dump_enabled_p ())
1846     dump_printf_loc (MSG_NOTE, vect_location,
1847                      "Decided to SLP %d instances. Unrolling factor %d\n",
1848                      decided_to_slp, unrolling_factor);
1849
1850   return (decided_to_slp > 0);
1851 }
1852
1853
1854 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1855    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1856
1857 static void
1858 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1859 {
1860   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1861   imm_use_iterator imm_iter;
1862   gimple use_stmt;
1863   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1864   slp_tree child;
1865   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1866   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1867   int j;
1868
1869   /* Propagate hybrid down the SLP tree.  */
1870   if (stype == hybrid)
1871     ;
1872   else if (HYBRID_SLP_STMT (stmt_vinfo))
1873     stype = hybrid;
1874   else
1875     {
1876       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
1877       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1878       if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1879         FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1880           if (gimple_bb (use_stmt)
1881               && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1882               && (use_vinfo = vinfo_for_stmt (use_stmt))
1883               && !STMT_SLP_TYPE (use_vinfo)
1884               && (STMT_VINFO_RELEVANT (use_vinfo)
1885                   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))
1886                   || (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1887                       && STMT_VINFO_RELATED_STMT (use_vinfo)
1888                       && !STMT_SLP_TYPE (vinfo_for_stmt
1889                             (STMT_VINFO_RELATED_STMT (use_vinfo)))))
1890               && !(gimple_code (use_stmt) == GIMPLE_PHI
1891                    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1892             stype = hybrid;
1893     }
1894
1895   if (stype == hybrid)
1896     STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1897
1898   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1899     vect_detect_hybrid_slp_stmts (child, i, stype);
1900 }
1901
1902 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
1903
1904 static tree
1905 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
1906 {
1907   walk_stmt_info *wi = (walk_stmt_info *)data;
1908   struct loop *loopp = (struct loop *)wi->info;
1909
1910   if (wi->is_lhs)
1911     return NULL_TREE;
1912
1913   if (TREE_CODE (*tp) == SSA_NAME
1914       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
1915     {
1916       gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
1917       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
1918           && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
1919         STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
1920     }
1921
1922   return NULL_TREE;
1923 }
1924
1925 static tree
1926 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
1927                           walk_stmt_info *)
1928 {
1929   /* If the stmt is in a SLP instance then this isn't a reason
1930      to mark use definitions in other SLP instances as hybrid.  */
1931   if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
1932     *handled = true;
1933   return NULL_TREE;
1934 }
1935
1936 /* Find stmts that must be both vectorized and SLPed.  */
1937
1938 void
1939 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1940 {
1941   unsigned int i;
1942   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1943   slp_instance instance;
1944
1945   if (dump_enabled_p ())
1946     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1947                      "\n");
1948
1949   /* First walk all pattern stmt in the loop and mark defs of uses as
1950      hybrid because immediate uses in them are not recorded.  */
1951   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
1952     {
1953       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
1954       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1955            gsi_next (&gsi))
1956         {
1957           gimple stmt = gsi_stmt (gsi);
1958           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1959           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
1960             {
1961               walk_stmt_info wi;
1962               memset (&wi, 0, sizeof (wi));
1963               wi.info = LOOP_VINFO_LOOP (loop_vinfo);
1964               gimple_stmt_iterator gsi2
1965                 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
1966               walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
1967                                 vect_detect_hybrid_slp_1, &wi);
1968               walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
1969                                vect_detect_hybrid_slp_2,
1970                                vect_detect_hybrid_slp_1, &wi);
1971             }
1972         }
1973     }
1974
1975   /* Then walk the SLP instance trees marking stmts with uses in
1976      non-SLP stmts as hybrid, also propagating hybrid down the
1977      SLP tree, collecting the above info on-the-fly.  */
1978   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1979     {
1980       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
1981         vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
1982                                       i, pure_slp);
1983     }
1984 }
1985
1986
1987 /* Create and initialize a new bb_vec_info struct for BB, as well as
1988    stmt_vec_info structs for all the stmts in it.  */
1989
1990 static bb_vec_info
1991 new_bb_vec_info (basic_block bb)
1992 {
1993   bb_vec_info res = NULL;
1994   gimple_stmt_iterator gsi;
1995
1996   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1997   BB_VINFO_BB (res) = bb;
1998
1999   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2000     {
2001       gimple stmt = gsi_stmt (gsi);
2002       gimple_set_uid (stmt, 0);
2003       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2004     }
2005
2006   BB_VINFO_GROUPED_STORES (res).create (10);
2007   BB_VINFO_SLP_INSTANCES (res).create (2);
2008   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2009
2010   bb->aux = res;
2011   return res;
2012 }
2013
2014
2015 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2016    stmts in the basic block.  */
2017
2018 static void
2019 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2020 {
2021   vec<slp_instance> slp_instances;
2022   slp_instance instance;
2023   basic_block bb;
2024   gimple_stmt_iterator si;
2025   unsigned i;
2026
2027   if (!bb_vinfo)
2028     return;
2029
2030   bb = BB_VINFO_BB (bb_vinfo);
2031
2032   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2033     {
2034       gimple stmt = gsi_stmt (si);
2035       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2036
2037       if (stmt_info)
2038         /* Free stmt_vec_info.  */
2039         free_stmt_vec_info (stmt);
2040     }
2041
2042   vect_destroy_datarefs (NULL, bb_vinfo);
2043   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2044   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2045   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2046   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2047     vect_free_slp_instance (instance);
2048   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2049   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2050   free (bb_vinfo);
2051   bb->aux = NULL;
2052 }
2053
2054
2055 /* Analyze statements contained in SLP tree node after recursively analyzing
2056    the subtree. Return TRUE if the operations are supported.  */
2057
2058 static bool
2059 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2060 {
2061   bool dummy;
2062   int i;
2063   gimple stmt;
2064   slp_tree child;
2065
2066   if (!node)
2067     return true;
2068
2069   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2070     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2071       return false;
2072
2073   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2074     {
2075       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2076       gcc_assert (stmt_info);
2077       gcc_assert (PURE_SLP_STMT (stmt_info));
2078
2079       if (!vect_analyze_stmt (stmt, &dummy, node))
2080         return false;
2081     }
2082
2083   return true;
2084 }
2085
2086
2087 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2088    operations are supported. */
2089
2090 static bool
2091 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2092 {
2093   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2094   slp_instance instance;
2095   int i;
2096
2097   for (i = 0; slp_instances.iterate (i, &instance); )
2098     {
2099       if (!vect_slp_analyze_node_operations (bb_vinfo,
2100                                              SLP_INSTANCE_TREE (instance)))
2101         {
2102           vect_free_slp_instance (instance);
2103           slp_instances.ordered_remove (i);
2104         }
2105       else
2106         i++;
2107     }
2108
2109   if (!slp_instances.length ())
2110     return false;
2111
2112   return true;
2113 }
2114
2115
2116 /* Compute the scalar cost of the SLP node NODE and its children
2117    and return it.  Do not account defs that are marked in LIFE and
2118    update LIFE according to uses of NODE.  */
2119
2120 static unsigned
2121 vect_bb_slp_scalar_cost (basic_block bb,
2122                          slp_tree node, vec<bool, va_heap> *life)
2123 {
2124   unsigned scalar_cost = 0;
2125   unsigned i;
2126   gimple stmt;
2127   slp_tree child;
2128
2129   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2130     {
2131       unsigned stmt_cost;
2132       ssa_op_iter op_iter;
2133       def_operand_p def_p;
2134       stmt_vec_info stmt_info;
2135
2136       if ((*life)[i])
2137         continue;
2138
2139       /* If there is a non-vectorized use of the defs then the scalar
2140          stmt is kept live in which case we do not account it or any
2141          required defs in the SLP children in the scalar cost.  This
2142          way we make the vectorization more costly when compared to
2143          the scalar cost.  */
2144       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2145         {
2146           imm_use_iterator use_iter;
2147           gimple use_stmt;
2148           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2149             if (!is_gimple_debug (use_stmt)
2150                 && (gimple_code (use_stmt) == GIMPLE_PHI
2151                     || gimple_bb (use_stmt) != bb
2152                     || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2153               {
2154                 (*life)[i] = true;
2155                 BREAK_FROM_IMM_USE_STMT (use_iter);
2156               }
2157         }
2158       if ((*life)[i])
2159         continue;
2160
2161       stmt_info = vinfo_for_stmt (stmt);
2162       if (STMT_VINFO_DATA_REF (stmt_info))
2163         {
2164           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2165             stmt_cost = vect_get_stmt_cost (scalar_load);
2166           else
2167             stmt_cost = vect_get_stmt_cost (scalar_store);
2168         }
2169       else
2170         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2171
2172       scalar_cost += stmt_cost;
2173     }
2174
2175   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2176     scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2177
2178   return scalar_cost;
2179 }
2180
2181 /* Check if vectorization of the basic block is profitable.  */
2182
2183 static bool
2184 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2185 {
2186   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2187   slp_instance instance;
2188   int i, j;
2189   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2190   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2191   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2192   stmt_vec_info stmt_info = NULL;
2193   stmt_vector_for_cost body_cost_vec;
2194   stmt_info_for_cost *ci;
2195
2196   /* Calculate vector costs.  */
2197   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2198     {
2199       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2200
2201       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2202         {
2203           stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2204           (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2205                                 stmt_info, ci->misalign, vect_body);
2206         }
2207     }
2208
2209   /* Calculate scalar cost.  */
2210   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2211     {
2212       auto_vec<bool, 20> life;
2213       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2214       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2215                                               SLP_INSTANCE_TREE (instance),
2216                                               &life);
2217     }
2218
2219   /* Complete the target-specific cost calculation.  */
2220   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2221                &vec_inside_cost, &vec_epilogue_cost);
2222
2223   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2224
2225   if (dump_enabled_p ())
2226     {
2227       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2228       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2229                    vec_inside_cost);
2230       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2231       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2232       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2233     }
2234
2235   /* Vectorization is profitable if its cost is less than the cost of scalar
2236      version.  */
2237   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2238     return false;
2239
2240   return true;
2241 }
2242
2243 /* Check if the basic block can be vectorized.  */
2244
2245 static bb_vec_info
2246 vect_slp_analyze_bb_1 (basic_block bb)
2247 {
2248   bb_vec_info bb_vinfo;
2249   vec<slp_instance> slp_instances;
2250   slp_instance instance;
2251   int i;
2252   int min_vf = 2;
2253   unsigned n_stmts = 0;
2254
2255   bb_vinfo = new_bb_vec_info (bb);
2256   if (!bb_vinfo)
2257     return NULL;
2258
2259   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2260     {
2261       if (dump_enabled_p ())
2262         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263                          "not vectorized: unhandled data-ref in basic "
2264                          "block.\n");
2265
2266       destroy_bb_vec_info (bb_vinfo);
2267       return NULL;
2268     }
2269
2270   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2271     {
2272       if (dump_enabled_p ())
2273         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274                          "not vectorized: not enough data-refs in "
2275                          "basic block.\n");
2276
2277       destroy_bb_vec_info (bb_vinfo);
2278       return NULL;
2279     }
2280
2281   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2282     {
2283      if (dump_enabled_p ())
2284        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285                         "not vectorized: unhandled data access in "
2286                         "basic block.\n");
2287
2288       destroy_bb_vec_info (bb_vinfo);
2289       return NULL;
2290     }
2291
2292   vect_pattern_recog (NULL, bb_vinfo);
2293
2294   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2295     {
2296       if (dump_enabled_p ())
2297         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298                          "not vectorized: bad data alignment in basic "
2299                          "block.\n");
2300
2301       destroy_bb_vec_info (bb_vinfo);
2302       return NULL;
2303     }
2304
2305   /* Check the SLP opportunities in the basic block, analyze and build SLP
2306      trees.  */
2307   if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2308     {
2309       if (dump_enabled_p ())
2310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2311                          "not vectorized: failed to find SLP opportunities "
2312                          "in basic block.\n");
2313
2314       destroy_bb_vec_info (bb_vinfo);
2315       return NULL;
2316     }
2317
2318   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2319
2320   /* Mark all the statements that we want to vectorize as pure SLP and
2321      relevant.  */
2322   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2323     {
2324       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2325       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2326     }
2327
2328   /* Mark all the statements that we do not want to vectorize.  */
2329   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2330        !gsi_end_p (gsi); gsi_next (&gsi))
2331     {
2332       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2333       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2334         STMT_VINFO_VECTORIZABLE (vinfo) = false;
2335     }
2336
2337   /* Analyze dependences.  At this point all stmts not participating in
2338      vectorization have to be marked.  Dependence analysis assumes
2339      that we either vectorize all SLP instances or none at all.  */
2340   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2341      {
2342        if (dump_enabled_p ())
2343          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344                           "not vectorized: unhandled data dependence "
2345                           "in basic block.\n");
2346
2347        destroy_bb_vec_info (bb_vinfo);
2348        return NULL;
2349      }
2350
2351   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2352     {
2353       if (dump_enabled_p ())
2354         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2355                          "not vectorized: unsupported alignment in basic "
2356                          "block.\n");
2357       destroy_bb_vec_info (bb_vinfo);
2358       return NULL;
2359     }
2360
2361   if (!vect_slp_analyze_operations (bb_vinfo))
2362     {
2363       if (dump_enabled_p ())
2364         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2365                          "not vectorized: bad operation in basic block.\n");
2366
2367       destroy_bb_vec_info (bb_vinfo);
2368       return NULL;
2369     }
2370
2371   /* Cost model: check if the vectorization is worthwhile.  */
2372   if (!unlimited_cost_model (NULL)
2373       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2374     {
2375       if (dump_enabled_p ())
2376         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377                          "not vectorized: vectorization is not "
2378                          "profitable.\n");
2379
2380       destroy_bb_vec_info (bb_vinfo);
2381       return NULL;
2382     }
2383
2384   if (dump_enabled_p ())
2385     dump_printf_loc (MSG_NOTE, vect_location,
2386                      "Basic block will be vectorized using SLP\n");
2387
2388   return bb_vinfo;
2389 }
2390
2391
2392 bb_vec_info
2393 vect_slp_analyze_bb (basic_block bb)
2394 {
2395   bb_vec_info bb_vinfo;
2396   int insns = 0;
2397   gimple_stmt_iterator gsi;
2398   unsigned int vector_sizes;
2399
2400   if (dump_enabled_p ())
2401     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2402
2403   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2404     {
2405       gimple stmt = gsi_stmt (gsi);
2406       if (!is_gimple_debug (stmt)
2407           && !gimple_nop_p (stmt)
2408           && gimple_code (stmt) != GIMPLE_LABEL)
2409         insns++;
2410     }
2411
2412   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2413     {
2414       if (dump_enabled_p ())
2415         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2416                          "not vectorized: too many instructions in "
2417                          "basic block.\n");
2418
2419       return NULL;
2420     }
2421
2422   /* Autodetect first vector size we try.  */
2423   current_vector_size = 0;
2424   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2425
2426   while (1)
2427     {
2428       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2429       if (bb_vinfo)
2430         return bb_vinfo;
2431
2432       destroy_bb_vec_info (bb_vinfo);
2433
2434       vector_sizes &= ~current_vector_size;
2435       if (vector_sizes == 0
2436           || current_vector_size == 0)
2437         return NULL;
2438
2439       /* Try the next biggest vector size.  */
2440       current_vector_size = 1 << floor_log2 (vector_sizes);
2441       if (dump_enabled_p ())
2442         dump_printf_loc (MSG_NOTE, vect_location,
2443                          "***** Re-trying analysis with "
2444                          "vector size %d\n", current_vector_size);
2445     }
2446 }
2447
2448
2449 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2450    the number of created vector stmts depends on the unrolling factor).
2451    However, the actual number of vector stmts for every SLP node depends on
2452    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2453    should be updated.  In this function we assume that the inside costs
2454    calculated in vect_model_xxx_cost are linear in ncopies.  */
2455
2456 void
2457 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2458 {
2459   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2460   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2461   slp_instance instance;
2462   stmt_vector_for_cost body_cost_vec;
2463   stmt_info_for_cost *si;
2464   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2465
2466   if (dump_enabled_p ())
2467     dump_printf_loc (MSG_NOTE, vect_location,
2468                      "=== vect_update_slp_costs_according_to_vf ===\n");
2469
2470   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2471     {
2472       /* We assume that costs are linear in ncopies.  */
2473       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2474
2475       /* Record the instance's instructions in the target cost model.
2476          This was delayed until here because the count of instructions
2477          isn't known beforehand.  */
2478       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2479
2480       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2481         (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2482                               vinfo_for_stmt (si->stmt), si->misalign,
2483                               vect_body);
2484     }
2485 }
2486
2487
2488 /* For constant and loop invariant defs of SLP_NODE this function returns
2489    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2490    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2491    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2492    REDUC_INDEX is the index of the reduction operand in the statements, unless
2493    it is -1.  */
2494
2495 static void
2496 vect_get_constant_vectors (tree op, slp_tree slp_node,
2497                            vec<tree> *vec_oprnds,
2498                            unsigned int op_num, unsigned int number_of_vectors,
2499                            int reduc_index)
2500 {
2501   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2502   gimple stmt = stmts[0];
2503   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2504   unsigned nunits;
2505   tree vec_cst;
2506   tree *elts;
2507   unsigned j, number_of_places_left_in_vector;
2508   tree vector_type;
2509   tree vop;
2510   int group_size = stmts.length ();
2511   unsigned int vec_num, i;
2512   unsigned number_of_copies = 1;
2513   vec<tree> voprnds;
2514   voprnds.create (number_of_vectors);
2515   bool constant_p, is_store;
2516   tree neutral_op = NULL;
2517   enum tree_code code = gimple_expr_code (stmt);
2518   gimple def_stmt;
2519   struct loop *loop;
2520   gimple_seq ctor_seq = NULL;
2521
2522   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2523       && reduc_index != -1)
2524     {
2525       op_num = reduc_index - 1;
2526       op = gimple_op (stmt, reduc_index);
2527       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2528          we need either neutral operands or the original operands.  See
2529          get_initial_def_for_reduction() for details.  */
2530       switch (code)
2531         {
2532           case WIDEN_SUM_EXPR:
2533           case DOT_PROD_EXPR:
2534           case PLUS_EXPR:
2535           case MINUS_EXPR:
2536           case BIT_IOR_EXPR:
2537           case BIT_XOR_EXPR:
2538              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2539                neutral_op = build_real (TREE_TYPE (op), dconst0);
2540              else
2541                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2542
2543              break;
2544
2545           case MULT_EXPR:
2546              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2547                neutral_op = build_real (TREE_TYPE (op), dconst1);
2548              else
2549                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2550
2551              break;
2552
2553           case BIT_AND_EXPR:
2554             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2555             break;
2556
2557           /* For MIN/MAX we don't have an easy neutral operand but
2558              the initial values can be used fine here.  Only for
2559              a reduction chain we have to force a neutral element.  */
2560           case MAX_EXPR:
2561           case MIN_EXPR:
2562             if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2563               neutral_op = NULL;
2564             else
2565               {
2566                 def_stmt = SSA_NAME_DEF_STMT (op);
2567                 loop = (gimple_bb (stmt))->loop_father;
2568                 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2569                                                     loop_preheader_edge (loop));
2570               }
2571             break;
2572
2573           default:
2574             neutral_op = NULL;
2575         }
2576     }
2577
2578   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2579     {
2580       is_store = true;
2581       op = gimple_assign_rhs1 (stmt);
2582     }
2583   else
2584     is_store = false;
2585
2586   gcc_assert (op);
2587
2588   if (CONSTANT_CLASS_P (op))
2589     constant_p = true;
2590   else
2591     constant_p = false;
2592
2593   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2594   gcc_assert (vector_type);
2595   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2596
2597   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2598      created vectors. It is greater than 1 if unrolling is performed.
2599
2600      For example, we have two scalar operands, s1 and s2 (e.g., group of
2601      strided accesses of size two), while NUNITS is four (i.e., four scalars
2602      of this type can be packed in a vector).  The output vector will contain
2603      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2604      will be 2).
2605
2606      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2607      containing the operands.
2608
2609      For example, NUNITS is four as before, and the group size is 8
2610      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2611      {s5, s6, s7, s8}.  */
2612
2613   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2614
2615   number_of_places_left_in_vector = nunits;
2616   elts = XALLOCAVEC (tree, nunits);
2617   for (j = 0; j < number_of_copies; j++)
2618     {
2619       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2620         {
2621           if (is_store)
2622             op = gimple_assign_rhs1 (stmt);
2623           else
2624             {
2625               switch (code)
2626                 {
2627                   case COND_EXPR:
2628                     if (op_num == 0 || op_num == 1)
2629                       {
2630                         tree cond = gimple_assign_rhs1 (stmt);
2631                         op = TREE_OPERAND (cond, op_num);
2632                       }
2633                     else
2634                       {
2635                         if (op_num == 2)
2636                           op = gimple_assign_rhs2 (stmt);
2637                         else
2638                           op = gimple_assign_rhs3 (stmt);
2639                       }
2640                     break;
2641
2642                   case CALL_EXPR:
2643                     op = gimple_call_arg (stmt, op_num);
2644                     break;
2645
2646                   case LSHIFT_EXPR:
2647                   case RSHIFT_EXPR:
2648                   case LROTATE_EXPR:
2649                   case RROTATE_EXPR:
2650                     op = gimple_op (stmt, op_num + 1);
2651                     /* Unlike the other binary operators, shifts/rotates have
2652                        the shift count being int, instead of the same type as
2653                        the lhs, so make sure the scalar is the right type if
2654                        we are dealing with vectors of
2655                        long long/long/short/char.  */
2656                     if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2657                       op = fold_convert (TREE_TYPE (vector_type), op);
2658                     break;
2659
2660                   default:
2661                     op = gimple_op (stmt, op_num + 1);
2662                     break;
2663                 }
2664             }
2665
2666           if (reduc_index != -1)
2667             {
2668               loop = (gimple_bb (stmt))->loop_father;
2669               def_stmt = SSA_NAME_DEF_STMT (op);
2670
2671               gcc_assert (loop);
2672
2673               /* Get the def before the loop.  In reduction chain we have only
2674                  one initial value.  */
2675               if ((j != (number_of_copies - 1)
2676                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2677                        && i != 0))
2678                   && neutral_op)
2679                 op = neutral_op;
2680               else
2681                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2682                                             loop_preheader_edge (loop));
2683             }
2684
2685           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2686           number_of_places_left_in_vector--;
2687           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2688             {
2689               if (CONSTANT_CLASS_P (op))
2690                 {
2691                   op = fold_unary (VIEW_CONVERT_EXPR,
2692                                    TREE_TYPE (vector_type), op);
2693                   gcc_assert (op && CONSTANT_CLASS_P (op));
2694                 }
2695               else
2696                 {
2697                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2698                   gimple init_stmt;
2699                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2700                   init_stmt
2701                     = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2702                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2703                   op = new_temp;
2704                 }
2705             }
2706           elts[number_of_places_left_in_vector] = op;
2707           if (!CONSTANT_CLASS_P (op))
2708             constant_p = false;
2709
2710           if (number_of_places_left_in_vector == 0)
2711             {
2712               number_of_places_left_in_vector = nunits;
2713
2714               if (constant_p)
2715                 vec_cst = build_vector (vector_type, elts);
2716               else
2717                 {
2718                   vec<constructor_elt, va_gc> *v;
2719                   unsigned k;
2720                   vec_alloc (v, nunits);
2721                   for (k = 0; k < nunits; ++k)
2722                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2723                   vec_cst = build_constructor (vector_type, v);
2724                 }
2725               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2726                                                     vector_type, NULL));
2727               if (ctor_seq != NULL)
2728                 {
2729                   gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2730                   gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2731                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2732                                                         GSI_SAME_STMT);
2733                   ctor_seq = NULL;
2734                 }
2735             }
2736         }
2737     }
2738
2739   /* Since the vectors are created in the reverse order, we should invert
2740      them.  */
2741   vec_num = voprnds.length ();
2742   for (j = vec_num; j != 0; j--)
2743     {
2744       vop = voprnds[j - 1];
2745       vec_oprnds->quick_push (vop);
2746     }
2747
2748   voprnds.release ();
2749
2750   /* In case that VF is greater than the unrolling factor needed for the SLP
2751      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2752      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2753      to replicate the vectors.  */
2754   while (number_of_vectors > vec_oprnds->length ())
2755     {
2756       tree neutral_vec = NULL;
2757
2758       if (neutral_op)
2759         {
2760           if (!neutral_vec)
2761             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2762
2763           vec_oprnds->quick_push (neutral_vec);
2764         }
2765       else
2766         {
2767           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2768             vec_oprnds->quick_push (vop);
2769         }
2770     }
2771 }
2772
2773
2774 /* Get vectorized definitions from SLP_NODE that contains corresponding
2775    vectorized def-stmts.  */
2776
2777 static void
2778 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2779 {
2780   tree vec_oprnd;
2781   gimple vec_def_stmt;
2782   unsigned int i;
2783
2784   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2785
2786   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2787     {
2788       gcc_assert (vec_def_stmt);
2789       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2790       vec_oprnds->quick_push (vec_oprnd);
2791     }
2792 }
2793
2794
2795 /* Get vectorized definitions for SLP_NODE.
2796    If the scalar definitions are loop invariants or constants, collect them and
2797    call vect_get_constant_vectors() to create vector stmts.
2798    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2799    must be stored in the corresponding child of SLP_NODE, and we call
2800    vect_get_slp_vect_defs () to retrieve them.  */
2801
2802 void
2803 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2804                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2805 {
2806   gimple first_stmt;
2807   int number_of_vects = 0, i;
2808   unsigned int child_index = 0;
2809   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2810   slp_tree child = NULL;
2811   vec<tree> vec_defs;
2812   tree oprnd;
2813   bool vectorized_defs;
2814
2815   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2816   FOR_EACH_VEC_ELT (ops, i, oprnd)
2817     {
2818       /* For each operand we check if it has vectorized definitions in a child
2819          node or we need to create them (for invariants and constants).  We
2820          check if the LHS of the first stmt of the next child matches OPRND.
2821          If it does, we found the correct child.  Otherwise, we call
2822          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2823          to check this child node for the next operand.  */
2824       vectorized_defs = false;
2825       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2826         {
2827           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2828
2829           /* We have to check both pattern and original def, if available.  */
2830           gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2831           gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2832
2833           if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2834               || (related
2835                   && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2836             {
2837               /* The number of vector defs is determined by the number of
2838                  vector statements in the node from which we get those
2839                  statements.  */
2840               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2841               vectorized_defs = true;
2842               child_index++;
2843             }
2844         }
2845
2846       if (!vectorized_defs)
2847         {
2848           if (i == 0)
2849             {
2850               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2851               /* Number of vector stmts was calculated according to LHS in
2852                  vect_schedule_slp_instance (), fix it by replacing LHS with
2853                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2854                  details.  */
2855               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2856                                              &rhs_size_unit);
2857               if (rhs_size_unit != lhs_size_unit)
2858                 {
2859                   number_of_vects *= rhs_size_unit;
2860                   number_of_vects /= lhs_size_unit;
2861                 }
2862             }
2863         }
2864
2865       /* Allocate memory for vectorized defs.  */
2866       vec_defs = vNULL;
2867       vec_defs.create (number_of_vects);
2868
2869       /* For reduction defs we call vect_get_constant_vectors (), since we are
2870          looking for initial loop invariant values.  */
2871       if (vectorized_defs && reduc_index == -1)
2872         /* The defs are already vectorized.  */
2873         vect_get_slp_vect_defs (child, &vec_defs);
2874       else
2875         /* Build vectors from scalar defs.  */
2876         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2877                                    number_of_vects, reduc_index);
2878
2879       vec_oprnds->quick_push (vec_defs);
2880
2881       /* For reductions, we only need initial values.  */
2882       if (reduc_index != -1)
2883         return;
2884     }
2885 }
2886
2887
2888 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2889    building a vector of type MASK_TYPE from it) and two input vectors placed in
2890    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2891    shifting by STRIDE elements of DR_CHAIN for every copy.
2892    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2893    copies).
2894    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2895    the created stmts must be inserted.  */
2896
2897 static inline void
2898 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2899                            tree mask, int first_vec_indx, int second_vec_indx,
2900                            gimple_stmt_iterator *gsi, slp_tree node,
2901                            tree vectype, vec<tree> dr_chain,
2902                            int ncopies, int vect_stmts_counter)
2903 {
2904   tree perm_dest;
2905   gimple perm_stmt = NULL;
2906   stmt_vec_info next_stmt_info;
2907   int i, stride;
2908   tree first_vec, second_vec, data_ref;
2909
2910   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2911
2912   /* Initialize the vect stmts of NODE to properly insert the generated
2913      stmts later.  */
2914   for (i = SLP_TREE_VEC_STMTS (node).length ();
2915        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2916     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2917
2918   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2919   for (i = 0; i < ncopies; i++)
2920     {
2921       first_vec = dr_chain[first_vec_indx];
2922       second_vec = dr_chain[second_vec_indx];
2923
2924       /* Generate the permute statement.  */
2925       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
2926                                        first_vec, second_vec, mask);
2927       data_ref = make_ssa_name (perm_dest, perm_stmt);
2928       gimple_set_lhs (perm_stmt, data_ref);
2929       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2930
2931       /* Store the vector statement in NODE.  */
2932       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2933
2934       first_vec_indx += stride;
2935       second_vec_indx += stride;
2936     }
2937
2938   /* Mark the scalar stmt as vectorized.  */
2939   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2940   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2941 }
2942
2943
2944 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2945    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2946    representation.  Check that the mask is valid and return FALSE if not.
2947    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2948    the next vector, i.e., the current first vector is not needed.  */
2949
2950 static bool
2951 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2952                        int mask_nunits, bool only_one_vec, int index,
2953                        unsigned char *mask, int *current_mask_element,
2954                        bool *need_next_vector, int *number_of_mask_fixes,
2955                        bool *mask_fixed, bool *needs_first_vector)
2956 {
2957   int i;
2958
2959   /* Convert to target specific representation.  */
2960   *current_mask_element = first_mask_element + m;
2961   /* Adjust the value in case it's a mask for second and third vectors.  */
2962   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2963
2964   if (*current_mask_element < mask_nunits)
2965     *needs_first_vector = true;
2966
2967   /* We have only one input vector to permute but the mask accesses values in
2968      the next vector as well.  */
2969   if (only_one_vec && *current_mask_element >= mask_nunits)
2970     {
2971       if (dump_enabled_p ())
2972         {
2973           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974                            "permutation requires at least two vectors ");
2975           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2976           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2977         }
2978
2979       return false;
2980     }
2981
2982   /* The mask requires the next vector.  */
2983   while (*current_mask_element >= mask_nunits * 2)
2984     {
2985       if (*needs_first_vector || *mask_fixed)
2986         {
2987           /* We either need the first vector too or have already moved to the
2988              next vector. In both cases, this permutation needs three
2989              vectors.  */
2990           if (dump_enabled_p ())
2991             {
2992               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2993                                "permutation requires at "
2994                                "least three vectors ");
2995               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2996               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2997             }
2998
2999           return false;
3000         }
3001
3002       /* We move to the next vector, dropping the first one and working with
3003          the second and the third - we need to adjust the values of the mask
3004          accordingly.  */
3005       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3006
3007       for (i = 0; i < index; i++)
3008         mask[i] -= mask_nunits * *number_of_mask_fixes;
3009
3010       (*number_of_mask_fixes)++;
3011       *mask_fixed = true;
3012     }
3013
3014   *need_next_vector = *mask_fixed;
3015
3016   /* This was the last element of this mask. Start a new one.  */
3017   if (index == mask_nunits - 1)
3018     {
3019       *number_of_mask_fixes = 1;
3020       *mask_fixed = false;
3021       *needs_first_vector = false;
3022     }
3023
3024   return true;
3025 }
3026
3027
3028 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3029    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3030    permute statements for the SLP node NODE of the SLP instance
3031    SLP_NODE_INSTANCE.  */
3032
3033 bool
3034 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3035                               gimple_stmt_iterator *gsi, int vf,
3036                               slp_instance slp_node_instance, bool analyze_only)
3037 {
3038   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3039   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3040   tree mask_element_type = NULL_TREE, mask_type;
3041   int i, j, k, nunits, vec_index = 0, scalar_index;
3042   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3043   gimple next_scalar_stmt;
3044   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3045   int first_mask_element;
3046   int index, unroll_factor, current_mask_element, ncopies;
3047   unsigned char *mask;
3048   bool only_one_vec = false, need_next_vector = false;
3049   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3050   int number_of_mask_fixes = 1;
3051   bool mask_fixed = false;
3052   bool needs_first_vector = false;
3053   machine_mode mode;
3054
3055   mode = TYPE_MODE (vectype);
3056
3057   if (!can_vec_perm_p (mode, false, NULL))
3058     {
3059       if (dump_enabled_p ())
3060         {
3061           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3062                            "no vect permute for ");
3063           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3064           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3065         }
3066       return false;
3067     }
3068
3069   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3070      same size as the vector element being permuted.  */
3071   mask_element_type = lang_hooks.types.type_for_mode
3072                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3073   mask_type = get_vectype_for_scalar_type (mask_element_type);
3074   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3075   mask = XALLOCAVEC (unsigned char, nunits);
3076   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3077
3078   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3079      unrolling factor.  */
3080   orig_vec_stmts_num = group_size *
3081                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3082   if (orig_vec_stmts_num == 1)
3083     only_one_vec = true;
3084
3085   /* Number of copies is determined by the final vectorization factor
3086      relatively to SLP_NODE_INSTANCE unrolling factor.  */
3087   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3088
3089   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3090     return false;
3091
3092   /* Generate permutation masks for every NODE. Number of masks for each NODE
3093      is equal to GROUP_SIZE.
3094      E.g., we have a group of three nodes with three loads from the same
3095      location in each node, and the vector size is 4. I.e., we have a
3096      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3097      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3098      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3099      ...
3100
3101      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3102      The last mask is illegal since we assume two operands for permute
3103      operation, and the mask element values can't be outside that range.
3104      Hence, the last mask must be converted into {2,5,5,5}.
3105      For the first two permutations we need the first and the second input
3106      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3107      we need the second and the third vectors: {b1,c1,a2,b2} and
3108      {c2,a3,b3,c3}.  */
3109
3110     {
3111       scalar_index = 0;
3112       index = 0;
3113       vect_stmts_counter = 0;
3114       vec_index = 0;
3115       first_vec_index = vec_index++;
3116       if (only_one_vec)
3117         second_vec_index = first_vec_index;
3118       else
3119         second_vec_index =  vec_index++;
3120
3121       for (j = 0; j < unroll_factor; j++)
3122         {
3123           for (k = 0; k < group_size; k++)
3124             {
3125               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3126               first_mask_element = i + j * group_size;
3127               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3128                                           nunits, only_one_vec, index,
3129                                           mask, &current_mask_element,
3130                                           &need_next_vector,
3131                                           &number_of_mask_fixes, &mask_fixed,
3132                                           &needs_first_vector))
3133                 return false;
3134               gcc_assert (current_mask_element < 2 * nunits);
3135               mask[index++] = current_mask_element;
3136
3137               if (index == nunits)
3138                 {
3139                   index = 0;
3140                   if (!can_vec_perm_p (mode, false, mask))
3141                     {
3142                       if (dump_enabled_p ())
3143                         {
3144                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3145                                            vect_location, 
3146                                            "unsupported vect permute { ");
3147                           for (i = 0; i < nunits; ++i)
3148                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3149                                          mask[i]);
3150                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3151                         }
3152                       return false;
3153                     }
3154
3155                   if (!analyze_only)
3156                     {
3157                       int l;
3158                       tree mask_vec, *mask_elts;
3159                       mask_elts = XALLOCAVEC (tree, nunits);
3160                       for (l = 0; l < nunits; ++l)
3161                         mask_elts[l] = build_int_cst (mask_element_type,
3162                                                       mask[l]);
3163                       mask_vec = build_vector (mask_type, mask_elts);
3164
3165                       if (need_next_vector)
3166                         {
3167                           first_vec_index = second_vec_index;
3168                           second_vec_index = vec_index;
3169                         }
3170
3171                       next_scalar_stmt
3172                           = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3173
3174                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3175                                mask_vec, first_vec_index, second_vec_index,
3176                                gsi, node, vectype, dr_chain,
3177                                ncopies, vect_stmts_counter++);
3178                     }
3179                 }
3180             }
3181         }
3182     }
3183
3184   return true;
3185 }
3186
3187
3188
3189 /* Vectorize SLP instance tree in postorder.  */
3190
3191 static bool
3192 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3193                             unsigned int vectorization_factor)
3194 {
3195   gimple stmt;
3196   bool grouped_store, is_store;
3197   gimple_stmt_iterator si;
3198   stmt_vec_info stmt_info;
3199   unsigned int vec_stmts_size, nunits, group_size;
3200   tree vectype;
3201   int i;
3202   slp_tree child;
3203
3204   if (!node)
3205     return false;
3206
3207   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3208     vect_schedule_slp_instance (child, instance, vectorization_factor);
3209
3210   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3211   stmt_info = vinfo_for_stmt (stmt);
3212
3213   /* VECTYPE is the type of the destination.  */
3214   vectype = STMT_VINFO_VECTYPE (stmt_info);
3215   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3216   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3217
3218   /* For each SLP instance calculate number of vector stmts to be created
3219      for the scalar stmts in each node of the SLP tree.  Number of vector
3220      elements in one vector iteration is the number of scalar elements in
3221      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3222      size.  */
3223   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3224
3225   if (!SLP_TREE_VEC_STMTS (node).exists ())
3226     {
3227       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3228       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3229     }
3230
3231   if (dump_enabled_p ())
3232     {
3233       dump_printf_loc (MSG_NOTE,vect_location,
3234                        "------>vectorizing SLP node starting from: ");
3235       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3236       dump_printf (MSG_NOTE, "\n");
3237     }
3238
3239   /* Loads should be inserted before the first load.  */
3240   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3241       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3242       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3243       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3244     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3245   else if (is_pattern_stmt_p (stmt_info))
3246     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3247   else
3248     si = gsi_for_stmt (stmt);
3249
3250   /* Stores should be inserted just before the last store.  */
3251   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3252       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3253     { 
3254       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3255       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3256        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3257       si = gsi_for_stmt (last_store);
3258     }
3259
3260   /* Mark the first element of the reduction chain as reduction to properly
3261      transform the node.  In the analysis phase only the last element of the
3262      chain is marked as reduction.  */
3263   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3264       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3265     {
3266       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3267       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3268     }
3269
3270   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3271   return is_store;
3272 }
3273
3274 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3275    For loop vectorization this is done in vectorizable_call, but for SLP
3276    it needs to be deferred until end of vect_schedule_slp, because multiple
3277    SLP instances may refer to the same scalar stmt.  */
3278
3279 static void
3280 vect_remove_slp_scalar_calls (slp_tree node)
3281 {
3282   gimple stmt, new_stmt;
3283   gimple_stmt_iterator gsi;
3284   int i;
3285   slp_tree child;
3286   tree lhs;
3287   stmt_vec_info stmt_info;
3288
3289   if (!node)
3290     return;
3291
3292   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3293     vect_remove_slp_scalar_calls (child);
3294
3295   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3296     {
3297       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3298         continue;
3299       stmt_info = vinfo_for_stmt (stmt);
3300       if (stmt_info == NULL
3301           || is_pattern_stmt_p (stmt_info)
3302           || !PURE_SLP_STMT (stmt_info))
3303         continue;
3304       lhs = gimple_call_lhs (stmt);
3305       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3306       set_vinfo_for_stmt (new_stmt, stmt_info);
3307       set_vinfo_for_stmt (stmt, NULL);
3308       STMT_VINFO_STMT (stmt_info) = new_stmt;
3309       gsi = gsi_for_stmt (stmt);
3310       gsi_replace (&gsi, new_stmt, false);
3311       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3312     }
3313 }
3314
3315 /* Generate vector code for all SLP instances in the loop/basic block.  */
3316
3317 bool
3318 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3319 {
3320   vec<slp_instance> slp_instances;
3321   slp_instance instance;
3322   unsigned int i, vf;
3323   bool is_store = false;
3324
3325   if (loop_vinfo)
3326     {
3327       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3328       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3329     }
3330   else
3331     {
3332       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3333       vf = 1;
3334     }
3335
3336   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3337     {
3338       /* Schedule the tree of INSTANCE.  */
3339       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3340                                              instance, vf);
3341       if (dump_enabled_p ())
3342         dump_printf_loc (MSG_NOTE, vect_location,
3343                          "vectorizing stmts using SLP.\n");
3344     }
3345
3346   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3347     {
3348       slp_tree root = SLP_INSTANCE_TREE (instance);
3349       gimple store;
3350       unsigned int j;
3351       gimple_stmt_iterator gsi;
3352
3353       /* Remove scalar call stmts.  Do not do this for basic-block
3354          vectorization as not all uses may be vectorized.
3355          ???  Why should this be necessary?  DCE should be able to
3356          remove the stmts itself.
3357          ???  For BB vectorization we can as well remove scalar
3358          stmts starting from the SLP tree root if they have no
3359          uses.  */
3360       if (loop_vinfo)
3361         vect_remove_slp_scalar_calls (root);
3362
3363       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3364                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3365         {
3366           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3367             break;
3368
3369          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3370            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3371           /* Free the attached stmt_vec_info and remove the stmt.  */
3372           gsi = gsi_for_stmt (store);
3373           unlink_stmt_vdef (store);
3374           gsi_remove (&gsi, true);
3375           release_defs (store);
3376           free_stmt_vec_info (store);
3377         }
3378     }
3379
3380   return is_store;
3381 }
3382
3383
3384 /* Vectorize the basic block.  */
3385
3386 void
3387 vect_slp_transform_bb (basic_block bb)
3388 {
3389   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3390   gimple_stmt_iterator si;
3391
3392   gcc_assert (bb_vinfo);
3393
3394   if (dump_enabled_p ())
3395     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3396
3397   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3398     {
3399       gimple stmt = gsi_stmt (si);
3400       stmt_vec_info stmt_info;
3401
3402       if (dump_enabled_p ())
3403         {
3404           dump_printf_loc (MSG_NOTE, vect_location,
3405                            "------>SLPing statement: ");
3406           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3407           dump_printf (MSG_NOTE, "\n");
3408         }
3409
3410       stmt_info = vinfo_for_stmt (stmt);
3411       gcc_assert (stmt_info);
3412
3413       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3414       if (STMT_SLP_TYPE (stmt_info))
3415         {
3416           vect_schedule_slp (NULL, bb_vinfo);
3417           break;
3418         }
3419     }
3420
3421   if (dump_enabled_p ())
3422     dump_printf_loc (MSG_NOTE, vect_location,
3423                      "BASIC BLOCK VECTORIZED\n");
3424
3425   destroy_bb_vec_info (bb_vinfo);
3426 }