1 /* Loop unrolling and peeling.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
23 #include "coretypes.h"
26 #include "hard-reg-set.h"
27 #include "basic-block.h"
29 #include "cfglayout.h"
33 /* We need to use the macro exact_log2. */
36 /* This pass performs loop unrolling and peeling. We only perform these
37 optimizations on innermost loops (with single exception) because
38 the impact on performance is greatest here, and we want to avoid
39 unnecessary code size growth. The gain is caused by greater sequentiality
40 of code, better code to optimize for further passes and in some cases
41 by fewer testings of exit conditions. The main problem is code growth,
42 that impacts performance negatively due to effect of caches.
46 -- complete peeling of once-rolling loops; this is the above mentioned
47 exception, as this causes loop to be cancelled completely and
48 does not cause code growth
49 -- complete peeling of loops that roll (small) constant times.
50 -- simple peeling of first iterations of loops that do not roll much
51 (according to profile feedback)
52 -- unrolling of loops that roll constant times; this is almost always
53 win, as we get rid of exit condition tests.
54 -- unrolling of loops that roll number of times that we can compute
55 in runtime; we also get rid of exit condition tests here, but there
56 is the extra expense for calculating the number of iterations
57 -- simple unrolling of remaining loops; this is performed only if we
58 are asked to, as the gain is questionable in this case and often
59 it may even slow down the code
60 For more detailed descriptions of each of those, see comments at
61 appropriate function below.
63 There is a lot of parameters (defined and described in params.def) that
64 control how much we unroll/peel.
66 ??? A great problem is that we don't have a good way how to determine
67 how many times we should unroll the loop; the experiments I have made
68 showed that this choice may affect performance in order of several %.
71 static void decide_unrolling_and_peeling (struct loops *, int);
72 static void peel_loops_completely (struct loops *, int);
73 static void decide_peel_simple (struct loop *, int);
74 static void decide_peel_once_rolling (struct loop *, int);
75 static void decide_peel_completely (struct loop *, int);
76 static void decide_unroll_stupid (struct loop *, int);
77 static void decide_unroll_constant_iterations (struct loop *, int);
78 static void decide_unroll_runtime_iterations (struct loop *, int);
79 static void peel_loop_simple (struct loops *, struct loop *);
80 static void peel_loop_completely (struct loops *, struct loop *);
81 static void unroll_loop_stupid (struct loops *, struct loop *);
82 static void unroll_loop_constant_iterations (struct loops *, struct loop *);
83 static void unroll_loop_runtime_iterations (struct loops *, struct loop *);
84 static void expand_bct (edge, int);
85 static bool discard_increment (struct loop *);
87 /* Unroll and/or peel (depending on FLAGS) LOOPS. */
89 unroll_and_peel_loops (struct loops *loops, int flags)
91 struct loop *loop, *next;
94 /* First perform complete loop peeling (it is almost surely a win,
95 and affects parameters for further decision a lot). */
96 peel_loops_completely (loops, flags);
98 /* Now decide rest of unrolling and peeling. */
99 decide_unrolling_and_peeling (loops, flags);
101 loop = loops->tree_root;
105 /* Scan the loops, inner ones first. */
106 while (loop != loops->tree_root)
118 /* And perform the appropriate transformations. */
119 switch (loop->lpt_decision.decision)
121 case LPT_PEEL_COMPLETELY:
124 case LPT_PEEL_SIMPLE:
125 peel_loop_simple (loops, loop);
127 case LPT_UNROLL_CONSTANT:
128 unroll_loop_constant_iterations (loops, loop);
130 case LPT_UNROLL_RUNTIME:
131 unroll_loop_runtime_iterations (loops, loop);
133 case LPT_UNROLL_STUPID:
134 unroll_loop_stupid (loops, loop);
144 #ifdef ENABLE_CHECKING
145 verify_dominators (CDI_DOMINATORS);
146 verify_loop_structure (loops);
153 /* Check whether to peel LOOPS (depending on FLAGS) completely and do so. */
155 peel_loops_completely (struct loops *loops, int flags)
157 struct loop *loop, *next;
159 loop = loops->tree_root;
163 while (loop != loops->tree_root)
174 loop->lpt_decision.decision = LPT_NONE;
178 fprintf (rtl_dump_file, ";; Considering loop %d for complete peeling\n",
181 loop->ninsns = num_loop_insns (loop);
183 decide_peel_once_rolling (loop, flags);
184 if (loop->lpt_decision.decision == LPT_NONE)
185 decide_peel_completely (loop, flags);
187 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
189 peel_loop_completely (loops, loop);
190 #ifdef ENABLE_CHECKING
191 verify_dominators (CDI_DOMINATORS);
192 verify_loop_structure (loops);
199 /* Decide whether unroll or peel LOOPS (depending on FLAGS) and how much. */
201 decide_unrolling_and_peeling (struct loops *loops, int flags)
203 struct loop *loop = loops->tree_root, *next;
208 /* Scan the loops, inner ones first. */
209 while (loop != loops->tree_root)
220 loop->lpt_decision.decision = LPT_NONE;
223 fprintf (rtl_dump_file, ";; Considering loop %d\n", loop->num);
225 /* Do not peel cold areas. */
226 if (!maybe_hot_bb_p (loop->header))
229 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
234 /* Can the loop be manipulated? */
235 if (!can_duplicate_loop_p (loop))
238 fprintf (rtl_dump_file,
239 ";; Not considering loop, cannot duplicate\n");
244 /* Skip non-innermost loops. */
248 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
253 loop->ninsns = num_loop_insns (loop);
254 loop->av_ninsns = average_num_loop_insns (loop);
256 /* Try transformations one by one in decreasing order of
259 decide_unroll_constant_iterations (loop, flags);
260 if (loop->lpt_decision.decision == LPT_NONE)
261 decide_unroll_runtime_iterations (loop, flags);
262 if (loop->lpt_decision.decision == LPT_NONE)
263 decide_unroll_stupid (loop, flags);
264 if (loop->lpt_decision.decision == LPT_NONE)
265 decide_peel_simple (loop, flags);
271 /* Decide whether the LOOP is once rolling and suitable for complete
274 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
277 fprintf (rtl_dump_file, ";; Considering peeling once rolling loop\n");
279 /* Is the loop small enough? */
280 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
283 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
287 /* Check for simple loops. */
288 loop->simple = simple_loop_p (loop, &loop->desc);
291 /* Check number of iterations. */
292 if (!loop->simple || !loop->desc.const_iter || loop->desc.niter != 0)
295 fprintf (rtl_dump_file, ";; Unable to prove that the loop rolls exactly once\n");
301 fprintf (rtl_dump_file, ";; Decided to peel exactly once rolling loop\n");
302 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
305 /* Decide whether the LOOP is suitable for complete peeling. */
307 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
312 fprintf (rtl_dump_file, ";; Considering peeling completely\n");
314 /* Skip non-innermost loops. */
318 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
322 /* Do not peel cold areas. */
323 if (!maybe_hot_bb_p (loop->header))
326 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
330 /* Can the loop be manipulated? */
331 if (!can_duplicate_loop_p (loop))
334 fprintf (rtl_dump_file,
335 ";; Not considering loop, cannot duplicate\n");
339 /* npeel = number of iterations to peel. */
340 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
341 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
342 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
344 /* Is the loop small enough? */
348 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
352 /* Check for simple loops. */
355 loop->simple = simple_loop_p (loop, &loop->desc);
359 /* Check number of iterations. */
360 if (!loop->simple || !loop->desc.const_iter)
363 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
367 if (loop->desc.niter > npeel - 1)
371 fprintf (rtl_dump_file, ";; Not peeling loop completely, rolls too much (");
372 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC,(HOST_WIDEST_INT) loop->desc.niter);
373 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
380 fprintf (rtl_dump_file, ";; Decided to peel loop completely\n");
381 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
384 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
385 completely. The transformation done:
387 for (i = 0; i < 4; i++)
399 peel_loop_completely (struct loops *loops, struct loop *loop)
402 unsigned HOST_WIDE_INT npeel;
403 unsigned n_remove_edges, i;
405 struct loop_desc *desc = &loop->desc;
406 bool discard_inc = false;
409 if ((is_bct = is_bct_cond (BB_END (loop->desc.out_edge->src))))
410 discard_inc = discard_increment (loop);
416 wont_exit = sbitmap_alloc (npeel + 1);
417 sbitmap_ones (wont_exit);
418 RESET_BIT (wont_exit, 0);
419 if (desc->may_be_zero)
420 RESET_BIT (wont_exit, 1);
422 remove_edges = xcalloc (npeel, sizeof (edge));
425 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
427 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
428 DLTHE_FLAG_UPDATE_FREQ))
433 /* Expand the branch and count. */
435 for (i = 0; i < n_remove_edges; i++)
436 expand_bct (remove_edges[i], discard_inc);
438 /* Remove the exit edges. */
439 for (i = 0; i < n_remove_edges; i++)
440 remove_path (loops, remove_edges[i]);
444 /* Expand the branch and count. */
446 expand_bct (desc->in_edge, discard_inc);
448 /* Now remove the unreachable part of the last iteration and cancel
450 remove_path (loops, desc->in_edge);
453 fprintf (rtl_dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
456 /* Decide whether to unroll LOOP iterating constant number of times and how much. */
458 decide_unroll_constant_iterations (struct loop *loop, int flags)
460 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = -1, n_copies, i;
462 if (!(flags & UAP_UNROLL))
464 /* We were not asked to, just return back silently. */
469 fprintf (rtl_dump_file, ";; Considering unrolling loop with constant number of iterations\n");
471 /* nunroll = total number of copies of the original loop body in
472 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
473 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
474 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
475 if (nunroll > nunroll_by_av)
476 nunroll = nunroll_by_av;
477 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
478 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
480 /* Skip big loops. */
484 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
488 /* Check for simple loops. */
491 loop->simple = simple_loop_p (loop, &loop->desc);
495 /* Check number of iterations. */
496 if (!loop->simple || !loop->desc.const_iter)
499 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
503 /* Check whether the loop rolls enough to consider. */
504 if (loop->desc.niter < 2 * nunroll)
507 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
511 /* Success; now compute number of iterations to unroll. We alter
512 nunroll so that as few as possible copies of loop body are
513 necessary, while still not decreasing the number of unrollings
514 too much (at most by 1). */
515 best_copies = 2 * nunroll + 10;
518 if ((unsigned) i - 1 >= loop->desc.niter)
519 i = loop->desc.niter - 2;
521 for (; i >= nunroll - 1; i--)
523 unsigned exit_mod = loop->desc.niter % (i + 1);
525 if (loop->desc.postincr)
526 n_copies = exit_mod + i + 1;
527 else if (exit_mod != (unsigned) i || loop->desc.may_be_zero)
528 n_copies = exit_mod + i + 2;
532 if (n_copies < best_copies)
534 best_copies = n_copies;
540 fprintf (rtl_dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
541 best_unroll + 1, best_copies, nunroll);
543 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
544 loop->lpt_decision.times = best_unroll;
547 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
548 times. The transformation does this:
550 for (i = 0; i < 102; i++)
567 unroll_loop_constant_iterations (struct loops *loops, struct loop *loop)
569 unsigned HOST_WIDE_INT niter;
572 unsigned n_remove_edges, i;
574 unsigned max_unroll = loop->lpt_decision.times;
575 struct loop_desc *desc = &loop->desc;
576 bool discard_inc = false;
581 if (niter <= (unsigned) max_unroll + 1)
582 abort (); /* Should not get here (such loop should be peeled instead). */
584 exit_mod = niter % (max_unroll + 1);
586 wont_exit = sbitmap_alloc (max_unroll + 1);
587 sbitmap_ones (wont_exit);
589 remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge));
592 /* For a loop ending with a branch and count for which the increment
593 of the count register will be discarded, adjust the initialization of
594 the count register. */
595 if ((is_bct = is_bct_cond (BB_END (desc->out_edge->src)))
596 && (discard_inc = discard_increment (loop)))
601 int n_peel, new_bct_value;
603 /* Get expression for number of iterations. */
606 n_peel = (niter+1) % (max_unroll+1);
607 new_bct_value = (niter+1 - n_peel) / (max_unroll+1) ;
608 ini_var = GEN_INT (new_bct_value);
610 emit_move_insn (desc->var, ini_var);
611 init_code = get_insns ();
614 loop_split_edge_with (loop_preheader_edge (loop), init_code);
619 /* Counter is incremented after the exit test; leave exit test
620 in the first copy, so that the loops that start with test
621 of exit condition have continuous body after unrolling. */
624 fprintf (rtl_dump_file, ";; Condition on beginning of loop.\n");
626 /* Peel exit_mod iterations. */
627 RESET_BIT (wont_exit, 0);
628 if (desc->may_be_zero)
629 RESET_BIT (wont_exit, 1);
632 && !duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
634 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
635 DLTHE_FLAG_UPDATE_FREQ))
638 SET_BIT (wont_exit, 1);
642 /* Leave exit test in last copy, for the same reason as above if
643 the loop tests the condition at the end of loop body. */
646 fprintf (rtl_dump_file, ";; Condition on end of loop.\n");
648 /* We know that niter >= max_unroll + 2; so we do not need to care of
649 case when we would exit before reaching the loop. So just peel
650 exit_mod + 1 iterations.
652 if (exit_mod != (unsigned) max_unroll || desc->may_be_zero)
654 RESET_BIT (wont_exit, 0);
655 if (desc->may_be_zero)
656 RESET_BIT (wont_exit, 1);
658 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
660 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
661 DLTHE_FLAG_UPDATE_FREQ))
664 SET_BIT (wont_exit, 0);
665 SET_BIT (wont_exit, 1);
668 RESET_BIT (wont_exit, max_unroll);
671 /* Now unroll the loop. */
672 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
674 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
675 DLTHE_FLAG_UPDATE_FREQ))
680 /* Expand the branch and count. */
682 for (i = 0; i < n_remove_edges; i++)
683 expand_bct (remove_edges[i], discard_inc);
685 /* Remove the edges. */
686 for (i = 0; i < n_remove_edges; i++)
687 remove_path (loops, remove_edges[i]);
691 fprintf (rtl_dump_file, ";; Unrolled loop %d times, constant # of iterations %i insns\n",max_unroll, num_loop_insns (loop));
694 /* Decide whether to unroll LOOP iterating runtime computable number of times
697 decide_unroll_runtime_iterations (struct loop *loop, int flags)
699 unsigned nunroll, nunroll_by_av, i;
701 if (!(flags & UAP_UNROLL))
703 /* We were not asked to, just return back silently. */
708 fprintf (rtl_dump_file, ";; Considering unrolling loop with runtime computable number of iterations\n");
710 /* nunroll = total number of copies of the original loop body in
711 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
712 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
713 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
714 if (nunroll > nunroll_by_av)
715 nunroll = nunroll_by_av;
716 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
717 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
719 /* Skip big loops. */
723 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
727 /* Check for simple loops. */
730 loop->simple = simple_loop_p (loop, &loop->desc);
734 /* Check simpleness. */
738 fprintf (rtl_dump_file, ";; Unable to prove that the number of iterations can be counted in runtime\n");
742 if (loop->desc.const_iter)
745 fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
749 /* If we have profile feedback, check whether the loop rolls. */
750 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
753 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
757 /* Success; now force nunroll to be power of 2, as we are unable to
758 cope with overflows in computation of number of iterations. */
759 for (i = 1; 2 * i <= nunroll; i *= 2);
761 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
762 loop->lpt_decision.times = i - 1;
765 /* Unroll LOOP for that we are able to count number of iterations in runtime
766 LOOP->LPT_DECISION.TIMES + 1 times. The transformation does this (with some
767 extra care for case n < 0):
769 for (i = 0; i < n; i++)
797 unroll_loop_runtime_iterations (struct loops *loops, struct loop *loop)
799 rtx niter, init_code, branch_code, jump, label;
801 basic_block preheader, *body, *dom_bbs, swtch, ezc_swtch;
805 unsigned n_peel, n_remove_edges;
806 edge *remove_edges, e;
807 bool extra_zero_check, last_may_exit;
808 unsigned max_unroll = loop->lpt_decision.times;
809 struct loop_desc *desc = &loop->desc;
810 bool discard_inc = false;
813 /* Remember blocks whose dominators will have to be updated. */
814 dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block));
817 body = get_loop_body (loop);
818 for (i = 0; i < loop->num_nodes; i++)
823 nldom = get_dominated_by (CDI_DOMINATORS, body[i], &ldom);
824 for (j = 0; j < nldom; j++)
825 if (!flow_bb_inside_loop_p (loop, ldom[j]))
826 dom_bbs[n_dom_bbs++] = ldom[j];
834 /* Leave exit in first copy (for explanation why see comment in
835 unroll_loop_constant_iterations). */
837 n_peel = max_unroll - 1;
838 extra_zero_check = true;
839 last_may_exit = false;
843 /* Leave exit in last copy (for explanation why see comment in
844 unroll_loop_constant_iterations). */
845 may_exit_copy = max_unroll;
847 extra_zero_check = false;
848 last_may_exit = true;
851 /* Get expression for number of iterations. */
853 niter = count_loop_iterations (desc, NULL, NULL);
856 niter = force_operand (niter, NULL);
858 /* Count modulo by ANDing it with max_unroll; we use the fact that
859 the number of unrollings is a power of two, and thus this is correct
860 even if there is overflow in the computation. */
861 niter = expand_simple_binop (GET_MODE (desc->var), AND,
863 GEN_INT (max_unroll),
864 NULL_RTX, 0, OPTAB_LIB_WIDEN);
866 /* For a loop ending with a branch and count for which the increment
867 of the count register will be discarded, adjust the initialization of
868 the count register. */
869 if ((is_bct = is_bct_cond (BB_END (desc->out_edge->src)))
870 && (discard_inc = discard_increment (loop)))
872 rtx count, count2, count_unroll_mod;
875 /* start_sequence (); */
877 count = count_loop_iterations (desc, NULL, NULL);
879 count_unroll = loop->lpt_decision.times+1;
883 count_unroll_mod = GEN_INT (exact_log2 (count_unroll));
884 count = expand_simple_binop (GET_MODE (desc->var), LSHIFTRT,
885 count, count_unroll_mod,
886 0, 0, OPTAB_LIB_WIDEN);
888 count2 = expand_simple_binop (GET_MODE (desc->var), PLUS,
890 0, 0, OPTAB_LIB_WIDEN);
892 emit_move_insn (desc->var, count2);
895 init_code = get_insns ();
898 /* Precondition the loop. */
899 loop_split_edge_with (loop_preheader_edge (loop), init_code);
901 remove_edges = xcalloc (max_unroll + n_peel + 1, sizeof (edge));
904 wont_exit = sbitmap_alloc (max_unroll + 2);
906 /* Peel the first copy of loop body (almost always we must leave exit test
907 here; the only exception is when we have extra zero check and the number
908 of iterations is reliable (i.e. comes out of NE condition). Also record
909 the place of (possible) extra zero check. */
910 sbitmap_zero (wont_exit);
911 if (extra_zero_check && desc->cond == NE)
912 SET_BIT (wont_exit, 1);
913 ezc_swtch = loop_preheader_edge (loop)->src;
914 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
916 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
917 DLTHE_FLAG_UPDATE_FREQ))
920 /* Record the place where switch will be built for preconditioning. */
921 swtch = loop_split_edge_with (loop_preheader_edge (loop),
924 for (i = 0; i < n_peel; i++)
927 sbitmap_zero (wont_exit);
928 if (i != n_peel - 1 || !last_may_exit)
929 SET_BIT (wont_exit, 1);
930 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
932 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
933 DLTHE_FLAG_UPDATE_FREQ))
936 /* Create item for switch. */
937 j = n_peel - i - (extra_zero_check ? 0 : 1);
938 p = REG_BR_PROB_BASE / (i + 2);
940 /* If modulo is zero do not jumo to the header of the unrolled loops.
941 Jump instead to the last branch and count that precedes it. */
942 if (is_bct && discard_inc && (j == 0))
944 basic_block lastbb = loop_preheader_edge(loop)->src;
947 /* Skip dummy basic blocks generated during the unrolling. */
948 while (!is_bct_cond (BB_END (lastbb)))
949 lastbb = lastbb->pred->src;
951 split_after = PREV_INSN (BB_END (lastbb));
953 preheader = split_loop_bb (lastbb , split_after)->dest;
956 preheader = loop_split_edge_with (loop_preheader_edge (loop),
958 label = block_label (preheader);
960 do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0,
961 GET_MODE (desc->var), NULL_RTX, NULL_RTX,
963 jump = get_last_insn ();
964 JUMP_LABEL (jump) = label;
966 = gen_rtx_EXPR_LIST (REG_BR_PROB,
967 GEN_INT (p), REG_NOTES (jump));
969 LABEL_NUSES (label)++;
970 branch_code = get_insns ();
973 swtch = loop_split_edge_with (swtch->pred, branch_code);
974 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
975 swtch->succ->probability = REG_BR_PROB_BASE - p;
976 e = make_edge (swtch, preheader,
977 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
981 if (extra_zero_check)
983 /* Add branch for zero iterations. */
984 p = REG_BR_PROB_BASE / (max_unroll + 1);
986 preheader = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX);
987 label = block_label (preheader);
989 do_compare_rtx_and_jump (copy_rtx (niter), const0_rtx, EQ, 0,
990 GET_MODE (desc->var), NULL_RTX, NULL_RTX,
992 jump = get_last_insn ();
993 JUMP_LABEL (jump) = label;
995 = gen_rtx_EXPR_LIST (REG_BR_PROB,
996 GEN_INT (p), REG_NOTES (jump));
998 LABEL_NUSES (label)++;
999 branch_code = get_insns ();
1002 swtch = loop_split_edge_with (swtch->succ, branch_code);
1003 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1004 swtch->succ->probability = REG_BR_PROB_BASE - p;
1005 e = make_edge (swtch, preheader,
1006 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
1010 /* Recount dominators for outer blocks. */
1011 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, n_dom_bbs);
1013 /* And unroll loop. */
1015 sbitmap_ones (wont_exit);
1016 RESET_BIT (wont_exit, may_exit_copy);
1018 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1020 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
1021 DLTHE_FLAG_UPDATE_FREQ))
1026 /* Expand the branch and count. */
1028 for (i = 0; i < n_remove_edges; i++)
1029 expand_bct (remove_edges[i], discard_inc);
1031 /* Remove the edges. */
1032 for (i = 0; i < n_remove_edges; i++)
1033 remove_path (loops, remove_edges[i]);
1034 free (remove_edges);
1037 fprintf (rtl_dump_file,
1038 ";; Unrolled loop %d times, counting # of iterations in runtime, %i insns\n",
1039 max_unroll, num_loop_insns (loop));
1042 /* Decide whether to simply peel LOOP and how much. */
1044 decide_peel_simple (struct loop *loop, int flags)
1048 if (!(flags & UAP_PEEL))
1050 /* We were not asked to, just return back silently. */
1055 fprintf (rtl_dump_file, ";; Considering simply peeling loop\n");
1057 /* npeel = number of iterations to peel. */
1058 npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1059 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1060 npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1062 /* Skip big loops. */
1066 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
1070 /* Check for simple loops. */
1071 if (!loop->has_desc)
1073 loop->simple = simple_loop_p (loop, &loop->desc);
1077 /* Check number of iterations. */
1078 if (loop->simple && loop->desc.const_iter)
1081 fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
1085 /* Do not simply peel loops with branches inside -- it increases number
1087 if (loop->desc.n_branches > 1)
1090 fprintf (rtl_dump_file, ";; Not peeling, contains branches\n");
1094 if (loop->header->count)
1096 unsigned niter = expected_loop_iterations (loop);
1097 if (niter + 1 > npeel)
1101 fprintf (rtl_dump_file, ";; Not peeling loop, rolls too much (");
1102 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC, (HOST_WIDEST_INT) (niter + 1));
1103 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
1111 /* For now we have no good heuristics to decide whether loop peeling
1112 will be effective, so disable it. */
1114 fprintf (rtl_dump_file,
1115 ";; Not peeling loop, no evidence it will be profitable\n");
1120 loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1121 loop->lpt_decision.times = npeel;
1124 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1130 if (!cond) goto end;
1132 if (!cond) goto end;
1139 peel_loop_simple (struct loops *loops, struct loop *loop)
1142 unsigned npeel = loop->lpt_decision.times;
1144 wont_exit = sbitmap_alloc (npeel + 1);
1145 sbitmap_zero (wont_exit);
1147 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1148 loops, npeel, wont_exit, NULL, NULL, NULL,
1149 DLTHE_FLAG_UPDATE_FREQ))
1155 fprintf (rtl_dump_file, ";; Peeling loop %d times\n", npeel);
1158 /* Decide whether to unroll LOOP stupidly and how much. */
1160 decide_unroll_stupid (struct loop *loop, int flags)
1162 unsigned nunroll, nunroll_by_av, i;
1164 if (!(flags & UAP_UNROLL_ALL))
1166 /* We were not asked to, just return back silently. */
1171 fprintf (rtl_dump_file, ";; Considering unrolling loop stupidly\n");
1173 /* nunroll = total number of copies of the original loop body in
1174 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1175 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1176 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1177 if (nunroll > nunroll_by_av)
1178 nunroll = nunroll_by_av;
1179 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1180 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1182 /* Skip big loops. */
1186 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
1190 /* Check for simple loops. */
1191 if (!loop->has_desc)
1193 loop->simple = simple_loop_p (loop, &loop->desc);
1197 /* Check simpleness. */
1201 fprintf (rtl_dump_file, ";; The loop is simple\n");
1205 /* Do not unroll loops with branches inside -- it increases number
1207 if (loop->desc.n_branches > 1)
1210 fprintf (rtl_dump_file, ";; Not unrolling, contains branches\n");
1214 /* If we have profile feedback, check whether the loop rolls. */
1215 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
1218 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
1222 /* Success. Now force nunroll to be power of 2, as it seems that this
1223 improves results (partially because of better alignments, partially
1224 because of some dark magic). */
1225 for (i = 1; 2 * i <= nunroll; i *= 2);
1227 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1228 loop->lpt_decision.times = i - 1;
1231 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1249 unroll_loop_stupid (struct loops *loops, struct loop *loop)
1252 unsigned nunroll = loop->lpt_decision.times;
1254 wont_exit = sbitmap_alloc (nunroll + 1);
1255 sbitmap_zero (wont_exit);
1257 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1258 loops, nunroll, wont_exit, NULL, NULL, NULL,
1259 DLTHE_FLAG_UPDATE_FREQ))
1265 fprintf (rtl_dump_file, ";; Unrolled loop %d times, %i insns\n",
1266 nunroll, num_loop_insns (loop));
1269 /* Expand a bct instruction in a branch and an increment.
1270 If flag_inc is set, the induction variable does not need to be
1274 expand_bct (edge e, int flag_inc)
1276 rtx bct_insn = BB_END (e->src);
1285 rtx pattern = PATTERN (bct_insn);
1287 if (!is_bct_cond (bct_insn))
1290 inc = get_var_set_from_bct (bct_insn);
1291 cmp = XVECEXP (pattern, 0, 0);
1292 reg = SET_DEST (inc);
1297 tgt = force_operand (XEXP (inc, 1), XEXP (inc, 0));
1298 if (tgt != XEXP (inc, 0))
1299 emit_move_insn (XEXP (inc, 0), tgt);
1302 condition = XEXP (SET_SRC (cmp), 0);
1303 labelref = XEXP (SET_SRC (cmp), 1);
1305 do_compare_rtx_and_jump (copy_rtx (reg), XEXP (condition, 1),
1306 GET_CODE (condition), 0,
1307 GET_MODE (reg), NULL_RTX, NULL_RTX,
1308 XEXP (labelref, 0));
1311 emit_insn_after (seq, bct_insn);
1313 delete_insn (bct_insn);
1318 /* Check that the increment of the count register can be discarded. */
1320 discard_increment (struct loop *loop)
1322 struct loop_desc *desc = &loop->desc;
1323 rtx inc, set_src, reg;
1328 bct_insn = BB_END (desc->out_edge->src);
1329 if (!is_bct_cond (bct_insn))
1332 inc = get_var_set_from_bct (bct_insn);
1334 /* Check that inc is of the form reg = reg - 1. */
1335 reg = SET_DEST (inc);
1336 set_src = SET_SRC (inc);
1338 if (GET_CODE (set_src) != PLUS)
1341 if (!rtx_equal_p (XEXP (set_src, 0), reg))
1344 if (!CONSTANT_P (XEXP (set_src, 1)))
1347 if (INTVAL (XEXP (set_src, 1)) != -1)
1350 /* We need to check that the register has no other uses beside the branch and
1352 body = get_loop_body (loop);
1353 for(i=0; i < loop->num_nodes; i++)
1355 if (reg_mentioned_p (desc->var, BB_HEAD (body[i])))
1358 if (body[i] != desc->out_edge->src)
1359 if (reg_mentioned_p (desc->var, BB_END (body[i])))
1362 if (reg_used_between_p (desc->var, BB_HEAD (body[i]), BB_END (body[i])))
1366 /* Check that the branch and count ends the latch. */
1367 if (desc->out_edge->src != loop->latch)
1371 /* Latch is a dummy block generated by loop-init. */
1372 if (BRANCH_EDGE(desc->out_edge->src)->dest != loop->latch)
1375 for (insn = BB_HEAD (loop->latch); insn != NEXT_INSN (BB_END (loop->latch));
1376 insn = NEXT_INSN (insn))
1377 if (INSN_P (insn)) return false;