1 /* Swing Modulo Scheduling implementation.
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
26 #include "coretypes.h"
31 #include "hard-reg-set.h"
35 #include "insn-config.h"
36 #include "insn-attr.h"
40 #include "sched-int.h"
42 #include "cfglayout.h"
51 #ifdef INSN_SCHEDULING
53 /* This file contains the implementation of the Swing Modulo Scheduler,
54 described in the following references:
55 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
56 Lifetime--sensitive modulo scheduling in a production environment.
57 IEEE Trans. on Comps., 50(3), March 2001
58 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
59 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
60 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
62 The basic structure is:
63 1. Build a data-dependence graph (DDG) for each loop.
64 2. Use the DDG to order the insns of a loop (not in topological order
65 necessarily, but rather) trying to place each insn after all its
66 predecessors _or_ after all its successors.
67 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
68 4. Use the ordering to perform list-scheduling of the loop:
69 1. Set II = MII. We will try to schedule the loop within II cycles.
70 2. Try to schedule the insns one by one according to the ordering.
71 For each insn compute an interval of cycles by considering already-
72 scheduled preds and succs (and associated latencies); try to place
73 the insn in the cycles of this window checking for potential
74 resource conflicts (using the DFA interface).
75 Note: this is different from the cycle-scheduling of schedule_insns;
76 here the insns are not scheduled monotonically top-down (nor bottom-
78 3. If failed in scheduling all insns - bump II++ and try again, unless
79 II reaches an upper bound MaxII, in which case report failure.
80 5. If we succeeded in scheduling the loop within II cycles, we now
81 generate prolog and epilog, decrease the counter of the loop, and
82 perform modulo variable expansion for live ranges that span more than
83 II cycles (i.e. use register copies to prevent a def from overwriting
84 itself before reaching the use).
88 /* This page defines partial-schedule structures and functions for
91 typedef struct partial_schedule *partial_schedule_ptr;
92 typedef struct ps_insn *ps_insn_ptr;
94 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
95 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
97 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
98 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
100 /* Perform signed modulo, always returning a non-negative value. */
101 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
103 /* The number of different iterations the nodes in ps span, assuming
104 the stage boundaries are placed efficiently. */
105 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
106 + 1 + (ps)->ii - 1) / (ps)->ii)
108 #define CFG_HOOKS cfg_layout_rtl_cfg_hooks
110 /* A single instruction in the partial schedule. */
113 /* The corresponding DDG_NODE. */
116 /* The (absolute) cycle in which the PS instruction is scheduled.
117 Same as SCHED_TIME (node). */
120 /* The next/prev PS_INSN in the same row. */
121 ps_insn_ptr next_in_row,
124 /* The number of nodes in the same row that come after this node. */
128 /* Holds the partial schedule as an array of II rows. Each entry of the
129 array points to a linked list of PS_INSNs, which represents the
130 instructions that are scheduled for that row. */
131 struct partial_schedule
133 int ii; /* Number of rows in the partial schedule. */
134 int history; /* Threshold for conflict checking using DFA. */
136 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
139 /* The earliest absolute cycle of an insn in the partial schedule. */
142 /* The latest absolute cycle of an insn in the partial schedule. */
145 ddg_ptr g; /* The DDG of the insns in the partial schedule. */
149 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
150 static void free_partial_schedule (partial_schedule_ptr);
151 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
152 void print_partial_schedule (partial_schedule_ptr, FILE *);
153 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
154 ddg_node_ptr node, int cycle,
155 sbitmap must_precede,
156 sbitmap must_follow);
157 static void rotate_partial_schedule (partial_schedule_ptr, int);
158 void set_row_column_for_ps (partial_schedule_ptr);
161 /* This page defines constants and structures for the modulo scheduling
164 /* As in haifa-sched.c: */
165 /* issue_rate is the number of insns that can be scheduled in the same
166 machine cycle. It can be defined in the config/mach/mach.h file,
167 otherwise we set it to 1. */
169 static int issue_rate;
171 /* For printing statistics. */
172 static FILE *stats_file;
174 static int sms_order_nodes (ddg_ptr, int, int * result);
175 static void set_node_sched_params (ddg_ptr);
176 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int,
178 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
179 static void generate_prolog_epilog (partial_schedule_ptr, rtx, rtx, int);
180 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
181 int from_stage, int to_stage,
185 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
186 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
187 #define SCHED_FIRST_REG_MOVE(x) \
188 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
189 #define SCHED_NREG_MOVES(x) \
190 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
191 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
192 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
193 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
195 /* The scheduling parameters held for each node. */
196 typedef struct node_sched_params
198 int asap; /* A lower-bound on the absolute scheduling cycle. */
199 int time; /* The absolute scheduling cycle (time >= asap). */
201 /* The following field (first_reg_move) is a pointer to the first
202 register-move instruction added to handle the modulo-variable-expansion
203 of the register defined by this node. This register-move copies the
204 original register defined by the node. */
207 /* The number of register-move instructions added, immediately preceding
211 int row; /* Holds time % ii. */
212 int stage; /* Holds time / ii. */
214 /* The column of a node inside the ps. If nodes u, v are on the same row,
215 u will precede v if column (u) < column (v). */
217 } *node_sched_params_ptr;
220 /* The following three functions are copied from the current scheduler
221 code in order to use sched_analyze() for computing the dependencies.
222 They are used when initializing the sched_info structure. */
224 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
228 sprintf (tmp, "i%4d", INSN_UID (insn));
233 contributes_to_priority (rtx next, rtx insn)
235 return BLOCK_NUM (next) == BLOCK_NUM (insn);
239 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
240 regset cond_exec ATTRIBUTE_UNUSED,
241 regset used ATTRIBUTE_UNUSED,
242 regset set ATTRIBUTE_UNUSED)
246 static struct sched_info sms_sched_info =
254 contributes_to_priority,
255 compute_jump_reg_dependencies,
262 /* Return the register decremented and tested or zero if it is not a decrement
263 and branch jump insn (similar to doloop_condition_get). */
265 doloop_register_get (rtx insn, rtx *comp)
267 rtx pattern, cmp, inc, reg, condition;
271 pattern = PATTERN (insn);
273 /* The canonical doloop pattern we expect is:
275 (parallel [(set (pc) (if_then_else (condition)
278 (set (reg) (plus (reg) (const_int -1)))
279 (additional clobbers and uses)])
281 where condition is further restricted to be
282 (ne (reg) (const_int 1)). */
284 if (GET_CODE (pattern) != PARALLEL)
287 cmp = XVECEXP (pattern, 0, 0);
288 inc = XVECEXP (pattern, 0, 1);
289 /* Return the compare rtx. */
292 /* Check for (set (reg) (something)). */
293 if (GET_CODE (inc) != SET || ! REG_P (SET_DEST (inc)))
296 /* Extract loop counter register. */
297 reg = SET_DEST (inc);
299 /* Check if something = (plus (reg) (const_int -1)). */
300 if (GET_CODE (SET_SRC (inc)) != PLUS
301 || XEXP (SET_SRC (inc), 0) != reg
302 || XEXP (SET_SRC (inc), 1) != constm1_rtx)
305 /* Check for (set (pc) (if_then_else (condition)
308 if (GET_CODE (cmp) != SET
309 || SET_DEST (cmp) != pc_rtx
310 || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
311 || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
312 || XEXP (SET_SRC (cmp), 2) != pc_rtx)
315 /* Extract loop termination condition. */
316 condition = XEXP (SET_SRC (cmp), 0);
318 /* Check if condition = (ne (reg) (const_int 1)), which is more
319 restrictive than the check in doloop_condition_get:
320 if ((GET_CODE (condition) != GE && GET_CODE (condition) != NE)
321 || GET_CODE (XEXP (condition, 1)) != CONST_INT). */
322 if (GET_CODE (condition) != NE
323 || XEXP (condition, 1) != const1_rtx)
326 if (XEXP (condition, 0) == reg)
332 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
333 that the number of iterations is a compile-time constant. If so,
334 return the rtx that sets COUNT_REG to a constant, and set COUNT to
335 this constant. Otherwise return 0. */
337 const_iteration_count (rtx count_reg, basic_block pre_header,
338 HOST_WIDEST_INT * count)
346 get_block_head_tail (pre_header->index, &head, &tail);
348 for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
349 if (INSN_P (insn) && single_set (insn) &&
350 rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
352 rtx pat = single_set (insn);
354 if (GET_CODE (SET_SRC (pat)) == CONST_INT)
356 *count = INTVAL (SET_SRC (pat));
366 /* A very simple resource-based lower bound on the initiation interval.
367 ??? Improve the accuracy of this bound by considering the
368 utilization of various units. */
372 return (g->num_nodes / issue_rate);
376 /* Points to the array that contains the sched data for each node. */
377 static node_sched_params_ptr node_sched_params;
379 /* Allocate sched_params for each node and initialize it. Assumes that
380 the aux field of each node contain the asap bound (computed earlier),
381 and copies it into the sched_params field. */
383 set_node_sched_params (ddg_ptr g)
387 /* Allocate for each node in the DDG a place to hold the "sched_data". */
388 /* Initialize ASAP/ALAP/HIGHT to zero. */
389 node_sched_params = (node_sched_params_ptr)
390 xcalloc (g->num_nodes,
391 sizeof (struct node_sched_params));
393 /* Set the pointer of the general data of the node to point to the
394 appropriate sched_params structure. */
395 for (i = 0; i < g->num_nodes; i++)
397 /* Watch out for aliasing problems? */
398 node_sched_params[i].asap = g->nodes[i].aux.count;
399 g->nodes[i].aux.info = &node_sched_params[i];
404 print_node_sched_params (FILE * dump_file, int num_nodes)
410 for (i = 0; i < num_nodes; i++)
412 node_sched_params_ptr nsp = &node_sched_params[i];
413 rtx reg_move = nsp->first_reg_move;
416 fprintf (dump_file, "Node %d:\n", i);
417 fprintf (dump_file, " asap = %d:\n", nsp->asap);
418 fprintf (dump_file, " time = %d:\n", nsp->time);
419 fprintf (dump_file, " nreg_moves = %d:\n", nsp->nreg_moves);
420 for (j = 0; j < nsp->nreg_moves; j++)
422 fprintf (dump_file, " reg_move = ");
423 print_rtl_single (dump_file, reg_move);
424 reg_move = PREV_INSN (reg_move);
429 /* Calculate an upper bound for II. SMS should not schedule the loop if it
430 requires more cycles than this bound. Currently set to the sum of the
431 longest latency edge for each node. Reset based on experiments. */
433 calculate_maxii (ddg_ptr g)
438 for (i = 0; i < g->num_nodes; i++)
440 ddg_node_ptr u = &g->nodes[i];
442 int max_edge_latency = 0;
444 for (e = u->out; e; e = e->next_out)
445 max_edge_latency = MAX (max_edge_latency, e->latency);
447 maxii += max_edge_latency;
453 Breaking intra-loop register anti-dependences:
454 Each intra-loop register anti-dependence implies a cross-iteration true
455 dependence of distance 1. Therefore, we can remove such false dependencies
456 and figure out if the partial schedule broke them by checking if (for a
457 true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
458 if so generate a register move. The number of such moves is equal to:
459 SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
460 nreg_moves = ----------------------------------- + 1 - { dependecnce.
464 generate_reg_moves (partial_schedule_ptr ps)
470 for (i = 0; i < g->num_nodes; i++)
472 ddg_node_ptr u = &g->nodes[i];
474 int nreg_moves = 0, i_reg_move;
475 sbitmap *uses_of_defs;
477 rtx prev_reg, old_reg;
479 /* Compute the number of reg_moves needed for u, by looking at life
480 ranges started at u (excluding self-loops). */
481 for (e = u->out; e; e = e->next_out)
482 if (e->type == TRUE_DEP && e->dest != e->src)
484 int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
486 if (e->distance == 1)
487 nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
489 /* If dest precedes src in the schedule of the kernel, then dest
490 will read before src writes and we can save one reg_copy. */
491 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
492 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
495 nreg_moves = MAX (nreg_moves, nreg_moves4e);
501 /* Every use of the register defined by node may require a different
502 copy of this register, depending on the time the use is scheduled.
503 Set a bitmap vector, telling which nodes use each copy of this
505 uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
506 sbitmap_vector_zero (uses_of_defs, nreg_moves);
507 for (e = u->out; e; e = e->next_out)
508 if (e->type == TRUE_DEP && e->dest != e->src)
510 int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
512 if (e->distance == 1)
513 dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
515 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
516 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
520 SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
523 /* Now generate the reg_moves, attaching relevant uses to them. */
524 SCHED_NREG_MOVES (u) = nreg_moves;
525 old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
526 last_reg_move = u->insn;
528 for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
531 rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
532 rtx reg_move = gen_move_insn (new_reg, prev_reg);
534 add_insn_before (reg_move, last_reg_move);
535 last_reg_move = reg_move;
537 if (!SCHED_FIRST_REG_MOVE (u))
538 SCHED_FIRST_REG_MOVE (u) = reg_move;
540 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use,
541 replace_rtx (g->nodes[i_use].insn, old_reg, new_reg));
548 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
549 of SCHED_ROW and SCHED_STAGE. */
551 normalize_sched_times (partial_schedule_ptr ps)
555 int amount = PS_MIN_CYCLE (ps);
558 /* Don't include the closing branch assuming that it is the last node. */
559 for (i = 0; i < g->num_nodes - 1; i++)
561 ddg_node_ptr u = &g->nodes[i];
562 int normalized_time = SCHED_TIME (u) - amount;
564 if (normalized_time < 0)
567 SCHED_TIME (u) = normalized_time;
568 SCHED_ROW (u) = normalized_time % ii;
569 SCHED_STAGE (u) = normalized_time / ii;
573 /* Set SCHED_COLUMN of each node according to its position in PS. */
575 set_columns_for_ps (partial_schedule_ptr ps)
579 for (row = 0; row < ps->ii; row++)
581 ps_insn_ptr cur_insn = ps->rows[row];
584 for (; cur_insn; cur_insn = cur_insn->next_in_row)
585 SCHED_COLUMN (cur_insn->node) = column++;
589 /* Permute the insns according to their order in PS, from row 0 to
590 row ii-1, and position them right before LAST. This schedules
591 the insns of the loop kernel. */
593 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
599 for (row = 0; row < ii ; row++)
600 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
601 if (PREV_INSN (last) != ps_ij->node->insn)
602 reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
606 /* Used to generate the prologue & epilogue. Duplicate the subset of
607 nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
608 of both), together with a prefix/suffix of their reg_moves. */
610 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
611 int to_stage, int for_prolog)
616 for (row = 0; row < ps->ii; row++)
617 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
619 ddg_node_ptr u_node = ps_ij->node;
621 rtx reg_move = NULL_RTX;
625 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
626 number of reg_moves starting with the second occurrence of
627 u_node, which is generated if its SCHED_STAGE <= to_stage. */
628 i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
629 i_reg_moves = MAX (i_reg_moves, 0);
630 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
632 /* The reg_moves start from the *first* reg_move backwards. */
635 reg_move = SCHED_FIRST_REG_MOVE (u_node);
636 for (j = 1; j < i_reg_moves; j++)
637 reg_move = PREV_INSN (reg_move);
640 else /* It's for the epilog. */
642 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
643 starting to decrease one stage after u_node no longer occurs;
644 that is, generate all reg_moves until
645 SCHED_STAGE (u_node) == from_stage - 1. */
646 i_reg_moves = SCHED_NREG_MOVES (u_node)
647 - (from_stage - SCHED_STAGE (u_node) - 1);
648 i_reg_moves = MAX (i_reg_moves, 0);
649 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
651 /* The reg_moves start from the *last* reg_move forwards. */
654 reg_move = SCHED_FIRST_REG_MOVE (u_node);
655 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
656 reg_move = PREV_INSN (reg_move);
660 for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
661 emit_insn (copy_rtx (PATTERN (reg_move)));
663 if (SCHED_STAGE (u_node) >= from_stage
664 && SCHED_STAGE (u_node) <= to_stage)
665 duplicate_insn_chain (u_node->first_note, u_node->insn);
670 /* Generate the instructions (including reg_moves) for prolog & epilog. */
672 generate_prolog_epilog (partial_schedule_ptr ps, rtx orig_loop_beg,
673 rtx orig_loop_end, int unknown_count)
676 int last_stage = PS_STAGE_COUNT (ps) - 1;
678 rtx c_reg = NULL_RTX;
680 rtx precond_jump = NULL_RTX;
681 rtx precond_exit_label = NULL_RTX;
682 rtx precond_exit_label_insn = NULL_RTX;
683 rtx last_epilog_insn = NULL_RTX;
684 rtx loop_exit_label = NULL_RTX;
685 rtx loop_exit_label_insn = NULL_RTX;
686 rtx orig_loop_bct = NULL_RTX;
688 /* Loop header edge. */
689 e = EDGE_PRED (ps->g->bb, 0);
690 if (e->src == ps->g->bb)
691 e = EDGE_PRED (ps->g->bb, 1);
693 /* Generate the prolog, inserting its insns on the loop-entry edge. */
696 /* This is the place where we want to insert the precondition. */
698 precond_jump = emit_note (NOTE_INSN_DELETED);
700 for (i = 0; i < last_stage; i++)
701 duplicate_insns_of_cycles (ps, 0, i, 1);
703 /* No need to call insert_insn_on_edge; we prepared the sequence. */
704 e->insns.r = get_insns ();
707 /* Generate the epilog, inserting its insns on the loop-exit edge. */
710 for (i = 0; i < last_stage; i++)
711 duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
713 last_epilog_insn = emit_note (NOTE_INSN_DELETED);
715 /* Emit the label where to put the original loop code. */
720 precond_exit_label = gen_label_rtx ();
721 precond_exit_label_insn = emit_label (precond_exit_label);
723 /* Put the original loop code. */
724 reorder_insns_nobb (orig_loop_beg, orig_loop_end, precond_exit_label_insn);
726 /* Change the label of the BCT to be the PRECOND_EXIT_LABEL. */
727 orig_loop_bct = get_last_insn ();
728 c_reg = doloop_register_get (orig_loop_bct, &cmp);
729 label = XEXP (SET_SRC (cmp), 1);
730 cond = XEXP (SET_SRC (cmp), 0);
732 if (! c_reg || GET_CODE (cond) != NE)
735 XEXP (label, 0) = precond_exit_label;
736 JUMP_LABEL (orig_loop_bct) = precond_exit_label_insn;
737 LABEL_NUSES (precond_exit_label_insn)++;
739 /* Generate the loop exit label. */
740 loop_exit_label = gen_label_rtx ();
741 loop_exit_label_insn = emit_label (loop_exit_label);
744 e = EDGE_SUCC (ps->g->bb, 0);
745 if (e->dest == ps->g->bb)
746 e = EDGE_SUCC (ps->g->bb, 1);
748 e->insns.r = get_insns ();
751 commit_edge_insertions ();
755 rtx precond_insns, epilog_jump, insert_after_insn;
756 basic_block loop_exit_bb = BLOCK_FOR_INSN (loop_exit_label_insn);
757 basic_block epilog_bb = BLOCK_FOR_INSN (last_epilog_insn);
758 basic_block precond_bb = BLOCK_FOR_INSN (precond_jump);
759 basic_block orig_loop_bb = BLOCK_FOR_INSN (precond_exit_label_insn);
760 edge epilog_exit_edge = EDGE_SUCC (epilog_bb, 0);
762 /* Do loop preconditioning to take care of cases were the loop count is
763 less than the stage count. Update the CFG properly. */
764 insert_after_insn = precond_jump;
766 c_reg = doloop_register_get (ps->g->closing_branch->insn, &cmp);
767 emit_cmp_and_jump_insns (c_reg, GEN_INT (PS_STAGE_COUNT (ps)), LT, NULL,
768 GET_MODE (c_reg), 1, precond_exit_label);
769 precond_insns = get_insns ();
770 precond_jump = get_last_insn ();
772 reorder_insns (precond_insns, precond_jump, insert_after_insn);
774 /* Generate a subtract instruction at the beginning of the prolog to
775 adjust the loop count by STAGE_COUNT. */
776 emit_insn_after (gen_sub2_insn (c_reg, GEN_INT (PS_STAGE_COUNT (ps) - 1)),
778 update_bb_for_insn (precond_bb);
779 delete_insn (insert_after_insn);
781 /* Update label info for the precondition jump. */
782 JUMP_LABEL (precond_jump) = precond_exit_label_insn;
783 LABEL_NUSES (precond_exit_label_insn)++;
785 /* Update the CFG. */
786 split_block (precond_bb, precond_jump);
787 make_edge (precond_bb, orig_loop_bb, 0);
789 /* Add a jump at end of the epilog to the LOOP_EXIT_LABEL to jump over the
790 original loop copy and update the CFG. */
791 epilog_jump = emit_jump_insn_after (gen_jump (loop_exit_label),
793 delete_insn (last_epilog_insn);
794 JUMP_LABEL (epilog_jump) = loop_exit_label_insn;
795 LABEL_NUSES (loop_exit_label_insn)++;
797 redirect_edge_succ (epilog_exit_edge, loop_exit_bb);
798 epilog_exit_edge->flags &= ~EDGE_FALLTHRU;
799 emit_barrier_after (BB_END (epilog_bb));
803 /* Return the line note insn preceding INSN, for debugging. Taken from
806 find_line_note (rtx insn)
808 for (; insn; insn = PREV_INSN (insn))
810 && NOTE_LINE_NUMBER (insn) >= 0)
816 /* Main entry point, perform SMS scheduling on the loops of the function
817 that consist of single basic blocks. */
819 sms_schedule (FILE *dump_file)
821 static int passes = 0;
824 basic_block bb, pre_header = NULL;
828 partial_schedule_ptr ps;
829 int max_bb_index = last_basic_block;
832 stats_file = dump_file;
834 /* Initialize issue_rate. */
835 if (targetm.sched.issue_rate)
837 int temp = reload_completed;
839 reload_completed = 1;
840 issue_rate = (*targetm.sched.issue_rate) ();
841 reload_completed = temp;
846 /* Initialize the scheduler. */
847 current_sched_info = &sms_sched_info;
850 /* Init Data Flow analysis, to be used in interloop dep calculation. */
852 df_analyze (df, 0, DF_ALL);
854 /* Allocate memory to hold the DDG array. */
855 g_arr = xcalloc (max_bb_index, sizeof (ddg_ptr));
857 /* Build DDGs for all the relevant loops and hold them in G_ARR
858 indexed by the loop BB index. */
863 edge e, pre_header_edge;
868 /* Check if bb has two successors, one being itself. */
869 if (EDGE_COUNT (bb->succs) != 2)
872 if (EDGE_SUCC (bb, 0)->dest != bb && EDGE_SUCC (bb, 1)->dest != bb)
875 if ((EDGE_SUCC (bb, 0)->flags & EDGE_COMPLEX)
876 || (EDGE_SUCC (bb, 1)->flags & EDGE_COMPLEX))
879 /* Check if bb has two predecessors, one being itself. */
880 if (EDGE_COUNT (bb->preds) != 2)
883 if (EDGE_PRED (bb, 0)->src != bb && EDGE_PRED (bb, 1)->src != bb)
886 if ((EDGE_PRED (bb, 0)->flags & EDGE_COMPLEX)
887 || (EDGE_PRED (bb, 1)->flags & EDGE_COMPLEX))
891 if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
894 fprintf (dump_file, "SMS reached MAX_PASSES... \n");
898 get_block_head_tail (bb->index, &head, &tail);
899 pre_header_edge = EDGE_PRED (bb, 0);
900 if (EDGE_PRED (bb, 0)->src != bb)
901 pre_header_edge = EDGE_PRED (bb, 1);
903 /* Perfrom SMS only on loops that their average count is above threshold. */
904 if (bb->count < pre_header_edge->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD)
908 rtx line_note = find_line_note (tail);
912 expanded_location xloc;
913 NOTE_EXPANDED_LOCATION (xloc, line_note);
914 fprintf (stats_file, "SMS bb %s %d (file, line)\n",
915 xloc.file, xloc.line);
917 fprintf (stats_file, "SMS single-bb-loop\n");
918 if (profile_info && flag_branch_probabilities)
920 fprintf (stats_file, "SMS loop-count ");
921 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
922 (HOST_WIDEST_INT) bb->count);
923 fprintf (stats_file, "\n");
924 fprintf (stats_file, "SMS preheader-count ");
925 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
926 (HOST_WIDEST_INT) pre_header_edge->count);
927 fprintf (stats_file, "\n");
928 fprintf (stats_file, "SMS profile-sum-max ");
929 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
930 (HOST_WIDEST_INT) profile_info->sum_max);
931 fprintf (stats_file, "\n");
937 /* Make sure this is a doloop. */
938 if ( !(count_reg = doloop_register_get (tail, &comp)))
941 e = EDGE_PRED (bb, 0);
943 pre_header = EDGE_PRED (bb, 1)->src;
947 /* Don't handle BBs with calls or barriers, or !single_set insns. */
948 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
951 || (INSN_P (insn) && !JUMP_P (insn)
952 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
955 if (insn != NEXT_INSN (tail))
960 fprintf (stats_file, "SMS loop-with-call\n");
961 else if (BARRIER_P (insn))
962 fprintf (stats_file, "SMS loop-with-barrier\n");
964 fprintf (stats_file, "SMS loop-with-not-single-set\n");
965 print_rtl_single (stats_file, insn);
971 if (! (g = create_ddg (bb, df, 0)))
974 fprintf (stats_file, "SMS doloop\n");
978 g_arr[bb->index] = g;
981 /* Release Data Flow analysis data structures. */
984 /* Go over the built DDGs and perfrom SMS for each one of them. */
985 for (i = 0; i < max_bb_index; i++)
988 rtx count_reg, count_init, comp;
989 edge pre_header_edge;
992 HOST_WIDEST_INT loop_count = 0;
994 if (! (g = g_arr[i]))
998 print_ddg (dump_file, g);
1000 get_block_head_tail (g->bb->index, &head, &tail);
1002 pre_header_edge = EDGE_PRED (g->bb, 0);
1003 if (EDGE_PRED (g->bb, 0)->src != g->bb)
1004 pre_header_edge = EDGE_PRED (g->bb, 1);
1008 rtx line_note = find_line_note (tail);
1012 expanded_location xloc;
1013 NOTE_EXPANDED_LOCATION (xloc, line_note);
1014 fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1015 xloc.file, xloc.line);
1017 fprintf (stats_file, "SMS single-bb-loop\n");
1018 if (profile_info && flag_branch_probabilities)
1020 fprintf (stats_file, "SMS loop-count ");
1021 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1022 (HOST_WIDEST_INT) bb->count);
1023 fprintf (stats_file, "\n");
1024 fprintf (stats_file, "SMS preheader-count ");
1025 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1026 (HOST_WIDEST_INT) pre_header_edge->count);
1027 fprintf (stats_file, "\n");
1028 fprintf (stats_file, "SMS profile-sum-max ");
1029 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1030 (HOST_WIDEST_INT) profile_info->sum_max);
1031 fprintf (stats_file, "\n");
1033 fprintf (stats_file, "SMS doloop\n");
1034 fprintf (stats_file, "SMS built-ddg %d\n", g->num_nodes);
1035 fprintf (stats_file, "SMS num-loads %d\n", g->num_loads);
1036 fprintf (stats_file, "SMS num-stores %d\n", g->num_stores);
1039 /* Make sure this is a doloop. */
1040 if ( !(count_reg = doloop_register_get (tail, &comp)))
1043 /* This should be NULL_RTX if the count is unknown at compile time. */
1044 count_init = const_iteration_count (count_reg, pre_header, &loop_count);
1046 if (stats_file && count_init)
1048 fprintf (stats_file, "SMS const-doloop ");
1049 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1050 fprintf (stats_file, "\n");
1053 node_order = (int *) xmalloc (sizeof (int) * g->num_nodes);
1055 mii = 1; /* Need to pass some estimate of mii. */
1056 rec_mii = sms_order_nodes (g, mii, node_order);
1057 mii = MAX (res_MII (g), rec_mii);
1058 maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1061 fprintf (stats_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1062 rec_mii, mii, maxii);
1064 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1066 set_node_sched_params (g);
1068 ps = sms_schedule_by_order (g, mii, maxii, node_order, dump_file);
1071 stage_count = PS_STAGE_COUNT (ps);
1073 if (stage_count == 0 || (count_init && (stage_count > loop_count)))
1076 fprintf (dump_file, "SMS failed... \n");
1078 fprintf (stats_file, "SMS sched-failed %d\n", stage_count);
1082 rtx orig_loop_beg = NULL_RTX;
1083 rtx orig_loop_end = NULL_RTX;
1087 fprintf (stats_file,
1088 "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1090 print_partial_schedule (ps, dump_file);
1092 "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1093 g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1096 /* Save the original loop if we want to do loop preconditioning in
1097 case the BCT count is not known. */
1103 /* Copy the original loop code before modifying it -
1104 so we can use it later. */
1105 for (i = 0; i < ps->g->num_nodes; i++)
1106 duplicate_insn_chain (ps->g->nodes[i].first_note,
1107 ps->g->nodes[i].insn);
1109 orig_loop_beg = get_insns ();
1110 orig_loop_end = get_last_insn ();
1113 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
1114 the closing_branch was scheduled and should appear in the last (ii-1)
1115 row. Otherwise, we are free to schedule the branch, and we let nodes
1116 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1117 row; this should reduce stage_count to minimum. */
1118 normalize_sched_times (ps);
1119 rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1120 set_columns_for_ps (ps);
1122 permute_partial_schedule (ps, g->closing_branch->first_note);
1124 /* Mark this loop as software pipelined so the later
1125 scheduling passes doesn't touch it. */
1126 if (! flag_resched_modulo_sched)
1127 g->bb->flags |= BB_DISABLE_SCHEDULE;
1128 /* The life-info is not valid any more. */
1129 g->bb->flags |= BB_DIRTY;
1131 generate_reg_moves (ps);
1133 print_node_sched_params (dump_file, g->num_nodes);
1135 /* Set new iteration count of loop kernel. */
1137 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1140 /* Generate prolog and epilog. */
1141 generate_prolog_epilog (ps, orig_loop_beg, orig_loop_end,
1142 count_init ? 0 : 1);
1144 free_partial_schedule (ps);
1145 free (node_sched_params);
1150 /* Release scheduler data, needed until now because of DFA. */
1154 /* The SMS scheduling algorithm itself
1155 -----------------------------------
1156 Input: 'O' an ordered list of insns of a loop.
1157 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1159 'Q' is the empty Set
1160 'PS' is the partial schedule; it holds the currently scheduled nodes with
1162 'PSP' previously scheduled predecessors.
1163 'PSS' previously scheduled successors.
1164 't(u)' the cycle where u is scheduled.
1165 'l(u)' is the latency of u.
1166 'd(v,u)' is the dependence distance from v to u.
1167 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1168 the node ordering phase.
1169 'check_hardware_resources_conflicts(u, PS, c)'
1170 run a trace around cycle/slot through DFA model
1171 to check resource conflicts involving instruction u
1172 at cycle c given the partial schedule PS.
1173 'add_to_partial_schedule_at_time(u, PS, c)'
1174 Add the node/instruction u to the partial schedule
1176 'calculate_register_pressure(PS)'
1177 Given a schedule of instructions, calculate the register
1178 pressure it implies. One implementation could be the
1179 maximum number of overlapping live ranges.
1180 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1181 registers available in the hardware.
1185 3. for each node u in O in pre-computed order
1186 4. if (PSP(u) != Q && PSS(u) == Q) then
1187 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1188 6. start = Early_start; end = Early_start + II - 1; step = 1
1189 11. else if (PSP(u) == Q && PSS(u) != Q) then
1190 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1191 13. start = Late_start; end = Late_start - II + 1; step = -1
1192 14. else if (PSP(u) != Q && PSS(u) != Q) then
1193 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1194 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1195 17. start = Early_start;
1196 18. end = min(Early_start + II - 1 , Late_start);
1198 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1199 21. start = ASAP(u); end = start + II - 1; step = 1
1203 24. for (c = start ; c != end ; c += step)
1204 25. if check_hardware_resources_conflicts(u, PS, c) then
1205 26. add_to_partial_schedule_at_time(u, PS, c)
1210 31. if (success == false) then
1212 33. if (II > maxII) then
1213 34. finish - failed to schedule
1218 39. if (calculate_register_pressure(PS) > maxRP) then
1221 42. compute epilogue & prologue
1222 43. finish - succeeded to schedule
1225 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1226 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1227 set to 0 to save compile time. */
1228 #define DFA_HISTORY SMS_DFA_HISTORY
1230 static partial_schedule_ptr
1231 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order, FILE *dump_file)
1235 int try_again_with_larger_ii = true;
1236 int num_nodes = g->num_nodes;
1238 int start, end, step; /* Place together into one struct? */
1239 sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1240 sbitmap must_precede = sbitmap_alloc (num_nodes);
1241 sbitmap must_follow = sbitmap_alloc (num_nodes);
1243 partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1245 while (try_again_with_larger_ii && ii < maxii)
1248 fprintf(dump_file, "Starting with ii=%d\n", ii);
1249 try_again_with_larger_ii = false;
1250 sbitmap_zero (sched_nodes);
1252 for (i = 0; i < num_nodes; i++)
1254 int u = nodes_order[i];
1255 ddg_node_ptr u_node = &g->nodes[u];
1256 sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1257 sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1260 rtx insn = u_node->insn;
1265 if (JUMP_P (insn)) /* Closing branch handled later. */
1268 /* 1. compute sched window for u (start, end, step). */
1269 psp_not_empty = sbitmap_any_common_bits (u_node_preds, sched_nodes);
1270 pss_not_empty = sbitmap_any_common_bits (u_node_succs, sched_nodes);
1272 if (psp_not_empty && !pss_not_empty)
1274 int early_start = 0;
1277 for (e = u_node->in; e != 0; e = e->next_in)
1279 ddg_node_ptr v_node = e->src;
1280 if (TEST_BIT (sched_nodes, v_node->cuid))
1282 int node_st = SCHED_TIME (v_node)
1283 + e->latency - (e->distance * ii);
1285 early_start = MAX (early_start, node_st);
1287 if (e->data_type == MEM_DEP)
1288 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1291 start = early_start;
1292 end = MIN (end, early_start + ii);
1296 else if (!psp_not_empty && pss_not_empty)
1298 int late_start = INT_MAX;
1301 for (e = u_node->out; e != 0; e = e->next_out)
1303 ddg_node_ptr v_node = e->dest;
1304 if (TEST_BIT (sched_nodes, v_node->cuid))
1306 late_start = MIN (late_start,
1307 SCHED_TIME (v_node) - e->latency
1308 + (e->distance * ii));
1309 if (e->data_type == MEM_DEP)
1310 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1314 end = MAX (end, late_start - ii);
1318 else if (psp_not_empty && pss_not_empty)
1320 int early_start = 0;
1321 int late_start = INT_MAX;
1325 for (e = u_node->in; e != 0; e = e->next_in)
1327 ddg_node_ptr v_node = e->src;
1329 if (TEST_BIT (sched_nodes, v_node->cuid))
1331 early_start = MAX (early_start,
1332 SCHED_TIME (v_node) + e->latency
1333 - (e->distance * ii));
1334 if (e->data_type == MEM_DEP)
1335 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1338 for (e = u_node->out; e != 0; e = e->next_out)
1340 ddg_node_ptr v_node = e->dest;
1342 if (TEST_BIT (sched_nodes, v_node->cuid))
1344 late_start = MIN (late_start,
1345 SCHED_TIME (v_node) - e->latency
1346 + (e->distance * ii));
1347 if (e->data_type == MEM_DEP)
1348 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1351 start = MAX (start, early_start);
1352 end = MIN (end, MIN (early_start + ii, late_start + 1));
1355 else /* psp is empty && pss is empty. */
1357 start = SCHED_ASAP (u_node);
1362 /* 2. Try scheduling u in window. */
1364 fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1365 u, start, end, step);
1367 /* use must_follow & must_precede bitmaps to determine order
1368 of nodes within the cycle. */
1369 sbitmap_zero (must_precede);
1370 sbitmap_zero (must_follow);
1371 for (e = u_node->in; e != 0; e = e->next_in)
1372 if (TEST_BIT (sched_nodes, e->src->cuid)
1373 && e->latency == (ii * e->distance)
1374 && start == SCHED_TIME (e->src))
1375 SET_BIT (must_precede, e->src->cuid);
1377 for (e = u_node->out; e != 0; e = e->next_out)
1378 if (TEST_BIT (sched_nodes, e->dest->cuid)
1379 && e->latency == (ii * e->distance)
1380 && end == SCHED_TIME (e->dest))
1381 SET_BIT (must_follow, e->dest->cuid);
1384 if ((step > 0 && start < end) || (step < 0 && start > end))
1385 for (c = start; c != end; c += step)
1389 psi = ps_add_node_check_conflicts (ps, u_node, c,
1395 SCHED_TIME (u_node) = c;
1396 SET_BIT (sched_nodes, u);
1399 fprintf(dump_file, "Schedule in %d\n", c);
1405 /* ??? Try backtracking instead of immediately ii++? */
1407 try_again_with_larger_ii = true;
1408 reset_partial_schedule (ps, ii);
1411 /* ??? If (success), check register pressure estimates. */
1412 } /* Continue with next node. */
1413 } /* While try_again_with_larger_ii. */
1415 sbitmap_free (sched_nodes);
1419 free_partial_schedule (ps);
1426 /* This page implements the algorithm for ordering the nodes of a DDG
1427 for modulo scheduling, activated through the
1428 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
1430 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1431 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1432 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1433 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1434 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1435 #define DEPTH(x) (ASAP ((x)))
1437 typedef struct node_order_params * nopa;
1439 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1440 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1441 static nopa calculate_order_params (ddg_ptr, int mii);
1442 static int find_max_asap (ddg_ptr, sbitmap);
1443 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1444 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1446 enum sms_direction {BOTTOMUP, TOPDOWN};
1448 struct node_order_params
1455 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
1457 check_nodes_order (int *node_order, int num_nodes)
1460 sbitmap tmp = sbitmap_alloc (num_nodes);
1464 for (i = 0; i < num_nodes; i++)
1466 int u = node_order[i];
1468 if (u >= num_nodes || u < 0 || TEST_BIT (tmp, u))
1477 /* Order the nodes of G for scheduling and pass the result in
1478 NODE_ORDER. Also set aux.count of each node to ASAP.
1479 Return the recMII for the given DDG. */
1481 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1485 ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1487 nopa nops = calculate_order_params (g, mii);
1489 order_nodes_of_sccs (sccs, node_order);
1491 if (sccs->num_sccs > 0)
1492 /* First SCC has the largest recurrence_length. */
1493 rec_mii = sccs->sccs[0]->recurrence_length;
1495 /* Save ASAP before destroying node_order_params. */
1496 for (i = 0; i < g->num_nodes; i++)
1498 ddg_node_ptr v = &g->nodes[i];
1499 v->aux.count = ASAP (v);
1503 free_ddg_all_sccs (sccs);
1504 check_nodes_order (node_order, g->num_nodes);
1510 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1513 ddg_ptr g = all_sccs->ddg;
1514 int num_nodes = g->num_nodes;
1515 sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1516 sbitmap on_path = sbitmap_alloc (num_nodes);
1517 sbitmap tmp = sbitmap_alloc (num_nodes);
1518 sbitmap ones = sbitmap_alloc (num_nodes);
1520 sbitmap_zero (prev_sccs);
1521 sbitmap_ones (ones);
1523 /* Perfrom the node ordering starting from the SCC with the highest recMII.
1524 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
1525 for (i = 0; i < all_sccs->num_sccs; i++)
1527 ddg_scc_ptr scc = all_sccs->sccs[i];
1529 /* Add nodes on paths from previous SCCs to the current SCC. */
1530 find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1531 sbitmap_a_or_b (tmp, scc->nodes, on_path);
1533 /* Add nodes on paths from the current SCC to previous SCCs. */
1534 find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1535 sbitmap_a_or_b (tmp, tmp, on_path);
1537 /* Remove nodes of previous SCCs from current extended SCC. */
1538 sbitmap_difference (tmp, tmp, prev_sccs);
1540 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1541 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
1544 /* Handle the remaining nodes that do not belong to any scc. Each call
1545 to order_nodes_in_scc handles a single connected component. */
1546 while (pos < g->num_nodes)
1548 sbitmap_difference (tmp, ones, prev_sccs);
1549 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1551 sbitmap_free (prev_sccs);
1552 sbitmap_free (on_path);
1554 sbitmap_free (ones);
1557 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
1558 static struct node_order_params *
1559 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1563 int num_nodes = g->num_nodes;
1565 /* Allocate a place to hold ordering params for each node in the DDG. */
1566 nopa node_order_params_arr;
1568 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
1569 node_order_params_arr = (nopa) xcalloc (num_nodes,
1570 sizeof (struct node_order_params));
1572 /* Set the aux pointer of each node to point to its order_params structure. */
1573 for (u = 0; u < num_nodes; u++)
1574 g->nodes[u].aux.info = &node_order_params_arr[u];
1576 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1577 calculate ASAP, ALAP, mobility, distance, and height for each node
1578 in the dependence (direct acyclic) graph. */
1580 /* We assume that the nodes in the array are in topological order. */
1583 for (u = 0; u < num_nodes; u++)
1585 ddg_node_ptr u_node = &g->nodes[u];
1588 for (e = u_node->in; e; e = e->next_in)
1589 if (e->distance == 0)
1590 ASAP (u_node) = MAX (ASAP (u_node),
1591 ASAP (e->src) + e->latency);
1592 max_asap = MAX (max_asap, ASAP (u_node));
1595 for (u = num_nodes - 1; u > -1; u--)
1597 ddg_node_ptr u_node = &g->nodes[u];
1599 ALAP (u_node) = max_asap;
1600 HEIGHT (u_node) = 0;
1601 for (e = u_node->out; e; e = e->next_out)
1602 if (e->distance == 0)
1604 ALAP (u_node) = MIN (ALAP (u_node),
1605 ALAP (e->dest) - e->latency);
1606 HEIGHT (u_node) = MAX (HEIGHT (u_node),
1607 HEIGHT (e->dest) + e->latency);
1611 return node_order_params_arr;
1615 find_max_asap (ddg_ptr g, sbitmap nodes)
1621 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1623 ddg_node_ptr u_node = &g->nodes[u];
1625 if (max_asap < ASAP (u_node))
1627 max_asap = ASAP (u_node);
1635 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1639 int min_mob = INT_MAX;
1642 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1644 ddg_node_ptr u_node = &g->nodes[u];
1646 if (max_hv < HEIGHT (u_node))
1648 max_hv = HEIGHT (u_node);
1649 min_mob = MOB (u_node);
1652 else if ((max_hv == HEIGHT (u_node))
1653 && (min_mob > MOB (u_node)))
1655 min_mob = MOB (u_node);
1663 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1667 int min_mob = INT_MAX;
1670 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1672 ddg_node_ptr u_node = &g->nodes[u];
1674 if (max_dv < DEPTH (u_node))
1676 max_dv = DEPTH (u_node);
1677 min_mob = MOB (u_node);
1680 else if ((max_dv == DEPTH (u_node))
1681 && (min_mob > MOB (u_node)))
1683 min_mob = MOB (u_node);
1690 /* Places the nodes of SCC into the NODE_ORDER array starting
1691 at position POS, according to the SMS ordering algorithm.
1692 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1693 the NODE_ORDER array, starting from position zero. */
1695 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1696 int * node_order, int pos)
1698 enum sms_direction dir;
1699 int num_nodes = g->num_nodes;
1700 sbitmap workset = sbitmap_alloc (num_nodes);
1701 sbitmap tmp = sbitmap_alloc (num_nodes);
1702 sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1703 sbitmap predecessors = sbitmap_alloc (num_nodes);
1704 sbitmap successors = sbitmap_alloc (num_nodes);
1706 sbitmap_zero (predecessors);
1707 find_predecessors (predecessors, g, nodes_ordered);
1709 sbitmap_zero (successors);
1710 find_successors (successors, g, nodes_ordered);
1713 if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1715 sbitmap_copy (workset, tmp);
1718 else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1720 sbitmap_copy (workset, tmp);
1727 sbitmap_zero (workset);
1728 if ((u = find_max_asap (g, scc)) >= 0)
1729 SET_BIT (workset, u);
1733 sbitmap_zero (zero_bitmap);
1734 while (!sbitmap_equal (workset, zero_bitmap))
1737 ddg_node_ptr v_node;
1738 sbitmap v_node_preds;
1739 sbitmap v_node_succs;
1743 while (!sbitmap_equal (workset, zero_bitmap))
1745 v = find_max_hv_min_mob (g, workset);
1746 v_node = &g->nodes[v];
1747 node_order[pos++] = v;
1748 v_node_succs = NODE_SUCCESSORS (v_node);
1749 sbitmap_a_and_b (tmp, v_node_succs, scc);
1751 /* Don't consider the already ordered successors again. */
1752 sbitmap_difference (tmp, tmp, nodes_ordered);
1753 sbitmap_a_or_b (workset, workset, tmp);
1754 RESET_BIT (workset, v);
1755 SET_BIT (nodes_ordered, v);
1758 sbitmap_zero (predecessors);
1759 find_predecessors (predecessors, g, nodes_ordered);
1760 sbitmap_a_and_b (workset, predecessors, scc);
1764 while (!sbitmap_equal (workset, zero_bitmap))
1766 v = find_max_dv_min_mob (g, workset);
1767 v_node = &g->nodes[v];
1768 node_order[pos++] = v;
1769 v_node_preds = NODE_PREDECESSORS (v_node);
1770 sbitmap_a_and_b (tmp, v_node_preds, scc);
1772 /* Don't consider the already ordered predecessors again. */
1773 sbitmap_difference (tmp, tmp, nodes_ordered);
1774 sbitmap_a_or_b (workset, workset, tmp);
1775 RESET_BIT (workset, v);
1776 SET_BIT (nodes_ordered, v);
1779 sbitmap_zero (successors);
1780 find_successors (successors, g, nodes_ordered);
1781 sbitmap_a_and_b (workset, successors, scc);
1785 sbitmap_free (workset);
1786 sbitmap_free (zero_bitmap);
1787 sbitmap_free (predecessors);
1788 sbitmap_free (successors);
1793 /* This page contains functions for manipulating partial-schedules during
1794 modulo scheduling. */
1796 /* Create a partial schedule and allocate a memory to hold II rows. */
1797 static partial_schedule_ptr
1798 create_partial_schedule (int ii, ddg_ptr g, int history)
1800 partial_schedule_ptr ps = (partial_schedule_ptr)
1801 xmalloc (sizeof (struct partial_schedule));
1802 ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
1804 ps->history = history;
1805 ps->min_cycle = INT_MAX;
1806 ps->max_cycle = INT_MIN;
1812 /* Free the PS_INSNs in rows array of the given partial schedule.
1813 ??? Consider caching the PS_INSN's. */
1815 free_ps_insns (partial_schedule_ptr ps)
1819 for (i = 0; i < ps->ii; i++)
1823 ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
1826 ps->rows[i] = ps_insn;
1832 /* Free all the memory allocated to the partial schedule. */
1834 free_partial_schedule (partial_schedule_ptr ps)
1843 /* Clear the rows array with its PS_INSNs, and create a new one with
1846 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
1851 if (new_ii == ps->ii)
1853 ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
1854 * sizeof (ps_insn_ptr));
1855 memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
1857 ps->min_cycle = INT_MAX;
1858 ps->max_cycle = INT_MIN;
1861 /* Prints the partial schedule as an ii rows array, for each rows
1862 print the ids of the insns in it. */
1864 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
1868 for (i = 0; i < ps->ii; i++)
1870 ps_insn_ptr ps_i = ps->rows[i];
1872 fprintf (dump, "\n[CYCLE %d ]: ", i);
1875 fprintf (dump, "%d, ",
1876 INSN_UID (ps_i->node->insn));
1877 ps_i = ps_i->next_in_row;
1882 /* Creates an object of PS_INSN and initializes it to the given parameters. */
1884 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
1886 ps_insn_ptr ps_i = xmalloc (sizeof (struct ps_insn));
1889 ps_i->next_in_row = NULL;
1890 ps_i->prev_in_row = NULL;
1891 ps_i->row_rest_count = rest_count;
1892 ps_i->cycle = cycle;
1898 /* Removes the given PS_INSN from the partial schedule. Returns false if the
1899 node is not found in the partial schedule, else returns true. */
1901 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
1908 row = SMODULO (ps_i->cycle, ps->ii);
1909 if (! ps_i->prev_in_row)
1911 if (ps_i != ps->rows[row])
1914 ps->rows[row] = ps_i->next_in_row;
1916 ps->rows[row]->prev_in_row = NULL;
1920 ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
1921 if (ps_i->next_in_row)
1922 ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
1928 /* Unlike what literature describes for modulo scheduling (which focuses
1929 on VLIW machines) the order of the instructions inside a cycle is
1930 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
1931 where the current instruction should go relative to the already
1932 scheduled instructions in the given cycle. Go over these
1933 instructions and find the first possible column to put it in. */
1935 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
1936 sbitmap must_precede, sbitmap must_follow)
1938 ps_insn_ptr next_ps_i;
1939 ps_insn_ptr first_must_follow = NULL;
1940 ps_insn_ptr last_must_precede = NULL;
1946 row = SMODULO (ps_i->cycle, ps->ii);
1948 /* Find the first must follow and the last must precede
1949 and insert the node immediately after the must precede
1950 but make sure that it there is no must follow after it. */
1951 for (next_ps_i = ps->rows[row];
1953 next_ps_i = next_ps_i->next_in_row)
1955 if (TEST_BIT (must_follow, next_ps_i->node->cuid)
1956 && ! first_must_follow)
1957 first_must_follow = next_ps_i;
1958 if (TEST_BIT (must_precede, next_ps_i->node->cuid))
1960 /* If we have already met a node that must follow, then
1961 there is no possible column. */
1962 if (first_must_follow)
1965 last_must_precede = next_ps_i;
1969 /* Now insert the node after INSERT_AFTER_PSI. */
1971 if (! last_must_precede)
1973 ps_i->next_in_row = ps->rows[row];
1974 ps_i->prev_in_row = NULL;
1975 if (ps_i->next_in_row)
1976 ps_i->next_in_row->prev_in_row = ps_i;
1977 ps->rows[row] = ps_i;
1981 ps_i->next_in_row = last_must_precede->next_in_row;
1982 last_must_precede->next_in_row = ps_i;
1983 ps_i->prev_in_row = last_must_precede;
1984 if (ps_i->next_in_row)
1985 ps_i->next_in_row->prev_in_row = ps_i;
1991 /* Advances the PS_INSN one column in its current row; returns false
1992 in failure and true in success. Bit N is set in MUST_FOLLOW if
1993 the node with cuid N must be come after the node pointed to by
1994 PS_I when scheduled in the same cycle. */
1996 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
1997 sbitmap must_follow)
1999 ps_insn_ptr prev, next;
2001 ddg_node_ptr next_node;
2006 row = SMODULO (ps_i->cycle, ps->ii);
2008 if (! ps_i->next_in_row)
2011 next_node = ps_i->next_in_row->node;
2013 /* Check if next_in_row is dependent on ps_i, both having same sched
2014 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
2015 if (TEST_BIT (must_follow, next_node->cuid))
2018 /* Advance PS_I over its next_in_row in the doubly linked list. */
2019 prev = ps_i->prev_in_row;
2020 next = ps_i->next_in_row;
2022 if (ps_i == ps->rows[row])
2023 ps->rows[row] = next;
2025 ps_i->next_in_row = next->next_in_row;
2027 if (next->next_in_row)
2028 next->next_in_row->prev_in_row = ps_i;
2030 next->next_in_row = ps_i;
2031 ps_i->prev_in_row = next;
2033 next->prev_in_row = prev;
2035 prev->next_in_row = next;
2040 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2041 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
2042 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2043 before/after (respectively) the node pointed to by PS_I when scheduled
2044 in the same cycle. */
2046 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2047 sbitmap must_precede, sbitmap must_follow)
2051 int row = SMODULO (cycle, ps->ii);
2054 && ps->rows[row]->row_rest_count >= issue_rate)
2058 rest_count += ps->rows[row]->row_rest_count;
2060 ps_i = create_ps_insn (node, rest_count, cycle);
2062 /* Finds and inserts PS_I according to MUST_FOLLOW and
2064 if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2073 /* Advance time one cycle. Assumes DFA is being used. */
2075 advance_one_cycle (void)
2077 if (targetm.sched.dfa_pre_cycle_insn)
2078 state_transition (curr_state,
2079 (*targetm.sched.dfa_pre_cycle_insn) ());
2081 state_transition (curr_state, NULL);
2083 if (targetm.sched.dfa_post_cycle_insn)
2084 state_transition (curr_state,
2085 (*targetm.sched.dfa_post_cycle_insn) ());
2088 /* Checks if PS has resource conflicts according to DFA, starting from
2089 FROM cycle to TO cycle; returns true if there are conflicts and false
2090 if there are no conflicts. Assumes DFA is being used. */
2092 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2096 state_reset (curr_state);
2098 for (cycle = from; cycle <= to; cycle++)
2100 ps_insn_ptr crr_insn;
2101 /* Holds the remaining issue slots in the current row. */
2102 int can_issue_more = issue_rate;
2104 /* Walk through the DFA for the current row. */
2105 for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2107 crr_insn = crr_insn->next_in_row)
2109 rtx insn = crr_insn->node->insn;
2114 /* Check if there is room for the current insn. */
2115 if (!can_issue_more || state_dead_lock_p (curr_state))
2118 /* Update the DFA state and return with failure if the DFA found
2119 recource conflicts. */
2120 if (state_transition (curr_state, insn) >= 0)
2123 if (targetm.sched.variable_issue)
2125 (*targetm.sched.variable_issue) (sched_dump, sched_verbose,
2126 insn, can_issue_more);
2127 /* A naked CLOBBER or USE generates no instruction, so don't
2128 let them consume issue slots. */
2129 else if (GET_CODE (PATTERN (insn)) != USE
2130 && GET_CODE (PATTERN (insn)) != CLOBBER)
2134 /* Advance the DFA to the next cycle. */
2135 advance_one_cycle ();
2140 /* Checks if the given node causes resource conflicts when added to PS at
2141 cycle C. If not the node is added to PS and returned; otherwise zero
2142 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2143 cuid N must be come before/after (respectively) the node pointed to by
2144 PS_I when scheduled in the same cycle. */
2146 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2147 int c, sbitmap must_precede,
2148 sbitmap must_follow)
2150 int has_conflicts = 0;
2153 /* First add the node to the PS, if this succeeds check for
2154 conflicts, trying different issue slots in the same row. */
2155 if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2156 return NULL; /* Failed to insert the node at the given cycle. */
2158 has_conflicts = ps_has_conflicts (ps, c, c)
2160 && ps_has_conflicts (ps,
2164 /* Try different issue slots to find one that the given node can be
2165 scheduled in without conflicts. */
2166 while (has_conflicts)
2168 if (! ps_insn_advance_column (ps, ps_i, must_follow))
2170 has_conflicts = ps_has_conflicts (ps, c, c)
2172 && ps_has_conflicts (ps,
2179 remove_node_from_ps (ps, ps_i);
2183 ps->min_cycle = MIN (ps->min_cycle, c);
2184 ps->max_cycle = MAX (ps->max_cycle, c);
2188 /* Rotate the rows of PS such that insns scheduled at time
2189 START_CYCLE will appear in row 0. Updates max/min_cycles. */
2191 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2193 int i, row, backward_rotates;
2194 int last_row = ps->ii - 1;
2196 if (start_cycle == 0)
2199 backward_rotates = SMODULO (start_cycle, ps->ii);
2201 /* Revisit later and optimize this into a single loop. */
2202 for (i = 0; i < backward_rotates; i++)
2204 ps_insn_ptr first_row = ps->rows[0];
2206 for (row = 0; row < last_row; row++)
2207 ps->rows[row] = ps->rows[row+1];
2209 ps->rows[last_row] = first_row;
2212 ps->max_cycle -= start_cycle;
2213 ps->min_cycle -= start_cycle;
2216 #endif /* INSN_SCHEDULING*/