Merge from vendor branch GCC:
[dragonfly.git] / contrib / gcc-4.1 / gcc / config / i386 / ppro.md
1 ;; Scheduling for the Intel P6 family of processors
2 ;; Copyright (C) 2004, 2005 Free Software Foundation, Inc.
3 ;;
4 ;; This file is part of GCC.
5 ;;
6 ;; GCC is free software; you can redistribute it and/or modify
7 ;; it under the terms of the GNU General Public License as published by
8 ;; the Free Software Foundation; either version 2, or (at your option)
9 ;; any later version.
10 ;;
11 ;; GCC is distributed in the hope that it will be useful,
12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 ;; GNU General Public License for more details.
15 ;;
16 ;; You should have received a copy of the GNU General Public License
17 ;; along with GCC; see the file COPYING.  If not, write to
18 ;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
19 ;; Boston, MA 02110-1301, USA.  */
20
21 ;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
22 ;; and Xeon lines of CPUs.  The DFA scheduler description in this file is
23 ;; based on information that can be found in the following three documents:
24 ;;
25 ;;    "P6 Family of Processors Hardware Developer's Manual",
26 ;;    Intel, September 1999.
27 ;;
28 ;;    "Intel Architecture Optimization Manual",
29 ;;    Intel, 1999 (Order Number: 245127-001).
30 ;;
31 ;;    "How to optimize for the Pentium family of microprocessors",
32 ;;    by Agner Fog, PhD.
33 ;;
34 ;; The P6 pipeline has three major components:
35 ;;   1) the FETCH/DECODE unit, an in-order issue front-end
36 ;;   2) the DISPATCH/EXECUTE unit, which is the out-of-order core
37 ;;   3) the RETIRE unit, an in-order retirement unit
38 ;;
39 ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
40 ;; retirement unit are naturally in-order.
41 ;;
42 ;;                       BUS INTERFACE UNIT
43 ;;                     /                   \
44 ;;                L1 ICACHE             L1 DCACHE
45 ;;              /     |     \              |     \
46 ;;       DECODER0  DECODER1  DECODER2  DISP/EXEC  RETIRE
47 ;;              \     |     /              |        |
48 ;;            INSTRUCTION POOL   __________|_______/
49 ;;          (inc. reorder buffer)
50 ;;
51 ;; Since the P6 CPUs execute instructions out-of-order, the most important
52 ;; consideration in performance tuning is making sure enough micro-ops are
53 ;; ready for execution in the out-of-order core, while not stalling the
54 ;; decoder.
55 ;;
56 ;; TODO:
57 ;; - Find a less crude way to model complex instructions, in
58 ;;   particular how many cycles they take to be decoded.
59 ;; - Include decoder latencies in the total reservation latencies.
60 ;;   This isn't necessary right now because we assume for every
61 ;;   instruction that it never blocks a decoder.
62 ;; - Figure out where the p0 and p1 reservations come from.  These
63 ;;   appear not to be in the manual (e.g. why is cld "(p0+p1)*2"
64 ;;   better than "(p0|p1)*4" ???)
65 ;; - Lots more because I'm sure this is still far from optimal :-)
66
67 ;; The ppro_idiv and ppro_fdiv automata are used to model issue
68 ;; latencies of idiv and fdiv type insns.
69 (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
70
71 ;; Simple instructions of the register-register form have only one uop.
72 ;; Load instructions are also only one uop.  Store instructions decode to
73 ;; two uops, and simple read-modify instructions also take two uops.
74 ;; Simple instructions of the register-memory form have two to three uops.
75 ;; Simple read-modify-write instructions have four uops.  The rules for
76 ;; the decoder are simple:
77 ;;  - an instruction with 1 uop can be decoded by any of the three
78 ;;    decoders in one cycle.
79 ;;  - an instruction with 1 to 4 uops can be decoded only by decoder 0
80 ;;    but still in only one cycle.
81 ;;  - a complex (microcode) instruction can also only be decoded by
82 ;;    decoder 0, and this takes an unspecified number of cycles.
83 ;;    
84 ;; The goal is to schedule such that we have a few-one-one uops sequence
85 ;; in each cycle, to decode as many instructions per cycle as possible.
86 (define_cpu_unit "decoder0" "ppro_decoder")
87 (define_cpu_unit "decoder1" "ppro_decoder")
88 (define_cpu_unit "decoder2" "ppro_decoder")
89
90 ;; We first wish to find an instruction for decoder0, so exclude
91 ;; decoder1 and decoder2 from being reserved until decoder 0 is
92 ;; reserved.
93 (presence_set "decoder1" "decoder0")
94 (presence_set "decoder2" "decoder0")
95
96 ;; Most instructions can be decoded on any of the three decoders.
97 (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
98
99 ;; The out-of-order core has five pipelines.  During each cycle, the core
100 ;; may dispatch zero or one uop on the port of any of the five pipelines
101 ;; so the maximum number of dispatched uops per cycle is 5.  In practicer,
102 ;; 3 uops per cycle is more realistic.
103 ;;
104 ;; Two of the five pipelines contain several execution units:
105 ;;
106 ;; Port 0       Port 1          Port 2          Port 3          Port 4
107 ;; ALU          ALU             LOAD            SAC             SDA
108 ;; FPU          JUE
109 ;; AGU          MMX
110 ;; MMX          P3FPU
111 ;; P3FPU
112 ;;
113 ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
114 ;;  JUE = Jump Execution Unit, AGU = Address Generation Unit)
115 ;;
116 (define_cpu_unit "p0,p1" "ppro_core")
117 (define_cpu_unit "p2" "ppro_load")
118 (define_cpu_unit "p3,p4" "ppro_store")
119 (define_cpu_unit "idiv" "ppro_idiv")
120 (define_cpu_unit "fdiv" "ppro_fdiv")
121
122 ;; Only the irregular instructions have to be modeled here.  A load
123 ;; increases the latency by 2 or 3, or by nothing if the manual gives
124 ;; a latency already.  Store latencies are not accounted for.
125 ;;
126 ;; The simple instructions follow a very regular pattern of 1 uop per
127 ;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
128 ;; on port 4 and port 3.  These instructions are modelled at the bottom
129 ;; of this file.
130 ;;
131 ;; For microcoded instructions we don't know how many uops are produced.
132 ;; These instructions are the "complex" ones in the Intel manuals.  All
133 ;; we _do_ know is that they typically produce four or more uops, so
134 ;; they can only be decoded on decoder0.  Modelling their latencies
135 ;; doesn't make sense because we don't know how these instructions are
136 ;; executed in the core.  So we just model that they can only be decoded
137 ;; on decoder 0, and say that it takes a little while before the result
138 ;; is available.
139 (define_insn_reservation "ppro_complex_insn" 6
140                          (and (eq_attr "cpu" "pentiumpro")
141                               (eq_attr "type" "other,multi,call,callv,str"))
142                          "decoder0")
143
144 ;; imov with memory operands does not use the integer units.
145 (define_insn_reservation "ppro_imov" 1
146                          (and (eq_attr "cpu" "pentiumpro")
147                               (and (eq_attr "memory" "none")
148                                    (eq_attr "type" "imov")))
149                          "decodern,(p0|p1)")
150
151 (define_insn_reservation "ppro_imov_load" 4
152                          (and (eq_attr "cpu" "pentiumpro")
153                               (and (eq_attr "memory" "load")
154                                    (eq_attr "type" "imov")))
155                          "decodern,p2")
156
157 (define_insn_reservation "ppro_imov_store" 1
158                          (and (eq_attr "cpu" "pentiumpro")
159                               (and (eq_attr "memory" "store")
160                                    (eq_attr "type" "imov")))
161                          "decoder0,p4+p3")
162
163 ;; imovx always decodes to one uop, and also doesn't use the integer
164 ;; units if it has memory operands.
165 (define_insn_reservation "ppro_imovx" 1
166                          (and (eq_attr "cpu" "pentiumpro")
167                               (and (eq_attr "memory" "none")
168                                    (eq_attr "type" "imovx")))
169                          "decodern,(p0|p1)")
170
171 (define_insn_reservation "ppro_imovx_load" 4
172                          (and (eq_attr "cpu" "pentiumpro")
173                               (and (eq_attr "memory" "load")
174                                    (eq_attr "type" "imovx")))
175                          "decodern,p2")
176
177 ;; lea executes on port 0 with latency one and throughput 1.
178 (define_insn_reservation "ppro_lea" 1
179                          (and (eq_attr "cpu" "pentiumpro")
180                               (and (eq_attr "memory" "none")
181                                    (eq_attr "type" "lea")))
182                          "decodern,p0")
183
184 ;; Shift and rotate execute on port 0 with latency and throughput 1.
185 ;; The load and store units need to be reserved when memory operands
186 ;; are involved.
187 (define_insn_reservation "ppro_shift_rotate" 1
188                          (and (eq_attr "cpu" "pentiumpro")
189                               (and (eq_attr "memory" "none")
190                                    (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
191                          "decodern,p0")
192
193 (define_insn_reservation "ppro_shift_rotate_mem" 4
194                          (and (eq_attr "cpu" "pentiumpro")
195                               (and (eq_attr "memory" "!none")
196                                    (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
197                          "decoder0,p2+p0,p4+p3")
198
199 (define_insn_reservation "ppro_cld" 2
200                          (and (eq_attr "cpu" "pentiumpro")
201                               (eq_attr "type" "cld"))
202                          "decoder0,(p0+p1)*2")
203
204 ;; The P6 has a sophisticated branch prediction mechanism to minimize
205 ;; latencies due to branching.  In particular, it has a fast way to
206 ;; execute branches that are taken multiple times (such as in loops).
207 ;; Branches not taken suffer no penalty, and correctly predicted
208 ;; branches cost only one fetch cycle.  Mispredicted branches are very
209 ;; costly: typically 15 cycles and possibly as many as 26 cycles.
210 ;;
211 ;; Unfortunately all this makes it quite difficult to properly model
212 ;; the latencies for the compiler.  Here I've made the choice to be
213 ;; optimistic and assume branches are often predicted correctly, so
214 ;; they have latency 1, and the decoders are not blocked.
215 ;;
216 ;; In addition, the model assumes a branch always decodes to only 1 uop,
217 ;; which is not exactly true because there are a few instructions that
218 ;; decode to 2 uops or microcode.  But this probably gives the best
219 ;; results because we can assume these instructions can decode on all
220 ;; decoders.
221 (define_insn_reservation "ppro_branch" 1
222                          (and (eq_attr "cpu" "pentiumpro")
223                               (and (eq_attr "memory" "none")
224                                    (eq_attr "type" "ibr")))
225                          "decodern,p1")
226
227 ;; ??? Indirect branches probably have worse latency than this.
228 (define_insn_reservation "ppro_indirect_branch" 6
229                          (and (eq_attr "cpu" "pentiumpro")
230                               (and (eq_attr "memory" "!none")
231                                    (eq_attr "type" "ibr")))
232                          "decoder0,p2+p1")
233
234 (define_insn_reservation "ppro_leave" 4
235                          (and (eq_attr "cpu" "pentiumpro")
236                               (eq_attr "type" "leave"))
237                          "decoder0,p2+(p0|p1),(p0|p1)")
238
239 ;; imul has throughput one, but latency 4, and can only execute on port 0.
240 (define_insn_reservation "ppro_imul" 4
241                          (and (eq_attr "cpu" "pentiumpro")
242                               (and (eq_attr "memory" "none")
243                                    (eq_attr "type" "imul")))
244                          "decodern,p0")
245
246 (define_insn_reservation "ppro_imul_mem" 4
247                          (and (eq_attr "cpu" "pentiumpro")
248                               (and (eq_attr "memory" "!none")
249                                    (eq_attr "type" "imul")))
250                          "decoder0,p2+p0")
251
252 ;; div and idiv are very similar, so we model them the same.
253 ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
254 ;; These issue latencies are modelled via the ppro_div automaton.
255 (define_insn_reservation "ppro_idiv_QI" 19
256                          (and (eq_attr "cpu" "pentiumpro")
257                               (and (eq_attr "memory" "none")
258                                    (and (eq_attr "mode" "QI")
259                                         (eq_attr "type" "idiv"))))
260                          "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
261
262 (define_insn_reservation "ppro_idiv_QI_load" 19
263                          (and (eq_attr "cpu" "pentiumpro")
264                               (and (eq_attr "memory" "load")
265                                    (and (eq_attr "mode" "QI")
266                                         (eq_attr "type" "idiv"))))
267                          "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
268
269 (define_insn_reservation "ppro_idiv_HI" 23
270                          (and (eq_attr "cpu" "pentiumpro")
271                               (and (eq_attr "memory" "none")
272                                    (and (eq_attr "mode" "HI")
273                                         (eq_attr "type" "idiv"))))
274                          "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
275
276 (define_insn_reservation "ppro_idiv_HI_load" 23
277                          (and (eq_attr "cpu" "pentiumpro")
278                               (and (eq_attr "memory" "load")
279                                    (and (eq_attr "mode" "HI")
280                                         (eq_attr "type" "idiv"))))
281                          "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
282
283 (define_insn_reservation "ppro_idiv_SI" 39
284                          (and (eq_attr "cpu" "pentiumpro")
285                               (and (eq_attr "memory" "none")
286                                    (and (eq_attr "mode" "SI")
287                                         (eq_attr "type" "idiv"))))
288                          "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
289
290 (define_insn_reservation "ppro_idiv_SI_load" 39
291                          (and (eq_attr "cpu" "pentiumpro")
292                               (and (eq_attr "memory" "load")
293                                    (and (eq_attr "mode" "SI")
294                                         (eq_attr "type" "idiv"))))
295                          "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
296
297 ;; Floating point operations always execute on port 0.
298 ;; ??? where do these latencies come from? fadd has latency 3 and
299 ;;     has throughput "1/cycle (align with FADD)".  What do they
300 ;;     mean and how can we model that?
301 (define_insn_reservation "ppro_fop" 3
302                          (and (eq_attr "cpu" "pentiumpro")
303                               (and (eq_attr "memory" "none,unknown")
304                                    (eq_attr "type" "fop")))
305                          "decodern,p0")
306
307 (define_insn_reservation "ppro_fop_load" 5
308                          (and (eq_attr "cpu" "pentiumpro")
309                               (and (eq_attr "memory" "load")
310                                    (eq_attr "type" "fop")))
311                          "decoder0,p2+p0,p0")
312
313 (define_insn_reservation "ppro_fop_store" 3
314                          (and (eq_attr "cpu" "pentiumpro")
315                               (and (eq_attr "memory" "store")
316                                    (eq_attr "type" "fop")))
317                          "decoder0,p0,p0,p0+p4+p3")
318
319 (define_insn_reservation "ppro_fop_both" 5
320                          (and (eq_attr "cpu" "pentiumpro")
321                               (and (eq_attr "memory" "both")
322                                    (eq_attr "type" "fop")))
323                          "decoder0,p2+p0,p0+p4+p3")
324
325 (define_insn_reservation "ppro_fsgn" 1
326                          (and (eq_attr "cpu" "pentiumpro")
327                               (eq_attr "type" "fsgn"))
328                          "decodern,p0")
329
330 (define_insn_reservation "ppro_fistp" 5
331                          (and (eq_attr "cpu" "pentiumpro")
332                               (eq_attr "type" "fistp"))
333                          "decoder0,p0*2,p4+p3")
334
335 (define_insn_reservation "ppro_fcmov" 2
336                          (and (eq_attr "cpu" "pentiumpro")
337                               (eq_attr "type" "fcmov"))
338                          "decoder0,p0*2")
339
340 (define_insn_reservation "ppro_fcmp" 1
341                          (and (eq_attr "cpu" "pentiumpro")
342                               (and (eq_attr "memory" "none")
343                                    (eq_attr "type" "fcmp")))
344                          "decodern,p0")
345
346 (define_insn_reservation "ppro_fcmp_load" 4
347                          (and (eq_attr "cpu" "pentiumpro")
348                               (and (eq_attr "memory" "load")
349                                    (eq_attr "type" "fcmp")))
350                          "decoder0,p2+p0")
351
352 (define_insn_reservation "ppro_fmov" 1
353                          (and (eq_attr "cpu" "pentiumpro")
354                               (and (eq_attr "memory" "none")
355                                    (eq_attr "type" "fmov")))
356                          "decodern,p0")
357
358 (define_insn_reservation "ppro_fmov_load" 1
359                          (and (eq_attr "cpu" "pentiumpro")
360                               (and (eq_attr "memory" "load")
361                                    (and (eq_attr "mode" "!XF")
362                                         (eq_attr "type" "fmov"))))
363                          "decodern,p2")
364
365 (define_insn_reservation "ppro_fmov_XF_load" 3
366                          (and (eq_attr "cpu" "pentiumpro")
367                               (and (eq_attr "memory" "load")
368                                    (and (eq_attr "mode" "XF")
369                                         (eq_attr "type" "fmov"))))
370                          "decoder0,(p2+p0)*2")
371
372 (define_insn_reservation "ppro_fmov_store" 1
373                          (and (eq_attr "cpu" "pentiumpro")
374                               (and (eq_attr "memory" "store")
375                                    (and (eq_attr "mode" "!XF")
376                                         (eq_attr "type" "fmov"))))
377                          "decodern,p0")
378
379 (define_insn_reservation "ppro_fmov_XF_store" 3
380                          (and (eq_attr "cpu" "pentiumpro")
381                               (and (eq_attr "memory" "store")
382                                    (and (eq_attr "mode" "XF")
383                                         (eq_attr "type" "fmov"))))
384                          "decoder0,(p0+p4),(p0+p3)")
385
386 ;; fmul executes on port 0 with latency 5.  It has issue latency 2,
387 ;; but we don't model this.
388 (define_insn_reservation "ppro_fmul" 5
389                          (and (eq_attr "cpu" "pentiumpro")
390                               (and (eq_attr "memory" "none")
391                                    (eq_attr "type" "fmul")))
392                          "decoder0,p0*2")
393
394 (define_insn_reservation "ppro_fmul_load" 6
395                          (and (eq_attr "cpu" "pentiumpro")
396                               (and (eq_attr "memory" "load")
397                                    (eq_attr "type" "fmul")))
398                          "decoder0,p2+p0,p0")
399
400 ;; fdiv latencies depend on the mode of the operands.  XFmode gives
401 ;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
402 ;; Division by a power of 2 takes only 9 cycles, but we cannot model
403 ;; that.  Throughput is equal to latency - 1, which we model using the
404 ;; ppro_div automaton.
405 (define_insn_reservation "ppro_fdiv_SF" 18
406                          (and (eq_attr "cpu" "pentiumpro")
407                               (and (eq_attr "memory" "none")
408                                    (and (eq_attr "mode" "SF")
409                                         (eq_attr "type" "fdiv,fpspc"))))
410                          "decodern,p0+fdiv,fdiv*16")
411
412 (define_insn_reservation "ppro_fdiv_SF_load" 19
413                          (and (eq_attr "cpu" "pentiumpro")
414                               (and (eq_attr "memory" "load")
415                                    (and (eq_attr "mode" "SF")
416                                         (eq_attr "type" "fdiv,fpspc"))))
417                          "decoder0,p2+p0+fdiv,fdiv*16")
418
419 (define_insn_reservation "ppro_fdiv_DF" 32
420                          (and (eq_attr "cpu" "pentiumpro")
421                               (and (eq_attr "memory" "none")
422                                    (and (eq_attr "mode" "DF")
423                                         (eq_attr "type" "fdiv,fpspc"))))
424                          "decodern,p0+fdiv,fdiv*30")
425
426 (define_insn_reservation "ppro_fdiv_DF_load" 33
427                          (and (eq_attr "cpu" "pentiumpro")
428                               (and (eq_attr "memory" "load")
429                                    (and (eq_attr "mode" "DF")
430                                         (eq_attr "type" "fdiv,fpspc"))))
431                          "decoder0,p2+p0+fdiv,fdiv*30")
432
433 (define_insn_reservation "ppro_fdiv_XF" 38
434                          (and (eq_attr "cpu" "pentiumpro")
435                               (and (eq_attr "memory" "none")
436                                    (and (eq_attr "mode" "XF")
437                                         (eq_attr "type" "fdiv,fpspc"))))
438                          "decodern,p0+fdiv,fdiv*36")
439
440 (define_insn_reservation "ppro_fdiv_XF_load" 39
441                          (and (eq_attr "cpu" "pentiumpro")
442                               (and (eq_attr "memory" "load")
443                                    (and (eq_attr "mode" "XF")
444                                         (eq_attr "type" "fdiv,fpspc"))))
445                          "decoder0,p2+p0+fdiv,fdiv*36")
446
447 ;; MMX instructions can execute on either port 0 or port 1 with a
448 ;; throughput of 1/cycle.
449 ;;   on port 0: - ALU (latency 1)
450 ;;              - Multiplier Unit (latency 3)
451 ;;   on port 1: - ALU (latency 1)
452 ;;              - Shift Unit (latency 1)
453 ;;
454 ;; MMX instructions are either of the type reg-reg, or read-modify, and
455 ;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
456 ;; so they behave as "simple" instructions that need no special modelling.
457 ;; We only have to model mmxshft and mmxmul.
458 (define_insn_reservation "ppro_mmx_shft" 1
459                          (and (eq_attr "cpu" "pentiumpro")
460                               (and (eq_attr "memory" "none")
461                                    (eq_attr "type" "mmxshft")))
462                          "decodern,p1")
463
464 (define_insn_reservation "ppro_mmx_shft_load" 2
465                          (and (eq_attr "cpu" "pentiumpro")
466                               (and (eq_attr "memory" "none")
467                                    (eq_attr "type" "mmxshft")))
468                          "decoder0,p2+p1")
469
470 (define_insn_reservation "ppro_mmx_mul" 3
471                          (and (eq_attr "cpu" "pentiumpro")
472                               (and (eq_attr "memory" "none")
473                                    (eq_attr "type" "mmxmul")))
474                          "decodern,p0")
475
476 (define_insn_reservation "ppro_mmx_mul_load" 3
477                          (and (eq_attr "cpu" "pentiumpro")
478                               (and (eq_attr "memory" "none")
479                                    (eq_attr "type" "mmxmul")))
480                          "decoder0,p2+p0")
481
482 (define_insn_reservation "ppro_sse_mmxcvt" 4
483                          (and (eq_attr "cpu" "pentiumpro")
484                               (and (eq_attr "mode" "DI")
485                                    (eq_attr "type" "mmxcvt")))
486                          "decodern,p1")
487
488 ;; FIXME: These are Pentium III only, but we cannot tell here if
489 ;; we're generating code for PentiumPro/Pentium II or Pentium III
490 ;; (define_insn_reservation "ppro_sse_mmxshft" 2
491 ;;                       (and (eq_attr "cpu" "pentiumpro")
492 ;;                            (and (eq_attr "mode" "DI")
493 ;;                                 (eq_attr "type" "mmxshft")))
494 ;;                       "decodern,p0")
495
496 ;; SSE is very complicated, and takes a bit more effort.
497 ;; ??? I assumed that all SSE instructions decode on decoder0,
498 ;;     but is this correct?
499
500 ;; The sfence instruction.
501 (define_insn_reservation "ppro_sse_sfence" 3
502                          (and (eq_attr "cpu" "pentiumpro")
503                               (and (eq_attr "memory" "unknown")
504                                    (eq_attr "type" "sse")))
505                          "decoder0,p4+p3")
506
507 ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
508 (define_insn_reservation "ppro_sse_SF" 3
509                          (and (eq_attr "cpu" "pentiumpro")
510                               (and (eq_attr "mode" "SF")
511                                    (eq_attr "type" "sse")))
512                          "decodern,p0")
513
514 (define_insn_reservation "ppro_sse_add_SF" 3
515                          (and (eq_attr "cpu" "pentiumpro")
516                               (and (eq_attr "memory" "none")
517                                    (and (eq_attr "mode" "SF")
518                                         (eq_attr "type" "sseadd"))))
519                          "decodern,p1")
520
521 (define_insn_reservation "ppro_sse_add_SF_load" 3
522                          (and (eq_attr "cpu" "pentiumpro")
523                               (and (eq_attr "memory" "load")
524                                    (and (eq_attr "mode" "SF")
525                                         (eq_attr "type" "sseadd"))))
526                          "decoder0,p2+p1")
527
528 (define_insn_reservation "ppro_sse_cmp_SF" 3
529                          (and (eq_attr "cpu" "pentiumpro")
530                               (and (eq_attr "memory" "none")
531                                    (and (eq_attr "mode" "SF")
532                                         (eq_attr "type" "ssecmp"))))
533                          "decoder0,p1")
534
535 (define_insn_reservation "ppro_sse_cmp_SF_load" 3
536                          (and (eq_attr "cpu" "pentiumpro")
537                               (and (eq_attr "memory" "load")
538                                    (and (eq_attr "mode" "SF")
539                                         (eq_attr "type" "ssecmp"))))
540                          "decoder0,p2+p1")
541
542 (define_insn_reservation "ppro_sse_comi_SF" 1
543                          (and (eq_attr "cpu" "pentiumpro")
544                               (and (eq_attr "memory" "none")
545                                    (and (eq_attr "mode" "SF")
546                                         (eq_attr "type" "ssecomi"))))
547                          "decodern,p0")
548
549 (define_insn_reservation "ppro_sse_comi_SF_load" 1
550                          (and (eq_attr "cpu" "pentiumpro")
551                               (and (eq_attr "memory" "load")
552                                    (and (eq_attr "mode" "SF")
553                                         (eq_attr "type" "ssecomi"))))
554                          "decoder0,p2+p0")
555
556 (define_insn_reservation "ppro_sse_mul_SF" 4
557                          (and (eq_attr "cpu" "pentiumpro")
558                               (and (eq_attr "memory" "none")
559                                    (and (eq_attr "mode" "SF")
560                                         (eq_attr "type" "ssemul"))))
561                         "decodern,p0")
562
563 (define_insn_reservation "ppro_sse_mul_SF_load" 4
564                          (and (eq_attr "cpu" "pentiumpro")
565                               (and (eq_attr "memory" "load")
566                                    (and (eq_attr "mode" "SF")
567                                         (eq_attr "type" "ssemul"))))
568                         "decoder0,p2+p0")
569
570 ;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
571 (define_insn_reservation "ppro_sse_div_SF" 18
572                          (and (eq_attr "cpu" "pentiumpro")
573                               (and (eq_attr "memory" "none")
574                                    (and (eq_attr "mode" "SF")
575                                         (eq_attr "type" "ssediv"))))
576                          "decoder0,p0*17")
577
578 (define_insn_reservation "ppro_sse_div_SF_load" 18
579                          (and (eq_attr "cpu" "pentiumpro")
580                               (and (eq_attr "memory" "none")
581                                    (and (eq_attr "mode" "SF")
582                                         (eq_attr "type" "ssediv"))))
583                          "decoder0,(p2+p0),p0*16")
584
585 (define_insn_reservation "ppro_sse_icvt_SF" 4
586                          (and (eq_attr "cpu" "pentiumpro")
587                               (and (eq_attr "mode" "SF")
588                                    (eq_attr "type" "sseicvt")))
589                          "decoder0,(p2+p1)*2")
590
591 (define_insn_reservation "ppro_sse_icvt_SI" 3
592                          (and (eq_attr "cpu" "pentiumpro")
593                               (and (eq_attr "mode" "SI")
594                                    (eq_attr "type" "sseicvt")))
595                          "decoder0,(p2+p1)")
596
597 (define_insn_reservation "ppro_sse_mov_SF" 3
598                          (and (eq_attr "cpu" "pentiumpro")
599                               (and (eq_attr "memory" "none")
600                                    (and (eq_attr "mode" "SF")
601                                         (eq_attr "type" "ssemov"))))
602                          "decoder0,(p0|p1)")
603
604 (define_insn_reservation "ppro_sse_mov_SF_load" 3
605                          (and (eq_attr "cpu" "pentiumpro")
606                               (and (eq_attr "memory" "load")
607                                    (and (eq_attr "mode" "SF")
608                                         (eq_attr "type" "ssemov"))))
609                          "decoder0,p2+(p0|p1)")
610
611 (define_insn_reservation "ppro_sse_mov_SF_store" 3
612                          (and (eq_attr "cpu" "pentiumpro")
613                               (and (eq_attr "memory" "store")
614                                    (and (eq_attr "mode" "SF")
615                                         (eq_attr "type" "ssemov"))))
616                          "decoder0,p4+p3")
617
618 (define_insn_reservation "ppro_sse_V4SF" 4
619                          (and (eq_attr "cpu" "pentiumpro")
620                               (and (eq_attr "mode" "V4SF")
621                                    (eq_attr "type" "sse")))
622                          "decoder0,p1*2")
623
624 (define_insn_reservation "ppro_sse_add_V4SF" 3
625                          (and (eq_attr "cpu" "pentiumpro")
626                               (and (eq_attr "memory" "none")
627                                    (and (eq_attr "mode" "V4SF")
628                                         (eq_attr "type" "sseadd"))))
629                          "decoder0,p1*2")
630
631 (define_insn_reservation "ppro_sse_add_V4SF_load" 3
632                          (and (eq_attr "cpu" "pentiumpro")
633                               (and (eq_attr "memory" "load")
634                                    (and (eq_attr "mode" "V4SF")
635                                         (eq_attr "type" "sseadd"))))
636                          "decoder0,(p2+p1)*2")
637
638 (define_insn_reservation "ppro_sse_cmp_V4SF" 3
639                          (and (eq_attr "cpu" "pentiumpro")
640                               (and (eq_attr "memory" "none")
641                                    (and (eq_attr "mode" "V4SF")
642                                         (eq_attr "type" "ssecmp"))))
643                          "decoder0,p1*2")
644
645 (define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
646                          (and (eq_attr "cpu" "pentiumpro")
647                               (and (eq_attr "memory" "load")
648                                    (and (eq_attr "mode" "V4SF")
649                                         (eq_attr "type" "ssecmp"))))
650                          "decoder0,(p2+p1)*2")
651
652 (define_insn_reservation "ppro_sse_cvt_V4SF" 3
653                          (and (eq_attr "cpu" "pentiumpro")
654                               (and (eq_attr "memory" "none,unknown")
655                                    (and (eq_attr "mode" "V4SF")
656                                         (eq_attr "type" "ssecvt"))))
657                          "decoder0,p1*2")
658
659 (define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
660                          (and (eq_attr "cpu" "pentiumpro")
661                               (and (eq_attr "memory" "!none,unknown")
662                                    (and (eq_attr "mode" "V4SF")
663                                         (eq_attr "type" "ssecmp"))))
664                          "decoder0,p1,p4+p3")
665
666 (define_insn_reservation "ppro_sse_mul_V4SF" 5
667                          (and (eq_attr "cpu" "pentiumpro")
668                               (and (eq_attr "memory" "none")
669                                    (and (eq_attr "mode" "V4SF")
670                                         (eq_attr "type" "ssemul"))))
671                         "decoder0,p0*2")
672
673 (define_insn_reservation "ppro_sse_mul_V4SF_load" 5
674                          (and (eq_attr "cpu" "pentiumpro")
675                               (and (eq_attr "memory" "load")
676                                    (and (eq_attr "mode" "V4SF")
677                                         (eq_attr "type" "ssemul"))))
678                         "decoder0,(p2+p0)*2")
679
680 ;; FIXME: p0 really closed this long???
681 (define_insn_reservation "ppro_sse_div_V4SF" 48
682                          (and (eq_attr "cpu" "pentiumpro")
683                               (and (eq_attr "memory" "none")
684                                    (and (eq_attr "mode" "V4SF")
685                                         (eq_attr "type" "ssediv"))))
686                          "decoder0,p0*34")
687
688 (define_insn_reservation "ppro_sse_div_V4SF_load" 48
689                          (and (eq_attr "cpu" "pentiumpro")
690                               (and (eq_attr "memory" "load")
691                                    (and (eq_attr "mode" "V4SF")
692                                         (eq_attr "type" "ssediv"))))
693                          "decoder0,(p2+p0)*2,p0*32")
694
695 (define_insn_reservation "ppro_sse_log_V4SF" 2
696                          (and (eq_attr "cpu" "pentiumpro")
697                               (and (eq_attr "memory" "none")
698                                    (and (eq_attr "mode" "V4SF")
699                                         (eq_attr "type" "sselog,sselog1"))))
700                          "decodern,p1")
701
702 (define_insn_reservation "ppro_sse_log_V4SF_load" 2
703                          (and (eq_attr "cpu" "pentiumpro")
704                               (and (eq_attr "memory" "load")
705                                    (and (eq_attr "mode" "V4SF")
706                                         (eq_attr "type" "sselog,sselog1"))))
707                          "decoder0,(p2+p1)")
708
709 (define_insn_reservation "ppro_sse_mov_V4SF" 1
710                          (and (eq_attr "cpu" "pentiumpro")
711                               (and (eq_attr "memory" "none")
712                                    (and (eq_attr "mode" "V4SF")
713                                         (eq_attr "type" "ssemov"))))
714                          "decoder0,(p0|p1)*2")
715
716 (define_insn_reservation "ppro_sse_mov_V4SF_load" 2
717                          (and (eq_attr "cpu" "pentiumpro")
718                               (and (eq_attr "memory" "load")
719                                    (and (eq_attr "mode" "V4SF")
720                                         (eq_attr "type" "ssemov"))))
721                          "decoder0,p2*2")
722
723 (define_insn_reservation "ppro_sse_mov_V4SF_store" 3
724                          (and (eq_attr "cpu" "pentiumpro")
725                               (and (eq_attr "memory" "store")
726                                    (and (eq_attr "mode" "V4SF")
727                                         (eq_attr "type" "ssemov"))))
728                          "decoder0,(p4+p3)*2")
729
730 ;; All other instructions are modelled as simple instructions.
731 ;; We have already modelled all i387 floating point instructions, so all
732 ;; other instructions execute on either port 0 or port 1.  This includes
733 ;; the ALU units, and the MMX units.
734 ;;
735 ;; reg-reg instructions produce 1 uop so they can be decoded on any of
736 ;; the three decoders.
737 (define_insn_reservation "ppro_insn" 1
738                          (and (eq_attr "cpu" "pentiumpro")
739                               (and (eq_attr "memory" "none,unknown")
740                                    (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
741                          "decodern,(p0|p1)")
742
743 ;; read-modify and register-memory instructions have 2 or three uops,
744 ;; so they have to be decoded on decoder0.
745 (define_insn_reservation "ppro_insn_load" 3
746                          (and (eq_attr "cpu" "pentiumpro")
747                               (and (eq_attr "memory" "load")
748                                    (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
749                          "decoder0,p2+(p0|p1)")
750
751 (define_insn_reservation "ppro_insn_store" 1
752                          (and (eq_attr "cpu" "pentiumpro")
753                               (and (eq_attr "memory" "store")
754                                    (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
755                          "decoder0,(p0|p1),p4+p3")
756
757 ;; read-modify-store instructions produce 4 uops so they have to be
758 ;; decoded on decoder0 as well.
759 (define_insn_reservation "ppro_insn_both" 4
760                          (and (eq_attr "cpu" "pentiumpro")
761                               (and (eq_attr "memory" "both")
762                                    (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
763                          "decoder0,p2+(p0|p1),p4+p3")
764