Merge branch 'vendor/GCC47'
[dragonfly.git] / contrib / gcc-4.7 / gcc / config / i386 / i386.c
CommitLineData
e4b17023 1/* Subroutines used for code generation on IA-32.
5ce9237c
JM
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013
e4b17023
JM
4 Free Software Foundation, Inc.
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify
9it under the terms of the GNU General Public License as published by
10the Free Software Foundation; either version 3, or (at your option)
11any later version.
12
13GCC is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16GNU General Public License for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "tm.h"
26#include "rtl.h"
27#include "tree.h"
28#include "tm_p.h"
29#include "regs.h"
30#include "hard-reg-set.h"
31#include "insn-config.h"
32#include "conditions.h"
33#include "output.h"
34#include "insn-codes.h"
35#include "insn-attr.h"
36#include "flags.h"
37#include "except.h"
38#include "function.h"
39#include "recog.h"
40#include "expr.h"
41#include "optabs.h"
42#include "diagnostic-core.h"
43#include "toplev.h"
44#include "basic-block.h"
45#include "ggc.h"
46#include "target.h"
47#include "target-def.h"
48#include "common/common-target.h"
49#include "langhooks.h"
50#include "reload.h"
51#include "cgraph.h"
52#include "gimple.h"
53#include "dwarf2.h"
54#include "df.h"
55#include "tm-constrs.h"
56#include "params.h"
57#include "cselib.h"
58#include "debug.h"
59#include "sched-int.h"
60#include "sbitmap.h"
61#include "fibheap.h"
62#include "opts.h"
63#include "diagnostic.h"
64
65enum upper_128bits_state
66{
67 unknown = 0,
68 unused,
69 used
70};
71
72typedef struct block_info_def
73{
74 /* State of the upper 128bits of AVX registers at exit. */
75 enum upper_128bits_state state;
76 /* TRUE if state of the upper 128bits of AVX registers is unchanged
77 in this block. */
78 bool unchanged;
79 /* TRUE if block has been processed. */
80 bool processed;
81 /* TRUE if block has been scanned. */
82 bool scanned;
83 /* Previous state of the upper 128bits of AVX registers at entry. */
84 enum upper_128bits_state prev;
85} *block_info;
86
87#define BLOCK_INFO(B) ((block_info) (B)->aux)
88
89enum call_avx256_state
90{
91 /* Callee returns 256bit AVX register. */
92 callee_return_avx256 = -1,
93 /* Callee returns and passes 256bit AVX register. */
94 callee_return_pass_avx256,
95 /* Callee passes 256bit AVX register. */
96 callee_pass_avx256,
97 /* Callee doesn't return nor passe 256bit AVX register, or no
98 256bit AVX register in function return. */
99 call_no_avx256,
100 /* vzeroupper intrinsic. */
101 vzeroupper_intrinsic
102};
103
104/* Check if a 256bit AVX register is referenced in stores. */
105
106static void
107check_avx256_stores (rtx dest, const_rtx set, void *data)
108{
109 if ((REG_P (dest)
110 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
111 || (GET_CODE (set) == SET
112 && REG_P (SET_SRC (set))
113 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114 {
115 enum upper_128bits_state *state
116 = (enum upper_128bits_state *) data;
117 *state = used;
118 }
119}
120
121/* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
122 in basic block BB. Delete it if upper 128bit AVX registers are
123 unused. If it isn't deleted, move it to just before a jump insn.
124
125 STATE is state of the upper 128bits of AVX registers at entry. */
126
127static void
128move_or_delete_vzeroupper_2 (basic_block bb,
129 enum upper_128bits_state state)
130{
131 rtx insn, bb_end;
132 rtx vzeroupper_insn = NULL_RTX;
133 rtx pat;
134 int avx256;
135 bool unchanged;
136
137 if (BLOCK_INFO (bb)->unchanged)
138 {
139 if (dump_file)
140 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
141 bb->index, state);
142
143 BLOCK_INFO (bb)->state = state;
144 return;
145 }
146
147 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
148 {
149 if (dump_file)
150 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
151 bb->index, BLOCK_INFO (bb)->state);
152 return;
153 }
154
155 BLOCK_INFO (bb)->prev = state;
156
157 if (dump_file)
158 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
159 bb->index, state);
160
161 unchanged = true;
162
163 /* BB_END changes when it is deleted. */
164 bb_end = BB_END (bb);
165 insn = BB_HEAD (bb);
166 while (insn != bb_end)
167 {
168 insn = NEXT_INSN (insn);
169
170 if (!NONDEBUG_INSN_P (insn))
171 continue;
172
173 /* Move vzeroupper before jump/call. */
174 if (JUMP_P (insn) || CALL_P (insn))
175 {
176 if (!vzeroupper_insn)
177 continue;
178
179 if (PREV_INSN (insn) != vzeroupper_insn)
180 {
181 if (dump_file)
182 {
183 fprintf (dump_file, "Move vzeroupper after:\n");
184 print_rtl_single (dump_file, PREV_INSN (insn));
185 fprintf (dump_file, "before:\n");
186 print_rtl_single (dump_file, insn);
187 }
188 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
189 PREV_INSN (insn));
190 }
191 vzeroupper_insn = NULL_RTX;
192 continue;
193 }
194
195 pat = PATTERN (insn);
196
197 /* Check insn for vzeroupper intrinsic. */
198 if (GET_CODE (pat) == UNSPEC_VOLATILE
199 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
200 {
201 if (dump_file)
202 {
203 /* Found vzeroupper intrinsic. */
204 fprintf (dump_file, "Found vzeroupper:\n");
205 print_rtl_single (dump_file, insn);
206 }
207 }
208 else
209 {
210 /* Check insn for vzeroall intrinsic. */
211 if (GET_CODE (pat) == PARALLEL
212 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
213 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
214 {
215 state = unused;
216 unchanged = false;
217
218 /* Delete pending vzeroupper insertion. */
219 if (vzeroupper_insn)
220 {
221 delete_insn (vzeroupper_insn);
222 vzeroupper_insn = NULL_RTX;
223 }
224 }
225 else if (state != used)
226 {
227 note_stores (pat, check_avx256_stores, &state);
228 if (state == used)
229 unchanged = false;
230 }
231 continue;
232 }
233
234 /* Process vzeroupper intrinsic. */
235 avx256 = INTVAL (XVECEXP (pat, 0, 0));
236
237 if (state == unused)
238 {
239 /* Since the upper 128bits are cleared, callee must not pass
240 256bit AVX register. We only need to check if callee
241 returns 256bit AVX register. */
242 if (avx256 == callee_return_avx256)
243 {
244 state = used;
245 unchanged = false;
246 }
247
248 /* Remove unnecessary vzeroupper since upper 128bits are
249 cleared. */
250 if (dump_file)
251 {
252 fprintf (dump_file, "Delete redundant vzeroupper:\n");
253 print_rtl_single (dump_file, insn);
254 }
255 delete_insn (insn);
256 }
257 else
258 {
259 /* Set state to UNUSED if callee doesn't return 256bit AVX
260 register. */
261 if (avx256 != callee_return_pass_avx256)
262 state = unused;
263
264 if (avx256 == callee_return_pass_avx256
265 || avx256 == callee_pass_avx256)
266 {
267 /* Must remove vzeroupper since callee passes in 256bit
268 AVX register. */
269 if (dump_file)
270 {
271 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
272 print_rtl_single (dump_file, insn);
273 }
274 delete_insn (insn);
275 }
276 else
277 {
278 vzeroupper_insn = insn;
279 unchanged = false;
280 }
281 }
282 }
283
284 BLOCK_INFO (bb)->state = state;
285 BLOCK_INFO (bb)->unchanged = unchanged;
286 BLOCK_INFO (bb)->scanned = true;
287
288 if (dump_file)
289 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
290 bb->index, unchanged ? "unchanged" : "changed",
291 state);
292}
293
294/* Helper function for move_or_delete_vzeroupper. Process vzeroupper
295 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
296 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
297 state is changed. */
298
299static bool
300move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
301{
302 edge e;
303 edge_iterator ei;
304 enum upper_128bits_state state, old_state, new_state;
305 bool seen_unknown;
306
307 if (dump_file)
308 fprintf (dump_file, " Process [bb %i]: status: %d\n",
309 block->index, BLOCK_INFO (block)->processed);
310
311 if (BLOCK_INFO (block)->processed)
312 return false;
313
314 state = unused;
315
316 /* Check all predecessor edges of this block. */
317 seen_unknown = false;
318 FOR_EACH_EDGE (e, ei, block->preds)
319 {
320 if (e->src == block)
321 continue;
322 switch (BLOCK_INFO (e->src)->state)
323 {
324 case unknown:
325 if (!unknown_is_unused)
326 seen_unknown = true;
327 case unused:
328 break;
329 case used:
330 state = used;
331 goto done;
332 }
333 }
334
335 if (seen_unknown)
336 state = unknown;
337
338done:
339 old_state = BLOCK_INFO (block)->state;
340 move_or_delete_vzeroupper_2 (block, state);
341 new_state = BLOCK_INFO (block)->state;
342
343 if (state != unknown || new_state == used)
344 BLOCK_INFO (block)->processed = true;
345
346 /* Need to rescan if the upper 128bits of AVX registers are changed
347 to USED at exit. */
348 if (new_state != old_state)
349 {
350 if (new_state == used)
351 cfun->machine->rescan_vzeroupper_p = 1;
352 return true;
353 }
354 else
355 return false;
356}
357
358/* Go through the instruction stream looking for vzeroupper. Delete
359 it if upper 128bit AVX registers are unused. If it isn't deleted,
360 move it to just before a jump insn. */
361
362static void
363move_or_delete_vzeroupper (void)
364{
365 edge e;
366 edge_iterator ei;
367 basic_block bb;
368 fibheap_t worklist, pending, fibheap_swap;
369 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
370 int *bb_order;
371 int *rc_order;
372 int i;
373
374 /* Set up block info for each basic block. */
375 alloc_aux_for_blocks (sizeof (struct block_info_def));
376
377 /* Process outgoing edges of entry point. */
378 if (dump_file)
379 fprintf (dump_file, "Process outgoing edges of entry point\n");
380
381 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382 {
383 move_or_delete_vzeroupper_2 (e->dest,
384 cfun->machine->caller_pass_avx256_p
385 ? used : unused);
386 BLOCK_INFO (e->dest)->processed = true;
387 }
388
389 /* Compute reverse completion order of depth first search of the CFG
390 so that the data-flow runs faster. */
391 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
392 bb_order = XNEWVEC (int, last_basic_block);
393 pre_and_rev_post_order_compute (NULL, rc_order, false);
394 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
395 bb_order[rc_order[i]] = i;
396 free (rc_order);
397
398 worklist = fibheap_new ();
399 pending = fibheap_new ();
400 visited = sbitmap_alloc (last_basic_block);
401 in_worklist = sbitmap_alloc (last_basic_block);
402 in_pending = sbitmap_alloc (last_basic_block);
403 sbitmap_zero (in_worklist);
404
405 /* Don't check outgoing edges of entry point. */
406 sbitmap_ones (in_pending);
407 FOR_EACH_BB (bb)
408 if (BLOCK_INFO (bb)->processed)
409 RESET_BIT (in_pending, bb->index);
410 else
411 {
412 move_or_delete_vzeroupper_1 (bb, false);
413 fibheap_insert (pending, bb_order[bb->index], bb);
414 }
415
416 if (dump_file)
417 fprintf (dump_file, "Check remaining basic blocks\n");
418
419 while (!fibheap_empty (pending))
420 {
421 fibheap_swap = pending;
422 pending = worklist;
423 worklist = fibheap_swap;
424 sbitmap_swap = in_pending;
425 in_pending = in_worklist;
426 in_worklist = sbitmap_swap;
427
428 sbitmap_zero (visited);
429
430 cfun->machine->rescan_vzeroupper_p = 0;
431
432 while (!fibheap_empty (worklist))
433 {
434 bb = (basic_block) fibheap_extract_min (worklist);
435 RESET_BIT (in_worklist, bb->index);
436 gcc_assert (!TEST_BIT (visited, bb->index));
437 if (!TEST_BIT (visited, bb->index))
438 {
439 edge_iterator ei;
440
441 SET_BIT (visited, bb->index);
442
443 if (move_or_delete_vzeroupper_1 (bb, false))
444 FOR_EACH_EDGE (e, ei, bb->succs)
445 {
446 if (e->dest == EXIT_BLOCK_PTR
447 || BLOCK_INFO (e->dest)->processed)
448 continue;
449
450 if (TEST_BIT (visited, e->dest->index))
451 {
452 if (!TEST_BIT (in_pending, e->dest->index))
453 {
454 /* Send E->DEST to next round. */
455 SET_BIT (in_pending, e->dest->index);
456 fibheap_insert (pending,
457 bb_order[e->dest->index],
458 e->dest);
459 }
460 }
461 else if (!TEST_BIT (in_worklist, e->dest->index))
462 {
463 /* Add E->DEST to current round. */
464 SET_BIT (in_worklist, e->dest->index);
465 fibheap_insert (worklist, bb_order[e->dest->index],
466 e->dest);
467 }
468 }
469 }
470 }
471
472 if (!cfun->machine->rescan_vzeroupper_p)
473 break;
474 }
475
476 free (bb_order);
477 fibheap_delete (worklist);
478 fibheap_delete (pending);
479 sbitmap_free (visited);
480 sbitmap_free (in_worklist);
481 sbitmap_free (in_pending);
482
483 if (dump_file)
484 fprintf (dump_file, "Process remaining basic blocks\n");
485
486 FOR_EACH_BB (bb)
487 move_or_delete_vzeroupper_1 (bb, true);
488
489 free_aux_for_blocks ();
490}
491
492static rtx legitimize_dllimport_symbol (rtx, bool);
493
494#ifndef CHECK_STACK_LIMIT
495#define CHECK_STACK_LIMIT (-1)
496#endif
497
498/* Return index of given mode in mult and division cost tables. */
499#define MODE_INDEX(mode) \
500 ((mode) == QImode ? 0 \
501 : (mode) == HImode ? 1 \
502 : (mode) == SImode ? 2 \
503 : (mode) == DImode ? 3 \
504 : 4)
505
506/* Processor costs (relative to an add) */
507/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
508#define COSTS_N_BYTES(N) ((N) * 2)
509
510#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
511
512const
513struct processor_costs ix86_size_cost = {/* costs for tuning for size */
514 COSTS_N_BYTES (2), /* cost of an add instruction */
515 COSTS_N_BYTES (3), /* cost of a lea instruction */
516 COSTS_N_BYTES (2), /* variable shift costs */
517 COSTS_N_BYTES (3), /* constant shift costs */
518 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
519 COSTS_N_BYTES (3), /* HI */
520 COSTS_N_BYTES (3), /* SI */
521 COSTS_N_BYTES (3), /* DI */
522 COSTS_N_BYTES (5)}, /* other */
523 0, /* cost of multiply per each bit set */
524 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
525 COSTS_N_BYTES (3), /* HI */
526 COSTS_N_BYTES (3), /* SI */
527 COSTS_N_BYTES (3), /* DI */
528 COSTS_N_BYTES (5)}, /* other */
529 COSTS_N_BYTES (3), /* cost of movsx */
530 COSTS_N_BYTES (3), /* cost of movzx */
531 0, /* "large" insn */
532 2, /* MOVE_RATIO */
533 2, /* cost for loading QImode using movzbl */
534 {2, 2, 2}, /* cost of loading integer registers
535 in QImode, HImode and SImode.
536 Relative to reg-reg move (2). */
537 {2, 2, 2}, /* cost of storing integer registers */
538 2, /* cost of reg,reg fld/fst */
539 {2, 2, 2}, /* cost of loading fp registers
540 in SFmode, DFmode and XFmode */
541 {2, 2, 2}, /* cost of storing fp registers
542 in SFmode, DFmode and XFmode */
543 3, /* cost of moving MMX register */
544 {3, 3}, /* cost of loading MMX registers
545 in SImode and DImode */
546 {3, 3}, /* cost of storing MMX registers
547 in SImode and DImode */
548 3, /* cost of moving SSE register */
549 {3, 3, 3}, /* cost of loading SSE registers
550 in SImode, DImode and TImode */
551 {3, 3, 3}, /* cost of storing SSE registers
552 in SImode, DImode and TImode */
553 3, /* MMX or SSE register to integer */
554 0, /* size of l1 cache */
555 0, /* size of l2 cache */
556 0, /* size of prefetch block */
557 0, /* number of parallel prefetches */
558 2, /* Branch cost */
559 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
560 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
561 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
562 COSTS_N_BYTES (2), /* cost of FABS instruction. */
563 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
564 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
565 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
566 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
567 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
568 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
569 1, /* scalar_stmt_cost. */
570 1, /* scalar load_cost. */
571 1, /* scalar_store_cost. */
572 1, /* vec_stmt_cost. */
573 1, /* vec_to_scalar_cost. */
574 1, /* scalar_to_vec_cost. */
575 1, /* vec_align_load_cost. */
576 1, /* vec_unalign_load_cost. */
577 1, /* vec_store_cost. */
578 1, /* cond_taken_branch_cost. */
579 1, /* cond_not_taken_branch_cost. */
580};
581
582/* Processor costs (relative to an add) */
583static const
584struct processor_costs i386_cost = { /* 386 specific costs */
585 COSTS_N_INSNS (1), /* cost of an add instruction */
586 COSTS_N_INSNS (1), /* cost of a lea instruction */
587 COSTS_N_INSNS (3), /* variable shift costs */
588 COSTS_N_INSNS (2), /* constant shift costs */
589 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
590 COSTS_N_INSNS (6), /* HI */
591 COSTS_N_INSNS (6), /* SI */
592 COSTS_N_INSNS (6), /* DI */
593 COSTS_N_INSNS (6)}, /* other */
594 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
595 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
596 COSTS_N_INSNS (23), /* HI */
597 COSTS_N_INSNS (23), /* SI */
598 COSTS_N_INSNS (23), /* DI */
599 COSTS_N_INSNS (23)}, /* other */
600 COSTS_N_INSNS (3), /* cost of movsx */
601 COSTS_N_INSNS (2), /* cost of movzx */
602 15, /* "large" insn */
603 3, /* MOVE_RATIO */
604 4, /* cost for loading QImode using movzbl */
605 {2, 4, 2}, /* cost of loading integer registers
606 in QImode, HImode and SImode.
607 Relative to reg-reg move (2). */
608 {2, 4, 2}, /* cost of storing integer registers */
609 2, /* cost of reg,reg fld/fst */
610 {8, 8, 8}, /* cost of loading fp registers
611 in SFmode, DFmode and XFmode */
612 {8, 8, 8}, /* cost of storing fp registers
613 in SFmode, DFmode and XFmode */
614 2, /* cost of moving MMX register */
615 {4, 8}, /* cost of loading MMX registers
616 in SImode and DImode */
617 {4, 8}, /* cost of storing MMX registers
618 in SImode and DImode */
619 2, /* cost of moving SSE register */
620 {4, 8, 16}, /* cost of loading SSE registers
621 in SImode, DImode and TImode */
622 {4, 8, 16}, /* cost of storing SSE registers
623 in SImode, DImode and TImode */
624 3, /* MMX or SSE register to integer */
625 0, /* size of l1 cache */
626 0, /* size of l2 cache */
627 0, /* size of prefetch block */
628 0, /* number of parallel prefetches */
629 1, /* Branch cost */
630 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
631 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
632 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
633 COSTS_N_INSNS (22), /* cost of FABS instruction. */
634 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
635 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
636 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
637 DUMMY_STRINGOP_ALGS},
638 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
639 DUMMY_STRINGOP_ALGS},
640 1, /* scalar_stmt_cost. */
641 1, /* scalar load_cost. */
642 1, /* scalar_store_cost. */
643 1, /* vec_stmt_cost. */
644 1, /* vec_to_scalar_cost. */
645 1, /* scalar_to_vec_cost. */
646 1, /* vec_align_load_cost. */
647 2, /* vec_unalign_load_cost. */
648 1, /* vec_store_cost. */
649 3, /* cond_taken_branch_cost. */
650 1, /* cond_not_taken_branch_cost. */
651};
652
653static const
654struct processor_costs i486_cost = { /* 486 specific costs */
655 COSTS_N_INSNS (1), /* cost of an add instruction */
656 COSTS_N_INSNS (1), /* cost of a lea instruction */
657 COSTS_N_INSNS (3), /* variable shift costs */
658 COSTS_N_INSNS (2), /* constant shift costs */
659 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
660 COSTS_N_INSNS (12), /* HI */
661 COSTS_N_INSNS (12), /* SI */
662 COSTS_N_INSNS (12), /* DI */
663 COSTS_N_INSNS (12)}, /* other */
664 1, /* cost of multiply per each bit set */
665 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
666 COSTS_N_INSNS (40), /* HI */
667 COSTS_N_INSNS (40), /* SI */
668 COSTS_N_INSNS (40), /* DI */
669 COSTS_N_INSNS (40)}, /* other */
670 COSTS_N_INSNS (3), /* cost of movsx */
671 COSTS_N_INSNS (2), /* cost of movzx */
672 15, /* "large" insn */
673 3, /* MOVE_RATIO */
674 4, /* cost for loading QImode using movzbl */
675 {2, 4, 2}, /* cost of loading integer registers
676 in QImode, HImode and SImode.
677 Relative to reg-reg move (2). */
678 {2, 4, 2}, /* cost of storing integer registers */
679 2, /* cost of reg,reg fld/fst */
680 {8, 8, 8}, /* cost of loading fp registers
681 in SFmode, DFmode and XFmode */
682 {8, 8, 8}, /* cost of storing fp registers
683 in SFmode, DFmode and XFmode */
684 2, /* cost of moving MMX register */
685 {4, 8}, /* cost of loading MMX registers
686 in SImode and DImode */
687 {4, 8}, /* cost of storing MMX registers
688 in SImode and DImode */
689 2, /* cost of moving SSE register */
690 {4, 8, 16}, /* cost of loading SSE registers
691 in SImode, DImode and TImode */
692 {4, 8, 16}, /* cost of storing SSE registers
693 in SImode, DImode and TImode */
694 3, /* MMX or SSE register to integer */
695 4, /* size of l1 cache. 486 has 8kB cache
696 shared for code and data, so 4kB is
697 not really precise. */
698 4, /* size of l2 cache */
699 0, /* size of prefetch block */
700 0, /* number of parallel prefetches */
701 1, /* Branch cost */
702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
703 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
704 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
705 COSTS_N_INSNS (3), /* cost of FABS instruction. */
706 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
707 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
708 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
709 DUMMY_STRINGOP_ALGS},
710 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
711 DUMMY_STRINGOP_ALGS},
712 1, /* scalar_stmt_cost. */
713 1, /* scalar load_cost. */
714 1, /* scalar_store_cost. */
715 1, /* vec_stmt_cost. */
716 1, /* vec_to_scalar_cost. */
717 1, /* scalar_to_vec_cost. */
718 1, /* vec_align_load_cost. */
719 2, /* vec_unalign_load_cost. */
720 1, /* vec_store_cost. */
721 3, /* cond_taken_branch_cost. */
722 1, /* cond_not_taken_branch_cost. */
723};
724
725static const
726struct processor_costs pentium_cost = {
727 COSTS_N_INSNS (1), /* cost of an add instruction */
728 COSTS_N_INSNS (1), /* cost of a lea instruction */
729 COSTS_N_INSNS (4), /* variable shift costs */
730 COSTS_N_INSNS (1), /* constant shift costs */
731 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
732 COSTS_N_INSNS (11), /* HI */
733 COSTS_N_INSNS (11), /* SI */
734 COSTS_N_INSNS (11), /* DI */
735 COSTS_N_INSNS (11)}, /* other */
736 0, /* cost of multiply per each bit set */
737 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
738 COSTS_N_INSNS (25), /* HI */
739 COSTS_N_INSNS (25), /* SI */
740 COSTS_N_INSNS (25), /* DI */
741 COSTS_N_INSNS (25)}, /* other */
742 COSTS_N_INSNS (3), /* cost of movsx */
743 COSTS_N_INSNS (2), /* cost of movzx */
744 8, /* "large" insn */
745 6, /* MOVE_RATIO */
746 6, /* cost for loading QImode using movzbl */
747 {2, 4, 2}, /* cost of loading integer registers
748 in QImode, HImode and SImode.
749 Relative to reg-reg move (2). */
750 {2, 4, 2}, /* cost of storing integer registers */
751 2, /* cost of reg,reg fld/fst */
752 {2, 2, 6}, /* cost of loading fp registers
753 in SFmode, DFmode and XFmode */
754 {4, 4, 6}, /* cost of storing fp registers
755 in SFmode, DFmode and XFmode */
756 8, /* cost of moving MMX register */
757 {8, 8}, /* cost of loading MMX registers
758 in SImode and DImode */
759 {8, 8}, /* cost of storing MMX registers
760 in SImode and DImode */
761 2, /* cost of moving SSE register */
762 {4, 8, 16}, /* cost of loading SSE registers
763 in SImode, DImode and TImode */
764 {4, 8, 16}, /* cost of storing SSE registers
765 in SImode, DImode and TImode */
766 3, /* MMX or SSE register to integer */
767 8, /* size of l1 cache. */
768 8, /* size of l2 cache */
769 0, /* size of prefetch block */
770 0, /* number of parallel prefetches */
771 2, /* Branch cost */
772 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
773 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
774 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
775 COSTS_N_INSNS (1), /* cost of FABS instruction. */
776 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
777 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
778 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
779 DUMMY_STRINGOP_ALGS},
780 {{libcall, {{-1, rep_prefix_4_byte}}},
781 DUMMY_STRINGOP_ALGS},
782 1, /* scalar_stmt_cost. */
783 1, /* scalar load_cost. */
784 1, /* scalar_store_cost. */
785 1, /* vec_stmt_cost. */
786 1, /* vec_to_scalar_cost. */
787 1, /* scalar_to_vec_cost. */
788 1, /* vec_align_load_cost. */
789 2, /* vec_unalign_load_cost. */
790 1, /* vec_store_cost. */
791 3, /* cond_taken_branch_cost. */
792 1, /* cond_not_taken_branch_cost. */
793};
794
795static const
796struct processor_costs pentiumpro_cost = {
797 COSTS_N_INSNS (1), /* cost of an add instruction */
798 COSTS_N_INSNS (1), /* cost of a lea instruction */
799 COSTS_N_INSNS (1), /* variable shift costs */
800 COSTS_N_INSNS (1), /* constant shift costs */
801 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
802 COSTS_N_INSNS (4), /* HI */
803 COSTS_N_INSNS (4), /* SI */
804 COSTS_N_INSNS (4), /* DI */
805 COSTS_N_INSNS (4)}, /* other */
806 0, /* cost of multiply per each bit set */
807 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
808 COSTS_N_INSNS (17), /* HI */
809 COSTS_N_INSNS (17), /* SI */
810 COSTS_N_INSNS (17), /* DI */
811 COSTS_N_INSNS (17)}, /* other */
812 COSTS_N_INSNS (1), /* cost of movsx */
813 COSTS_N_INSNS (1), /* cost of movzx */
814 8, /* "large" insn */
815 6, /* MOVE_RATIO */
816 2, /* cost for loading QImode using movzbl */
817 {4, 4, 4}, /* cost of loading integer registers
818 in QImode, HImode and SImode.
819 Relative to reg-reg move (2). */
820 {2, 2, 2}, /* cost of storing integer registers */
821 2, /* cost of reg,reg fld/fst */
822 {2, 2, 6}, /* cost of loading fp registers
823 in SFmode, DFmode and XFmode */
824 {4, 4, 6}, /* cost of storing fp registers
825 in SFmode, DFmode and XFmode */
826 2, /* cost of moving MMX register */
827 {2, 2}, /* cost of loading MMX registers
828 in SImode and DImode */
829 {2, 2}, /* cost of storing MMX registers
830 in SImode and DImode */
831 2, /* cost of moving SSE register */
832 {2, 2, 8}, /* cost of loading SSE registers
833 in SImode, DImode and TImode */
834 {2, 2, 8}, /* cost of storing SSE registers
835 in SImode, DImode and TImode */
836 3, /* MMX or SSE register to integer */
837 8, /* size of l1 cache. */
838 256, /* size of l2 cache */
839 32, /* size of prefetch block */
840 6, /* number of parallel prefetches */
841 2, /* Branch cost */
842 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
843 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
844 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
845 COSTS_N_INSNS (2), /* cost of FABS instruction. */
846 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
847 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
848 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
849 (we ensure the alignment). For small blocks inline loop is still a
850 noticeable win, for bigger blocks either rep movsl or rep movsb is
851 way to go. Rep movsb has apparently more expensive startup time in CPU,
852 but after 4K the difference is down in the noise. */
853 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
854 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
855 DUMMY_STRINGOP_ALGS},
856 {{rep_prefix_4_byte, {{1024, unrolled_loop},
857 {8192, rep_prefix_4_byte}, {-1, libcall}}},
858 DUMMY_STRINGOP_ALGS},
859 1, /* scalar_stmt_cost. */
860 1, /* scalar load_cost. */
861 1, /* scalar_store_cost. */
862 1, /* vec_stmt_cost. */
863 1, /* vec_to_scalar_cost. */
864 1, /* scalar_to_vec_cost. */
865 1, /* vec_align_load_cost. */
866 2, /* vec_unalign_load_cost. */
867 1, /* vec_store_cost. */
868 3, /* cond_taken_branch_cost. */
869 1, /* cond_not_taken_branch_cost. */
870};
871
872static const
873struct processor_costs geode_cost = {
874 COSTS_N_INSNS (1), /* cost of an add instruction */
875 COSTS_N_INSNS (1), /* cost of a lea instruction */
876 COSTS_N_INSNS (2), /* variable shift costs */
877 COSTS_N_INSNS (1), /* constant shift costs */
878 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
879 COSTS_N_INSNS (4), /* HI */
880 COSTS_N_INSNS (7), /* SI */
881 COSTS_N_INSNS (7), /* DI */
882 COSTS_N_INSNS (7)}, /* other */
883 0, /* cost of multiply per each bit set */
884 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
885 COSTS_N_INSNS (23), /* HI */
886 COSTS_N_INSNS (39), /* SI */
887 COSTS_N_INSNS (39), /* DI */
888 COSTS_N_INSNS (39)}, /* other */
889 COSTS_N_INSNS (1), /* cost of movsx */
890 COSTS_N_INSNS (1), /* cost of movzx */
891 8, /* "large" insn */
892 4, /* MOVE_RATIO */
893 1, /* cost for loading QImode using movzbl */
894 {1, 1, 1}, /* cost of loading integer registers
895 in QImode, HImode and SImode.
896 Relative to reg-reg move (2). */
897 {1, 1, 1}, /* cost of storing integer registers */
898 1, /* cost of reg,reg fld/fst */
899 {1, 1, 1}, /* cost of loading fp registers
900 in SFmode, DFmode and XFmode */
901 {4, 6, 6}, /* cost of storing fp registers
902 in SFmode, DFmode and XFmode */
903
904 1, /* cost of moving MMX register */
905 {1, 1}, /* cost of loading MMX registers
906 in SImode and DImode */
907 {1, 1}, /* cost of storing MMX registers
908 in SImode and DImode */
909 1, /* cost of moving SSE register */
910 {1, 1, 1}, /* cost of loading SSE registers
911 in SImode, DImode and TImode */
912 {1, 1, 1}, /* cost of storing SSE registers
913 in SImode, DImode and TImode */
914 1, /* MMX or SSE register to integer */
915 64, /* size of l1 cache. */
916 128, /* size of l2 cache. */
917 32, /* size of prefetch block */
918 1, /* number of parallel prefetches */
919 1, /* Branch cost */
920 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
921 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
922 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
923 COSTS_N_INSNS (1), /* cost of FABS instruction. */
924 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
925 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
926 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
927 DUMMY_STRINGOP_ALGS},
928 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
929 DUMMY_STRINGOP_ALGS},
930 1, /* scalar_stmt_cost. */
931 1, /* scalar load_cost. */
932 1, /* scalar_store_cost. */
933 1, /* vec_stmt_cost. */
934 1, /* vec_to_scalar_cost. */
935 1, /* scalar_to_vec_cost. */
936 1, /* vec_align_load_cost. */
937 2, /* vec_unalign_load_cost. */
938 1, /* vec_store_cost. */
939 3, /* cond_taken_branch_cost. */
940 1, /* cond_not_taken_branch_cost. */
941};
942
943static const
944struct processor_costs k6_cost = {
945 COSTS_N_INSNS (1), /* cost of an add instruction */
946 COSTS_N_INSNS (2), /* cost of a lea instruction */
947 COSTS_N_INSNS (1), /* variable shift costs */
948 COSTS_N_INSNS (1), /* constant shift costs */
949 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
950 COSTS_N_INSNS (3), /* HI */
951 COSTS_N_INSNS (3), /* SI */
952 COSTS_N_INSNS (3), /* DI */
953 COSTS_N_INSNS (3)}, /* other */
954 0, /* cost of multiply per each bit set */
955 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
956 COSTS_N_INSNS (18), /* HI */
957 COSTS_N_INSNS (18), /* SI */
958 COSTS_N_INSNS (18), /* DI */
959 COSTS_N_INSNS (18)}, /* other */
960 COSTS_N_INSNS (2), /* cost of movsx */
961 COSTS_N_INSNS (2), /* cost of movzx */
962 8, /* "large" insn */
963 4, /* MOVE_RATIO */
964 3, /* cost for loading QImode using movzbl */
965 {4, 5, 4}, /* cost of loading integer registers
966 in QImode, HImode and SImode.
967 Relative to reg-reg move (2). */
968 {2, 3, 2}, /* cost of storing integer registers */
969 4, /* cost of reg,reg fld/fst */
970 {6, 6, 6}, /* cost of loading fp registers
971 in SFmode, DFmode and XFmode */
972 {4, 4, 4}, /* cost of storing fp registers
973 in SFmode, DFmode and XFmode */
974 2, /* cost of moving MMX register */
975 {2, 2}, /* cost of loading MMX registers
976 in SImode and DImode */
977 {2, 2}, /* cost of storing MMX registers
978 in SImode and DImode */
979 2, /* cost of moving SSE register */
980 {2, 2, 8}, /* cost of loading SSE registers
981 in SImode, DImode and TImode */
982 {2, 2, 8}, /* cost of storing SSE registers
983 in SImode, DImode and TImode */
984 6, /* MMX or SSE register to integer */
985 32, /* size of l1 cache. */
986 32, /* size of l2 cache. Some models
987 have integrated l2 cache, but
988 optimizing for k6 is not important
989 enough to worry about that. */
990 32, /* size of prefetch block */
991 1, /* number of parallel prefetches */
992 1, /* Branch cost */
993 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
994 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
995 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
996 COSTS_N_INSNS (2), /* cost of FABS instruction. */
997 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
998 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
999 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1000 DUMMY_STRINGOP_ALGS},
1001 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1002 DUMMY_STRINGOP_ALGS},
1003 1, /* scalar_stmt_cost. */
1004 1, /* scalar load_cost. */
1005 1, /* scalar_store_cost. */
1006 1, /* vec_stmt_cost. */
1007 1, /* vec_to_scalar_cost. */
1008 1, /* scalar_to_vec_cost. */
1009 1, /* vec_align_load_cost. */
1010 2, /* vec_unalign_load_cost. */
1011 1, /* vec_store_cost. */
1012 3, /* cond_taken_branch_cost. */
1013 1, /* cond_not_taken_branch_cost. */
1014};
1015
1016static const
1017struct processor_costs athlon_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (2), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (5), /* HI */
1024 COSTS_N_INSNS (5), /* SI */
1025 COSTS_N_INSNS (5), /* DI */
1026 COSTS_N_INSNS (5)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (26), /* HI */
1030 COSTS_N_INSNS (42), /* SI */
1031 COSTS_N_INSNS (74), /* DI */
1032 COSTS_N_INSNS (74)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {3, 4, 3}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {3, 4, 3}, /* cost of storing integer registers */
1042 4, /* cost of reg,reg fld/fst */
1043 {4, 4, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {6, 6, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 6}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 5}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 5, /* MMX or SSE register to integer */
1058 64, /* size of l1 cache. */
1059 256, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 6, /* number of parallel prefetches */
1062 5, /* Branch cost */
1063 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1064 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1065 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1066 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1067 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1068 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1069 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1070 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1071 128 bytes for memset. */
1072 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1073 DUMMY_STRINGOP_ALGS},
1074 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1075 DUMMY_STRINGOP_ALGS},
1076 1, /* scalar_stmt_cost. */
1077 1, /* scalar load_cost. */
1078 1, /* scalar_store_cost. */
1079 1, /* vec_stmt_cost. */
1080 1, /* vec_to_scalar_cost. */
1081 1, /* scalar_to_vec_cost. */
1082 1, /* vec_align_load_cost. */
1083 2, /* vec_unalign_load_cost. */
1084 1, /* vec_store_cost. */
1085 3, /* cond_taken_branch_cost. */
1086 1, /* cond_not_taken_branch_cost. */
1087};
1088
1089static const
1090struct processor_costs k8_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (2), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (3), /* SI */
1098 COSTS_N_INSNS (4), /* DI */
1099 COSTS_N_INSNS (5)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (26), /* HI */
1103 COSTS_N_INSNS (42), /* SI */
1104 COSTS_N_INSNS (74), /* DI */
1105 COSTS_N_INSNS (74)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {3, 4, 3}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {3, 4, 3}, /* cost of storing integer registers */
1115 4, /* cost of reg,reg fld/fst */
1116 {4, 4, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {6, 6, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {3, 3}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 3, 6}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 5}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 5, /* MMX or SSE register to integer */
1131 64, /* size of l1 cache. */
1132 512, /* size of l2 cache. */
1133 64, /* size of prefetch block */
1134 /* New AMD processors never drop prefetches; if they cannot be performed
1135 immediately, they are queued. We set number of simultaneous prefetches
1136 to a large constant to reflect this (it probably is not a good idea not
1137 to limit number of prefetches at all, as their execution also takes some
1138 time). */
1139 100, /* number of parallel prefetches */
1140 3, /* Branch cost */
1141 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1147 /* K8 has optimized REP instruction for medium sized blocks, but for very
1148 small blocks it is better to use loop. For large blocks, libcall can
1149 do nontemporary accesses and beat inline considerably. */
1150 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1151 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1152 {{libcall, {{8, loop}, {24, unrolled_loop},
1153 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1154 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1155 4, /* scalar_stmt_cost. */
1156 2, /* scalar load_cost. */
1157 2, /* scalar_store_cost. */
1158 5, /* vec_stmt_cost. */
1159 0, /* vec_to_scalar_cost. */
1160 2, /* scalar_to_vec_cost. */
1161 2, /* vec_align_load_cost. */
1162 3, /* vec_unalign_load_cost. */
1163 3, /* vec_store_cost. */
1164 3, /* cond_taken_branch_cost. */
1165 2, /* cond_not_taken_branch_cost. */
1166};
1167
1168struct processor_costs amdfam10_cost = {
1169 COSTS_N_INSNS (1), /* cost of an add instruction */
1170 COSTS_N_INSNS (2), /* cost of a lea instruction */
1171 COSTS_N_INSNS (1), /* variable shift costs */
1172 COSTS_N_INSNS (1), /* constant shift costs */
1173 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1174 COSTS_N_INSNS (4), /* HI */
1175 COSTS_N_INSNS (3), /* SI */
1176 COSTS_N_INSNS (4), /* DI */
1177 COSTS_N_INSNS (5)}, /* other */
1178 0, /* cost of multiply per each bit set */
1179 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1180 COSTS_N_INSNS (35), /* HI */
1181 COSTS_N_INSNS (51), /* SI */
1182 COSTS_N_INSNS (83), /* DI */
1183 COSTS_N_INSNS (83)}, /* other */
1184 COSTS_N_INSNS (1), /* cost of movsx */
1185 COSTS_N_INSNS (1), /* cost of movzx */
1186 8, /* "large" insn */
1187 9, /* MOVE_RATIO */
1188 4, /* cost for loading QImode using movzbl */
1189 {3, 4, 3}, /* cost of loading integer registers
1190 in QImode, HImode and SImode.
1191 Relative to reg-reg move (2). */
1192 {3, 4, 3}, /* cost of storing integer registers */
1193 4, /* cost of reg,reg fld/fst */
1194 {4, 4, 12}, /* cost of loading fp registers
1195 in SFmode, DFmode and XFmode */
1196 {6, 6, 8}, /* cost of storing fp registers
1197 in SFmode, DFmode and XFmode */
1198 2, /* cost of moving MMX register */
1199 {3, 3}, /* cost of loading MMX registers
1200 in SImode and DImode */
1201 {4, 4}, /* cost of storing MMX registers
1202 in SImode and DImode */
1203 2, /* cost of moving SSE register */
1204 {4, 4, 3}, /* cost of loading SSE registers
1205 in SImode, DImode and TImode */
1206 {4, 4, 5}, /* cost of storing SSE registers
1207 in SImode, DImode and TImode */
1208 3, /* MMX or SSE register to integer */
1209 /* On K8:
1210 MOVD reg64, xmmreg Double FSTORE 4
1211 MOVD reg32, xmmreg Double FSTORE 4
1212 On AMDFAM10:
1213 MOVD reg64, xmmreg Double FADD 3
1214 1/1 1/1
1215 MOVD reg32, xmmreg Double FADD 3
1216 1/1 1/1 */
1217 64, /* size of l1 cache. */
1218 512, /* size of l2 cache. */
1219 64, /* size of prefetch block */
1220 /* New AMD processors never drop prefetches; if they cannot be performed
1221 immediately, they are queued. We set number of simultaneous prefetches
1222 to a large constant to reflect this (it probably is not a good idea not
1223 to limit number of prefetches at all, as their execution also takes some
1224 time). */
1225 100, /* number of parallel prefetches */
1226 2, /* Branch cost */
1227 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1228 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1229 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1230 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1231 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1232 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1233
1234 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1235 very small blocks it is better to use loop. For large blocks, libcall can
1236 do nontemporary accesses and beat inline considerably. */
1237 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1238 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1239 {{libcall, {{8, loop}, {24, unrolled_loop},
1240 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1241 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1242 4, /* scalar_stmt_cost. */
1243 2, /* scalar load_cost. */
1244 2, /* scalar_store_cost. */
1245 6, /* vec_stmt_cost. */
1246 0, /* vec_to_scalar_cost. */
1247 2, /* scalar_to_vec_cost. */
1248 2, /* vec_align_load_cost. */
1249 2, /* vec_unalign_load_cost. */
1250 2, /* vec_store_cost. */
1251 2, /* cond_taken_branch_cost. */
1252 1, /* cond_not_taken_branch_cost. */
1253};
1254
1255struct processor_costs bdver1_cost = {
1256 COSTS_N_INSNS (1), /* cost of an add instruction */
1257 COSTS_N_INSNS (1), /* cost of a lea instruction */
1258 COSTS_N_INSNS (1), /* variable shift costs */
1259 COSTS_N_INSNS (1), /* constant shift costs */
1260 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1261 COSTS_N_INSNS (4), /* HI */
1262 COSTS_N_INSNS (4), /* SI */
1263 COSTS_N_INSNS (6), /* DI */
1264 COSTS_N_INSNS (6)}, /* other */
1265 0, /* cost of multiply per each bit set */
1266 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1267 COSTS_N_INSNS (35), /* HI */
1268 COSTS_N_INSNS (51), /* SI */
1269 COSTS_N_INSNS (83), /* DI */
1270 COSTS_N_INSNS (83)}, /* other */
1271 COSTS_N_INSNS (1), /* cost of movsx */
1272 COSTS_N_INSNS (1), /* cost of movzx */
1273 8, /* "large" insn */
1274 9, /* MOVE_RATIO */
1275 4, /* cost for loading QImode using movzbl */
1276 {5, 5, 4}, /* cost of loading integer registers
1277 in QImode, HImode and SImode.
1278 Relative to reg-reg move (2). */
1279 {4, 4, 4}, /* cost of storing integer registers */
1280 2, /* cost of reg,reg fld/fst */
1281 {5, 5, 12}, /* cost of loading fp registers
1282 in SFmode, DFmode and XFmode */
1283 {4, 4, 8}, /* cost of storing fp registers
1284 in SFmode, DFmode and XFmode */
1285 2, /* cost of moving MMX register */
1286 {4, 4}, /* cost of loading MMX registers
1287 in SImode and DImode */
1288 {4, 4}, /* cost of storing MMX registers
1289 in SImode and DImode */
1290 2, /* cost of moving SSE register */
1291 {4, 4, 4}, /* cost of loading SSE registers
1292 in SImode, DImode and TImode */
1293 {4, 4, 4}, /* cost of storing SSE registers
1294 in SImode, DImode and TImode */
1295 2, /* MMX or SSE register to integer */
1296 /* On K8:
1297 MOVD reg64, xmmreg Double FSTORE 4
1298 MOVD reg32, xmmreg Double FSTORE 4
1299 On AMDFAM10:
1300 MOVD reg64, xmmreg Double FADD 3
1301 1/1 1/1
1302 MOVD reg32, xmmreg Double FADD 3
1303 1/1 1/1 */
1304 16, /* size of l1 cache. */
1305 2048, /* size of l2 cache. */
1306 64, /* size of prefetch block */
1307 /* New AMD processors never drop prefetches; if they cannot be performed
1308 immediately, they are queued. We set number of simultaneous prefetches
1309 to a large constant to reflect this (it probably is not a good idea not
1310 to limit number of prefetches at all, as their execution also takes some
1311 time). */
1312 100, /* number of parallel prefetches */
1313 2, /* Branch cost */
1314 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1315 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1316 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1317 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1318 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1319 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320
1321 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1322 very small blocks it is better to use loop. For large blocks, libcall
1323 can do nontemporary accesses and beat inline considerably. */
1324 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1325 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1326 {{libcall, {{8, loop}, {24, unrolled_loop},
1327 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1328 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1329 6, /* scalar_stmt_cost. */
1330 4, /* scalar load_cost. */
1331 4, /* scalar_store_cost. */
1332 6, /* vec_stmt_cost. */
1333 0, /* vec_to_scalar_cost. */
1334 2, /* scalar_to_vec_cost. */
1335 4, /* vec_align_load_cost. */
1336 4, /* vec_unalign_load_cost. */
1337 4, /* vec_store_cost. */
1338 2, /* cond_taken_branch_cost. */
1339 1, /* cond_not_taken_branch_cost. */
1340};
1341
1342struct processor_costs bdver2_cost = {
1343 COSTS_N_INSNS (1), /* cost of an add instruction */
1344 COSTS_N_INSNS (1), /* cost of a lea instruction */
1345 COSTS_N_INSNS (1), /* variable shift costs */
1346 COSTS_N_INSNS (1), /* constant shift costs */
1347 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1348 COSTS_N_INSNS (4), /* HI */
1349 COSTS_N_INSNS (4), /* SI */
1350 COSTS_N_INSNS (6), /* DI */
1351 COSTS_N_INSNS (6)}, /* other */
1352 0, /* cost of multiply per each bit set */
1353 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1354 COSTS_N_INSNS (35), /* HI */
1355 COSTS_N_INSNS (51), /* SI */
1356 COSTS_N_INSNS (83), /* DI */
1357 COSTS_N_INSNS (83)}, /* other */
1358 COSTS_N_INSNS (1), /* cost of movsx */
1359 COSTS_N_INSNS (1), /* cost of movzx */
1360 8, /* "large" insn */
1361 9, /* MOVE_RATIO */
1362 4, /* cost for loading QImode using movzbl */
1363 {5, 5, 4}, /* cost of loading integer registers
1364 in QImode, HImode and SImode.
1365 Relative to reg-reg move (2). */
1366 {4, 4, 4}, /* cost of storing integer registers */
1367 2, /* cost of reg,reg fld/fst */
1368 {5, 5, 12}, /* cost of loading fp registers
1369 in SFmode, DFmode and XFmode */
1370 {4, 4, 8}, /* cost of storing fp registers
1371 in SFmode, DFmode and XFmode */
1372 2, /* cost of moving MMX register */
1373 {4, 4}, /* cost of loading MMX registers
1374 in SImode and DImode */
1375 {4, 4}, /* cost of storing MMX registers
1376 in SImode and DImode */
1377 2, /* cost of moving SSE register */
1378 {4, 4, 4}, /* cost of loading SSE registers
1379 in SImode, DImode and TImode */
1380 {4, 4, 4}, /* cost of storing SSE registers
1381 in SImode, DImode and TImode */
1382 2, /* MMX or SSE register to integer */
1383 /* On K8:
1384 MOVD reg64, xmmreg Double FSTORE 4
1385 MOVD reg32, xmmreg Double FSTORE 4
1386 On AMDFAM10:
1387 MOVD reg64, xmmreg Double FADD 3
1388 1/1 1/1
1389 MOVD reg32, xmmreg Double FADD 3
1390 1/1 1/1 */
1391 16, /* size of l1 cache. */
1392 2048, /* size of l2 cache. */
1393 64, /* size of prefetch block */
1394 /* New AMD processors never drop prefetches; if they cannot be performed
1395 immediately, they are queued. We set number of simultaneous prefetches
1396 to a large constant to reflect this (it probably is not a good idea not
1397 to limit number of prefetches at all, as their execution also takes some
1398 time). */
1399 100, /* number of parallel prefetches */
1400 2, /* Branch cost */
1401 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1402 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1403 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1404 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1405 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1406 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1407
1408 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1409 very small blocks it is better to use loop. For large blocks, libcall
1410 can do nontemporary accesses and beat inline considerably. */
1411 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1412 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1413 {{libcall, {{8, loop}, {24, unrolled_loop},
1414 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1415 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1416 6, /* scalar_stmt_cost. */
1417 4, /* scalar load_cost. */
1418 4, /* scalar_store_cost. */
1419 6, /* vec_stmt_cost. */
1420 0, /* vec_to_scalar_cost. */
1421 2, /* scalar_to_vec_cost. */
1422 4, /* vec_align_load_cost. */
1423 4, /* vec_unalign_load_cost. */
1424 4, /* vec_store_cost. */
1425 2, /* cond_taken_branch_cost. */
1426 1, /* cond_not_taken_branch_cost. */
1427};
1428
1429struct processor_costs btver1_cost = {
1430 COSTS_N_INSNS (1), /* cost of an add instruction */
1431 COSTS_N_INSNS (2), /* cost of a lea instruction */
1432 COSTS_N_INSNS (1), /* variable shift costs */
1433 COSTS_N_INSNS (1), /* constant shift costs */
1434 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1435 COSTS_N_INSNS (4), /* HI */
1436 COSTS_N_INSNS (3), /* SI */
1437 COSTS_N_INSNS (4), /* DI */
1438 COSTS_N_INSNS (5)}, /* other */
1439 0, /* cost of multiply per each bit set */
1440 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1441 COSTS_N_INSNS (35), /* HI */
1442 COSTS_N_INSNS (51), /* SI */
1443 COSTS_N_INSNS (83), /* DI */
1444 COSTS_N_INSNS (83)}, /* other */
1445 COSTS_N_INSNS (1), /* cost of movsx */
1446 COSTS_N_INSNS (1), /* cost of movzx */
1447 8, /* "large" insn */
1448 9, /* MOVE_RATIO */
1449 4, /* cost for loading QImode using movzbl */
1450 {3, 4, 3}, /* cost of loading integer registers
1451 in QImode, HImode and SImode.
1452 Relative to reg-reg move (2). */
1453 {3, 4, 3}, /* cost of storing integer registers */
1454 4, /* cost of reg,reg fld/fst */
1455 {4, 4, 12}, /* cost of loading fp registers
1456 in SFmode, DFmode and XFmode */
1457 {6, 6, 8}, /* cost of storing fp registers
1458 in SFmode, DFmode and XFmode */
1459 2, /* cost of moving MMX register */
1460 {3, 3}, /* cost of loading MMX registers
1461 in SImode and DImode */
1462 {4, 4}, /* cost of storing MMX registers
1463 in SImode and DImode */
1464 2, /* cost of moving SSE register */
1465 {4, 4, 3}, /* cost of loading SSE registers
1466 in SImode, DImode and TImode */
1467 {4, 4, 5}, /* cost of storing SSE registers
1468 in SImode, DImode and TImode */
1469 3, /* MMX or SSE register to integer */
1470 /* On K8:
1471 MOVD reg64, xmmreg Double FSTORE 4
1472 MOVD reg32, xmmreg Double FSTORE 4
1473 On AMDFAM10:
1474 MOVD reg64, xmmreg Double FADD 3
1475 1/1 1/1
1476 MOVD reg32, xmmreg Double FADD 3
1477 1/1 1/1 */
1478 32, /* size of l1 cache. */
1479 512, /* size of l2 cache. */
1480 64, /* size of prefetch block */
1481 100, /* number of parallel prefetches */
1482 2, /* Branch cost */
1483 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1485 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1486 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1487 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1488 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1489
1490 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1491 very small blocks it is better to use loop. For large blocks, libcall can
1492 do nontemporary accesses and beat inline considerably. */
1493 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1494 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1495 {{libcall, {{8, loop}, {24, unrolled_loop},
1496 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1497 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1498 4, /* scalar_stmt_cost. */
1499 2, /* scalar load_cost. */
1500 2, /* scalar_store_cost. */
1501 6, /* vec_stmt_cost. */
1502 0, /* vec_to_scalar_cost. */
1503 2, /* scalar_to_vec_cost. */
1504 2, /* vec_align_load_cost. */
1505 2, /* vec_unalign_load_cost. */
1506 2, /* vec_store_cost. */
1507 2, /* cond_taken_branch_cost. */
1508 1, /* cond_not_taken_branch_cost. */
1509};
1510
1511static const
1512struct processor_costs pentium4_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (3), /* cost of a lea instruction */
1515 COSTS_N_INSNS (4), /* variable shift costs */
1516 COSTS_N_INSNS (4), /* constant shift costs */
1517 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (15), /* HI */
1519 COSTS_N_INSNS (15), /* SI */
1520 COSTS_N_INSNS (15), /* DI */
1521 COSTS_N_INSNS (15)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (56), /* HI */
1525 COSTS_N_INSNS (56), /* SI */
1526 COSTS_N_INSNS (56), /* DI */
1527 COSTS_N_INSNS (56)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 16, /* "large" insn */
1531 6, /* MOVE_RATIO */
1532 2, /* cost for loading QImode using movzbl */
1533 {4, 5, 4}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {2, 3, 2}, /* cost of storing integer registers */
1537 2, /* cost of reg,reg fld/fst */
1538 {2, 2, 6}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {4, 4, 6}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {2, 2}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {2, 2}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 12, /* cost of moving SSE register */
1548 {12, 12, 12}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {2, 2, 8}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 10, /* MMX or SSE register to integer */
1553 8, /* size of l1 cache. */
1554 256, /* size of l2 cache. */
1555 64, /* size of prefetch block */
1556 6, /* number of parallel prefetches */
1557 2, /* Branch cost */
1558 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1559 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1560 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1561 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1562 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1563 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1564 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1565 DUMMY_STRINGOP_ALGS},
1566 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1567 {-1, libcall}}},
1568 DUMMY_STRINGOP_ALGS},
1569 1, /* scalar_stmt_cost. */
1570 1, /* scalar load_cost. */
1571 1, /* scalar_store_cost. */
1572 1, /* vec_stmt_cost. */
1573 1, /* vec_to_scalar_cost. */
1574 1, /* scalar_to_vec_cost. */
1575 1, /* vec_align_load_cost. */
1576 2, /* vec_unalign_load_cost. */
1577 1, /* vec_store_cost. */
1578 3, /* cond_taken_branch_cost. */
1579 1, /* cond_not_taken_branch_cost. */
1580};
1581
1582static const
1583struct processor_costs nocona_cost = {
1584 COSTS_N_INSNS (1), /* cost of an add instruction */
1585 COSTS_N_INSNS (1), /* cost of a lea instruction */
1586 COSTS_N_INSNS (1), /* variable shift costs */
1587 COSTS_N_INSNS (1), /* constant shift costs */
1588 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1589 COSTS_N_INSNS (10), /* HI */
1590 COSTS_N_INSNS (10), /* SI */
1591 COSTS_N_INSNS (10), /* DI */
1592 COSTS_N_INSNS (10)}, /* other */
1593 0, /* cost of multiply per each bit set */
1594 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1595 COSTS_N_INSNS (66), /* HI */
1596 COSTS_N_INSNS (66), /* SI */
1597 COSTS_N_INSNS (66), /* DI */
1598 COSTS_N_INSNS (66)}, /* other */
1599 COSTS_N_INSNS (1), /* cost of movsx */
1600 COSTS_N_INSNS (1), /* cost of movzx */
1601 16, /* "large" insn */
1602 17, /* MOVE_RATIO */
1603 4, /* cost for loading QImode using movzbl */
1604 {4, 4, 4}, /* cost of loading integer registers
1605 in QImode, HImode and SImode.
1606 Relative to reg-reg move (2). */
1607 {4, 4, 4}, /* cost of storing integer registers */
1608 3, /* cost of reg,reg fld/fst */
1609 {12, 12, 12}, /* cost of loading fp registers
1610 in SFmode, DFmode and XFmode */
1611 {4, 4, 4}, /* cost of storing fp registers
1612 in SFmode, DFmode and XFmode */
1613 6, /* cost of moving MMX register */
1614 {12, 12}, /* cost of loading MMX registers
1615 in SImode and DImode */
1616 {12, 12}, /* cost of storing MMX registers
1617 in SImode and DImode */
1618 6, /* cost of moving SSE register */
1619 {12, 12, 12}, /* cost of loading SSE registers
1620 in SImode, DImode and TImode */
1621 {12, 12, 12}, /* cost of storing SSE registers
1622 in SImode, DImode and TImode */
1623 8, /* MMX or SSE register to integer */
1624 8, /* size of l1 cache. */
1625 1024, /* size of l2 cache. */
95d28233 1626 64, /* size of prefetch block */
e4b17023
JM
1627 8, /* number of parallel prefetches */
1628 1, /* Branch cost */
1629 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1630 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1631 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1632 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1633 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1634 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1635 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1636 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1637 {100000, unrolled_loop}, {-1, libcall}}}},
1638 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1639 {-1, libcall}}},
1640 {libcall, {{24, loop}, {64, unrolled_loop},
1641 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1642 1, /* scalar_stmt_cost. */
1643 1, /* scalar load_cost. */
1644 1, /* scalar_store_cost. */
1645 1, /* vec_stmt_cost. */
1646 1, /* vec_to_scalar_cost. */
1647 1, /* scalar_to_vec_cost. */
1648 1, /* vec_align_load_cost. */
1649 2, /* vec_unalign_load_cost. */
1650 1, /* vec_store_cost. */
1651 3, /* cond_taken_branch_cost. */
1652 1, /* cond_not_taken_branch_cost. */
1653};
1654
1655static const
1656struct processor_costs atom_cost = {
1657 COSTS_N_INSNS (1), /* cost of an add instruction */
1658 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1659 COSTS_N_INSNS (1), /* variable shift costs */
1660 COSTS_N_INSNS (1), /* constant shift costs */
1661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1662 COSTS_N_INSNS (4), /* HI */
1663 COSTS_N_INSNS (3), /* SI */
1664 COSTS_N_INSNS (4), /* DI */
1665 COSTS_N_INSNS (2)}, /* other */
1666 0, /* cost of multiply per each bit set */
1667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1668 COSTS_N_INSNS (26), /* HI */
1669 COSTS_N_INSNS (42), /* SI */
1670 COSTS_N_INSNS (74), /* DI */
1671 COSTS_N_INSNS (74)}, /* other */
1672 COSTS_N_INSNS (1), /* cost of movsx */
1673 COSTS_N_INSNS (1), /* cost of movzx */
1674 8, /* "large" insn */
1675 17, /* MOVE_RATIO */
1676 4, /* cost for loading QImode using movzbl */
1677 {4, 4, 4}, /* cost of loading integer registers
1678 in QImode, HImode and SImode.
1679 Relative to reg-reg move (2). */
1680 {4, 4, 4}, /* cost of storing integer registers */
1681 4, /* cost of reg,reg fld/fst */
1682 {12, 12, 12}, /* cost of loading fp registers
1683 in SFmode, DFmode and XFmode */
1684 {6, 6, 8}, /* cost of storing fp registers
1685 in SFmode, DFmode and XFmode */
1686 2, /* cost of moving MMX register */
1687 {8, 8}, /* cost of loading MMX registers
1688 in SImode and DImode */
1689 {8, 8}, /* cost of storing MMX registers
1690 in SImode and DImode */
1691 2, /* cost of moving SSE register */
1692 {8, 8, 8}, /* cost of loading SSE registers
1693 in SImode, DImode and TImode */
1694 {8, 8, 8}, /* cost of storing SSE registers
1695 in SImode, DImode and TImode */
1696 5, /* MMX or SSE register to integer */
1697 32, /* size of l1 cache. */
1698 256, /* size of l2 cache. */
1699 64, /* size of prefetch block */
1700 6, /* number of parallel prefetches */
1701 3, /* Branch cost */
1702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1703 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1704 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1705 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1706 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1707 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1708 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1709 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1710 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1711 {{libcall, {{8, loop}, {15, unrolled_loop},
1712 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1713 {libcall, {{24, loop}, {32, unrolled_loop},
1714 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1715 1, /* scalar_stmt_cost. */
1716 1, /* scalar load_cost. */
1717 1, /* scalar_store_cost. */
1718 1, /* vec_stmt_cost. */
1719 1, /* vec_to_scalar_cost. */
1720 1, /* scalar_to_vec_cost. */
1721 1, /* vec_align_load_cost. */
1722 2, /* vec_unalign_load_cost. */
1723 1, /* vec_store_cost. */
1724 3, /* cond_taken_branch_cost. */
1725 1, /* cond_not_taken_branch_cost. */
1726};
1727
1728/* Generic64 should produce code tuned for Nocona and K8. */
1729static const
1730struct processor_costs generic64_cost = {
1731 COSTS_N_INSNS (1), /* cost of an add instruction */
1732 /* On all chips taken into consideration lea is 2 cycles and more. With
1733 this cost however our current implementation of synth_mult results in
1734 use of unnecessary temporary registers causing regression on several
1735 SPECfp benchmarks. */
1736 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1737 COSTS_N_INSNS (1), /* variable shift costs */
1738 COSTS_N_INSNS (1), /* constant shift costs */
1739 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1740 COSTS_N_INSNS (4), /* HI */
1741 COSTS_N_INSNS (3), /* SI */
1742 COSTS_N_INSNS (4), /* DI */
1743 COSTS_N_INSNS (2)}, /* other */
1744 0, /* cost of multiply per each bit set */
1745 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1746 COSTS_N_INSNS (26), /* HI */
1747 COSTS_N_INSNS (42), /* SI */
1748 COSTS_N_INSNS (74), /* DI */
1749 COSTS_N_INSNS (74)}, /* other */
1750 COSTS_N_INSNS (1), /* cost of movsx */
1751 COSTS_N_INSNS (1), /* cost of movzx */
1752 8, /* "large" insn */
1753 17, /* MOVE_RATIO */
1754 4, /* cost for loading QImode using movzbl */
1755 {4, 4, 4}, /* cost of loading integer registers
1756 in QImode, HImode and SImode.
1757 Relative to reg-reg move (2). */
1758 {4, 4, 4}, /* cost of storing integer registers */
1759 4, /* cost of reg,reg fld/fst */
1760 {12, 12, 12}, /* cost of loading fp registers
1761 in SFmode, DFmode and XFmode */
1762 {6, 6, 8}, /* cost of storing fp registers
1763 in SFmode, DFmode and XFmode */
1764 2, /* cost of moving MMX register */
1765 {8, 8}, /* cost of loading MMX registers
1766 in SImode and DImode */
1767 {8, 8}, /* cost of storing MMX registers
1768 in SImode and DImode */
1769 2, /* cost of moving SSE register */
1770 {8, 8, 8}, /* cost of loading SSE registers
1771 in SImode, DImode and TImode */
1772 {8, 8, 8}, /* cost of storing SSE registers
1773 in SImode, DImode and TImode */
1774 5, /* MMX or SSE register to integer */
1775 32, /* size of l1 cache. */
1776 512, /* size of l2 cache. */
1777 64, /* size of prefetch block */
1778 6, /* number of parallel prefetches */
1779 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1780 value is increased to perhaps more appropriate value of 5. */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {DUMMY_STRINGOP_ALGS,
1789 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1790 {DUMMY_STRINGOP_ALGS,
1791 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1792 1, /* scalar_stmt_cost. */
1793 1, /* scalar load_cost. */
1794 1, /* scalar_store_cost. */
1795 1, /* vec_stmt_cost. */
1796 1, /* vec_to_scalar_cost. */
1797 1, /* scalar_to_vec_cost. */
1798 1, /* vec_align_load_cost. */
1799 2, /* vec_unalign_load_cost. */
1800 1, /* vec_store_cost. */
1801 3, /* cond_taken_branch_cost. */
1802 1, /* cond_not_taken_branch_cost. */
1803};
1804
1805/* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1806 Athlon and K8. */
1807static const
1808struct processor_costs generic32_cost = {
1809 COSTS_N_INSNS (1), /* cost of an add instruction */
1810 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1811 COSTS_N_INSNS (1), /* variable shift costs */
1812 COSTS_N_INSNS (1), /* constant shift costs */
1813 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1814 COSTS_N_INSNS (4), /* HI */
1815 COSTS_N_INSNS (3), /* SI */
1816 COSTS_N_INSNS (4), /* DI */
1817 COSTS_N_INSNS (2)}, /* other */
1818 0, /* cost of multiply per each bit set */
1819 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1820 COSTS_N_INSNS (26), /* HI */
1821 COSTS_N_INSNS (42), /* SI */
1822 COSTS_N_INSNS (74), /* DI */
1823 COSTS_N_INSNS (74)}, /* other */
1824 COSTS_N_INSNS (1), /* cost of movsx */
1825 COSTS_N_INSNS (1), /* cost of movzx */
1826 8, /* "large" insn */
1827 17, /* MOVE_RATIO */
1828 4, /* cost for loading QImode using movzbl */
1829 {4, 4, 4}, /* cost of loading integer registers
1830 in QImode, HImode and SImode.
1831 Relative to reg-reg move (2). */
1832 {4, 4, 4}, /* cost of storing integer registers */
1833 4, /* cost of reg,reg fld/fst */
1834 {12, 12, 12}, /* cost of loading fp registers
1835 in SFmode, DFmode and XFmode */
1836 {6, 6, 8}, /* cost of storing fp registers
1837 in SFmode, DFmode and XFmode */
1838 2, /* cost of moving MMX register */
1839 {8, 8}, /* cost of loading MMX registers
1840 in SImode and DImode */
1841 {8, 8}, /* cost of storing MMX registers
1842 in SImode and DImode */
1843 2, /* cost of moving SSE register */
1844 {8, 8, 8}, /* cost of loading SSE registers
1845 in SImode, DImode and TImode */
1846 {8, 8, 8}, /* cost of storing SSE registers
1847 in SImode, DImode and TImode */
1848 5, /* MMX or SSE register to integer */
1849 32, /* size of l1 cache. */
1850 256, /* size of l2 cache. */
1851 64, /* size of prefetch block */
1852 6, /* number of parallel prefetches */
1853 3, /* Branch cost */
1854 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1855 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1856 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1857 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1858 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1859 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1860 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1861 DUMMY_STRINGOP_ALGS},
1862 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1863 DUMMY_STRINGOP_ALGS},
1864 1, /* scalar_stmt_cost. */
1865 1, /* scalar load_cost. */
1866 1, /* scalar_store_cost. */
1867 1, /* vec_stmt_cost. */
1868 1, /* vec_to_scalar_cost. */
1869 1, /* scalar_to_vec_cost. */
1870 1, /* vec_align_load_cost. */
1871 2, /* vec_unalign_load_cost. */
1872 1, /* vec_store_cost. */
1873 3, /* cond_taken_branch_cost. */
1874 1, /* cond_not_taken_branch_cost. */
1875};
1876
1877const struct processor_costs *ix86_cost = &pentium_cost;
1878
1879/* Processor feature/optimization bitmasks. */
1880#define m_386 (1<<PROCESSOR_I386)
1881#define m_486 (1<<PROCESSOR_I486)
1882#define m_PENT (1<<PROCESSOR_PENTIUM)
1883#define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1884#define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1885#define m_NOCONA (1<<PROCESSOR_NOCONA)
1886#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1887#define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1888#define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1889#define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1890#define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1891#define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1892#define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1893#define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1894#define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1895#define m_ATOM (1<<PROCESSOR_ATOM)
1896
1897#define m_GEODE (1<<PROCESSOR_GEODE)
1898#define m_K6 (1<<PROCESSOR_K6)
1899#define m_K6_GEODE (m_K6 | m_GEODE)
1900#define m_K8 (1<<PROCESSOR_K8)
1901#define m_ATHLON (1<<PROCESSOR_ATHLON)
1902#define m_ATHLON_K8 (m_K8 | m_ATHLON)
1903#define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1904#define m_BDVER1 (1<<PROCESSOR_BDVER1)
1905#define m_BDVER2 (1<<PROCESSOR_BDVER2)
1906#define m_BDVER (m_BDVER1 | m_BDVER2)
1907#define m_BTVER1 (1<<PROCESSOR_BTVER1)
1908#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1909
1910#define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1911#define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1912
1913/* Generic instruction choice should be common subset of supported CPUs
1914 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1915#define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1916
1917/* Feature tests against the various tunings. */
1918unsigned char ix86_tune_features[X86_TUNE_LAST];
1919
1920/* Feature tests against the various tunings used to create ix86_tune_features
1921 based on the processor mask. */
1922static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1923 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1924 negatively, so enabling for Generic64 seems like good code size
1925 tradeoff. We can't enable it for 32bit generic because it does not
1926 work well with PPro base chips. */
1927 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1928
1929 /* X86_TUNE_PUSH_MEMORY */
1930 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1931
1932 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1933 m_486 | m_PENT,
1934
1935 /* X86_TUNE_UNROLL_STRLEN */
1936 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1937
1938 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1939 on simulation result. But after P4 was made, no performance benefit
1940 was observed with branch hints. It also increases the code size.
1941 As a result, icc never generates branch hints. */
1942 0,
1943
1944 /* X86_TUNE_DOUBLE_WITH_ADD */
1945 ~m_386,
1946
1947 /* X86_TUNE_USE_SAHF */
1948 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1949
1950 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1951 partial dependencies. */
1952 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1953
1954 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1955 register stalls on Generic32 compilation setting as well. However
1956 in current implementation the partial register stalls are not eliminated
1957 very well - they can be introduced via subregs synthesized by combine
1958 and can happen in caller/callee saving sequences. Because this option
1959 pays back little on PPro based chips and is in conflict with partial reg
1960 dependencies used by Athlon/P4 based chips, it is better to leave it off
1961 for generic32 for now. */
1962 m_PPRO,
1963
1964 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1965 m_CORE2I7 | m_GENERIC,
1966
1967 /* X86_TUNE_USE_HIMODE_FIOP */
1968 m_386 | m_486 | m_K6_GEODE,
1969
1970 /* X86_TUNE_USE_SIMODE_FIOP */
1971 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1972
1973 /* X86_TUNE_USE_MOV0 */
1974 m_K6,
1975
1976 /* X86_TUNE_USE_CLTD */
1977 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1978
1979 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1980 m_PENT4,
1981
1982 /* X86_TUNE_SPLIT_LONG_MOVES */
1983 m_PPRO,
1984
1985 /* X86_TUNE_READ_MODIFY_WRITE */
1986 ~m_PENT,
1987
1988 /* X86_TUNE_READ_MODIFY */
1989 ~(m_PENT | m_PPRO),
1990
1991 /* X86_TUNE_PROMOTE_QIMODE */
1992 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1993
1994 /* X86_TUNE_FAST_PREFIX */
1995 ~(m_386 | m_486 | m_PENT),
1996
1997 /* X86_TUNE_SINGLE_STRINGOP */
1998 m_386 | m_P4_NOCONA,
1999
2000 /* X86_TUNE_QIMODE_MATH */
2001 ~0,
2002
2003 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2004 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2005 might be considered for Generic32 if our scheme for avoiding partial
2006 stalls was more effective. */
2007 ~m_PPRO,
2008
2009 /* X86_TUNE_PROMOTE_QI_REGS */
2010 0,
2011
2012 /* X86_TUNE_PROMOTE_HI_REGS */
2013 m_PPRO,
2014
2015 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2016 over esp addition. */
2017 m_386 | m_486 | m_PENT | m_PPRO,
2018
2019 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2020 over esp addition. */
2021 m_PENT,
2022
2023 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2024 over esp subtraction. */
2025 m_386 | m_486 | m_PENT | m_K6_GEODE,
2026
2027 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2028 over esp subtraction. */
2029 m_PENT | m_K6_GEODE,
2030
2031 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2032 for DFmode copies */
2033 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2034
2035 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2036 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2037
2038 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2039 conflict here in between PPro/Pentium4 based chips that thread 128bit
2040 SSE registers as single units versus K8 based chips that divide SSE
2041 registers to two 64bit halves. This knob promotes all store destinations
2042 to be 128bit to allow register renaming on 128bit SSE units, but usually
2043 results in one extra microop on 64bit SSE units. Experimental results
2044 shows that disabling this option on P4 brings over 20% SPECfp regression,
2045 while enabling it on K8 brings roughly 2.4% regression that can be partly
2046 masked by careful scheduling of moves. */
2047 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2048
2049 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2050 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2051
2052 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2053 m_COREI7 | m_BDVER,
2054
2055 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2056 m_BDVER ,
2057
2058 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2059 are resolved on SSE register parts instead of whole registers, so we may
2060 maintain just lower part of scalar values in proper format leaving the
2061 upper part undefined. */
2062 m_ATHLON_K8,
2063
2064 /* X86_TUNE_SSE_TYPELESS_STORES */
2065 m_AMD_MULTIPLE,
2066
2067 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2068 m_PPRO | m_P4_NOCONA,
2069
2070 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2071 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2072
2073 /* X86_TUNE_PROLOGUE_USING_MOVE */
2074 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2075
2076 /* X86_TUNE_EPILOGUE_USING_MOVE */
2077 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2078
2079 /* X86_TUNE_SHIFT1 */
2080 ~m_486,
2081
2082 /* X86_TUNE_USE_FFREEP */
2083 m_AMD_MULTIPLE,
2084
2085 /* X86_TUNE_INTER_UNIT_MOVES */
2086 ~(m_AMD_MULTIPLE | m_GENERIC),
2087
2088 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2089 ~(m_AMDFAM10 | m_BDVER ),
2090
2091 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2092 than 4 branch instructions in the 16 byte window. */
2093 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2094
2095 /* X86_TUNE_SCHEDULE */
2096 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2097
2098 /* X86_TUNE_USE_BT */
2099 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2100
2101 /* X86_TUNE_USE_INCDEC */
2102 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2103
2104 /* X86_TUNE_PAD_RETURNS */
2105 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2106
2107 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2108 m_ATOM,
2109
2110 /* X86_TUNE_EXT_80387_CONSTANTS */
2111 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2112
2113 /* X86_TUNE_SHORTEN_X87_SSE */
2114 ~m_K8,
2115
2116 /* X86_TUNE_AVOID_VECTOR_DECODE */
2117 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2118
2119 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2120 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2121 ~(m_386 | m_486),
2122
2123 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2124 vector path on AMD machines. */
2125 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2126
2127 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2128 machines. */
2129 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2130
2131 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2132 than a MOV. */
2133 m_PENT,
2134
2135 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2136 but one byte longer. */
2137 m_PENT,
2138
2139 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2140 operand that cannot be represented using a modRM byte. The XOR
2141 replacement is long decoded, so this split helps here as well. */
2142 m_K6,
2143
2144 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2145 from FP to FP. */
2146 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2147
2148 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2149 from integer to FP. */
2150 m_AMDFAM10,
2151
2152 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2153 with a subsequent conditional jump instruction into a single
2154 compare-and-branch uop. */
2155 m_BDVER,
2156
2157 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2158 will impact LEA instruction selection. */
2159 m_ATOM,
2160
2161 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2162 instructions. */
2163 ~m_ATOM,
2164
2165 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2166 at -O3. For the moment, the prefetching seems badly tuned for Intel
2167 chips. */
2168 m_K6_GEODE | m_AMD_MULTIPLE,
2169
2170 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2171 the auto-vectorizer. */
2172 m_BDVER,
2173
2174 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2175 during reassociation of integer computation. */
2176 m_ATOM,
2177
2178 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2179 during reassociation of fp computation. */
2180 m_ATOM
2181};
2182
2183/* Feature tests against the various architecture variations. */
2184unsigned char ix86_arch_features[X86_ARCH_LAST];
2185
2186/* Feature tests against the various architecture variations, used to create
2187 ix86_arch_features based on the processor mask. */
2188static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2189 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2190 ~(m_386 | m_486 | m_PENT | m_K6),
2191
2192 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2193 ~m_386,
2194
2195 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2196 ~(m_386 | m_486),
2197
2198 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2199 ~m_386,
2200
2201 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2202 ~m_386,
2203};
2204
2205static const unsigned int x86_accumulate_outgoing_args
2206 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2207
2208static const unsigned int x86_arch_always_fancy_math_387
2209 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2210
2211static const unsigned int x86_avx256_split_unaligned_load
2212 = m_COREI7 | m_GENERIC;
2213
2214static const unsigned int x86_avx256_split_unaligned_store
2215 = m_COREI7 | m_BDVER | m_GENERIC;
2216
2217/* In case the average insn count for single function invocation is
2218 lower than this constant, emit fast (but longer) prologue and
2219 epilogue code. */
2220#define FAST_PROLOGUE_INSN_COUNT 20
2221
2222/* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2223static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2224static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2225static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2226
2227/* Array of the smallest class containing reg number REGNO, indexed by
2228 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2229
2230enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2231{
2232 /* ax, dx, cx, bx */
2233 AREG, DREG, CREG, BREG,
2234 /* si, di, bp, sp */
2235 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2236 /* FP registers */
2237 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2238 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2239 /* arg pointer */
2240 NON_Q_REGS,
2241 /* flags, fpsr, fpcr, frame */
2242 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2243 /* SSE registers */
2244 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2245 SSE_REGS, SSE_REGS,
2246 /* MMX registers */
2247 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2248 MMX_REGS, MMX_REGS,
2249 /* REX registers */
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2252 /* SSE REX registers */
2253 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2254 SSE_REGS, SSE_REGS,
2255};
2256
2257/* The "default" register map used in 32bit mode. */
2258
2259int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2260{
2261 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2262 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2263 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2264 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2265 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2267 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2268};
2269
2270/* The "default" register map used in 64bit mode. */
2271
2272int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2273{
2274 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2275 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2276 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2277 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2278 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2279 8,9,10,11,12,13,14,15, /* extended integer registers */
2280 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2281};
2282
2283/* Define the register numbers to be used in Dwarf debugging information.
2284 The SVR4 reference port C compiler uses the following register numbers
2285 in its Dwarf output code:
2286 0 for %eax (gcc regno = 0)
2287 1 for %ecx (gcc regno = 2)
2288 2 for %edx (gcc regno = 1)
2289 3 for %ebx (gcc regno = 3)
2290 4 for %esp (gcc regno = 7)
2291 5 for %ebp (gcc regno = 6)
2292 6 for %esi (gcc regno = 4)
2293 7 for %edi (gcc regno = 5)
2294 The following three DWARF register numbers are never generated by
2295 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2296 believes these numbers have these meanings.
2297 8 for %eip (no gcc equivalent)
2298 9 for %eflags (gcc regno = 17)
2299 10 for %trapno (no gcc equivalent)
2300 It is not at all clear how we should number the FP stack registers
2301 for the x86 architecture. If the version of SDB on x86/svr4 were
2302 a bit less brain dead with respect to floating-point then we would
2303 have a precedent to follow with respect to DWARF register numbers
2304 for x86 FP registers, but the SDB on x86/svr4 is so completely
2305 broken with respect to FP registers that it is hardly worth thinking
2306 of it as something to strive for compatibility with.
2307 The version of x86/svr4 SDB I have at the moment does (partially)
2308 seem to believe that DWARF register number 11 is associated with
2309 the x86 register %st(0), but that's about all. Higher DWARF
2310 register numbers don't seem to be associated with anything in
2311 particular, and even for DWARF regno 11, SDB only seems to under-
2312 stand that it should say that a variable lives in %st(0) (when
2313 asked via an `=' command) if we said it was in DWARF regno 11,
2314 but SDB still prints garbage when asked for the value of the
2315 variable in question (via a `/' command).
2316 (Also note that the labels SDB prints for various FP stack regs
2317 when doing an `x' command are all wrong.)
2318 Note that these problems generally don't affect the native SVR4
2319 C compiler because it doesn't allow the use of -O with -g and
2320 because when it is *not* optimizing, it allocates a memory
2321 location for each floating-point variable, and the memory
2322 location is what gets described in the DWARF AT_location
2323 attribute for the variable in question.
2324 Regardless of the severe mental illness of the x86/svr4 SDB, we
2325 do something sensible here and we use the following DWARF
2326 register numbers. Note that these are all stack-top-relative
2327 numbers.
2328 11 for %st(0) (gcc regno = 8)
2329 12 for %st(1) (gcc regno = 9)
2330 13 for %st(2) (gcc regno = 10)
2331 14 for %st(3) (gcc regno = 11)
2332 15 for %st(4) (gcc regno = 12)
2333 16 for %st(5) (gcc regno = 13)
2334 17 for %st(6) (gcc regno = 14)
2335 18 for %st(7) (gcc regno = 15)
2336*/
2337int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2338{
2339 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2340 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2341 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2342 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2343 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2345 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2346};
2347
2348/* Define parameter passing and return registers. */
2349
2350static int const x86_64_int_parameter_registers[6] =
2351{
2352 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2353};
2354
2355static int const x86_64_ms_abi_int_parameter_registers[4] =
2356{
2357 CX_REG, DX_REG, R8_REG, R9_REG
2358};
2359
2360static int const x86_64_int_return_registers[4] =
2361{
2362 AX_REG, DX_REG, DI_REG, SI_REG
2363};
2364
2365/* Define the structure for the machine field in struct function. */
2366
2367struct GTY(()) stack_local_entry {
2368 unsigned short mode;
2369 unsigned short n;
2370 rtx rtl;
2371 struct stack_local_entry *next;
2372};
2373
2374/* Structure describing stack frame layout.
2375 Stack grows downward:
2376
2377 [arguments]
2378 <- ARG_POINTER
2379 saved pc
2380
2381 saved static chain if ix86_static_chain_on_stack
2382
2383 saved frame pointer if frame_pointer_needed
2384 <- HARD_FRAME_POINTER
2385 [saved regs]
2386 <- regs_save_offset
2387 [padding0]
2388
2389 [saved SSE regs]
2390 <- sse_regs_save_offset
2391 [padding1] |
2392 | <- FRAME_POINTER
2393 [va_arg registers] |
2394 |
2395 [frame] |
2396 |
2397 [padding2] | = to_allocate
2398 <- STACK_POINTER
2399 */
2400struct ix86_frame
2401{
2402 int nsseregs;
2403 int nregs;
2404 int va_arg_size;
2405 int red_zone_size;
2406 int outgoing_arguments_size;
2407 HOST_WIDE_INT frame;
2408
2409 /* The offsets relative to ARG_POINTER. */
2410 HOST_WIDE_INT frame_pointer_offset;
2411 HOST_WIDE_INT hard_frame_pointer_offset;
2412 HOST_WIDE_INT stack_pointer_offset;
2413 HOST_WIDE_INT hfp_save_offset;
2414 HOST_WIDE_INT reg_save_offset;
2415 HOST_WIDE_INT sse_reg_save_offset;
2416
2417 /* When save_regs_using_mov is set, emit prologue using
2418 move instead of push instructions. */
2419 bool save_regs_using_mov;
2420};
2421
2422/* Which cpu are we scheduling for. */
2423enum attr_cpu ix86_schedule;
2424
2425/* Which cpu are we optimizing for. */
2426enum processor_type ix86_tune;
2427
2428/* Which instruction set architecture to use. */
2429enum processor_type ix86_arch;
2430
2431/* True if processor has SSE prefetch instruction. */
2432int x86_prefetch_sse;
2433
2434/* True if processor has prefetchw instruction. */
2435int x86_prefetchw;
2436
2437/* -mstackrealign option */
2438static const char ix86_force_align_arg_pointer_string[]
2439 = "force_align_arg_pointer";
2440
2441static rtx (*ix86_gen_leave) (void);
2442static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2443static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2444static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2445static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2446static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2447static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2448static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2449static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2450static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2451
2452/* Preferred alignment for stack boundary in bits. */
2453unsigned int ix86_preferred_stack_boundary;
2454
2455/* Alignment for incoming stack boundary in bits specified at
2456 command line. */
2457static unsigned int ix86_user_incoming_stack_boundary;
2458
2459/* Default alignment for incoming stack boundary in bits. */
2460static unsigned int ix86_default_incoming_stack_boundary;
2461
2462/* Alignment for incoming stack boundary in bits. */
2463unsigned int ix86_incoming_stack_boundary;
2464
2465/* Calling abi specific va_list type nodes. */
2466static GTY(()) tree sysv_va_list_type_node;
2467static GTY(()) tree ms_va_list_type_node;
2468
2469/* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2470char internal_label_prefix[16];
2471int internal_label_prefix_len;
2472
2473/* Fence to use after loop using movnt. */
2474tree x86_mfence;
2475
2476/* Register class used for passing given 64bit part of the argument.
2477 These represent classes as documented by the PS ABI, with the exception
2478 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2479 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2480
2481 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2482 whenever possible (upper half does contain padding). */
2483enum x86_64_reg_class
2484 {
2485 X86_64_NO_CLASS,
2486 X86_64_INTEGER_CLASS,
2487 X86_64_INTEGERSI_CLASS,
2488 X86_64_SSE_CLASS,
2489 X86_64_SSESF_CLASS,
2490 X86_64_SSEDF_CLASS,
2491 X86_64_SSEUP_CLASS,
2492 X86_64_X87_CLASS,
2493 X86_64_X87UP_CLASS,
2494 X86_64_COMPLEX_X87_CLASS,
2495 X86_64_MEMORY_CLASS
2496 };
2497
2498#define MAX_CLASSES 4
2499
2500/* Table of constants used by fldpi, fldln2, etc.... */
2501static REAL_VALUE_TYPE ext_80387_constants_table [5];
2502static bool ext_80387_constants_init = 0;
2503
2504\f
2505static struct machine_function * ix86_init_machine_status (void);
2506static rtx ix86_function_value (const_tree, const_tree, bool);
2507static bool ix86_function_value_regno_p (const unsigned int);
2508static unsigned int ix86_function_arg_boundary (enum machine_mode,
2509 const_tree);
2510static rtx ix86_static_chain (const_tree, bool);
2511static int ix86_function_regparm (const_tree, const_tree);
2512static void ix86_compute_frame_layout (struct ix86_frame *);
2513static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2514 rtx, rtx, int);
2515static void ix86_add_new_builtins (HOST_WIDE_INT);
2516static tree ix86_canonical_va_list_type (tree);
2517static void predict_jump (int);
2518static unsigned int split_stack_prologue_scratch_regno (void);
2519static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2520
2521enum ix86_function_specific_strings
2522{
2523 IX86_FUNCTION_SPECIFIC_ARCH,
2524 IX86_FUNCTION_SPECIFIC_TUNE,
2525 IX86_FUNCTION_SPECIFIC_MAX
2526};
2527
2528static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2529 const char *, enum fpmath_unit, bool);
2530static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2531static void ix86_function_specific_save (struct cl_target_option *);
2532static void ix86_function_specific_restore (struct cl_target_option *);
2533static void ix86_function_specific_print (FILE *, int,
2534 struct cl_target_option *);
2535static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2536static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2537 struct gcc_options *);
2538static bool ix86_can_inline_p (tree, tree);
2539static void ix86_set_current_function (tree);
2540static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2541
2542static enum calling_abi ix86_function_abi (const_tree);
2543
2544\f
2545#ifndef SUBTARGET32_DEFAULT_CPU
2546#define SUBTARGET32_DEFAULT_CPU "i386"
2547#endif
2548
2549/* The svr4 ABI for the i386 says that records and unions are returned
2550 in memory. */
2551#ifndef DEFAULT_PCC_STRUCT_RETURN
2552#define DEFAULT_PCC_STRUCT_RETURN 1
2553#endif
2554
2555/* Whether -mtune= or -march= were specified */
2556static int ix86_tune_defaulted;
2557static int ix86_arch_specified;
2558
2559/* Vectorization library interface and handlers. */
2560static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2561
2562static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2563static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2564
2565/* Processor target table, indexed by processor number */
2566struct ptt
2567{
2568 const struct processor_costs *cost; /* Processor costs */
2569 const int align_loop; /* Default alignments. */
2570 const int align_loop_max_skip;
2571 const int align_jump;
2572 const int align_jump_max_skip;
2573 const int align_func;
2574};
2575
2576static const struct ptt processor_target_table[PROCESSOR_max] =
2577{
2578 {&i386_cost, 4, 3, 4, 3, 4},
2579 {&i486_cost, 16, 15, 16, 15, 16},
2580 {&pentium_cost, 16, 7, 16, 7, 16},
2581 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2582 {&geode_cost, 0, 0, 0, 0, 0},
2583 {&k6_cost, 32, 7, 32, 7, 32},
2584 {&athlon_cost, 16, 7, 16, 7, 16},
2585 {&pentium4_cost, 0, 0, 0, 0, 0},
2586 {&k8_cost, 16, 7, 16, 7, 16},
2587 {&nocona_cost, 0, 0, 0, 0, 0},
2588 /* Core 2 32-bit. */
2589 {&generic32_cost, 16, 10, 16, 10, 16},
2590 /* Core 2 64-bit. */
2591 {&generic64_cost, 16, 10, 16, 10, 16},
2592 /* Core i7 32-bit. */
2593 {&generic32_cost, 16, 10, 16, 10, 16},
2594 /* Core i7 64-bit. */
2595 {&generic64_cost, 16, 10, 16, 10, 16},
2596 {&generic32_cost, 16, 7, 16, 7, 16},
2597 {&generic64_cost, 16, 10, 16, 10, 16},
2598 {&amdfam10_cost, 32, 24, 32, 7, 32},
2599 {&bdver1_cost, 32, 24, 32, 7, 32},
2600 {&bdver2_cost, 32, 24, 32, 7, 32},
2601 {&btver1_cost, 32, 24, 32, 7, 32},
2602 {&atom_cost, 16, 15, 16, 7, 16}
2603};
2604
2605static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2606{
2607 "generic",
2608 "i386",
2609 "i486",
2610 "pentium",
2611 "pentium-mmx",
2612 "pentiumpro",
2613 "pentium2",
2614 "pentium3",
2615 "pentium4",
2616 "pentium-m",
2617 "prescott",
2618 "nocona",
2619 "core2",
2620 "corei7",
2621 "atom",
2622 "geode",
2623 "k6",
2624 "k6-2",
2625 "k6-3",
2626 "athlon",
2627 "athlon-4",
2628 "k8",
2629 "amdfam10",
2630 "bdver1",
2631 "bdver2",
2632 "btver1"
2633};
2634\f
2635/* Return true if a red-zone is in use. */
2636
2637static inline bool
2638ix86_using_red_zone (void)
2639{
2640 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2641}
2642\f
2643/* Return a string that documents the current -m options. The caller is
2644 responsible for freeing the string. */
2645
2646static char *
2647ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2648 const char *tune, enum fpmath_unit fpmath,
2649 bool add_nl_p)
2650{
2651 struct ix86_target_opts
2652 {
2653 const char *option; /* option string */
2654 HOST_WIDE_INT mask; /* isa mask options */
2655 };
2656
2657 /* This table is ordered so that options like -msse4.2 that imply
2658 preceding options while match those first. */
2659 static struct ix86_target_opts isa_opts[] =
2660 {
2661 { "-m64", OPTION_MASK_ISA_64BIT },
2662 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2663 { "-mfma", OPTION_MASK_ISA_FMA },
2664 { "-mxop", OPTION_MASK_ISA_XOP },
2665 { "-mlwp", OPTION_MASK_ISA_LWP },
2666 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2667 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2668 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2669 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2670 { "-msse3", OPTION_MASK_ISA_SSE3 },
2671 { "-msse2", OPTION_MASK_ISA_SSE2 },
2672 { "-msse", OPTION_MASK_ISA_SSE },
2673 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2674 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2675 { "-mmmx", OPTION_MASK_ISA_MMX },
2676 { "-mabm", OPTION_MASK_ISA_ABM },
2677 { "-mbmi", OPTION_MASK_ISA_BMI },
2678 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2679 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2680 { "-mtbm", OPTION_MASK_ISA_TBM },
2681 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2682 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2683 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2684 { "-maes", OPTION_MASK_ISA_AES },
2685 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2686 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2687 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2688 { "-mf16c", OPTION_MASK_ISA_F16C },
2689 };
2690
2691 /* Flag options. */
2692 static struct ix86_target_opts flag_opts[] =
2693 {
2694 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2695 { "-m80387", MASK_80387 },
2696 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2697 { "-malign-double", MASK_ALIGN_DOUBLE },
2698 { "-mcld", MASK_CLD },
2699 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2700 { "-mieee-fp", MASK_IEEE_FP },
2701 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2702 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2703 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2704 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2705 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2706 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2707 { "-mno-red-zone", MASK_NO_RED_ZONE },
2708 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2709 { "-mrecip", MASK_RECIP },
2710 { "-mrtd", MASK_RTD },
2711 { "-msseregparm", MASK_SSEREGPARM },
2712 { "-mstack-arg-probe", MASK_STACK_PROBE },
2713 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2714 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2715 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2716 { "-mvzeroupper", MASK_VZEROUPPER },
2717 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2718 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2719 { "-mprefer-avx128", MASK_PREFER_AVX128},
2720 };
2721
2722 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2723
2724 char isa_other[40];
2725 char target_other[40];
2726 unsigned num = 0;
2727 unsigned i, j;
2728 char *ret;
2729 char *ptr;
2730 size_t len;
2731 size_t line_len;
2732 size_t sep_len;
2733
2734 memset (opts, '\0', sizeof (opts));
2735
2736 /* Add -march= option. */
2737 if (arch)
2738 {
2739 opts[num][0] = "-march=";
2740 opts[num++][1] = arch;
2741 }
2742
2743 /* Add -mtune= option. */
2744 if (tune)
2745 {
2746 opts[num][0] = "-mtune=";
2747 opts[num++][1] = tune;
2748 }
2749
2750 /* Pick out the options in isa options. */
2751 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2752 {
2753 if ((isa & isa_opts[i].mask) != 0)
2754 {
2755 opts[num++][0] = isa_opts[i].option;
2756 isa &= ~ isa_opts[i].mask;
2757 }
2758 }
2759
2760 if (isa && add_nl_p)
2761 {
2762 opts[num++][0] = isa_other;
2763 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2764 isa);
2765 }
2766
2767 /* Add flag options. */
2768 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2769 {
2770 if ((flags & flag_opts[i].mask) != 0)
2771 {
2772 opts[num++][0] = flag_opts[i].option;
2773 flags &= ~ flag_opts[i].mask;
2774 }
2775 }
2776
2777 if (flags && add_nl_p)
2778 {
2779 opts[num++][0] = target_other;
2780 sprintf (target_other, "(other flags: %#x)", flags);
2781 }
2782
2783 /* Add -fpmath= option. */
2784 if (fpmath)
2785 {
2786 opts[num][0] = "-mfpmath=";
2787 switch ((int) fpmath)
2788 {
2789 case FPMATH_387:
2790 opts[num++][1] = "387";
2791 break;
2792
2793 case FPMATH_SSE:
2794 opts[num++][1] = "sse";
2795 break;
2796
2797 case FPMATH_387 | FPMATH_SSE:
2798 opts[num++][1] = "sse+387";
2799 break;
2800
2801 default:
2802 gcc_unreachable ();
2803 }
2804 }
2805
2806 /* Any options? */
2807 if (num == 0)
2808 return NULL;
2809
2810 gcc_assert (num < ARRAY_SIZE (opts));
2811
2812 /* Size the string. */
2813 len = 0;
2814 sep_len = (add_nl_p) ? 3 : 1;
2815 for (i = 0; i < num; i++)
2816 {
2817 len += sep_len;
2818 for (j = 0; j < 2; j++)
2819 if (opts[i][j])
2820 len += strlen (opts[i][j]);
2821 }
2822
2823 /* Build the string. */
2824 ret = ptr = (char *) xmalloc (len);
2825 line_len = 0;
2826
2827 for (i = 0; i < num; i++)
2828 {
2829 size_t len2[2];
2830
2831 for (j = 0; j < 2; j++)
2832 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2833
2834 if (i != 0)
2835 {
2836 *ptr++ = ' ';
2837 line_len++;
2838
2839 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2840 {
2841 *ptr++ = '\\';
2842 *ptr++ = '\n';
2843 line_len = 0;
2844 }
2845 }
2846
2847 for (j = 0; j < 2; j++)
2848 if (opts[i][j])
2849 {
2850 memcpy (ptr, opts[i][j], len2[j]);
2851 ptr += len2[j];
2852 line_len += len2[j];
2853 }
2854 }
2855
2856 *ptr = '\0';
2857 gcc_assert (ret + len >= ptr);
2858
2859 return ret;
2860}
2861
2862/* Return true, if profiling code should be emitted before
2863 prologue. Otherwise it returns false.
2864 Note: For x86 with "hotfix" it is sorried. */
2865static bool
2866ix86_profile_before_prologue (void)
2867{
2868 return flag_fentry != 0;
2869}
2870
2871/* Function that is callable from the debugger to print the current
2872 options. */
2873void
2874ix86_debug_options (void)
2875{
2876 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2877 ix86_arch_string, ix86_tune_string,
2878 ix86_fpmath, true);
2879
2880 if (opts)
2881 {
2882 fprintf (stderr, "%s\n\n", opts);
2883 free (opts);
2884 }
2885 else
2886 fputs ("<no options>\n\n", stderr);
2887
2888 return;
2889}
2890\f
2891/* Override various settings based on options. If MAIN_ARGS_P, the
2892 options are from the command line, otherwise they are from
2893 attributes. */
2894
2895static void
2896ix86_option_override_internal (bool main_args_p)
2897{
2898 int i;
2899 unsigned int ix86_arch_mask, ix86_tune_mask;
2900 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2901 const char *prefix;
2902 const char *suffix;
2903 const char *sw;
2904
2905#define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2906#define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2907#define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2908#define PTA_ABM (HOST_WIDE_INT_1 << 3)
2909#define PTA_AES (HOST_WIDE_INT_1 << 4)
2910#define PTA_AVX (HOST_WIDE_INT_1 << 5)
2911#define PTA_BMI (HOST_WIDE_INT_1 << 6)
2912#define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2913#define PTA_F16C (HOST_WIDE_INT_1 << 8)
2914#define PTA_FMA (HOST_WIDE_INT_1 << 9)
2915#define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2916#define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2917#define PTA_LWP (HOST_WIDE_INT_1 << 12)
2918#define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2919#define PTA_MMX (HOST_WIDE_INT_1 << 14)
2920#define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2921#define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2922#define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2923#define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2924#define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2925#define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2926#define PTA_SSE (HOST_WIDE_INT_1 << 21)
2927#define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2928#define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2929#define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2930#define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2931#define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2932#define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2933#define PTA_TBM (HOST_WIDE_INT_1 << 28)
2934#define PTA_XOP (HOST_WIDE_INT_1 << 29)
2935#define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2936#define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2937#define PTA_PREFETCHW (HOST_WIDE_INT_1 << 32)
2938
2939/* if this reaches 64, need to widen struct pta flags below */
2940
2941 static struct pta
2942 {
2943 const char *const name; /* processor name or nickname. */
2944 const enum processor_type processor;
2945 const enum attr_cpu schedule;
2946 const unsigned HOST_WIDE_INT flags;
2947 }
2948 const processor_alias_table[] =
2949 {
2950 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2951 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2952 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2953 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2954 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2955 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2956 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2957 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2958 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2959 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2960 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2961 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2962 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2963 PTA_MMX | PTA_SSE},
2964 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2965 PTA_MMX | PTA_SSE},
2966 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2967 PTA_MMX | PTA_SSE | PTA_SSE2},
2968 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2969 PTA_MMX |PTA_SSE | PTA_SSE2},
2970 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2971 PTA_MMX | PTA_SSE | PTA_SSE2},
2972 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2973 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2974 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 | PTA_CX16 | PTA_NO_SAHF},
2977 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 | PTA_SSSE3 | PTA_CX16},
2980 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2981 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
95d28233 2982 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT},
e4b17023
JM
2983 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2984 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2985 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2986 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2987 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2988 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2989 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2990 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2991 | PTA_RDRND | PTA_F16C},
2992 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2995 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2996 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2997 | PTA_FMA | PTA_MOVBE},
2998 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2999 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3000 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3001 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3002 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3003 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3004 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3005 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3006 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3008 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3010 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3011 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3012 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3013 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3014 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3015 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3016 {"x86-64", PROCESSOR_K8, CPU_K8,
3017 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3018 {"k8", PROCESSOR_K8, CPU_K8,
3019 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 | PTA_SSE2 | PTA_NO_SAHF},
3021 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3022 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024 {"opteron", PROCESSOR_K8, CPU_K8,
3025 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 | PTA_SSE2 | PTA_NO_SAHF},
3027 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3028 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030 {"athlon64", PROCESSOR_K8, CPU_K8,
3031 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 | PTA_SSE2 | PTA_NO_SAHF},
3033 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3034 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3036 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3037 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 | PTA_SSE2 | PTA_NO_SAHF},
3039 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3040 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3041 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3042 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3043 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3044 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3045 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3046 PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3047 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
3048 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3049 | PTA_FMA4 | PTA_XOP | PTA_LWP},
3050 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3051 PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3052 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
3053 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3054 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3055 | PTA_FMA},
3056 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3057 PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3058 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16},
3059 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3060 0 /* flags are only used for -march switch. */ },
3061 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3062 PTA_64BIT /* flags are only used for -march switch. */ },
3063 };
3064
3065 /* -mrecip options. */
3066 static struct
3067 {
3068 const char *string; /* option name */
3069 unsigned int mask; /* mask bits to set */
3070 }
3071 const recip_options[] =
3072 {
3073 { "all", RECIP_MASK_ALL },
3074 { "none", RECIP_MASK_NONE },
3075 { "div", RECIP_MASK_DIV },
3076 { "sqrt", RECIP_MASK_SQRT },
3077 { "vec-div", RECIP_MASK_VEC_DIV },
3078 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3079 };
3080
3081 int const pta_size = ARRAY_SIZE (processor_alias_table);
3082
3083 /* Set up prefix/suffix so the error messages refer to either the command
3084 line argument, or the attribute(target). */
3085 if (main_args_p)
3086 {
3087 prefix = "-m";
3088 suffix = "";
3089 sw = "switch";
3090 }
3091 else
3092 {
3093 prefix = "option(\"";
3094 suffix = "\")";
3095 sw = "attribute";
3096 }
3097
3098#ifdef SUBTARGET_OVERRIDE_OPTIONS
3099 SUBTARGET_OVERRIDE_OPTIONS;
3100#endif
3101
3102#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3103 SUBSUBTARGET_OVERRIDE_OPTIONS;
3104#endif
3105
3106 if (TARGET_X32)
3107 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3108
3109 /* -fPIC is the default for x86_64. */
3110 if (TARGET_MACHO && TARGET_64BIT)
3111 flag_pic = 2;
3112
3113 /* Need to check -mtune=generic first. */
3114 if (ix86_tune_string)
3115 {
3116 if (!strcmp (ix86_tune_string, "generic")
3117 || !strcmp (ix86_tune_string, "i686")
3118 /* As special support for cross compilers we read -mtune=native
3119 as -mtune=generic. With native compilers we won't see the
3120 -mtune=native, as it was changed by the driver. */
3121 || !strcmp (ix86_tune_string, "native"))
3122 {
3123 if (TARGET_64BIT)
3124 ix86_tune_string = "generic64";
3125 else
3126 ix86_tune_string = "generic32";
3127 }
3128 /* If this call is for setting the option attribute, allow the
3129 generic32/generic64 that was previously set. */
3130 else if (!main_args_p
3131 && (!strcmp (ix86_tune_string, "generic32")
3132 || !strcmp (ix86_tune_string, "generic64")))
3133 ;
3134 else if (!strncmp (ix86_tune_string, "generic", 7))
3135 error ("bad value (%s) for %stune=%s %s",
3136 ix86_tune_string, prefix, suffix, sw);
3137 else if (!strcmp (ix86_tune_string, "x86-64"))
3138 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3139 "%stune=k8%s or %stune=generic%s instead as appropriate",
3140 prefix, suffix, prefix, suffix, prefix, suffix);
3141 }
3142 else
3143 {
3144 if (ix86_arch_string)
3145 ix86_tune_string = ix86_arch_string;
3146 if (!ix86_tune_string)
3147 {
3148 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3149 ix86_tune_defaulted = 1;
3150 }
3151
3152 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3153 need to use a sensible tune option. */
3154 if (!strcmp (ix86_tune_string, "generic")
3155 || !strcmp (ix86_tune_string, "x86-64")
3156 || !strcmp (ix86_tune_string, "i686"))
3157 {
3158 if (TARGET_64BIT)
3159 ix86_tune_string = "generic64";
3160 else
3161 ix86_tune_string = "generic32";
3162 }
3163 }
3164
3165 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3166 {
3167 /* rep; movq isn't available in 32-bit code. */
3168 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3169 ix86_stringop_alg = no_stringop;
3170 }
3171
3172 if (!ix86_arch_string)
3173 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3174 else
3175 ix86_arch_specified = 1;
3176
3177 if (!global_options_set.x_ix86_abi)
3178 ix86_abi = DEFAULT_ABI;
3179
3180 if (global_options_set.x_ix86_cmodel)
3181 {
3182 switch (ix86_cmodel)
3183 {
3184 case CM_SMALL:
3185 case CM_SMALL_PIC:
3186 if (flag_pic)
3187 ix86_cmodel = CM_SMALL_PIC;
3188 if (!TARGET_64BIT)
3189 error ("code model %qs not supported in the %s bit mode",
3190 "small", "32");
3191 break;
3192
3193 case CM_MEDIUM:
3194 case CM_MEDIUM_PIC:
3195 if (flag_pic)
3196 ix86_cmodel = CM_MEDIUM_PIC;
3197 if (!TARGET_64BIT)
3198 error ("code model %qs not supported in the %s bit mode",
3199 "medium", "32");
3200 else if (TARGET_X32)
3201 error ("code model %qs not supported in x32 mode",
3202 "medium");
3203 break;
3204
3205 case CM_LARGE:
3206 case CM_LARGE_PIC:
3207 if (flag_pic)
3208 ix86_cmodel = CM_LARGE_PIC;
3209 if (!TARGET_64BIT)
3210 error ("code model %qs not supported in the %s bit mode",
3211 "large", "32");
3212 else if (TARGET_X32)
3213 error ("code model %qs not supported in x32 mode",
3214 "large");
3215 break;
3216
3217 case CM_32:
3218 if (flag_pic)
3219 error ("code model %s does not support PIC mode", "32");
3220 if (TARGET_64BIT)
3221 error ("code model %qs not supported in the %s bit mode",
3222 "32", "64");
3223 break;
3224
3225 case CM_KERNEL:
3226 if (flag_pic)
3227 {
3228 error ("code model %s does not support PIC mode", "kernel");
3229 ix86_cmodel = CM_32;
3230 }
3231 if (!TARGET_64BIT)
3232 error ("code model %qs not supported in the %s bit mode",
3233 "kernel", "32");
3234 break;
3235
3236 default:
3237 gcc_unreachable ();
3238 }
3239 }
3240 else
3241 {
3242 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3243 use of rip-relative addressing. This eliminates fixups that
3244 would otherwise be needed if this object is to be placed in a
3245 DLL, and is essentially just as efficient as direct addressing. */
3246 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3247 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3248 else if (TARGET_64BIT)
3249 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3250 else
3251 ix86_cmodel = CM_32;
3252 }
3253 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3254 {
3255 error ("-masm=intel not supported in this configuration");
3256 ix86_asm_dialect = ASM_ATT;
3257 }
3258 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3259 sorry ("%i-bit mode not compiled in",
3260 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3261
3262 for (i = 0; i < pta_size; i++)
3263 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3264 {
3265 ix86_schedule = processor_alias_table[i].schedule;
3266 ix86_arch = processor_alias_table[i].processor;
3267 /* Default cpu tuning to the architecture. */
3268 ix86_tune = ix86_arch;
3269
3270 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3271 error ("CPU you selected does not support x86-64 "
3272 "instruction set");
3273
3274 if (processor_alias_table[i].flags & PTA_MMX
3275 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3276 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3277 if (processor_alias_table[i].flags & PTA_3DNOW
3278 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3279 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3280 if (processor_alias_table[i].flags & PTA_3DNOW_A
3281 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3282 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3283 if (processor_alias_table[i].flags & PTA_SSE
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3285 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3286 if (processor_alias_table[i].flags & PTA_SSE2
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3288 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3289 if (processor_alias_table[i].flags & PTA_SSE3
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3291 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3292 if (processor_alias_table[i].flags & PTA_SSSE3
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3295 if (processor_alias_table[i].flags & PTA_SSE4_1
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3297 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3298 if (processor_alias_table[i].flags & PTA_SSE4_2
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3300 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3301 if (processor_alias_table[i].flags & PTA_AVX
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3303 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3304 if (processor_alias_table[i].flags & PTA_AVX2
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3306 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3307 if (processor_alias_table[i].flags & PTA_FMA
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3309 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3310 if (processor_alias_table[i].flags & PTA_SSE4A
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3312 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3313 if (processor_alias_table[i].flags & PTA_FMA4
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3315 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3316 if (processor_alias_table[i].flags & PTA_XOP
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3318 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3319 if (processor_alias_table[i].flags & PTA_LWP
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3321 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3322 if (processor_alias_table[i].flags & PTA_ABM
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3324 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3325 if (processor_alias_table[i].flags & PTA_BMI
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3327 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3328 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3330 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3331 if (processor_alias_table[i].flags & PTA_TBM
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3333 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3334 if (processor_alias_table[i].flags & PTA_BMI2
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3336 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3337 if (processor_alias_table[i].flags & PTA_CX16
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3339 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3340 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3342 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3343 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3345 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3346 if (processor_alias_table[i].flags & PTA_MOVBE
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3348 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3349 if (processor_alias_table[i].flags & PTA_AES
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3351 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3352 if (processor_alias_table[i].flags & PTA_PCLMUL
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3354 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3355 if (processor_alias_table[i].flags & PTA_FSGSBASE
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3357 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3358 if (processor_alias_table[i].flags & PTA_RDRND
3359 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3360 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3361 if (processor_alias_table[i].flags & PTA_F16C
3362 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3363 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3364 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3365 x86_prefetch_sse = true;
3366 if (processor_alias_table[i].flags & PTA_PREFETCHW)
3367 x86_prefetchw = true;
3368
3369 break;
3370 }
3371
3372 if (!strcmp (ix86_arch_string, "generic"))
3373 error ("generic CPU can be used only for %stune=%s %s",
3374 prefix, suffix, sw);
3375 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3376 error ("bad value (%s) for %sarch=%s %s",
3377 ix86_arch_string, prefix, suffix, sw);
3378
3379 ix86_arch_mask = 1u << ix86_arch;
3380 for (i = 0; i < X86_ARCH_LAST; ++i)
3381 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3382
3383 for (i = 0; i < pta_size; i++)
3384 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3385 {
3386 ix86_schedule = processor_alias_table[i].schedule;
3387 ix86_tune = processor_alias_table[i].processor;
3388 if (TARGET_64BIT)
3389 {
3390 if (!(processor_alias_table[i].flags & PTA_64BIT))
3391 {
3392 if (ix86_tune_defaulted)
3393 {
3394 ix86_tune_string = "x86-64";
3395 for (i = 0; i < pta_size; i++)
3396 if (! strcmp (ix86_tune_string,
3397 processor_alias_table[i].name))
3398 break;
3399 ix86_schedule = processor_alias_table[i].schedule;
3400 ix86_tune = processor_alias_table[i].processor;
3401 }
3402 else
3403 error ("CPU you selected does not support x86-64 "
3404 "instruction set");
3405 }
3406 }
3407 else
3408 {
3409 /* Adjust tuning when compiling for 32-bit ABI. */
3410 switch (ix86_tune)
3411 {
3412 case PROCESSOR_GENERIC64:
3413 ix86_tune = PROCESSOR_GENERIC32;
3414 ix86_schedule = CPU_PENTIUMPRO;
3415 break;
3416
3417 case PROCESSOR_CORE2_64:
3418 ix86_tune = PROCESSOR_CORE2_32;
3419 break;
3420
3421 case PROCESSOR_COREI7_64:
3422 ix86_tune = PROCESSOR_COREI7_32;
3423 break;
3424
3425 default:
3426 break;
3427 }
3428 }
3429 /* Intel CPUs have always interpreted SSE prefetch instructions as
3430 NOPs; so, we can enable SSE prefetch instructions even when
3431 -mtune (rather than -march) points us to a processor that has them.
3432 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3433 higher processors. */
3434 if (TARGET_CMOV
3435 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3436 x86_prefetch_sse = true;
3437 break;
3438 }
3439
3440 if (ix86_tune_specified && i == pta_size)
3441 error ("bad value (%s) for %stune=%s %s",
3442 ix86_tune_string, prefix, suffix, sw);
3443
3444 ix86_tune_mask = 1u << ix86_tune;
3445 for (i = 0; i < X86_TUNE_LAST; ++i)
3446 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3447
3448#ifndef USE_IX86_FRAME_POINTER
3449#define USE_IX86_FRAME_POINTER 0
3450#endif
3451
3452#ifndef USE_X86_64_FRAME_POINTER
3453#define USE_X86_64_FRAME_POINTER 0
3454#endif
3455
3456 /* Set the default values for switches whose default depends on TARGET_64BIT
3457 in case they weren't overwritten by command line options. */
3458 if (TARGET_64BIT)
3459 {
3460 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3461 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3462 if (flag_asynchronous_unwind_tables == 2)
3463 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3464 if (flag_pcc_struct_return == 2)
3465 flag_pcc_struct_return = 0;
3466 }
3467 else
3468 {
3469 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3470 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3471 if (flag_asynchronous_unwind_tables == 2)
3472 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3473 if (flag_pcc_struct_return == 2)
3474 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3475 }
3476
3477 if (optimize_size)
3478 ix86_cost = &ix86_size_cost;
3479 else
3480 ix86_cost = processor_target_table[ix86_tune].cost;
3481
3482 /* Arrange to set up i386_stack_locals for all functions. */
3483 init_machine_status = ix86_init_machine_status;
3484
3485 /* Validate -mregparm= value. */
3486 if (global_options_set.x_ix86_regparm)
3487 {
3488 if (TARGET_64BIT)
3489 warning (0, "-mregparm is ignored in 64-bit mode");
3490 if (ix86_regparm > REGPARM_MAX)
3491 {
3492 error ("-mregparm=%d is not between 0 and %d",
3493 ix86_regparm, REGPARM_MAX);
3494 ix86_regparm = 0;
3495 }
3496 }
3497 if (TARGET_64BIT)
3498 ix86_regparm = REGPARM_MAX;
3499
3500 /* Default align_* from the processor table. */
3501 if (align_loops == 0)
3502 {
3503 align_loops = processor_target_table[ix86_tune].align_loop;
3504 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3505 }
3506 if (align_jumps == 0)
3507 {
3508 align_jumps = processor_target_table[ix86_tune].align_jump;
3509 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3510 }
3511 if (align_functions == 0)
3512 {
3513 align_functions = processor_target_table[ix86_tune].align_func;
3514 }
3515
3516 /* Provide default for -mbranch-cost= value. */
3517 if (!global_options_set.x_ix86_branch_cost)
3518 ix86_branch_cost = ix86_cost->branch_cost;
3519
3520 if (TARGET_64BIT)
3521 {
3522 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3523
3524 /* Enable by default the SSE and MMX builtins. Do allow the user to
3525 explicitly disable any of these. In particular, disabling SSE and
3526 MMX for kernel code is extremely useful. */
3527 if (!ix86_arch_specified)
3528 ix86_isa_flags
3529 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3530 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3531
3532 if (TARGET_RTD)
3533 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3534 }
3535 else
3536 {
3537 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3538
3539 if (!ix86_arch_specified)
3540 ix86_isa_flags
3541 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3542
3543 /* i386 ABI does not specify red zone. It still makes sense to use it
3544 when programmer takes care to stack from being destroyed. */
3545 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3546 target_flags |= MASK_NO_RED_ZONE;
3547 }
3548
3549 /* Keep nonleaf frame pointers. */
3550 if (flag_omit_frame_pointer)
3551 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3552 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3553 flag_omit_frame_pointer = 1;
3554
3555 /* If we're doing fast math, we don't care about comparison order
3556 wrt NaNs. This lets us use a shorter comparison sequence. */
3557 if (flag_finite_math_only)
3558 target_flags &= ~MASK_IEEE_FP;
3559
3560 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3561 since the insns won't need emulation. */
3562 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3563 target_flags &= ~MASK_NO_FANCY_MATH_387;
3564
3565 /* Likewise, if the target doesn't have a 387, or we've specified
3566 software floating point, don't use 387 inline intrinsics. */
3567 if (!TARGET_80387)
3568 target_flags |= MASK_NO_FANCY_MATH_387;
3569
3570 /* Turn on MMX builtins for -msse. */
3571 if (TARGET_SSE)
3572 {
3573 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3574 x86_prefetch_sse = true;
3575 }
3576
3577 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3578 if (TARGET_SSE4_2 || TARGET_ABM)
3579 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3580
3581 /* Turn on lzcnt instruction for -mabm. */
3582 if (TARGET_ABM)
3583 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3584
3585 /* Validate -mpreferred-stack-boundary= value or default it to
3586 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3587 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3588 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3589 {
3590 int min = (TARGET_64BIT ? 4 : 2);
3591 int max = (TARGET_SEH ? 4 : 12);
3592
3593 if (ix86_preferred_stack_boundary_arg < min
3594 || ix86_preferred_stack_boundary_arg > max)
3595 {
3596 if (min == max)
3597 error ("-mpreferred-stack-boundary is not supported "
3598 "for this target");
3599 else
3600 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3601 ix86_preferred_stack_boundary_arg, min, max);
3602 }
3603 else
3604 ix86_preferred_stack_boundary
3605 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3606 }
3607
3608 /* Set the default value for -mstackrealign. */
3609 if (ix86_force_align_arg_pointer == -1)
3610 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3611
3612 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3613
3614 /* Validate -mincoming-stack-boundary= value or default it to
3615 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3616 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3617 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3618 {
3619 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3620 || ix86_incoming_stack_boundary_arg > 12)
3621 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3622 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3623 else
3624 {
3625 ix86_user_incoming_stack_boundary