contrib/gcc-5.0/gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "hash-set.h"
  93 #include "machmode.h"
  94 #include "vec.h"
  95 #include "double-int.h"
  96 #include "input.h"
  97 #include "alias.h"
  98 #include "symtab.h"
  99 #include "wide-int.h"
 100 #include "inchash.h"
 101 #include "tree.h"
 102 #include "fold-const.h"
 103 #include "predict.h"
 104 #include "hard-reg-set.h"
 105 #include "function.h"
 106 #include "dominance.h"
 107 #include "cfg.h"
 108 #include "basic-block.h"
 109 #include "tree-ssa-alias.h"
 110 #include "internal-fn.h"
 111 #include "gimple-fold.h"
 112 #include "gimple-expr.h"
 113 #include "is-a.h"
 114 #include "gimple.h"
 115 #include "gimple-iterator.h"
 116 #include "gimplify.h"
 117 #include "gimplify-me.h"
 118 #include "stor-layout.h"
 119 #include "gimple-ssa.h"
 120 #include "tree-cfg.h"
 121 #include "tree-phinodes.h"
 122 #include "ssa-iterators.h"
 123 #include "stringpool.h"
 124 #include "tree-ssanames.h"
 125 #include "hashtab.h"
 126 #include "rtl.h"
 127 #include "statistics.h"
 128 #include "real.h"
 129 #include "fixed-value.h"
 130 #include "insn-config.h"
 131 #include "expmed.h"
 132 #include "dojump.h"
 133 #include "explow.h"
 134 #include "calls.h"
 135 #include "emit-rtl.h"
 136 #include "varasm.h"
 137 #include "stmt.h"
 138 #include "expr.h"
 139 #include "tree-dfa.h"
 140 #include "tree-ssa.h"
 141 #include "tree-pass.h"
 142 #include "alloc-pool.h"
 143 #include "target.h"
 144 #include "gimple-pretty-print.h"
 145 #include "builtins.h"
 146
 147 /* FIXME: RTL headers have to be included here for optabs.  */
 148 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 149 #include "expr.h"               /* Because optabs.h wants sepops.  */
 150 #include "insn-codes.h"
 151 #include "optabs.h"
 152
 153 /* This structure represents one basic block that either computes a
 154    division, or is a common dominator for basic block that compute a
 155    division.  */
 156 struct occurrence {
 157   /* The basic block represented by this structure.  */
 158   basic_block bb;
 159
 160   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 161      inserted in BB.  */
 162   tree recip_def;
 163
 164   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 165      was inserted in BB.  */
 166   gimple recip_def_stmt;
 167
 168   /* Pointer to a list of "struct occurrence"s for blocks dominated
 169      by BB.  */
 170   struct occurrence *children;
 171
 172   /* Pointer to the next "struct occurrence"s in the list of blocks
 173      sharing a common dominator.  */
 174   struct occurrence *next;
 175
 176   /* The number of divisions that are in BB before compute_merit.  The
 177      number of divisions that are in BB or post-dominate it after
 178      compute_merit.  */
 179   int num_divisions;
 180
 181   /* True if the basic block has a division, false if it is a common
 182      dominator for basic blocks that do.  If it is false and trapping
 183      math is active, BB is not a candidate for inserting a reciprocal.  */
 184   bool bb_has_division;
 185 };
 186
 187 static struct
 188 {
 189   /* Number of 1.0/X ops inserted.  */
 190   int rdivs_inserted;
 191
 192   /* Number of 1.0/FUNC ops inserted.  */
 193   int rfuncs_inserted;
 194 } reciprocal_stats;
 195
 196 static struct
 197 {
 198   /* Number of cexpi calls inserted.  */
 199   int inserted;
 200 } sincos_stats;
 201
 202 static struct
 203 {
 204   /* Number of hand-written 16-bit nop / bswaps found.  */
 205   int found_16bit;
 206
 207   /* Number of hand-written 32-bit nop / bswaps found.  */
 208   int found_32bit;
 209
 210   /* Number of hand-written 64-bit nop / bswaps found.  */
 211   int found_64bit;
 212 } nop_stats, bswap_stats;
 213
 214 static struct
 215 {
 216   /* Number of widening multiplication ops inserted.  */
 217   int widen_mults_inserted;
 218
 219   /* Number of integer multiply-and-accumulate ops inserted.  */
 220   int maccs_inserted;
 221
 222   /* Number of fp fused multiply-add ops inserted.  */
 223   int fmas_inserted;
 224 } widen_mul_stats;
 225
 226 /* The instance of "struct occurrence" representing the highest
 227    interesting block in the dominator tree.  */
 228 static struct occurrence *occ_head;
 229
 230 /* Allocation pool for getting instances of "struct occurrence".  */
 231 static alloc_pool occ_pool;
 232
 233
 234
 235 /* Allocate and return a new struct occurrence for basic block BB, and
 236    whose children list is headed by CHILDREN.  */
 237 static struct occurrence *
 238 occ_new (basic_block bb, struct occurrence *children)
 239 {
 240   struct occurrence *occ;
 241
 242   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 243   memset (occ, 0, sizeof (struct occurrence));
 244
 245   occ->bb = bb;
 246   occ->children = children;
 247   return occ;
 248 }
 249
 250
 251 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 252    list of "struct occurrence"s, one per basic block, having IDOM as
 253    their common dominator.
 254
 255    We try to insert NEW_OCC as deep as possible in the tree, and we also
 256    insert any other block that is a common dominator for BB and one
 257    block already in the tree.  */
 258
 259 static void
 260 insert_bb (struct occurrence *new_occ, basic_block idom,
 261            struct occurrence **p_head)
 262 {
 263   struct occurrence *occ, **p_occ;
 264
 265   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 266     {
 267       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 268       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 269       if (dom == bb)
 270         {
 271           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 272              from its list.  */
 273           *p_occ = occ->next;
 274           occ->next = new_occ->children;
 275           new_occ->children = occ;
 276
 277           /* Try the next block (it may as well be dominated by BB).  */
 278         }
 279
 280       else if (dom == occ_bb)
 281         {
 282           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 283           insert_bb (new_occ, dom, &occ->children);
 284           return;
 285         }
 286
 287       else if (dom != idom)
 288         {
 289           gcc_assert (!dom->aux);
 290
 291           /* There is a dominator between IDOM and BB, add it and make
 292              two children out of NEW_OCC and OCC.  First, remove OCC from
 293              its list.  */
 294           *p_occ = occ->next;
 295           new_occ->next = occ;
 296           occ->next = NULL;
 297
 298           /* None of the previous blocks has DOM as a dominator: if we tail
 299              recursed, we would reexamine them uselessly. Just switch BB with
 300              DOM, and go on looking for blocks dominated by DOM.  */
 301           new_occ = occ_new (dom, new_occ);
 302         }
 303
 304       else
 305         {
 306           /* Nothing special, go on with the next element.  */
 307           p_occ = &occ->next;
 308         }
 309     }
 310
 311   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 312   new_occ->next = *p_head;
 313   *p_head = new_occ;
 314 }
 315
 316 /* Register that we found a division in BB.  */
 317
 318 static inline void
 319 register_division_in (basic_block bb)
 320 {
 321   struct occurrence *occ;
 322
 323   occ = (struct occurrence *) bb->aux;
 324   if (!occ)
 325     {
 326       occ = occ_new (bb, NULL);
 327       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 328     }
 329
 330   occ->bb_has_division = true;
 331   occ->num_divisions++;
 332 }
 333
 334
 335 /* Compute the number of divisions that postdominate each block in OCC and
 336    its children.  */
 337
 338 static void
 339 compute_merit (struct occurrence *occ)
 340 {
 341   struct occurrence *occ_child;
 342   basic_block dom = occ->bb;
 343
 344   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 345     {
 346       basic_block bb;
 347       if (occ_child->children)
 348         compute_merit (occ_child);
 349
 350       if (flag_exceptions)
 351         bb = single_noncomplex_succ (dom);
 352       else
 353         bb = dom;
 354
 355       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 356         occ->num_divisions += occ_child->num_divisions;
 357     }
 358 }
 359
 360
 361 /* Return whether USE_STMT is a floating-point division by DEF.  */
 362 static inline bool
 363 is_division_by (gimple use_stmt, tree def)
 364 {
 365   return is_gimple_assign (use_stmt)
 366          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 367          && gimple_assign_rhs2 (use_stmt) == def
 368          /* Do not recognize x / x as valid division, as we are getting
 369             confused later by replacing all immediate uses x in such
 370             a stmt.  */
 371          && gimple_assign_rhs1 (use_stmt) != def;
 372 }
 373
 374 /* Walk the subset of the dominator tree rooted at OCC, setting the
 375    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 376    the given basic block.  The field may be left NULL, of course,
 377    if it is not possible or profitable to do the optimization.
 378
 379    DEF_BSI is an iterator pointing at the statement defining DEF.
 380    If RECIP_DEF is set, a dominator already has a computation that can
 381    be used.  */
 382
 383 static void
 384 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 385                     tree def, tree recip_def, int threshold)
 386 {
 387   tree type;
 388   gassign *new_stmt;
 389   gimple_stmt_iterator gsi;
 390   struct occurrence *occ_child;
 391
 392   if (!recip_def
 393       && (occ->bb_has_division || !flag_trapping_math)
 394       && occ->num_divisions >= threshold)
 395     {
 396       /* Make a variable with the replacement and substitute it.  */
 397       type = TREE_TYPE (def);
 398       recip_def = create_tmp_reg (type, "reciptmp");
 399       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 400                                       build_one_cst (type), def);
 401
 402       if (occ->bb_has_division)
 403         {
 404           /* Case 1: insert before an existing division.  */
 405           gsi = gsi_after_labels (occ->bb);
 406           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 407             gsi_next (&gsi);
 408
 409           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 410         }
 411       else if (def_gsi && occ->bb == def_gsi->bb)
 412         {
 413           /* Case 2: insert right after the definition.  Note that this will
 414              never happen if the definition statement can throw, because in
 415              that case the sole successor of the statement's basic block will
 416              dominate all the uses as well.  */
 417           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 418         }
 419       else
 420         {
 421           /* Case 3: insert in a basic block not containing defs/uses.  */
 422           gsi = gsi_after_labels (occ->bb);
 423           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 424         }
 425
 426       reciprocal_stats.rdivs_inserted++;
 427
 428       occ->recip_def_stmt = new_stmt;
 429     }
 430
 431   occ->recip_def = recip_def;
 432   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 433     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 434 }
 435
 436
 437 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 438    possible.  */
 439
 440 static inline void
 441 replace_reciprocal (use_operand_p use_p)
 442 {
 443   gimple use_stmt = USE_STMT (use_p);
 444   basic_block bb = gimple_bb (use_stmt);
 445   struct occurrence *occ = (struct occurrence *) bb->aux;
 446
 447   if (optimize_bb_for_speed_p (bb)
 448       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 449     {
 450       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 451       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 452       SET_USE (use_p, occ->recip_def);
 453       fold_stmt_inplace (&gsi);
 454       update_stmt (use_stmt);
 455     }
 456 }
 457
 458
 459 /* Free OCC and return one more "struct occurrence" to be freed.  */
 460
 461 static struct occurrence *
 462 free_bb (struct occurrence *occ)
 463 {
 464   struct occurrence *child, *next;
 465
 466   /* First get the two pointers hanging off OCC.  */
 467   next = occ->next;
 468   child = occ->children;
 469   occ->bb->aux = NULL;
 470   pool_free (occ_pool, occ);
 471
 472   /* Now ensure that we don't recurse unless it is necessary.  */
 473   if (!child)
 474     return next;
 475   else
 476     {
 477       while (next)
 478         next = free_bb (next);
 479
 480       return child;
 481     }
 482 }
 483
 484
 485 /* Look for floating-point divisions among DEF's uses, and try to
 486    replace them by multiplications with the reciprocal.  Add
 487    as many statements computing the reciprocal as needed.
 488
 489    DEF must be a GIMPLE register of a floating-point type.  */
 490
 491 static void
 492 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 493 {
 494   use_operand_p use_p;
 495   imm_use_iterator use_iter;
 496   struct occurrence *occ;
 497   int count = 0, threshold;
 498
 499   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 500
 501   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 502     {
 503       gimple use_stmt = USE_STMT (use_p);
 504       if (is_division_by (use_stmt, def))
 505         {
 506           register_division_in (gimple_bb (use_stmt));
 507           count++;
 508         }
 509     }
 510
 511   /* Do the expensive part only if we can hope to optimize something.  */
 512   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 513   if (count >= threshold)
 514     {
 515       gimple use_stmt;
 516       for (occ = occ_head; occ; occ = occ->next)
 517         {
 518           compute_merit (occ);
 519           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 520         }
 521
 522       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 523         {
 524           if (is_division_by (use_stmt, def))
 525             {
 526               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 527                 replace_reciprocal (use_p);
 528             }
 529         }
 530     }
 531
 532   for (occ = occ_head; occ; )
 533     occ = free_bb (occ);
 534
 535   occ_head = NULL;
 536 }
 537
 538 /* Go through all the floating-point SSA_NAMEs, and call
 539    execute_cse_reciprocals_1 on each of them.  */
 540 namespace {
 541
 542 const pass_data pass_data_cse_reciprocals =
 543 {
 544   GIMPLE_PASS, /* type */
 545   "recip", /* name */
 546   OPTGROUP_NONE, /* optinfo_flags */
 547   TV_NONE, /* tv_id */
 548   PROP_ssa, /* properties_required */
 549   0, /* properties_provided */
 550   0, /* properties_destroyed */
 551   0, /* todo_flags_start */
 552   TODO_update_ssa, /* todo_flags_finish */
 553 };
 554
 555 class pass_cse_reciprocals : public gimple_opt_pass
 556 {
 557 public:
 558   pass_cse_reciprocals (gcc::context *ctxt)
 559     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 560   {}
 561
 562   /* opt_pass methods: */
 563   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 564   virtual unsigned int execute (function *);
 565
 566 }; // class pass_cse_reciprocals
 567
 568 unsigned int
 569 pass_cse_reciprocals::execute (function *fun)
 570 {
 571   basic_block bb;
 572   tree arg;
 573
 574   occ_pool = create_alloc_pool ("dominators for recip",
 575                                 sizeof (struct occurrence),
 576                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 577
 578   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 579   calculate_dominance_info (CDI_DOMINATORS);
 580   calculate_dominance_info (CDI_POST_DOMINATORS);
 581
 582 #ifdef ENABLE_CHECKING
 583   FOR_EACH_BB_FN (bb, fun)
 584     gcc_assert (!bb->aux);
 585 #endif
 586
 587   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 588     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 589         && is_gimple_reg (arg))
 590       {
 591         tree name = ssa_default_def (fun, arg);
 592         if (name)
 593           execute_cse_reciprocals_1 (NULL, name);
 594       }
 595
 596   FOR_EACH_BB_FN (bb, fun)
 597     {
 598       tree def;
 599
 600       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 601            gsi_next (&gsi))
 602         {
 603           gphi *phi = gsi.phi ();
 604           def = PHI_RESULT (phi);
 605           if (! virtual_operand_p (def)
 606               && FLOAT_TYPE_P (TREE_TYPE (def)))
 607             execute_cse_reciprocals_1 (NULL, def);
 608         }
 609
 610       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 611            gsi_next (&gsi))
 612         {
 613           gimple stmt = gsi_stmt (gsi);
 614
 615           if (gimple_has_lhs (stmt)
 616               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 617               && FLOAT_TYPE_P (TREE_TYPE (def))
 618               && TREE_CODE (def) == SSA_NAME)
 619             execute_cse_reciprocals_1 (&gsi, def);
 620         }
 621
 622       if (optimize_bb_for_size_p (bb))
 623         continue;
 624
 625       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 626       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 627            gsi_next (&gsi))
 628         {
 629           gimple stmt = gsi_stmt (gsi);
 630           tree fndecl;
 631
 632           if (is_gimple_assign (stmt)
 633               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 634             {
 635               tree arg1 = gimple_assign_rhs2 (stmt);
 636               gimple stmt1;
 637
 638               if (TREE_CODE (arg1) != SSA_NAME)
 639                 continue;
 640
 641               stmt1 = SSA_NAME_DEF_STMT (arg1);
 642
 643               if (is_gimple_call (stmt1)
 644                   && gimple_call_lhs (stmt1)
 645                   && (fndecl = gimple_call_fndecl (stmt1))
 646                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 647                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 648                 {
 649                   enum built_in_function code;
 650                   bool md_code, fail;
 651                   imm_use_iterator ui;
 652                   use_operand_p use_p;
 653
 654                   code = DECL_FUNCTION_CODE (fndecl);
 655                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 656
 657                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 658                   if (!fndecl)
 659                     continue;
 660
 661                   /* Check that all uses of the SSA name are divisions,
 662                      otherwise replacing the defining statement will do
 663                      the wrong thing.  */
 664                   fail = false;
 665                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 666                     {
 667                       gimple stmt2 = USE_STMT (use_p);
 668                       if (is_gimple_debug (stmt2))
 669                         continue;
 670                       if (!is_gimple_assign (stmt2)
 671                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 672                           || gimple_assign_rhs1 (stmt2) == arg1
 673                           || gimple_assign_rhs2 (stmt2) != arg1)
 674                         {
 675                           fail = true;
 676                           break;
 677                         }
 678                     }
 679                   if (fail)
 680                     continue;
 681
 682                   gimple_replace_ssa_lhs (stmt1, arg1);
 683                   gimple_call_set_fndecl (stmt1, fndecl);
 684                   update_stmt (stmt1);
 685                   reciprocal_stats.rfuncs_inserted++;
 686
 687                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 688                     {
 689                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 690                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 691                       fold_stmt_inplace (&gsi);
 692                       update_stmt (stmt);
 693                     }
 694                 }
 695             }
 696         }
 697     }
 698
 699   statistics_counter_event (fun, "reciprocal divs inserted",
 700                             reciprocal_stats.rdivs_inserted);
 701   statistics_counter_event (fun, "reciprocal functions inserted",
 702                             reciprocal_stats.rfuncs_inserted);
 703
 704   free_dominance_info (CDI_DOMINATORS);
 705   free_dominance_info (CDI_POST_DOMINATORS);
 706   free_alloc_pool (occ_pool);
 707   return 0;
 708 }
 709
 710 } // anon namespace
 711
 712 gimple_opt_pass *
 713 make_pass_cse_reciprocals (gcc::context *ctxt)
 714 {
 715   return new pass_cse_reciprocals (ctxt);
 716 }
 717
 718 /* Records an occurrence at statement USE_STMT in the vector of trees
 719    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 720    is not yet initialized.  Returns true if the occurrence was pushed on
 721    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 722    statements in the vector.  */
 723
 724 static bool
 725 maybe_record_sincos (vec<gimple> *stmts,
 726                      basic_block *top_bb, gimple use_stmt)
 727 {
 728   basic_block use_bb = gimple_bb (use_stmt);
 729   if (*top_bb
 730       && (*top_bb == use_bb
 731           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 732     stmts->safe_push (use_stmt);
 733   else if (!*top_bb
 734            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 735     {
 736       stmts->safe_push (use_stmt);
 737       *top_bb = use_bb;
 738     }
 739   else
 740     return false;
 741
 742   return true;
 743 }
 744
 745 /* Look for sin, cos and cexpi calls with the same argument NAME and
 746    create a single call to cexpi CSEing the result in this case.
 747    We first walk over all immediate uses of the argument collecting
 748    statements that we can CSE in a vector and in a second pass replace
 749    the statement rhs with a REALPART or IMAGPART expression on the
 750    result of the cexpi call we insert before the use statement that
 751    dominates all other candidates.  */
 752
 753 static bool
 754 execute_cse_sincos_1 (tree name)
 755 {
 756   gimple_stmt_iterator gsi;
 757   imm_use_iterator use_iter;
 758   tree fndecl, res, type;
 759   gimple def_stmt, use_stmt, stmt;
 760   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 761   auto_vec<gimple> stmts;
 762   basic_block top_bb = NULL;
 763   int i;
 764   bool cfg_changed = false;
 765
 766   type = TREE_TYPE (name);
 767   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 768     {
 769       if (gimple_code (use_stmt) != GIMPLE_CALL
 770           || !gimple_call_lhs (use_stmt)
 771           || !(fndecl = gimple_call_fndecl (use_stmt))
 772           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 773         continue;
 774
 775       switch (DECL_FUNCTION_CODE (fndecl))
 776         {
 777         CASE_FLT_FN (BUILT_IN_COS):
 778           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 779           break;
 780
 781         CASE_FLT_FN (BUILT_IN_SIN):
 782           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 783           break;
 784
 785         CASE_FLT_FN (BUILT_IN_CEXPI):
 786           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 787           break;
 788
 789         default:;
 790         }
 791     }
 792
 793   if (seen_cos + seen_sin + seen_cexpi <= 1)
 794     return false;
 795
 796   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 797      the name def statement.  */
 798   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 799   if (!fndecl)
 800     return false;
 801   stmt = gimple_build_call (fndecl, 1, name);
 802   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 803   gimple_call_set_lhs (stmt, res);
 804
 805   def_stmt = SSA_NAME_DEF_STMT (name);
 806   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 807       && gimple_code (def_stmt) != GIMPLE_PHI
 808       && gimple_bb (def_stmt) == top_bb)
 809     {
 810       gsi = gsi_for_stmt (def_stmt);
 811       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 812     }
 813   else
 814     {
 815       gsi = gsi_after_labels (top_bb);
 816       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 817     }
 818   sincos_stats.inserted++;
 819
 820   /* And adjust the recorded old call sites.  */
 821   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 822     {
 823       tree rhs = NULL;
 824       fndecl = gimple_call_fndecl (use_stmt);
 825
 826       switch (DECL_FUNCTION_CODE (fndecl))
 827         {
 828         CASE_FLT_FN (BUILT_IN_COS):
 829           rhs = fold_build1 (REALPART_EXPR, type, res);
 830           break;
 831
 832         CASE_FLT_FN (BUILT_IN_SIN):
 833           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 834           break;
 835
 836         CASE_FLT_FN (BUILT_IN_CEXPI):
 837           rhs = res;
 838           break;
 839
 840         default:;
 841           gcc_unreachable ();
 842         }
 843
 844         /* Replace call with a copy.  */
 845         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 846
 847         gsi = gsi_for_stmt (use_stmt);
 848         gsi_replace (&gsi, stmt, true);
 849         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 850           cfg_changed = true;
 851     }
 852
 853   return cfg_changed;
 854 }
 855
 856 /* To evaluate powi(x,n), the floating point value x raised to the
 857    constant integer exponent n, we use a hybrid algorithm that
 858    combines the "window method" with look-up tables.  For an
 859    introduction to exponentiation algorithms and "addition chains",
 860    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 861    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 862    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 863    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 864
 865 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 866    multiplications to inline before calling the system library's pow
 867    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 868    so this default never requires calling pow, powf or powl.  */
 869
 870 #ifndef POWI_MAX_MULTS
 871 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 872 #endif
 873
 874 /* The size of the "optimal power tree" lookup table.  All
 875    exponents less than this value are simply looked up in the
 876    powi_table below.  This threshold is also used to size the
 877    cache of pseudo registers that hold intermediate results.  */
 878 #define POWI_TABLE_SIZE 256
 879
 880 /* The size, in bits of the window, used in the "window method"
 881    exponentiation algorithm.  This is equivalent to a radix of
 882    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 883 #define POWI_WINDOW_SIZE 3
 884
 885 /* The following table is an efficient representation of an
 886    "optimal power tree".  For each value, i, the corresponding
 887    value, j, in the table states than an optimal evaluation
 888    sequence for calculating pow(x,i) can be found by evaluating
 889    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 890    100 integers is given in Knuth's "Seminumerical algorithms".  */
 891
 892 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 893   {
 894       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 895       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 896       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 897      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 898      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 899      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 900      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 901      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 902      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 903      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 904      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 905      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 906      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 907      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 908      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 909      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 910      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 911      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 912      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 913      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 914      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 915      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 916      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 917      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 918      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 919     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 920     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 921     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 922     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 923     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 924     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 925     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 926   };
 927
 928
 929 /* Return the number of multiplications required to calculate
 930    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 931    subroutine of powi_cost.  CACHE is an array indicating
 932    which exponents have already been calculated.  */
 933
 934 static int
 935 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 936 {
 937   /* If we've already calculated this exponent, then this evaluation
 938      doesn't require any additional multiplications.  */
 939   if (cache[n])
 940     return 0;
 941
 942   cache[n] = true;
 943   return powi_lookup_cost (n - powi_table[n], cache)
 944          + powi_lookup_cost (powi_table[n], cache) + 1;
 945 }
 946
 947 /* Return the number of multiplications required to calculate
 948    powi(x,n) for an arbitrary x, given the exponent N.  This
 949    function needs to be kept in sync with powi_as_mults below.  */
 950
 951 static int
 952 powi_cost (HOST_WIDE_INT n)
 953 {
 954   bool cache[POWI_TABLE_SIZE];
 955   unsigned HOST_WIDE_INT digit;
 956   unsigned HOST_WIDE_INT val;
 957   int result;
 958
 959   if (n == 0)
 960     return 0;
 961
 962   /* Ignore the reciprocal when calculating the cost.  */
 963   val = (n < 0) ? -n : n;
 964
 965   /* Initialize the exponent cache.  */
 966   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 967   cache[1] = true;
 968
 969   result = 0;
 970
 971   while (val >= POWI_TABLE_SIZE)
 972     {
 973       if (val & 1)
 974         {
 975           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 976           result += powi_lookup_cost (digit, cache)
 977                     + POWI_WINDOW_SIZE + 1;
 978           val >>= POWI_WINDOW_SIZE;
 979         }
 980       else
 981         {
 982           val >>= 1;
 983           result++;
 984         }
 985     }
 986
 987   return result + powi_lookup_cost (val, cache);
 988 }
 989
 990 /* Recursive subroutine of powi_as_mults.  This function takes the
 991    array, CACHE, of already calculated exponents and an exponent N and
 992    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 993
 994 static tree
 995 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 996                  HOST_WIDE_INT n, tree *cache)
 997 {
 998   tree op0, op1, ssa_target;
 999   unsigned HOST_WIDE_INT digit;
1000   gassign *mult_stmt;
1001
1002   if (n < POWI_TABLE_SIZE && cache[n])
1003     return cache[n];
1004
1005   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1006
1007   if (n < POWI_TABLE_SIZE)
1008     {
1009       cache[n] = ssa_target;
1010       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1011       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1012     }
1013   else if (n & 1)
1014     {
1015       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1016       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1017       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1018     }
1019   else
1020     {
1021       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1022       op1 = op0;
1023     }
1024
1025   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1026   gimple_set_location (mult_stmt, loc);
1027   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1028
1029   return ssa_target;
1030 }
1031
1032 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1033    This function needs to be kept in sync with powi_cost above.  */
1034
1035 static tree
1036 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1037                tree arg0, HOST_WIDE_INT n)
1038 {
1039   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1040   gassign *div_stmt;
1041   tree target;
1042
1043   if (n == 0)
1044     return build_real (type, dconst1);
1045
1046   memset (cache, 0,  sizeof (cache));
1047   cache[1] = arg0;
1048
1049   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1050   if (n >= 0)
1051     return result;
1052
1053   /* If the original exponent was negative, reciprocate the result.  */
1054   target = make_temp_ssa_name (type, NULL, "powmult");
1055   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1056                                   build_real (type, dconst1), result);
1057   gimple_set_location (div_stmt, loc);
1058   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1059
1060   return target;
1061 }
1062
1063 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1064    location info LOC.  If the arguments are appropriate, create an
1065    equivalent sequence of statements prior to GSI using an optimal
1066    number of multiplications, and return an expession holding the
1067    result.  */
1068
1069 static tree
1070 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1071                             tree arg0, HOST_WIDE_INT n)
1072 {
1073   /* Avoid largest negative number.  */
1074   if (n != -n
1075       && ((n >= -1 && n <= 2)
1076           || (optimize_function_for_speed_p (cfun)
1077               && powi_cost (n) <= POWI_MAX_MULTS)))
1078     return powi_as_mults (gsi, loc, arg0, n);
1079
1080   return NULL_TREE;
1081 }
1082
1083 /* Build a gimple call statement that calls FN with argument ARG.
1084    Set the lhs of the call statement to a fresh SSA name.  Insert the
1085    statement prior to GSI's current position, and return the fresh
1086    SSA name.  */
1087
1088 static tree
1089 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1090                        tree fn, tree arg)
1091 {
1092   gcall *call_stmt;
1093   tree ssa_target;
1094
1095   call_stmt = gimple_build_call (fn, 1, arg);
1096   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1097   gimple_set_lhs (call_stmt, ssa_target);
1098   gimple_set_location (call_stmt, loc);
1099   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1100
1101   return ssa_target;
1102 }
1103
1104 /* Build a gimple binary operation with the given CODE and arguments
1105    ARG0, ARG1, assigning the result to a new SSA name for variable
1106    TARGET.  Insert the statement prior to GSI's current position, and
1107    return the fresh SSA name.*/
1108
1109 static tree
1110 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1111                         const char *name, enum tree_code code,
1112                         tree arg0, tree arg1)
1113 {
1114   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1115   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1116   gimple_set_location (stmt, loc);
1117   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1118   return result;
1119 }
1120
1121 /* Build a gimple reference operation with the given CODE and argument
1122    ARG, assigning the result to a new SSA name of TYPE with NAME.
1123    Insert the statement prior to GSI's current position, and return
1124    the fresh SSA name.  */
1125
1126 static inline tree
1127 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1128                       const char *name, enum tree_code code, tree arg0)
1129 {
1130   tree result = make_temp_ssa_name (type, NULL, name);
1131   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1132   gimple_set_location (stmt, loc);
1133   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1134   return result;
1135 }
1136
1137 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1138    prior to GSI's current position, and return the fresh SSA name.  */
1139
1140 static tree
1141 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1142                        tree type, tree val)
1143 {
1144   tree result = make_ssa_name (type);
1145   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1146   gimple_set_location (stmt, loc);
1147   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1148   return result;
1149 }
1150
1151 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1152    with location info LOC.  If possible, create an equivalent and
1153    less expensive sequence of statements prior to GSI, and return an
1154    expession holding the result.  */
1155
1156 static tree
1157 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1158                            tree arg0, tree arg1)
1159 {
1160   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1161   REAL_VALUE_TYPE c2, dconst3;
1162   HOST_WIDE_INT n;
1163   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1164   machine_mode mode;
1165   bool hw_sqrt_exists, c_is_int, c2_is_int;
1166
1167   /* If the exponent isn't a constant, there's nothing of interest
1168      to be done.  */
1169   if (TREE_CODE (arg1) != REAL_CST)
1170     return NULL_TREE;
1171
1172   /* If the exponent is equivalent to an integer, expand to an optimal
1173      multiplication sequence when profitable.  */
1174   c = TREE_REAL_CST (arg1);
1175   n = real_to_integer (&c);
1176   real_from_integer (&cint, VOIDmode, n, SIGNED);
1177   c_is_int = real_identical (&c, &cint);
1178
1179   if (c_is_int
1180       && ((n >= -1 && n <= 2)
1181           || (flag_unsafe_math_optimizations
1182               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1183               && powi_cost (n) <= POWI_MAX_MULTS)))
1184     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1185
1186   /* Attempt various optimizations using sqrt and cbrt.  */
1187   type = TREE_TYPE (arg0);
1188   mode = TYPE_MODE (type);
1189   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1190
1191   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1192      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1193      sqrt(-0) = -0.  */
1194   if (sqrtfn
1195       && REAL_VALUES_EQUAL (c, dconsthalf)
1196       && !HONOR_SIGNED_ZEROS (mode))
1197     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1198
1199   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1200      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1201      so do this optimization even if -Os.  Don't do this optimization
1202      if we don't have a hardware sqrt insn.  */
1203   dconst1_4 = dconst1;
1204   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1205   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1206
1207   if (flag_unsafe_math_optimizations
1208       && sqrtfn
1209       && REAL_VALUES_EQUAL (c, dconst1_4)
1210       && hw_sqrt_exists)
1211     {
1212       /* sqrt(x)  */
1213       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1214
1215       /* sqrt(sqrt(x))  */
1216       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1217     }
1218
1219   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1220      optimizing for space.  Don't do this optimization if we don't have
1221      a hardware sqrt insn.  */
1222   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1223   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1224
1225   if (flag_unsafe_math_optimizations
1226       && sqrtfn
1227       && optimize_function_for_speed_p (cfun)
1228       && REAL_VALUES_EQUAL (c, dconst3_4)
1229       && hw_sqrt_exists)
1230     {
1231       /* sqrt(x)  */
1232       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1233
1234       /* sqrt(sqrt(x))  */
1235       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1236
1237       /* sqrt(x) * sqrt(sqrt(x))  */
1238       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1239                                      sqrt_arg0, sqrt_sqrt);
1240     }
1241
1242   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1243      optimizations since 1./3. is not exactly representable.  If x
1244      is negative and finite, the correct value of pow(x,1./3.) is
1245      a NaN with the "invalid" exception raised, because the value
1246      of 1./3. actually has an even denominator.  The correct value
1247      of cbrt(x) is a negative real value.  */
1248   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1249   dconst1_3 = real_value_truncate (mode, dconst_third ());
1250
1251   if (flag_unsafe_math_optimizations
1252       && cbrtfn
1253       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1254       && REAL_VALUES_EQUAL (c, dconst1_3))
1255     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1256
1257   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1258      if we don't have a hardware sqrt insn.  */
1259   dconst1_6 = dconst1_3;
1260   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1261
1262   if (flag_unsafe_math_optimizations
1263       && sqrtfn
1264       && cbrtfn
1265       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1266       && optimize_function_for_speed_p (cfun)
1267       && hw_sqrt_exists
1268       && REAL_VALUES_EQUAL (c, dconst1_6))
1269     {
1270       /* sqrt(x)  */
1271       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1272
1273       /* cbrt(sqrt(x))  */
1274       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1275     }
1276
1277   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1278      and c not an integer, into
1279
1280        sqrt(x) * powi(x, n/2),                n > 0;
1281        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1282
1283      Do not calculate the powi factor when n/2 = 0.  */
1284   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1285   n = real_to_integer (&c2);
1286   real_from_integer (&cint, VOIDmode, n, SIGNED);
1287   c2_is_int = real_identical (&c2, &cint);
1288
1289   if (flag_unsafe_math_optimizations
1290       && sqrtfn
1291       && c2_is_int
1292       && !c_is_int
1293       && optimize_function_for_speed_p (cfun))
1294     {
1295       tree powi_x_ndiv2 = NULL_TREE;
1296
1297       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1298          possible or profitable, give up.  Skip the degenerate case when
1299          n is 1 or -1, where the result is always 1.  */
1300       if (absu_hwi (n) != 1)
1301         {
1302           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1303                                                      abs_hwi (n / 2));
1304           if (!powi_x_ndiv2)
1305             return NULL_TREE;
1306         }
1307
1308       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1309          result of the optimal multiply sequence just calculated.  */
1310       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1311
1312       if (absu_hwi (n) == 1)
1313         result = sqrt_arg0;
1314       else
1315         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1316                                          sqrt_arg0, powi_x_ndiv2);
1317
1318       /* If n is negative, reciprocate the result.  */
1319       if (n < 0)
1320         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1321                                          build_real (type, dconst1), result);
1322       return result;
1323     }
1324
1325   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1326
1327      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1328      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1329
1330      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1331      different from pow(x, 1./3.) due to rounding and behavior with
1332      negative x, we need to constrain this transformation to unsafe
1333      math and positive x or finite math.  */
1334   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1335   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1336   real_round (&c2, mode, &c2);
1337   n = real_to_integer (&c2);
1338   real_from_integer (&cint, VOIDmode, n, SIGNED);
1339   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1340   real_convert (&c2, mode, &c2);
1341
1342   if (flag_unsafe_math_optimizations
1343       && cbrtfn
1344       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1345       && real_identical (&c2, &c)
1346       && !c2_is_int
1347       && optimize_function_for_speed_p (cfun)
1348       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1349     {
1350       tree powi_x_ndiv3 = NULL_TREE;
1351
1352       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1353          possible or profitable, give up.  Skip the degenerate case when
1354          abs(n) < 3, where the result is always 1.  */
1355       if (absu_hwi (n) >= 3)
1356         {
1357           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1358                                                      abs_hwi (n / 3));
1359           if (!powi_x_ndiv3)
1360             return NULL_TREE;
1361         }
1362
1363       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1364          as that creates an unnecessary variable.  Instead, just produce
1365          either cbrt(x) or cbrt(x) * cbrt(x).  */
1366       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1367
1368       if (absu_hwi (n) % 3 == 1)
1369         powi_cbrt_x = cbrt_x;
1370       else
1371         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1372                                               cbrt_x, cbrt_x);
1373
1374       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1375       if (absu_hwi (n) < 3)
1376         result = powi_cbrt_x;
1377       else
1378         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1379                                          powi_x_ndiv3, powi_cbrt_x);
1380
1381       /* If n is negative, reciprocate the result.  */
1382       if (n < 0)
1383         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1384                                          build_real (type, dconst1), result);
1385
1386       return result;
1387     }
1388
1389   /* No optimizations succeeded.  */
1390   return NULL_TREE;
1391 }
1392
1393 /* ARG is the argument to a cabs builtin call in GSI with location info
1394    LOC.  Create a sequence of statements prior to GSI that calculates
1395    sqrt(R*R + I*I), where R and I are the real and imaginary components
1396    of ARG, respectively.  Return an expression holding the result.  */
1397
1398 static tree
1399 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1400 {
1401   tree real_part, imag_part, addend1, addend2, sum, result;
1402   tree type = TREE_TYPE (TREE_TYPE (arg));
1403   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1404   machine_mode mode = TYPE_MODE (type);
1405
1406   if (!flag_unsafe_math_optimizations
1407       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1408       || !sqrtfn
1409       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1410     return NULL_TREE;
1411
1412   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1413                                     REALPART_EXPR, arg);
1414   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1415                                     real_part, real_part);
1416   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1417                                     IMAGPART_EXPR, arg);
1418   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1419                                     imag_part, imag_part);
1420   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1421   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1422
1423   return result;
1424 }
1425
1426 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1427    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1428    an optimal number of multiplies, when n is a constant.  */
1429
1430 namespace {
1431
1432 const pass_data pass_data_cse_sincos =
1433 {
1434   GIMPLE_PASS, /* type */
1435   "sincos", /* name */
1436   OPTGROUP_NONE, /* optinfo_flags */
1437   TV_NONE, /* tv_id */
1438   PROP_ssa, /* properties_required */
1439   0, /* properties_provided */
1440   0, /* properties_destroyed */
1441   0, /* todo_flags_start */
1442   TODO_update_ssa, /* todo_flags_finish */
1443 };
1444
1445 class pass_cse_sincos : public gimple_opt_pass
1446 {
1447 public:
1448   pass_cse_sincos (gcc::context *ctxt)
1449     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1450   {}
1451
1452   /* opt_pass methods: */
1453   virtual bool gate (function *)
1454     {
1455       /* We no longer require either sincos or cexp, since powi expansion
1456          piggybacks on this pass.  */
1457       return optimize;
1458     }
1459
1460   virtual unsigned int execute (function *);
1461
1462 }; // class pass_cse_sincos
1463
1464 unsigned int
1465 pass_cse_sincos::execute (function *fun)
1466 {
1467   basic_block bb;
1468   bool cfg_changed = false;
1469
1470   calculate_dominance_info (CDI_DOMINATORS);
1471   memset (&sincos_stats, 0, sizeof (sincos_stats));
1472
1473   FOR_EACH_BB_FN (bb, fun)
1474     {
1475       gimple_stmt_iterator gsi;
1476       bool cleanup_eh = false;
1477
1478       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1479         {
1480           gimple stmt = gsi_stmt (gsi);
1481           tree fndecl;
1482
1483           /* Only the last stmt in a bb could throw, no need to call
1484              gimple_purge_dead_eh_edges if we change something in the middle
1485              of a basic block.  */
1486           cleanup_eh = false;
1487
1488           if (is_gimple_call (stmt)
1489               && gimple_call_lhs (stmt)
1490               && (fndecl = gimple_call_fndecl (stmt))
1491               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1492             {
1493               tree arg, arg0, arg1, result;
1494               HOST_WIDE_INT n;
1495               location_t loc;
1496
1497               switch (DECL_FUNCTION_CODE (fndecl))
1498                 {
1499                 CASE_FLT_FN (BUILT_IN_COS):
1500                 CASE_FLT_FN (BUILT_IN_SIN):
1501                 CASE_FLT_FN (BUILT_IN_CEXPI):
1502                   /* Make sure we have either sincos or cexp.  */
1503                   if (!targetm.libc_has_function (function_c99_math_complex)
1504                       && !targetm.libc_has_function (function_sincos))
1505                     break;
1506
1507                   arg = gimple_call_arg (stmt, 0);
1508                   if (TREE_CODE (arg) == SSA_NAME)
1509                     cfg_changed |= execute_cse_sincos_1 (arg);
1510                   break;
1511
1512                 CASE_FLT_FN (BUILT_IN_POW):
1513                   arg0 = gimple_call_arg (stmt, 0);
1514                   arg1 = gimple_call_arg (stmt, 1);
1515
1516                   loc = gimple_location (stmt);
1517                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1518
1519                   if (result)
1520                     {
1521                       tree lhs = gimple_get_lhs (stmt);
1522                       gassign *new_stmt = gimple_build_assign (lhs, result);
1523                       gimple_set_location (new_stmt, loc);
1524                       unlink_stmt_vdef (stmt);
1525                       gsi_replace (&gsi, new_stmt, true);
1526                       cleanup_eh = true;
1527                       if (gimple_vdef (stmt))
1528                         release_ssa_name (gimple_vdef (stmt));
1529                     }
1530                   break;
1531
1532                 CASE_FLT_FN (BUILT_IN_POWI):
1533                   arg0 = gimple_call_arg (stmt, 0);
1534                   arg1 = gimple_call_arg (stmt, 1);
1535                   loc = gimple_location (stmt);
1536
1537                   if (real_minus_onep (arg0))
1538                     {
1539                       tree t0, t1, cond, one, minus_one;
1540                       gassign *stmt;
1541
1542                       t0 = TREE_TYPE (arg0);
1543                       t1 = TREE_TYPE (arg1);
1544                       one = build_real (t0, dconst1);
1545                       minus_one = build_real (t0, dconstm1);
1546
1547                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1548                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1549                                                   arg1, build_int_cst (t1, 1));
1550                       gimple_set_location (stmt, loc);
1551                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1552
1553                       result = make_temp_ssa_name (t0, NULL, "powi");
1554                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1555                                                   minus_one, one);
1556                       gimple_set_location (stmt, loc);
1557                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1558                     }
1559                   else
1560                     {
1561                       if (!tree_fits_shwi_p (arg1))
1562                         break;
1563
1564                       n = tree_to_shwi (arg1);
1565                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1566                     }
1567
1568                   if (result)
1569                     {
1570                       tree lhs = gimple_get_lhs (stmt);
1571                       gassign *new_stmt = gimple_build_assign (lhs, result);
1572                       gimple_set_location (new_stmt, loc);
1573                       unlink_stmt_vdef (stmt);
1574                       gsi_replace (&gsi, new_stmt, true);
1575                       cleanup_eh = true;
1576                       if (gimple_vdef (stmt))
1577                         release_ssa_name (gimple_vdef (stmt));
1578                     }
1579                   break;
1580
1581                 CASE_FLT_FN (BUILT_IN_CABS):
1582                   arg0 = gimple_call_arg (stmt, 0);
1583                   loc = gimple_location (stmt);
1584                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1585
1586                   if (result)
1587                     {
1588                       tree lhs = gimple_get_lhs (stmt);
1589                       gassign *new_stmt = gimple_build_assign (lhs, result);
1590                       gimple_set_location (new_stmt, loc);
1591                       unlink_stmt_vdef (stmt);
1592                       gsi_replace (&gsi, new_stmt, true);
1593                       cleanup_eh = true;
1594                       if (gimple_vdef (stmt))
1595                         release_ssa_name (gimple_vdef (stmt));
1596                     }
1597                   break;
1598
1599                 default:;
1600                 }
1601             }
1602         }
1603       if (cleanup_eh)
1604         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1605     }
1606
1607   statistics_counter_event (fun, "sincos statements inserted",
1608                             sincos_stats.inserted);
1609
1610   free_dominance_info (CDI_DOMINATORS);
1611   return cfg_changed ? TODO_cleanup_cfg : 0;
1612 }
1613
1614 } // anon namespace
1615
1616 gimple_opt_pass *
1617 make_pass_cse_sincos (gcc::context *ctxt)
1618 {
1619   return new pass_cse_sincos (ctxt);
1620 }
1621
1622 /* A symbolic number is used to detect byte permutation and selection
1623    patterns.  Therefore the field N contains an artificial number
1624    consisting of octet sized markers:
1625
1626    0    - target byte has the value 0
1627    FF   - target byte has an unknown value (eg. due to sign extension)
1628    1..size - marker value is the target byte index minus one.
1629
1630    To detect permutations on memory sources (arrays and structures), a symbolic
1631    number is also associated a base address (the array or structure the load is
1632    made from), an offset from the base address and a range which gives the
1633    difference between the highest and lowest accessed memory location to make
1634    such a symbolic number. The range is thus different from size which reflects
1635    the size of the type of current expression. Note that for non memory source,
1636    range holds the same value as size.
1637
1638    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1639    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1640    still have a size of 2 but this time a range of 1.  */
1641
1642 struct symbolic_number {
1643   uint64_t n;
1644   tree type;
1645   tree base_addr;
1646   tree offset;
1647   HOST_WIDE_INT bytepos;
1648   tree alias_set;
1649   tree vuse;
1650   unsigned HOST_WIDE_INT range;
1651 };
1652
1653 #define BITS_PER_MARKER 8
1654 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1655 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1656 #define HEAD_MARKER(n, size) \
1657   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1658
1659 /* The number which the find_bswap_or_nop_1 result should match in
1660    order to have a nop.  The number is masked according to the size of
1661    the symbolic number before using it.  */
1662 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1663   (uint64_t)0x08070605 << 32 | 0x04030201)
1664
1665 /* The number which the find_bswap_or_nop_1 result should match in
1666    order to have a byte swap.  The number is masked according to the
1667    size of the symbolic number before using it.  */
1668 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1669   (uint64_t)0x01020304 << 32 | 0x05060708)
1670
1671 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1672    number N.  Return false if the requested operation is not permitted
1673    on a symbolic number.  */
1674
1675 static inline bool
1676 do_shift_rotate (enum tree_code code,
1677                  struct symbolic_number *n,
1678                  int count)
1679 {
1680   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1681   unsigned head_marker;
1682
1683   if (count % BITS_PER_UNIT != 0)
1684     return false;
1685   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1686
1687   /* Zero out the extra bits of N in order to avoid them being shifted
1688      into the significant bits.  */
1689   if (size < 64 / BITS_PER_MARKER)
1690     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1691
1692   switch (code)
1693     {
1694     case LSHIFT_EXPR:
1695       n->n <<= count;
1696       break;
1697     case RSHIFT_EXPR:
1698       head_marker = HEAD_MARKER (n->n, size);
1699       n->n >>= count;
1700       /* Arithmetic shift of signed type: result is dependent on the value.  */
1701       if (!TYPE_UNSIGNED (n->type) && head_marker)
1702         for (i = 0; i < count / BITS_PER_MARKER; i++)
1703           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1704                   << ((size - 1 - i) * BITS_PER_MARKER);
1705       break;
1706     case LROTATE_EXPR:
1707       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1708       break;
1709     case RROTATE_EXPR:
1710       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1711       break;
1712     default:
1713       return false;
1714     }
1715   /* Zero unused bits for size.  */
1716   if (size < 64 / BITS_PER_MARKER)
1717     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1718   return true;
1719 }
1720
1721 /* Perform sanity checking for the symbolic number N and the gimple
1722    statement STMT.  */
1723
1724 static inline bool
1725 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1726 {
1727   tree lhs_type;
1728
1729   lhs_type = gimple_expr_type (stmt);
1730
1731   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1732     return false;
1733
1734   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1735     return false;
1736
1737   return true;
1738 }
1739
1740 /* Initialize the symbolic number N for the bswap pass from the base element
1741    SRC manipulated by the bitwise OR expression.  */
1742
1743 static bool
1744 init_symbolic_number (struct symbolic_number *n, tree src)
1745 {
1746   int size;
1747
1748   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1749
1750   /* Set up the symbolic number N by setting each byte to a value between 1 and
1751      the byte size of rhs1.  The highest order byte is set to n->size and the
1752      lowest order byte to 1.  */
1753   n->type = TREE_TYPE (src);
1754   size = TYPE_PRECISION (n->type);
1755   if (size % BITS_PER_UNIT != 0)
1756     return false;
1757   size /= BITS_PER_UNIT;
1758   if (size > 64 / BITS_PER_MARKER)
1759     return false;
1760   n->range = size;
1761   n->n = CMPNOP;
1762
1763   if (size < 64 / BITS_PER_MARKER)
1764     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1765
1766   return true;
1767 }
1768
1769 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1770    the answer. If so, REF is that memory source and the base of the memory area
1771    accessed and the offset of the access from that base are recorded in N.  */
1772
1773 bool
1774 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1775 {
1776   /* Leaf node is an array or component ref. Memorize its base and
1777      offset from base to compare to other such leaf node.  */
1778   HOST_WIDE_INT bitsize, bitpos;
1779   machine_mode mode;
1780   int unsignedp, volatilep;
1781   tree offset, base_addr;
1782
1783   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1784     return false;
1785
1786   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1787                                    &unsignedp, &volatilep, false);
1788
1789   if (TREE_CODE (base_addr) == MEM_REF)
1790     {
1791       offset_int bit_offset = 0;
1792       tree off = TREE_OPERAND (base_addr, 1);
1793
1794       if (!integer_zerop (off))
1795         {
1796           offset_int boff, coff = mem_ref_offset (base_addr);
1797           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1798           bit_offset += boff;
1799         }
1800
1801       base_addr = TREE_OPERAND (base_addr, 0);
1802
1803       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1804       if (wi::neg_p (bit_offset))
1805         {
1806           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1807           offset_int tem = bit_offset.and_not (mask);
1808           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1809              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1810           bit_offset -= tem;
1811           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1812           if (offset)
1813             offset = size_binop (PLUS_EXPR, offset,
1814                                     wide_int_to_tree (sizetype, tem));
1815           else
1816             offset = wide_int_to_tree (sizetype, tem);
1817         }
1818
1819       bitpos += bit_offset.to_shwi ();
1820     }
1821
1822   if (bitpos % BITS_PER_UNIT)
1823     return false;
1824   if (bitsize % BITS_PER_UNIT)
1825     return false;
1826
1827   if (!init_symbolic_number (n, ref))
1828     return false;
1829   n->base_addr = base_addr;
1830   n->offset = offset;
1831   n->bytepos = bitpos / BITS_PER_UNIT;
1832   n->alias_set = reference_alias_ptr_type (ref);
1833   n->vuse = gimple_vuse (stmt);
1834   return true;
1835 }
1836
1837 /* Compute the symbolic number N representing the result of a bitwise OR on 2
1838    symbolic number N1 and N2 whose source statements are respectively
1839    SOURCE_STMT1 and SOURCE_STMT2.  */
1840
1841 static gimple
1842 perform_symbolic_merge (gimple source_stmt1, struct symbolic_number *n1,
1843                         gimple source_stmt2, struct symbolic_number *n2,
1844                         struct symbolic_number *n)
1845 {
1846   int i, size;
1847   uint64_t mask;
1848   gimple source_stmt;
1849   struct symbolic_number *n_start;
1850
1851   /* Sources are different, cancel bswap if they are not memory location with
1852      the same base (array, structure, ...).  */
1853   if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
1854     {
1855       int64_t inc;
1856       HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
1857       struct symbolic_number *toinc_n_ptr, *n_end;
1858
1859       if (!n1->base_addr || !n2->base_addr
1860           || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
1861         return NULL;
1862
1863       if (!n1->offset != !n2->offset ||
1864           (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
1865         return NULL;
1866
1867       if (n1->bytepos < n2->bytepos)
1868         {
1869           n_start = n1;
1870           start_sub = n2->bytepos - n1->bytepos;
1871           source_stmt = source_stmt1;
1872         }
1873       else
1874         {
1875           n_start = n2;
1876           start_sub = n1->bytepos - n2->bytepos;
1877           source_stmt = source_stmt2;
1878         }
1879
1880       /* Find the highest address at which a load is performed and
1881          compute related info.  */
1882       end1 = n1->bytepos + (n1->range - 1);
1883       end2 = n2->bytepos + (n2->range - 1);
1884       if (end1 < end2)
1885         {
1886           end = end2;
1887           end_sub = end2 - end1;
1888         }
1889       else
1890         {
1891           end = end1;
1892           end_sub = end1 - end2;
1893         }
1894       n_end = (end2 > end1) ? n2 : n1;
1895
1896       /* Find symbolic number whose lsb is the most significant.  */
1897       if (BYTES_BIG_ENDIAN)
1898         toinc_n_ptr = (n_end == n1) ? n2 : n1;
1899       else
1900         toinc_n_ptr = (n_start == n1) ? n2 : n1;
1901
1902       n->range = end - n_start->bytepos + 1;
1903
1904       /* Check that the range of memory covered can be represented by
1905          a symbolic number.  */
1906       if (n->range > 64 / BITS_PER_MARKER)
1907         return NULL;
1908
1909       /* Reinterpret byte marks in symbolic number holding the value of
1910          bigger weight according to target endianness.  */
1911       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
1912       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
1913       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
1914         {
1915           unsigned marker =
1916             (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
1917           if (marker && marker != MARKER_BYTE_UNKNOWN)
1918             toinc_n_ptr->n += inc;
1919         }
1920     }
1921   else
1922     {
1923       n->range = n1->range;
1924       n_start = n1;
1925       source_stmt = source_stmt1;
1926     }
1927
1928   if (!n1->alias_set
1929       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
1930     n->alias_set = n1->alias_set;
1931   else
1932     n->alias_set = ptr_type_node;
1933   n->vuse = n_start->vuse;
1934   n->base_addr = n_start->base_addr;
1935   n->offset = n_start->offset;
1936   n->bytepos = n_start->bytepos;
1937   n->type = n_start->type;
1938   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1939
1940   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
1941     {
1942       uint64_t masked1, masked2;
1943
1944       masked1 = n1->n & mask;
1945       masked2 = n2->n & mask;
1946       if (masked1 && masked2 && masked1 != masked2)
1947         return NULL;
1948     }
1949   n->n = n1->n | n2->n;
1950
1951   return source_stmt;
1952 }
1953
1954 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1955    the operation given by the rhs of STMT on the result.  If the operation
1956    could successfully be executed the function returns a gimple stmt whose
1957    rhs's first tree is the expression of the source operand and NULL
1958    otherwise.  */
1959
1960 static gimple
1961 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1962 {
1963   enum tree_code code;
1964   tree rhs1, rhs2 = NULL;
1965   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1966   enum gimple_rhs_class rhs_class;
1967
1968   if (!limit || !is_gimple_assign (stmt))
1969     return NULL;
1970
1971   rhs1 = gimple_assign_rhs1 (stmt);
1972
1973   if (find_bswap_or_nop_load (stmt, rhs1, n))
1974     return stmt;
1975
1976   if (TREE_CODE (rhs1) != SSA_NAME)
1977     return NULL;
1978
1979   code = gimple_assign_rhs_code (stmt);
1980   rhs_class = gimple_assign_rhs_class (stmt);
1981   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1982
1983   if (rhs_class == GIMPLE_BINARY_RHS)
1984     rhs2 = gimple_assign_rhs2 (stmt);
1985
1986   /* Handle unary rhs and binary rhs with integer constants as second
1987      operand.  */
1988
1989   if (rhs_class == GIMPLE_UNARY_RHS
1990       || (rhs_class == GIMPLE_BINARY_RHS
1991           && TREE_CODE (rhs2) == INTEGER_CST))
1992     {
1993       if (code != BIT_AND_EXPR
1994           && code != LSHIFT_EXPR
1995           && code != RSHIFT_EXPR
1996           && code != LROTATE_EXPR
1997           && code != RROTATE_EXPR
1998           && !CONVERT_EXPR_CODE_P (code))
1999         return NULL;
2000
2001       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2002
2003       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2004          we have to initialize the symbolic number.  */
2005       if (!source_stmt1)
2006         {
2007           if (gimple_assign_load_p (stmt)
2008               || !init_symbolic_number (n, rhs1))
2009             return NULL;
2010           source_stmt1 = stmt;
2011         }
2012
2013       switch (code)
2014         {
2015         case BIT_AND_EXPR:
2016           {
2017             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2018             uint64_t val = int_cst_value (rhs2), mask = 0;
2019             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2020
2021             /* Only constants masking full bytes are allowed.  */
2022             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2023               if ((val & tmp) != 0 && (val & tmp) != tmp)
2024                 return NULL;
2025               else if (val & tmp)
2026                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2027
2028             n->n &= mask;
2029           }
2030           break;
2031         case LSHIFT_EXPR:
2032         case RSHIFT_EXPR:
2033         case LROTATE_EXPR:
2034         case RROTATE_EXPR:
2035           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
2036             return NULL;
2037           break;
2038         CASE_CONVERT:
2039           {
2040             int i, type_size, old_type_size;
2041             tree type;
2042
2043             type = gimple_expr_type (stmt);
2044             type_size = TYPE_PRECISION (type);
2045             if (type_size % BITS_PER_UNIT != 0)
2046               return NULL;
2047             type_size /= BITS_PER_UNIT;
2048             if (type_size > 64 / BITS_PER_MARKER)
2049               return NULL;
2050
2051             /* Sign extension: result is dependent on the value.  */
2052             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2053             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2054                 && HEAD_MARKER (n->n, old_type_size))
2055               for (i = 0; i < type_size - old_type_size; i++)
2056                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2057                         << ((type_size - 1 - i) * BITS_PER_MARKER);
2058
2059             if (type_size < 64 / BITS_PER_MARKER)
2060               {
2061                 /* If STMT casts to a smaller type mask out the bits not
2062                    belonging to the target type.  */
2063                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2064               }
2065             n->type = type;
2066             if (!n->base_addr)
2067               n->range = type_size;
2068           }
2069           break;
2070         default:
2071           return NULL;
2072         };
2073       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2074     }
2075
2076   /* Handle binary rhs.  */
2077
2078   if (rhs_class == GIMPLE_BINARY_RHS)
2079     {
2080       struct symbolic_number n1, n2;
2081       gimple source_stmt, source_stmt2;
2082
2083       if (code != BIT_IOR_EXPR)
2084         return NULL;
2085
2086       if (TREE_CODE (rhs2) != SSA_NAME)
2087         return NULL;
2088
2089       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2090
2091       switch (code)
2092         {
2093         case BIT_IOR_EXPR:
2094           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2095
2096           if (!source_stmt1)
2097             return NULL;
2098
2099           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2100
2101           if (!source_stmt2)
2102             return NULL;
2103
2104           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2105             return NULL;
2106
2107           if (!n1.vuse != !n2.vuse ||
2108           (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2109             return NULL;
2110
2111           source_stmt =
2112             perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2113
2114           if (!source_stmt)
2115             return NULL;
2116
2117           if (!verify_symbolic_number_p (n, stmt))
2118             return NULL;
2119
2120           break;
2121         default:
2122           return NULL;
2123         }
2124       return source_stmt;
2125     }
2126   return NULL;
2127 }
2128
2129 /* Check if STMT completes a bswap implementation or a read in a given
2130    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2131    accordingly.  It also sets N to represent the kind of operations
2132    performed: size of the resulting expression and whether it works on
2133    a memory source, and if so alias-set and vuse.  At last, the
2134    function returns a stmt whose rhs's first tree is the source
2135    expression.  */
2136
2137 static gimple
2138 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2139 {
2140 /* The number which the find_bswap_or_nop_1 result should match in order
2141    to have a full byte swap.  The number is shifted to the right
2142    according to the size of the symbolic number before using it.  */
2143   uint64_t cmpxchg = CMPXCHG;
2144   uint64_t cmpnop = CMPNOP;
2145
2146   gimple source_stmt;
2147   int limit;
2148
2149   /* The last parameter determines the depth search limit.  It usually
2150      correlates directly to the number n of bytes to be touched.  We
2151      increase that number by log2(n) + 1 here in order to also
2152      cover signed -> unsigned conversions of the src operand as can be seen
2153      in libgcc, and for initial shift/and operation of the src operand.  */
2154   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2155   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2156   source_stmt =  find_bswap_or_nop_1 (stmt, n, limit);
2157
2158   if (!source_stmt)
2159     return NULL;
2160
2161   /* Find real size of result (highest non zero byte).  */
2162   if (n->base_addr)
2163     {
2164       int rsize;
2165       uint64_t tmpn;
2166
2167       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2168       n->range = rsize;
2169     }
2170
2171   /* Zero out the extra bits of N and CMP*.  */
2172   if (n->range < (int) sizeof (int64_t))
2173     {
2174       uint64_t mask;
2175
2176       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2177       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2178       cmpnop &= mask;
2179     }
2180
2181   /* A complete byte swap should make the symbolic number to start with
2182      the largest digit in the highest order byte. Unchanged symbolic
2183      number indicates a read with same endianness as target architecture.  */
2184   if (n->n == cmpnop)
2185     *bswap = false;
2186   else if (n->n == cmpxchg)
2187     *bswap = true;
2188   else
2189     return NULL;
2190
2191   /* Useless bit manipulation performed by code.  */
2192   if (!n->base_addr && n->n == cmpnop)
2193     return NULL;
2194
2195   n->range *= BITS_PER_UNIT;
2196   return source_stmt;
2197 }
2198
2199 namespace {
2200
2201 const pass_data pass_data_optimize_bswap =
2202 {
2203   GIMPLE_PASS, /* type */
2204   "bswap", /* name */
2205   OPTGROUP_NONE, /* optinfo_flags */
2206   TV_NONE, /* tv_id */
2207   PROP_ssa, /* properties_required */
2208   0, /* properties_provided */
2209   0, /* properties_destroyed */
2210   0, /* todo_flags_start */
2211   0, /* todo_flags_finish */
2212 };
2213
2214 class pass_optimize_bswap : public gimple_opt_pass
2215 {
2216 public:
2217   pass_optimize_bswap (gcc::context *ctxt)
2218     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2219   {}
2220
2221   /* opt_pass methods: */
2222   virtual bool gate (function *)
2223     {
2224       return flag_expensive_optimizations && optimize;
2225     }
2226
2227   virtual unsigned int execute (function *);
2228
2229 }; // class pass_optimize_bswap
2230
2231 /* Perform the bswap optimization: replace the expression computed in the rhs
2232    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2233    Which of these alternatives replace the rhs is given by N->base_addr (non
2234    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2235    load to perform are also given in N while the builtin bswap invoke is given
2236    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2237    load statements involved to construct the rhs in CUR_STMT and N->range gives
2238    the size of the rhs expression for maintaining some statistics.
2239
2240    Note that if the replacement involve a load, CUR_STMT is moved just after
2241    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2242    changing of basic block.  */
2243
2244 static bool
2245 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2246                tree load_type, struct symbolic_number *n, bool bswap)
2247 {
2248   gimple_stmt_iterator gsi;
2249   tree src, tmp, tgt;
2250   gimple bswap_stmt;
2251
2252   gsi = gsi_for_stmt (cur_stmt);
2253   src = gimple_assign_rhs1 (src_stmt);
2254   tgt = gimple_assign_lhs (cur_stmt);
2255
2256   /* Need to load the value from memory first.  */
2257   if (n->base_addr)
2258     {
2259       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2260       tree addr_expr, addr_tmp, val_expr, val_tmp;
2261       tree load_offset_ptr, aligned_load_type;
2262       gimple addr_stmt, load_stmt;
2263       unsigned align;
2264
2265       align = get_object_alignment (src);
2266       if (bswap
2267           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2268           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2269         return false;
2270
2271       /* Move cur_stmt just before  one of the load of the original
2272          to ensure it has the same VUSE.  See PR61517 for what could
2273          go wrong.  */
2274       gsi_move_before (&gsi, &gsi_ins);
2275       gsi = gsi_for_stmt (cur_stmt);
2276
2277       /*  Compute address to load from and cast according to the size
2278           of the load.  */
2279       addr_expr = build_fold_addr_expr (unshare_expr (src));
2280       if (is_gimple_min_invariant (addr_expr))
2281         addr_tmp = addr_expr;
2282       else
2283         {
2284           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2285                                          "load_src");
2286           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2287           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2288         }
2289
2290       /* Perform the load.  */
2291       aligned_load_type = load_type;
2292       if (align < TYPE_ALIGN (load_type))
2293         aligned_load_type = build_aligned_type (load_type, align);
2294       load_offset_ptr = build_int_cst (n->alias_set, 0);
2295       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2296                               load_offset_ptr);
2297
2298       if (!bswap)
2299         {
2300           if (n->range == 16)
2301             nop_stats.found_16bit++;
2302           else if (n->range == 32)
2303             nop_stats.found_32bit++;
2304           else
2305             {
2306               gcc_assert (n->range == 64);
2307               nop_stats.found_64bit++;
2308             }
2309
2310           /* Convert the result of load if necessary.  */
2311           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2312             {
2313               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2314                                             "load_dst");
2315               load_stmt = gimple_build_assign (val_tmp, val_expr);
2316               gimple_set_vuse (load_stmt, n->vuse);
2317               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2318               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2319             }
2320           else
2321             {
2322               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2323               gimple_set_vuse (cur_stmt, n->vuse);
2324             }
2325           update_stmt (cur_stmt);
2326
2327           if (dump_file)
2328             {
2329               fprintf (dump_file,
2330                        "%d bit load in target endianness found at: ",
2331                        (int)n->range);
2332               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2333             }
2334           return true;
2335         }
2336       else
2337         {
2338           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2339           load_stmt = gimple_build_assign (val_tmp, val_expr);
2340           gimple_set_vuse (load_stmt, n->vuse);
2341           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2342         }
2343       src = val_tmp;
2344     }
2345
2346   if (n->range == 16)
2347     bswap_stats.found_16bit++;
2348   else if (n->range == 32)
2349     bswap_stats.found_32bit++;
2350   else
2351     {
2352       gcc_assert (n->range == 64);
2353       bswap_stats.found_64bit++;
2354     }
2355
2356   tmp = src;
2357
2358   /* Convert the src expression if necessary.  */
2359   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2360     {
2361       gimple convert_stmt;
2362
2363       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2364       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2365       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2366     }
2367
2368   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2369      are considered as rotation of 2N bit values by N bits is generally not
2370      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2371      gives 0x03040102 while a bswap for that value is 0x04030201.  */
2372   if (bswap && n->range == 16)
2373     {
2374       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2375       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2376       bswap_stmt = gimple_build_assign (NULL, src);
2377     }
2378   else
2379     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2380
2381   tmp = tgt;
2382
2383   /* Convert the result if necessary.  */
2384   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2385     {
2386       gimple convert_stmt;
2387
2388       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2389       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2390       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2391     }
2392
2393   gimple_set_lhs (bswap_stmt, tmp);
2394
2395   if (dump_file)
2396     {
2397       fprintf (dump_file, "%d bit bswap implementation found at: ",
2398                (int)n->range);
2399       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2400     }
2401
2402   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2403   gsi_remove (&gsi, true);
2404   return true;
2405 }
2406
2407 /* Find manual byte swap implementations as well as load in a given
2408    endianness. Byte swaps are turned into a bswap builtin invokation
2409    while endian loads are converted to bswap builtin invokation or
2410    simple load according to the target endianness.  */
2411
2412 unsigned int
2413 pass_optimize_bswap::execute (function *fun)
2414 {
2415   basic_block bb;
2416   bool bswap32_p, bswap64_p;
2417   bool changed = false;
2418   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2419
2420   if (BITS_PER_UNIT != 8)
2421     return 0;
2422
2423   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2424                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2425   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2426                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2427                    || (bswap32_p && word_mode == SImode)));
2428
2429   /* Determine the argument type of the builtins.  The code later on
2430      assumes that the return and argument type are the same.  */
2431   if (bswap32_p)
2432     {
2433       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2434       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2435     }
2436
2437   if (bswap64_p)
2438     {
2439       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2440       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2441     }
2442
2443   memset (&nop_stats, 0, sizeof (nop_stats));
2444   memset (&bswap_stats, 0, sizeof (bswap_stats));
2445
2446   FOR_EACH_BB_FN (bb, fun)
2447     {
2448       gimple_stmt_iterator gsi;
2449
2450       /* We do a reverse scan for bswap patterns to make sure we get the
2451          widest match. As bswap pattern matching doesn't handle previously
2452          inserted smaller bswap replacements as sub-patterns, the wider
2453          variant wouldn't be detected.  */
2454       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2455         {
2456           gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2457           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2458           enum tree_code code;
2459           struct symbolic_number n;
2460           bool bswap;
2461
2462           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2463              might be moved to a different basic block by bswap_replace and gsi
2464              must not points to it if that's the case.  Moving the gsi_prev
2465              there make sure that gsi points to the statement previous to
2466              cur_stmt while still making sure that all statements are
2467              considered in this basic block.  */
2468           gsi_prev (&gsi);
2469
2470           if (!is_gimple_assign (cur_stmt))
2471             continue;
2472
2473           code = gimple_assign_rhs_code (cur_stmt);
2474           switch (code)
2475             {
2476             case LROTATE_EXPR:
2477             case RROTATE_EXPR:
2478               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2479                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2480                      % BITS_PER_UNIT)
2481                 continue;
2482               /* Fall through.  */
2483             case BIT_IOR_EXPR:
2484               break;
2485             default:
2486               continue;
2487             }
2488
2489           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2490
2491           if (!src_stmt)
2492             continue;
2493
2494           switch (n.range)
2495             {
2496             case 16:
2497               /* Already in canonical form, nothing to do.  */
2498               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2499                 continue;
2500               load_type = bswap_type = uint16_type_node;
2501               break;
2502             case 32:
2503               load_type = uint32_type_node;
2504               if (bswap32_p)
2505                 {
2506                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2507                   bswap_type = bswap32_type;
2508                 }
2509               break;
2510             case 64:
2511               load_type = uint64_type_node;
2512               if (bswap64_p)
2513                 {
2514                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2515                   bswap_type = bswap64_type;
2516                 }
2517               break;
2518             default:
2519               continue;
2520             }
2521
2522           if (bswap && !fndecl && n.range != 16)
2523             continue;
2524
2525           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2526                              &n, bswap))
2527             changed = true;
2528         }
2529     }
2530
2531   statistics_counter_event (fun, "16-bit nop implementations found",
2532                             nop_stats.found_16bit);
2533   statistics_counter_event (fun, "32-bit nop implementations found",
2534                             nop_stats.found_32bit);
2535   statistics_counter_event (fun, "64-bit nop implementations found",
2536                             nop_stats.found_64bit);
2537   statistics_counter_event (fun, "16-bit bswap implementations found",
2538                             bswap_stats.found_16bit);
2539   statistics_counter_event (fun, "32-bit bswap implementations found",
2540                             bswap_stats.found_32bit);
2541   statistics_counter_event (fun, "64-bit bswap implementations found",
2542                             bswap_stats.found_64bit);
2543
2544   return (changed ? TODO_update_ssa : 0);
2545 }
2546
2547 } // anon namespace
2548
2549 gimple_opt_pass *
2550 make_pass_optimize_bswap (gcc::context *ctxt)
2551 {
2552   return new pass_optimize_bswap (ctxt);
2553 }
2554
2555 /* Return true if stmt is a type conversion operation that can be stripped
2556    when used in a widening multiply operation.  */
2557 static bool
2558 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2559 {
2560   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2561
2562   if (TREE_CODE (result_type) == INTEGER_TYPE)
2563     {
2564       tree op_type;
2565       tree inner_op_type;
2566
2567       if (!CONVERT_EXPR_CODE_P (rhs_code))
2568         return false;
2569
2570       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2571
2572       /* If the type of OP has the same precision as the result, then
2573          we can strip this conversion.  The multiply operation will be
2574          selected to create the correct extension as a by-product.  */
2575       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2576         return true;
2577
2578       /* We can also strip a conversion if it preserves the signed-ness of
2579          the operation and doesn't narrow the range.  */
2580       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2581
2582       /* If the inner-most type is unsigned, then we can strip any
2583          intermediate widening operation.  If it's signed, then the
2584          intermediate widening operation must also be signed.  */
2585       if ((TYPE_UNSIGNED (inner_op_type)
2586            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2587           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2588         return true;
2589
2590       return false;
2591     }
2592
2593   return rhs_code == FIXED_CONVERT_EXPR;
2594 }
2595
2596 /* Return true if RHS is a suitable operand for a widening multiplication,
2597    assuming a target type of TYPE.
2598    There are two cases:
2599
2600      - RHS makes some value at least twice as wide.  Store that value
2601        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2602
2603      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2604        but leave *TYPE_OUT untouched.  */
2605
2606 static bool
2607 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2608                         tree *new_rhs_out)
2609 {
2610   gimple stmt;
2611   tree type1, rhs1;
2612
2613   if (TREE_CODE (rhs) == SSA_NAME)
2614     {
2615       stmt = SSA_NAME_DEF_STMT (rhs);
2616       if (is_gimple_assign (stmt))
2617         {
2618           if (! widening_mult_conversion_strippable_p (type, stmt))
2619             rhs1 = rhs;
2620           else
2621             {
2622               rhs1 = gimple_assign_rhs1 (stmt);
2623
2624               if (TREE_CODE (rhs1) == INTEGER_CST)
2625                 {
2626                   *new_rhs_out = rhs1;
2627                   *type_out = NULL;
2628                   return true;
2629                 }
2630             }
2631         }
2632       else
2633         rhs1 = rhs;
2634
2635       type1 = TREE_TYPE (rhs1);
2636
2637       if (TREE_CODE (type1) != TREE_CODE (type)
2638           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2639         return false;
2640
2641       *new_rhs_out = rhs1;
2642       *type_out = type1;
2643       return true;
2644     }
2645
2646   if (TREE_CODE (rhs) == INTEGER_CST)
2647     {
2648       *new_rhs_out = rhs;
2649       *type_out = NULL;
2650       return true;
2651     }
2652
2653   return false;
2654 }
2655
2656 /* Return true if STMT performs a widening multiplication, assuming the
2657    output type is TYPE.  If so, store the unwidened types of the operands
2658    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2659    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2660    and *TYPE2_OUT would give the operands of the multiplication.  */
2661
2662 static bool
2663 is_widening_mult_p (gimple stmt,
2664                     tree *type1_out, tree *rhs1_out,
2665                     tree *type2_out, tree *rhs2_out)
2666 {
2667   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2668
2669   if (TREE_CODE (type) != INTEGER_TYPE
2670       && TREE_CODE (type) != FIXED_POINT_TYPE)
2671     return false;
2672
2673   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2674                                rhs1_out))
2675     return false;
2676
2677   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2678                                rhs2_out))
2679     return false;
2680
2681   if (*type1_out == NULL)
2682     {
2683       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2684         return false;
2685       *type1_out = *type2_out;
2686     }
2687
2688   if (*type2_out == NULL)
2689     {
2690       if (!int_fits_type_p (*rhs2_out, *type1_out))
2691         return false;
2692       *type2_out = *type1_out;
2693     }
2694
2695   /* Ensure that the larger of the two operands comes first. */
2696   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2697     {
2698       tree tmp;
2699       tmp = *type1_out;
2700       *type1_out = *type2_out;
2701       *type2_out = tmp;
2702       tmp = *rhs1_out;
2703       *rhs1_out = *rhs2_out;
2704       *rhs2_out = tmp;
2705     }
2706
2707   return true;
2708 }
2709
2710 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2711    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2712    value is true iff we converted the statement.  */
2713
2714 static bool
2715 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2716 {
2717   tree lhs, rhs1, rhs2, type, type1, type2;
2718   enum insn_code handler;
2719   machine_mode to_mode, from_mode, actual_mode;
2720   optab op;
2721   int actual_precision;
2722   location_t loc = gimple_location (stmt);
2723   bool from_unsigned1, from_unsigned2;
2724
2725   lhs = gimple_assign_lhs (stmt);
2726   type = TREE_TYPE (lhs);
2727   if (TREE_CODE (type) != INTEGER_TYPE)
2728     return false;
2729
2730   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2731     return false;
2732
2733   to_mode = TYPE_MODE (type);
2734   from_mode = TYPE_MODE (type1);
2735   from_unsigned1 = TYPE_UNSIGNED (type1);
2736   from_unsigned2 = TYPE_UNSIGNED (type2);
2737
2738   if (from_unsigned1 && from_unsigned2)
2739     op = umul_widen_optab;
2740   else if (!from_unsigned1 && !from_unsigned2)
2741     op = smul_widen_optab;
2742   else
2743     op = usmul_widen_optab;
2744
2745   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2746                                                   0, &actual_mode);
2747
2748   if (handler == CODE_FOR_nothing)
2749     {
2750       if (op != smul_widen_optab)
2751         {
2752           /* We can use a signed multiply with unsigned types as long as
2753              there is a wider mode to use, or it is the smaller of the two
2754              types that is unsigned.  Note that type1 >= type2, always.  */
2755           if ((TYPE_UNSIGNED (type1)
2756                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2757               || (TYPE_UNSIGNED (type2)
2758                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2759             {
2760               from_mode = GET_MODE_WIDER_MODE (from_mode);
2761               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2762                 return false;
2763             }
2764
2765           op = smul_widen_optab;
2766           handler = find_widening_optab_handler_and_mode (op, to_mode,
2767                                                           from_mode, 0,
2768                                                           &actual_mode);
2769
2770           if (handler == CODE_FOR_nothing)
2771             return false;
2772
2773           from_unsigned1 = from_unsigned2 = false;
2774         }
2775       else
2776         return false;
2777     }
2778
2779   /* Ensure that the inputs to the handler are in the correct precison
2780      for the opcode.  This will be the full mode size.  */
2781   actual_precision = GET_MODE_PRECISION (actual_mode);
2782   if (2 * actual_precision > TYPE_PRECISION (type))
2783     return false;
2784   if (actual_precision != TYPE_PRECISION (type1)
2785       || from_unsigned1 != TYPE_UNSIGNED (type1))
2786     rhs1 = build_and_insert_cast (gsi, loc,
2787                                   build_nonstandard_integer_type
2788                                     (actual_precision, from_unsigned1), rhs1);
2789   if (actual_precision != TYPE_PRECISION (type2)
2790       || from_unsigned2 != TYPE_UNSIGNED (type2))
2791     rhs2 = build_and_insert_cast (gsi, loc,
2792                                   build_nonstandard_integer_type
2793                                     (actual_precision, from_unsigned2), rhs2);
2794
2795   /* Handle constants.  */
2796   if (TREE_CODE (rhs1) == INTEGER_CST)
2797     rhs1 = fold_convert (type1, rhs1);
2798   if (TREE_CODE (rhs2) == INTEGER_CST)
2799     rhs2 = fold_convert (type2, rhs2);
2800
2801   gimple_assign_set_rhs1 (stmt, rhs1);
2802   gimple_assign_set_rhs2 (stmt, rhs2);
2803   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2804   update_stmt (stmt);
2805   widen_mul_stats.widen_mults_inserted++;
2806   return true;
2807 }
2808
2809 /* Process a single gimple statement STMT, which is found at the
2810    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2811    rhs (given by CODE), and try to convert it into a
2812    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2813    is true iff we converted the statement.  */
2814
2815 static bool
2816 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2817                             enum tree_code code)
2818 {
2819   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2820   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2821   tree type, type1, type2, optype;
2822   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2823   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2824   optab this_optab;
2825   enum tree_code wmult_code;
2826   enum insn_code handler;
2827   machine_mode to_mode, from_mode, actual_mode;
2828   location_t loc = gimple_location (stmt);
2829   int actual_precision;
2830   bool from_unsigned1, from_unsigned2;
2831
2832   lhs = gimple_assign_lhs (stmt);
2833   type = TREE_TYPE (lhs);
2834   if (TREE_CODE (type) != INTEGER_TYPE
2835       && TREE_CODE (type) != FIXED_POINT_TYPE)
2836     return false;
2837
2838   if (code == MINUS_EXPR)
2839     wmult_code = WIDEN_MULT_MINUS_EXPR;
2840   else
2841     wmult_code = WIDEN_MULT_PLUS_EXPR;
2842
2843   rhs1 = gimple_assign_rhs1 (stmt);
2844   rhs2 = gimple_assign_rhs2 (stmt);
2845
2846   if (TREE_CODE (rhs1) == SSA_NAME)
2847     {
2848       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2849       if (is_gimple_assign (rhs1_stmt))
2850         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2851     }
2852
2853   if (TREE_CODE (rhs2) == SSA_NAME)
2854     {
2855       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2856       if (is_gimple_assign (rhs2_stmt))
2857         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2858     }
2859
2860   /* Allow for one conversion statement between the multiply
2861      and addition/subtraction statement.  If there are more than
2862      one conversions then we assume they would invalidate this
2863      transformation.  If that's not the case then they should have
2864      been folded before now.  */
2865   if (CONVERT_EXPR_CODE_P (rhs1_code))
2866     {
2867       conv1_stmt = rhs1_stmt;
2868       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2869       if (TREE_CODE (rhs1) == SSA_NAME)
2870         {
2871           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2872           if (is_gimple_assign (rhs1_stmt))
2873             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2874         }
2875       else
2876         return false;
2877     }
2878   if (CONVERT_EXPR_CODE_P (rhs2_code))
2879     {
2880       conv2_stmt = rhs2_stmt;
2881       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2882       if (TREE_CODE (rhs2) == SSA_NAME)
2883         {
2884           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2885           if (is_gimple_assign (rhs2_stmt))
2886             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2887         }
2888       else
2889         return false;
2890     }
2891
2892   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2893      is_widening_mult_p, but we still need the rhs returns.
2894
2895      It might also appear that it would be sufficient to use the existing
2896      operands of the widening multiply, but that would limit the choice of
2897      multiply-and-accumulate instructions.
2898
2899      If the widened-multiplication result has more than one uses, it is
2900      probably wiser not to do the conversion.  */
2901   if (code == PLUS_EXPR
2902       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2903     {
2904       if (!has_single_use (rhs1)
2905           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2906                                   &type2, &mult_rhs2))
2907         return false;
2908       add_rhs = rhs2;
2909       conv_stmt = conv1_stmt;
2910     }
2911   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2912     {
2913       if (!has_single_use (rhs2)
2914           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2915                                   &type2, &mult_rhs2))
2916         return false;
2917       add_rhs = rhs1;
2918       conv_stmt = conv2_stmt;
2919     }
2920   else
2921     return false;
2922
2923   to_mode = TYPE_MODE (type);
2924   from_mode = TYPE_MODE (type1);
2925   from_unsigned1 = TYPE_UNSIGNED (type1);
2926   from_unsigned2 = TYPE_UNSIGNED (type2);
2927   optype = type1;
2928
2929   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2930   if (from_unsigned1 != from_unsigned2)
2931     {
2932       if (!INTEGRAL_TYPE_P (type))
2933         return false;
2934       /* We can use a signed multiply with unsigned types as long as
2935          there is a wider mode to use, or it is the smaller of the two
2936          types that is unsigned.  Note that type1 >= type2, always.  */
2937       if ((from_unsigned1
2938            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2939           || (from_unsigned2
2940               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2941         {
2942           from_mode = GET_MODE_WIDER_MODE (from_mode);
2943           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2944             return false;
2945         }
2946
2947       from_unsigned1 = from_unsigned2 = false;
2948       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2949                                                false);
2950     }
2951
2952   /* If there was a conversion between the multiply and addition
2953      then we need to make sure it fits a multiply-and-accumulate.
2954      The should be a single mode change which does not change the
2955      value.  */
2956   if (conv_stmt)
2957     {
2958       /* We use the original, unmodified data types for this.  */
2959       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2960       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2961       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2962       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2963
2964       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2965         {
2966           /* Conversion is a truncate.  */
2967           if (TYPE_PRECISION (to_type) < data_size)
2968             return false;
2969         }
2970       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2971         {
2972           /* Conversion is an extend.  Check it's the right sort.  */
2973           if (TYPE_UNSIGNED (from_type) != is_unsigned
2974               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2975             return false;
2976         }
2977       /* else convert is a no-op for our purposes.  */
2978     }
2979
2980   /* Verify that the machine can perform a widening multiply
2981      accumulate in this mode/signedness combination, otherwise
2982      this transformation is likely to pessimize code.  */
2983   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2984   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2985                                                   from_mode, 0, &actual_mode);
2986
2987   if (handler == CODE_FOR_nothing)
2988     return false;
2989
2990   /* Ensure that the inputs to the handler are in the correct precison
2991      for the opcode.  This will be the full mode size.  */
2992   actual_precision = GET_MODE_PRECISION (actual_mode);
2993   if (actual_precision != TYPE_PRECISION (type1)
2994       || from_unsigned1 != TYPE_UNSIGNED (type1))
2995     mult_rhs1 = build_and_insert_cast (gsi, loc,
2996                                        build_nonstandard_integer_type
2997                                          (actual_precision, from_unsigned1),
2998                                        mult_rhs1);
2999   if (actual_precision != TYPE_PRECISION (type2)
3000       || from_unsigned2 != TYPE_UNSIGNED (type2))
3001     mult_rhs2 = build_and_insert_cast (gsi, loc,
3002                                        build_nonstandard_integer_type
3003                                          (actual_precision, from_unsigned2),
3004                                        mult_rhs2);
3005
3006   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3007     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3008
3009   /* Handle constants.  */
3010   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3011     mult_rhs1 = fold_convert (type1, mult_rhs1);
3012   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3013     mult_rhs2 = fold_convert (type2, mult_rhs2);
3014
3015   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3016                                   add_rhs);
3017   update_stmt (gsi_stmt (*gsi));
3018   widen_mul_stats.maccs_inserted++;
3019   return true;
3020 }
3021
3022 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3023    with uses in additions and subtractions to form fused multiply-add
3024    operations.  Returns true if successful and MUL_STMT should be removed.  */
3025
3026 static bool
3027 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
3028 {
3029   tree mul_result = gimple_get_lhs (mul_stmt);
3030   tree type = TREE_TYPE (mul_result);
3031   gimple use_stmt, neguse_stmt;
3032   gassign *fma_stmt;
3033   use_operand_p use_p;
3034   imm_use_iterator imm_iter;
3035
3036   if (FLOAT_TYPE_P (type)
3037       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3038     return false;
3039
3040   /* We don't want to do bitfield reduction ops.  */
3041   if (INTEGRAL_TYPE_P (type)
3042       && (TYPE_PRECISION (type)
3043           != GET_MODE_PRECISION (TYPE_MODE (type))))
3044     return false;
3045
3046   /* If the target doesn't support it, don't generate it.  We assume that
3047      if fma isn't available then fms, fnma or fnms are not either.  */
3048   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3049     return false;
3050
3051   /* If the multiplication has zero uses, it is kept around probably because
3052      of -fnon-call-exceptions.  Don't optimize it away in that case,
3053      it is DCE job.  */
3054   if (has_zero_uses (mul_result))
3055     return false;
3056
3057   /* Make sure that the multiplication statement becomes dead after
3058      the transformation, thus that all uses are transformed to FMAs.
3059      This means we assume that an FMA operation has the same cost
3060      as an addition.  */
3061   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3062     {
3063       enum tree_code use_code;
3064       tree result = mul_result;
3065       bool negate_p = false;
3066
3067       use_stmt = USE_STMT (use_p);
3068
3069       if (is_gimple_debug (use_stmt))
3070         continue;
3071
3072       /* For now restrict this operations to single basic blocks.  In theory
3073          we would want to support sinking the multiplication in
3074          m = a*b;
3075          if ()
3076            ma = m + c;
3077          else
3078            d = m;
3079          to form a fma in the then block and sink the multiplication to the
3080          else block.  */
3081       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3082         return false;
3083
3084       if (!is_gimple_assign (use_stmt))
3085         return false;
3086
3087       use_code = gimple_assign_rhs_code (use_stmt);
3088
3089       /* A negate on the multiplication leads to FNMA.  */
3090       if (use_code == NEGATE_EXPR)
3091         {
3092           ssa_op_iter iter;
3093           use_operand_p usep;
3094
3095           result = gimple_assign_lhs (use_stmt);
3096
3097           /* Make sure the negate statement becomes dead with this
3098              single transformation.  */
3099           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3100                                &use_p, &neguse_stmt))
3101             return false;
3102
3103           /* Make sure the multiplication isn't also used on that stmt.  */
3104           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3105             if (USE_FROM_PTR (usep) == mul_result)
3106               return false;
3107
3108           /* Re-validate.  */
3109           use_stmt = neguse_stmt;
3110           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3111             return false;
3112           if (!is_gimple_assign (use_stmt))
3113             return false;
3114
3115           use_code = gimple_assign_rhs_code (use_stmt);
3116           negate_p = true;
3117         }
3118
3119       switch (use_code)
3120         {
3121         case MINUS_EXPR:
3122           if (gimple_assign_rhs2 (use_stmt) == result)
3123             negate_p = !negate_p;
3124           break;
3125         case PLUS_EXPR:
3126           break;
3127         default:
3128           /* FMA can only be formed from PLUS and MINUS.  */
3129           return false;
3130         }
3131
3132       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3133          by a MULT_EXPR that we'll visit later, we might be able to
3134          get a more profitable match with fnma.
3135          OTOH, if we don't, a negate / fma pair has likely lower latency
3136          that a mult / subtract pair.  */
3137       if (use_code == MINUS_EXPR && !negate_p
3138           && gimple_assign_rhs1 (use_stmt) == result
3139           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3140           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3141         {
3142           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3143
3144           if (TREE_CODE (rhs2) == SSA_NAME)
3145             {
3146               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3147               if (has_single_use (rhs2)
3148                   && is_gimple_assign (stmt2)
3149                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3150               return false;
3151             }
3152         }
3153
3154       /* We can't handle a * b + a * b.  */
3155       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3156         return false;
3157
3158       /* While it is possible to validate whether or not the exact form
3159          that we've recognized is available in the backend, the assumption
3160          is that the transformation is never a loss.  For instance, suppose
3161          the target only has the plain FMA pattern available.  Consider
3162          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3163          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3164          still have 3 operations, but in the FMA form the two NEGs are
3165          independent and could be run in parallel.  */
3166     }
3167
3168   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3169     {
3170       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3171       enum tree_code use_code;
3172       tree addop, mulop1 = op1, result = mul_result;
3173       bool negate_p = false;
3174
3175       if (is_gimple_debug (use_stmt))
3176         continue;
3177
3178       use_code = gimple_assign_rhs_code (use_stmt);
3179       if (use_code == NEGATE_EXPR)
3180         {
3181           result = gimple_assign_lhs (use_stmt);
3182           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3183           gsi_remove (&gsi, true);
3184           release_defs (use_stmt);
3185
3186           use_stmt = neguse_stmt;
3187           gsi = gsi_for_stmt (use_stmt);
3188           use_code = gimple_assign_rhs_code (use_stmt);
3189           negate_p = true;
3190         }
3191
3192       if (gimple_assign_rhs1 (use_stmt) == result)
3193         {
3194           addop = gimple_assign_rhs2 (use_stmt);
3195           /* a * b - c -> a * b + (-c)  */
3196           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3197             addop = force_gimple_operand_gsi (&gsi,
3198                                               build1 (NEGATE_EXPR,
3199                                                       type, addop),
3200                                               true, NULL_TREE, true,
3201                                               GSI_SAME_STMT);
3202         }
3203       else
3204         {
3205           addop = gimple_assign_rhs1 (use_stmt);
3206           /* a - b * c -> (-b) * c + a */
3207           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3208             negate_p = !negate_p;
3209         }
3210
3211       if (negate_p)
3212         mulop1 = force_gimple_operand_gsi (&gsi,
3213                                            build1 (NEGATE_EXPR,
3214                                                    type, mulop1),
3215                                            true, NULL_TREE, true,
3216                                            GSI_SAME_STMT);
3217
3218       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3219                                       FMA_EXPR, mulop1, op2, addop);
3220       gsi_replace (&gsi, fma_stmt, true);
3221       widen_mul_stats.fmas_inserted++;
3222     }
3223
3224   return true;
3225 }
3226
3227 /* Find integer multiplications where the operands are extended from
3228    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3229    where appropriate.  */
3230
3231 namespace {
3232
3233 const pass_data pass_data_optimize_widening_mul =
3234 {
3235   GIMPLE_PASS, /* type */
3236   "widening_mul", /* name */
3237   OPTGROUP_NONE, /* optinfo_flags */
3238   TV_NONE, /* tv_id */
3239   PROP_ssa, /* properties_required */
3240   0, /* properties_provided */
3241   0, /* properties_destroyed */
3242   0, /* todo_flags_start */
3243   TODO_update_ssa, /* todo_flags_finish */
3244 };
3245
3246 class pass_optimize_widening_mul : public gimple_opt_pass
3247 {
3248 public:
3249   pass_optimize_widening_mul (gcc::context *ctxt)
3250     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3251   {}
3252
3253   /* opt_pass methods: */
3254   virtual bool gate (function *)
3255     {
3256       return flag_expensive_optimizations && optimize;
3257     }
3258
3259   virtual unsigned int execute (function *);
3260
3261 }; // class pass_optimize_widening_mul
3262
3263 unsigned int
3264 pass_optimize_widening_mul::execute (function *fun)
3265 {
3266   basic_block bb;
3267   bool cfg_changed = false;
3268
3269   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3270
3271   FOR_EACH_BB_FN (bb, fun)
3272     {
3273       gimple_stmt_iterator gsi;
3274
3275       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3276         {
3277           gimple stmt = gsi_stmt (gsi);
3278           enum tree_code code;
3279
3280           if (is_gimple_assign (stmt))
3281             {
3282               code = gimple_assign_rhs_code (stmt);
3283               switch (code)
3284                 {
3285                 case MULT_EXPR:
3286                   if (!convert_mult_to_widen (stmt, &gsi)
3287                       && convert_mult_to_fma (stmt,
3288                                               gimple_assign_rhs1 (stmt),
3289                                               gimple_assign_rhs2 (stmt)))
3290                     {
3291                       gsi_remove (&gsi, true);
3292                       release_defs (stmt);
3293                       continue;
3294                     }
3295                   break;
3296
3297                 case PLUS_EXPR:
3298                 case MINUS_EXPR:
3299                   convert_plusminus_to_widen (&gsi, stmt, code);
3300                   break;
3301
3302                 default:;
3303                 }
3304             }
3305           else if (is_gimple_call (stmt)
3306                    && gimple_call_lhs (stmt))
3307             {
3308               tree fndecl = gimple_call_fndecl (stmt);
3309               if (fndecl
3310                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3311                 {
3312                   switch (DECL_FUNCTION_CODE (fndecl))
3313                     {
3314                       case BUILT_IN_POWF:
3315                       case BUILT_IN_POW:
3316                       case BUILT_IN_POWL:
3317                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3318                             && REAL_VALUES_EQUAL
3319                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3320                                   dconst2)
3321                             && convert_mult_to_fma (stmt,
3322                                                     gimple_call_arg (stmt, 0),
3323                                                     gimple_call_arg (stmt, 0)))
3324                           {
3325                             unlink_stmt_vdef (stmt);
3326                             if (gsi_remove (&gsi, true)
3327                                 && gimple_purge_dead_eh_edges (bb))
3328                               cfg_changed = true;
3329                             release_defs (stmt);
3330                             continue;
3331                           }
3332                           break;
3333
3334                       default:;
3335                     }
3336                 }
3337             }
3338           gsi_next (&gsi);
3339         }
3340     }
3341
3342   statistics_counter_event (fun, "widening multiplications inserted",
3343                             widen_mul_stats.widen_mults_inserted);
3344   statistics_counter_event (fun, "widening maccs inserted",
3345                             widen_mul_stats.maccs_inserted);
3346   statistics_counter_event (fun, "fused multiply-adds inserted",
3347                             widen_mul_stats.fmas_inserted);
3348
3349   return cfg_changed ? TODO_cleanup_cfg : 0;
3350 }
3351
3352 } // anon namespace
3353
3354 gimple_opt_pass *
3355 make_pass_optimize_widening_mul (gcc::context *ctxt)
3356 {
3357   return new pass_optimize_widening_mul (ctxt);
3358 }