contrib/gcc-5.0/gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "hash-set.h"
  93 #include "machmode.h"
  94 #include "vec.h"
  95 #include "double-int.h"
  96 #include "input.h"
  97 #include "alias.h"
  98 #include "symtab.h"
  99 #include "wide-int.h"
 100 #include "inchash.h"
 101 #include "tree.h"
 102 #include "fold-const.h"
 103 #include "predict.h"
 104 #include "hard-reg-set.h"
 105 #include "function.h"
 106 #include "dominance.h"
 107 #include "cfg.h"
 108 #include "basic-block.h"
 109 #include "tree-ssa-alias.h"
 110 #include "internal-fn.h"
 111 #include "gimple-fold.h"
 112 #include "gimple-expr.h"
 113 #include "is-a.h"
 114 #include "gimple.h"
 115 #include "gimple-iterator.h"
 116 #include "gimplify.h"
 117 #include "gimplify-me.h"
 118 #include "stor-layout.h"
 119 #include "gimple-ssa.h"
 120 #include "tree-cfg.h"
 121 #include "tree-phinodes.h"
 122 #include "ssa-iterators.h"
 123 #include "stringpool.h"
 124 #include "tree-ssanames.h"
 125 #include "hashtab.h"
 126 #include "rtl.h"
 127 #include "statistics.h"
 128 #include "real.h"
 129 #include "fixed-value.h"
 130 #include "insn-config.h"
 131 #include "expmed.h"
 132 #include "dojump.h"
 133 #include "explow.h"
 134 #include "calls.h"
 135 #include "emit-rtl.h"
 136 #include "varasm.h"
 137 #include "stmt.h"
 138 #include "expr.h"
 139 #include "tree-dfa.h"
 140 #include "tree-ssa.h"
 141 #include "tree-pass.h"
 142 #include "alloc-pool.h"
 143 #include "target.h"
 144 #include "gimple-pretty-print.h"
 145 #include "builtins.h"
 146
 147 /* FIXME: RTL headers have to be included here for optabs.  */
 148 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 149 #include "expr.h"               /* Because optabs.h wants sepops.  */
 150 #include "insn-codes.h"
 151 #include "optabs.h"
 152
 153 /* This structure represents one basic block that either computes a
 154    division, or is a common dominator for basic block that compute a
 155    division.  */
 156 struct occurrence {
 157   /* The basic block represented by this structure.  */
 158   basic_block bb;
 159
 160   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 161      inserted in BB.  */
 162   tree recip_def;
 163
 164   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 165      was inserted in BB.  */
 166   gimple recip_def_stmt;
 167
 168   /* Pointer to a list of "struct occurrence"s for blocks dominated
 169      by BB.  */
 170   struct occurrence *children;
 171
 172   /* Pointer to the next "struct occurrence"s in the list of blocks
 173      sharing a common dominator.  */
 174   struct occurrence *next;
 175
 176   /* The number of divisions that are in BB before compute_merit.  The
 177      number of divisions that are in BB or post-dominate it after
 178      compute_merit.  */
 179   int num_divisions;
 180
 181   /* True if the basic block has a division, false if it is a common
 182      dominator for basic blocks that do.  If it is false and trapping
 183      math is active, BB is not a candidate for inserting a reciprocal.  */
 184   bool bb_has_division;
 185 };
 186
 187 static struct
 188 {
 189   /* Number of 1.0/X ops inserted.  */
 190   int rdivs_inserted;
 191
 192   /* Number of 1.0/FUNC ops inserted.  */
 193   int rfuncs_inserted;
 194 } reciprocal_stats;
 195
 196 static struct
 197 {
 198   /* Number of cexpi calls inserted.  */
 199   int inserted;
 200 } sincos_stats;
 201
 202 static struct
 203 {
 204   /* Number of hand-written 16-bit nop / bswaps found.  */
 205   int found_16bit;
 206
 207   /* Number of hand-written 32-bit nop / bswaps found.  */
 208   int found_32bit;
 209
 210   /* Number of hand-written 64-bit nop / bswaps found.  */
 211   int found_64bit;
 212 } nop_stats, bswap_stats;
 213
 214 static struct
 215 {
 216   /* Number of widening multiplication ops inserted.  */
 217   int widen_mults_inserted;
 218
 219   /* Number of integer multiply-and-accumulate ops inserted.  */
 220   int maccs_inserted;
 221
 222   /* Number of fp fused multiply-add ops inserted.  */
 223   int fmas_inserted;
 224 } widen_mul_stats;
 225
 226 /* The instance of "struct occurrence" representing the highest
 227    interesting block in the dominator tree.  */
 228 static struct occurrence *occ_head;
 229
 230 /* Allocation pool for getting instances of "struct occurrence".  */
 231 static alloc_pool occ_pool;
 232
 233
 234
 235 /* Allocate and return a new struct occurrence for basic block BB, and
 236    whose children list is headed by CHILDREN.  */
 237 static struct occurrence *
 238 occ_new (basic_block bb, struct occurrence *children)
 239 {
 240   struct occurrence *occ;
 241
 242   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 243   memset (occ, 0, sizeof (struct occurrence));
 244
 245   occ->bb = bb;
 246   occ->children = children;
 247   return occ;
 248 }
 249
 250
 251 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 252    list of "struct occurrence"s, one per basic block, having IDOM as
 253    their common dominator.
 254
 255    We try to insert NEW_OCC as deep as possible in the tree, and we also
 256    insert any other block that is a common dominator for BB and one
 257    block already in the tree.  */
 258
 259 static void
 260 insert_bb (struct occurrence *new_occ, basic_block idom,
 261            struct occurrence **p_head)
 262 {
 263   struct occurrence *occ, **p_occ;
 264
 265   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 266     {
 267       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 268       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 269       if (dom == bb)
 270         {
 271           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 272              from its list.  */
 273           *p_occ = occ->next;
 274           occ->next = new_occ->children;
 275           new_occ->children = occ;
 276
 277           /* Try the next block (it may as well be dominated by BB).  */
 278         }
 279
 280       else if (dom == occ_bb)
 281         {
 282           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 283           insert_bb (new_occ, dom, &occ->children);
 284           return;
 285         }
 286
 287       else if (dom != idom)
 288         {
 289           gcc_assert (!dom->aux);
 290
 291           /* There is a dominator between IDOM and BB, add it and make
 292              two children out of NEW_OCC and OCC.  First, remove OCC from
 293              its list.  */
 294           *p_occ = occ->next;
 295           new_occ->next = occ;
 296           occ->next = NULL;
 297
 298           /* None of the previous blocks has DOM as a dominator: if we tail
 299              recursed, we would reexamine them uselessly. Just switch BB with
 300              DOM, and go on looking for blocks dominated by DOM.  */
 301           new_occ = occ_new (dom, new_occ);
 302         }
 303
 304       else
 305         {
 306           /* Nothing special, go on with the next element.  */
 307           p_occ = &occ->next;
 308         }
 309     }
 310
 311   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 312   new_occ->next = *p_head;
 313   *p_head = new_occ;
 314 }
 315
 316 /* Register that we found a division in BB.  */
 317
 318 static inline void
 319 register_division_in (basic_block bb)
 320 {
 321   struct occurrence *occ;
 322
 323   occ = (struct occurrence *) bb->aux;
 324   if (!occ)
 325     {
 326       occ = occ_new (bb, NULL);
 327       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 328     }
 329
 330   occ->bb_has_division = true;
 331   occ->num_divisions++;
 332 }
 333
 334
 335 /* Compute the number of divisions that postdominate each block in OCC and
 336    its children.  */
 337
 338 static void
 339 compute_merit (struct occurrence *occ)
 340 {
 341   struct occurrence *occ_child;
 342   basic_block dom = occ->bb;
 343
 344   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 345     {
 346       basic_block bb;
 347       if (occ_child->children)
 348         compute_merit (occ_child);
 349
 350       if (flag_exceptions)
 351         bb = single_noncomplex_succ (dom);
 352       else
 353         bb = dom;
 354
 355       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 356         occ->num_divisions += occ_child->num_divisions;
 357     }
 358 }
 359
 360
 361 /* Return whether USE_STMT is a floating-point division by DEF.  */
 362 static inline bool
 363 is_division_by (gimple use_stmt, tree def)
 364 {
 365   return is_gimple_assign (use_stmt)
 366          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 367          && gimple_assign_rhs2 (use_stmt) == def
 368          /* Do not recognize x / x as valid division, as we are getting
 369             confused later by replacing all immediate uses x in such
 370             a stmt.  */
 371          && gimple_assign_rhs1 (use_stmt) != def;
 372 }
 373
 374 /* Walk the subset of the dominator tree rooted at OCC, setting the
 375    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 376    the given basic block.  The field may be left NULL, of course,
 377    if it is not possible or profitable to do the optimization.
 378
 379    DEF_BSI is an iterator pointing at the statement defining DEF.
 380    If RECIP_DEF is set, a dominator already has a computation that can
 381    be used.  */
 382
 383 static void
 384 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 385                     tree def, tree recip_def, int threshold)
 386 {
 387   tree type;
 388   gassign *new_stmt;
 389   gimple_stmt_iterator gsi;
 390   struct occurrence *occ_child;
 391
 392   if (!recip_def
 393       && (occ->bb_has_division || !flag_trapping_math)
 394       && occ->num_divisions >= threshold)
 395     {
 396       /* Make a variable with the replacement and substitute it.  */
 397       type = TREE_TYPE (def);
 398       recip_def = create_tmp_reg (type, "reciptmp");
 399       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 400                                       build_one_cst (type), def);
 401
 402       if (occ->bb_has_division)
 403         {
 404           /* Case 1: insert before an existing division.  */
 405           gsi = gsi_after_labels (occ->bb);
 406           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 407             gsi_next (&gsi);
 408
 409           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 410         }
 411       else if (def_gsi && occ->bb == def_gsi->bb)
 412         {
 413           /* Case 2: insert right after the definition.  Note that this will
 414              never happen if the definition statement can throw, because in
 415              that case the sole successor of the statement's basic block will
 416              dominate all the uses as well.  */
 417           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 418         }
 419       else
 420         {
 421           /* Case 3: insert in a basic block not containing defs/uses.  */
 422           gsi = gsi_after_labels (occ->bb);
 423           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 424         }
 425
 426       reciprocal_stats.rdivs_inserted++;
 427
 428       occ->recip_def_stmt = new_stmt;
 429     }
 430
 431   occ->recip_def = recip_def;
 432   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 433     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 434 }
 435
 436
 437 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 438    possible.  */
 439
 440 static inline void
 441 replace_reciprocal (use_operand_p use_p)
 442 {
 443   gimple use_stmt = USE_STMT (use_p);
 444   basic_block bb = gimple_bb (use_stmt);
 445   struct occurrence *occ = (struct occurrence *) bb->aux;
 446
 447   if (optimize_bb_for_speed_p (bb)
 448       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 449     {
 450       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 451       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 452       SET_USE (use_p, occ->recip_def);
 453       fold_stmt_inplace (&gsi);
 454       update_stmt (use_stmt);
 455     }
 456 }
 457
 458
 459 /* Free OCC and return one more "struct occurrence" to be freed.  */
 460
 461 static struct occurrence *
 462 free_bb (struct occurrence *occ)
 463 {
 464   struct occurrence *child, *next;
 465
 466   /* First get the two pointers hanging off OCC.  */
 467   next = occ->next;
 468   child = occ->children;
 469   occ->bb->aux = NULL;
 470   pool_free (occ_pool, occ);
 471
 472   /* Now ensure that we don't recurse unless it is necessary.  */
 473   if (!child)
 474     return next;
 475   else
 476     {
 477       while (next)
 478         next = free_bb (next);
 479
 480       return child;
 481     }
 482 }
 483
 484
 485 /* Look for floating-point divisions among DEF's uses, and try to
 486    replace them by multiplications with the reciprocal.  Add
 487    as many statements computing the reciprocal as needed.
 488
 489    DEF must be a GIMPLE register of a floating-point type.  */
 490
 491 static void
 492 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 493 {
 494   use_operand_p use_p;
 495   imm_use_iterator use_iter;
 496   struct occurrence *occ;
 497   int count = 0, threshold;
 498
 499   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 500
 501   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 502     {
 503       gimple use_stmt = USE_STMT (use_p);
 504       if (is_division_by (use_stmt, def))
 505         {
 506           register_division_in (gimple_bb (use_stmt));
 507           count++;
 508         }
 509     }
 510
 511   /* Do the expensive part only if we can hope to optimize something.  */
 512   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 513   if (count >= threshold)
 514     {
 515       gimple use_stmt;
 516       for (occ = occ_head; occ; occ = occ->next)
 517         {
 518           compute_merit (occ);
 519           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 520         }
 521
 522       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 523         {
 524           if (is_division_by (use_stmt, def))
 525             {
 526               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 527                 replace_reciprocal (use_p);
 528             }
 529         }
 530     }
 531
 532   for (occ = occ_head; occ; )
 533     occ = free_bb (occ);
 534
 535   occ_head = NULL;
 536 }
 537
 538 /* Go through all the floating-point SSA_NAMEs, and call
 539    execute_cse_reciprocals_1 on each of them.  */
 540 namespace {
 541
 542 const pass_data pass_data_cse_reciprocals =
 543 {
 544   GIMPLE_PASS, /* type */
 545   "recip", /* name */
 546   OPTGROUP_NONE, /* optinfo_flags */
 547   TV_NONE, /* tv_id */
 548   PROP_ssa, /* properties_required */
 549   0, /* properties_provided */
 550   0, /* properties_destroyed */
 551   0, /* todo_flags_start */
 552   TODO_update_ssa, /* todo_flags_finish */
 553 };
 554
 555 class pass_cse_reciprocals : public gimple_opt_pass
 556 {
 557 public:
 558   pass_cse_reciprocals (gcc::context *ctxt)
 559     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 560   {}
 561
 562   /* opt_pass methods: */
 563   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 564   virtual unsigned int execute (function *);
 565
 566 }; // class pass_cse_reciprocals
 567
 568 unsigned int
 569 pass_cse_reciprocals::execute (function *fun)
 570 {
 571   basic_block bb;
 572   tree arg;
 573
 574   occ_pool = create_alloc_pool ("dominators for recip",
 575                                 sizeof (struct occurrence),
 576                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 577
 578   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 579   calculate_dominance_info (CDI_DOMINATORS);
 580   calculate_dominance_info (CDI_POST_DOMINATORS);
 581
 582 #ifdef ENABLE_CHECKING
 583   FOR_EACH_BB_FN (bb, fun)
 584     gcc_assert (!bb->aux);
 585 #endif
 586
 587   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 588     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 589         && is_gimple_reg (arg))
 590       {
 591         tree name = ssa_default_def (fun, arg);
 592         if (name)
 593           execute_cse_reciprocals_1 (NULL, name);
 594       }
 595
 596   FOR_EACH_BB_FN (bb, fun)
 597     {
 598       tree def;
 599
 600       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 601            gsi_next (&gsi))
 602         {
 603           gphi *phi = gsi.phi ();
 604           def = PHI_RESULT (phi);
 605           if (! virtual_operand_p (def)
 606               && FLOAT_TYPE_P (TREE_TYPE (def)))
 607             execute_cse_reciprocals_1 (NULL, def);
 608         }
 609
 610       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 611            gsi_next (&gsi))
 612         {
 613           gimple stmt = gsi_stmt (gsi);
 614
 615           if (gimple_has_lhs (stmt)
 616               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 617               && FLOAT_TYPE_P (TREE_TYPE (def))
 618               && TREE_CODE (def) == SSA_NAME)
 619             execute_cse_reciprocals_1 (&gsi, def);
 620         }
 621
 622       if (optimize_bb_for_size_p (bb))
 623         continue;
 624
 625       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 626       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 627            gsi_next (&gsi))
 628         {
 629           gimple stmt = gsi_stmt (gsi);
 630           tree fndecl;
 631
 632           if (is_gimple_assign (stmt)
 633               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 634             {
 635               tree arg1 = gimple_assign_rhs2 (stmt);
 636               gimple stmt1;
 637
 638               if (TREE_CODE (arg1) != SSA_NAME)
 639                 continue;
 640
 641               stmt1 = SSA_NAME_DEF_STMT (arg1);
 642
 643               if (is_gimple_call (stmt1)
 644                   && gimple_call_lhs (stmt1)
 645                   && (fndecl = gimple_call_fndecl (stmt1))
 646                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 647                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 648                 {
 649                   enum built_in_function code;
 650                   bool md_code, fail;
 651                   imm_use_iterator ui;
 652                   use_operand_p use_p;
 653
 654                   code = DECL_FUNCTION_CODE (fndecl);
 655                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 656
 657                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 658                   if (!fndecl)
 659                     continue;
 660
 661                   /* Check that all uses of the SSA name are divisions,
 662                      otherwise replacing the defining statement will do
 663                      the wrong thing.  */
 664                   fail = false;
 665                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 666                     {
 667                       gimple stmt2 = USE_STMT (use_p);
 668                       if (is_gimple_debug (stmt2))
 669                         continue;
 670                       if (!is_gimple_assign (stmt2)
 671                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 672                           || gimple_assign_rhs1 (stmt2) == arg1
 673                           || gimple_assign_rhs2 (stmt2) != arg1)
 674                         {
 675                           fail = true;
 676                           break;
 677                         }
 678                     }
 679                   if (fail)
 680                     continue;
 681
 682                   gimple_replace_ssa_lhs (stmt1, arg1);
 683                   gimple_call_set_fndecl (stmt1, fndecl);
 684                   update_stmt (stmt1);
 685                   reciprocal_stats.rfuncs_inserted++;
 686
 687                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 688                     {
 689                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 690                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 691                       fold_stmt_inplace (&gsi);
 692                       update_stmt (stmt);
 693                     }
 694                 }
 695             }
 696         }
 697     }
 698
 699   statistics_counter_event (fun, "reciprocal divs inserted",
 700                             reciprocal_stats.rdivs_inserted);
 701   statistics_counter_event (fun, "reciprocal functions inserted",
 702                             reciprocal_stats.rfuncs_inserted);
 703
 704   free_dominance_info (CDI_DOMINATORS);
 705   free_dominance_info (CDI_POST_DOMINATORS);
 706   free_alloc_pool (occ_pool);
 707   return 0;
 708 }
 709
 710 } // anon namespace
 711
 712 gimple_opt_pass *
 713 make_pass_cse_reciprocals (gcc::context *ctxt)
 714 {
 715   return new pass_cse_reciprocals (ctxt);
 716 }
 717
 718 /* Records an occurrence at statement USE_STMT in the vector of trees
 719    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 720    is not yet initialized.  Returns true if the occurrence was pushed on
 721    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 722    statements in the vector.  */
 723
 724 static bool
 725 maybe_record_sincos (vec<gimple> *stmts,
 726                      basic_block *top_bb, gimple use_stmt)
 727 {
 728   basic_block use_bb = gimple_bb (use_stmt);
 729   if (*top_bb
 730       && (*top_bb == use_bb
 731           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 732     stmts->safe_push (use_stmt);
 733   else if (!*top_bb
 734            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 735     {
 736       stmts->safe_push (use_stmt);
 737       *top_bb = use_bb;
 738     }
 739   else
 740     return false;
 741
 742   return true;
 743 }
 744
 745 /* Look for sin, cos and cexpi calls with the same argument NAME and
 746    create a single call to cexpi CSEing the result in this case.
 747    We first walk over all immediate uses of the argument collecting
 748    statements that we can CSE in a vector and in a second pass replace
 749    the statement rhs with a REALPART or IMAGPART expression on the
 750    result of the cexpi call we insert before the use statement that
 751    dominates all other candidates.  */
 752
 753 static bool
 754 execute_cse_sincos_1 (tree name)
 755 {
 756   gimple_stmt_iterator gsi;
 757   imm_use_iterator use_iter;
 758   tree fndecl, res, type;
 759   gimple def_stmt, use_stmt, stmt;
 760   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 761   auto_vec<gimple> stmts;
 762   basic_block top_bb = NULL;
 763   int i;
 764   bool cfg_changed = false;
 765
 766   type = TREE_TYPE (name);
 767   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 768     {
 769       if (gimple_code (use_stmt) != GIMPLE_CALL
 770           || !gimple_call_lhs (use_stmt)
 771           || !(fndecl = gimple_call_fndecl (use_stmt))
 772           || !gimple_call_builtin_p (use_stmt, BUILT_IN_NORMAL))
 773         continue;
 774
 775       switch (DECL_FUNCTION_CODE (fndecl))
 776         {
 777         CASE_FLT_FN (BUILT_IN_COS):
 778           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 779           break;
 780
 781         CASE_FLT_FN (BUILT_IN_SIN):
 782           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 783           break;
 784
 785         CASE_FLT_FN (BUILT_IN_CEXPI):
 786           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 787           break;
 788
 789         default:;
 790         }
 791     }
 792
 793   if (seen_cos + seen_sin + seen_cexpi <= 1)
 794     return false;
 795
 796   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 797      the name def statement.  */
 798   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 799   if (!fndecl)
 800     return false;
 801   stmt = gimple_build_call (fndecl, 1, name);
 802   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 803   gimple_call_set_lhs (stmt, res);
 804
 805   def_stmt = SSA_NAME_DEF_STMT (name);
 806   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 807       && gimple_code (def_stmt) != GIMPLE_PHI
 808       && gimple_bb (def_stmt) == top_bb)
 809     {
 810       gsi = gsi_for_stmt (def_stmt);
 811       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 812     }
 813   else
 814     {
 815       gsi = gsi_after_labels (top_bb);
 816       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 817     }
 818   sincos_stats.inserted++;
 819
 820   /* And adjust the recorded old call sites.  */
 821   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 822     {
 823       tree rhs = NULL;
 824       fndecl = gimple_call_fndecl (use_stmt);
 825
 826       switch (DECL_FUNCTION_CODE (fndecl))
 827         {
 828         CASE_FLT_FN (BUILT_IN_COS):
 829           rhs = fold_build1 (REALPART_EXPR, type, res);
 830           break;
 831
 832         CASE_FLT_FN (BUILT_IN_SIN):
 833           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 834           break;
 835
 836         CASE_FLT_FN (BUILT_IN_CEXPI):
 837           rhs = res;
 838           break;
 839
 840         default:;
 841           gcc_unreachable ();
 842         }
 843
 844         /* Replace call with a copy.  */
 845         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 846
 847         gsi = gsi_for_stmt (use_stmt);
 848         gsi_replace (&gsi, stmt, true);
 849         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 850           cfg_changed = true;
 851     }
 852
 853   return cfg_changed;
 854 }
 855
 856 /* To evaluate powi(x,n), the floating point value x raised to the
 857    constant integer exponent n, we use a hybrid algorithm that
 858    combines the "window method" with look-up tables.  For an
 859    introduction to exponentiation algorithms and "addition chains",
 860    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 861    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 862    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 863    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 864
 865 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 866    multiplications to inline before calling the system library's pow
 867    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 868    so this default never requires calling pow, powf or powl.  */
 869
 870 #ifndef POWI_MAX_MULTS
 871 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 872 #endif
 873
 874 /* The size of the "optimal power tree" lookup table.  All
 875    exponents less than this value are simply looked up in the
 876    powi_table below.  This threshold is also used to size the
 877    cache of pseudo registers that hold intermediate results.  */
 878 #define POWI_TABLE_SIZE 256
 879
 880 /* The size, in bits of the window, used in the "window method"
 881    exponentiation algorithm.  This is equivalent to a radix of
 882    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 883 #define POWI_WINDOW_SIZE 3
 884
 885 /* The following table is an efficient representation of an
 886    "optimal power tree".  For each value, i, the corresponding
 887    value, j, in the table states than an optimal evaluation
 888    sequence for calculating pow(x,i) can be found by evaluating
 889    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 890    100 integers is given in Knuth's "Seminumerical algorithms".  */
 891
 892 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 893   {
 894       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 895       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 896       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 897      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 898      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 899      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 900      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 901      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 902      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 903      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 904      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 905      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 906      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 907      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 908      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 909      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 910      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 911      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 912      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 913      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 914      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 915      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 916      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 917      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 918      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 919     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 920     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 921     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 922     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 923     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 924     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 925     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 926   };
 927
 928
 929 /* Return the number of multiplications required to calculate
 930    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 931    subroutine of powi_cost.  CACHE is an array indicating
 932    which exponents have already been calculated.  */
 933
 934 static int
 935 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 936 {
 937   /* If we've already calculated this exponent, then this evaluation
 938      doesn't require any additional multiplications.  */
 939   if (cache[n])
 940     return 0;
 941
 942   cache[n] = true;
 943   return powi_lookup_cost (n - powi_table[n], cache)
 944          + powi_lookup_cost (powi_table[n], cache) + 1;
 945 }
 946
 947 /* Return the number of multiplications required to calculate
 948    powi(x,n) for an arbitrary x, given the exponent N.  This
 949    function needs to be kept in sync with powi_as_mults below.  */
 950
 951 static int
 952 powi_cost (HOST_WIDE_INT n)
 953 {
 954   bool cache[POWI_TABLE_SIZE];
 955   unsigned HOST_WIDE_INT digit;
 956   unsigned HOST_WIDE_INT val;
 957   int result;
 958
 959   if (n == 0)
 960     return 0;
 961
 962   /* Ignore the reciprocal when calculating the cost.  */
 963   val = (n < 0) ? -n : n;
 964
 965   /* Initialize the exponent cache.  */
 966   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 967   cache[1] = true;
 968
 969   result = 0;
 970
 971   while (val >= POWI_TABLE_SIZE)
 972     {
 973       if (val & 1)
 974         {
 975           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 976           result += powi_lookup_cost (digit, cache)
 977                     + POWI_WINDOW_SIZE + 1;
 978           val >>= POWI_WINDOW_SIZE;
 979         }
 980       else
 981         {
 982           val >>= 1;
 983           result++;
 984         }
 985     }
 986
 987   return result + powi_lookup_cost (val, cache);
 988 }
 989
 990 /* Recursive subroutine of powi_as_mults.  This function takes the
 991    array, CACHE, of already calculated exponents and an exponent N and
 992    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 993
 994 static tree
 995 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 996                  HOST_WIDE_INT n, tree *cache)
 997 {
 998   tree op0, op1, ssa_target;
 999   unsigned HOST_WIDE_INT digit;
1000   gassign *mult_stmt;
1001
1002   if (n < POWI_TABLE_SIZE && cache[n])
1003     return cache[n];
1004
1005   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1006
1007   if (n < POWI_TABLE_SIZE)
1008     {
1009       cache[n] = ssa_target;
1010       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1011       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1012     }
1013   else if (n & 1)
1014     {
1015       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1016       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1017       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1018     }
1019   else
1020     {
1021       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1022       op1 = op0;
1023     }
1024
1025   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1026   gimple_set_location (mult_stmt, loc);
1027   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1028
1029   return ssa_target;
1030 }
1031
1032 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1033    This function needs to be kept in sync with powi_cost above.  */
1034
1035 static tree
1036 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1037                tree arg0, HOST_WIDE_INT n)
1038 {
1039   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1040   gassign *div_stmt;
1041   tree target;
1042
1043   if (n == 0)
1044     return build_real (type, dconst1);
1045
1046   memset (cache, 0,  sizeof (cache));
1047   cache[1] = arg0;
1048
1049   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1050   if (n >= 0)
1051     return result;
1052
1053   /* If the original exponent was negative, reciprocate the result.  */
1054   target = make_temp_ssa_name (type, NULL, "powmult");
1055   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1056                                   build_real (type, dconst1), result);
1057   gimple_set_location (div_stmt, loc);
1058   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1059
1060   return target;
1061 }
1062
1063 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1064    location info LOC.  If the arguments are appropriate, create an
1065    equivalent sequence of statements prior to GSI using an optimal
1066    number of multiplications, and return an expession holding the
1067    result.  */
1068
1069 static tree
1070 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1071                             tree arg0, HOST_WIDE_INT n)
1072 {
1073   /* Avoid largest negative number.  */
1074   if (n != -n
1075       && ((n >= -1 && n <= 2)
1076           || (optimize_function_for_speed_p (cfun)
1077               && powi_cost (n) <= POWI_MAX_MULTS)))
1078     return powi_as_mults (gsi, loc, arg0, n);
1079
1080   return NULL_TREE;
1081 }
1082
1083 /* Build a gimple call statement that calls FN with argument ARG.
1084    Set the lhs of the call statement to a fresh SSA name.  Insert the
1085    statement prior to GSI's current position, and return the fresh
1086    SSA name.  */
1087
1088 static tree
1089 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1090                        tree fn, tree arg)
1091 {
1092   gcall *call_stmt;
1093   tree ssa_target;
1094
1095   call_stmt = gimple_build_call (fn, 1, arg);
1096   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1097   gimple_set_lhs (call_stmt, ssa_target);
1098   gimple_set_location (call_stmt, loc);
1099   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1100
1101   return ssa_target;
1102 }
1103
1104 /* Build a gimple binary operation with the given CODE and arguments
1105    ARG0, ARG1, assigning the result to a new SSA name for variable
1106    TARGET.  Insert the statement prior to GSI's current position, and
1107    return the fresh SSA name.*/
1108
1109 static tree
1110 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1111                         const char *name, enum tree_code code,
1112                         tree arg0, tree arg1)
1113 {
1114   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1115   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1116   gimple_set_location (stmt, loc);
1117   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1118   return result;
1119 }
1120
1121 /* Build a gimple reference operation with the given CODE and argument
1122    ARG, assigning the result to a new SSA name of TYPE with NAME.
1123    Insert the statement prior to GSI's current position, and return
1124    the fresh SSA name.  */
1125
1126 static inline tree
1127 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1128                       const char *name, enum tree_code code, tree arg0)
1129 {
1130   tree result = make_temp_ssa_name (type, NULL, name);
1131   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1132   gimple_set_location (stmt, loc);
1133   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1134   return result;
1135 }
1136
1137 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1138    prior to GSI's current position, and return the fresh SSA name.  */
1139
1140 static tree
1141 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1142                        tree type, tree val)
1143 {
1144   tree result = make_ssa_name (type);
1145   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1146   gimple_set_location (stmt, loc);
1147   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1148   return result;
1149 }
1150
1151 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1152    with location info LOC.  If possible, create an equivalent and
1153    less expensive sequence of statements prior to GSI, and return an
1154    expession holding the result.  */
1155
1156 static tree
1157 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1158                            tree arg0, tree arg1)
1159 {
1160   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1161   REAL_VALUE_TYPE c2, dconst3;
1162   HOST_WIDE_INT n;
1163   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1164   machine_mode mode;
1165   bool hw_sqrt_exists, c_is_int, c2_is_int;
1166
1167   /* If the exponent isn't a constant, there's nothing of interest
1168      to be done.  */
1169   if (TREE_CODE (arg1) != REAL_CST)
1170     return NULL_TREE;
1171
1172   /* If the exponent is equivalent to an integer, expand to an optimal
1173      multiplication sequence when profitable.  */
1174   c = TREE_REAL_CST (arg1);
1175   n = real_to_integer (&c);
1176   real_from_integer (&cint, VOIDmode, n, SIGNED);
1177   c_is_int = real_identical (&c, &cint);
1178
1179   if (c_is_int
1180       && ((n >= -1 && n <= 2)
1181           || (flag_unsafe_math_optimizations
1182               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1183               && powi_cost (n) <= POWI_MAX_MULTS)))
1184     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1185
1186   /* Attempt various optimizations using sqrt and cbrt.  */
1187   type = TREE_TYPE (arg0);
1188   mode = TYPE_MODE (type);
1189   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1190
1191   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1192      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1193      sqrt(-0) = -0.  */
1194   if (sqrtfn
1195       && REAL_VALUES_EQUAL (c, dconsthalf)
1196       && !HONOR_SIGNED_ZEROS (mode))
1197     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1198
1199   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1200      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1201      so do this optimization even if -Os.  Don't do this optimization
1202      if we don't have a hardware sqrt insn.  */
1203   dconst1_4 = dconst1;
1204   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1205   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1206
1207   if (flag_unsafe_math_optimizations
1208       && sqrtfn
1209       && REAL_VALUES_EQUAL (c, dconst1_4)
1210       && hw_sqrt_exists)
1211     {
1212       /* sqrt(x)  */
1213       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1214
1215       /* sqrt(sqrt(x))  */
1216       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1217     }
1218
1219   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1220      optimizing for space.  Don't do this optimization if we don't have
1221      a hardware sqrt insn.  */
1222   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1223   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1224
1225   if (flag_unsafe_math_optimizations
1226       && sqrtfn
1227       && optimize_function_for_speed_p (cfun)
1228       && REAL_VALUES_EQUAL (c, dconst3_4)
1229       && hw_sqrt_exists)
1230     {
1231       /* sqrt(x)  */
1232       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1233
1234       /* sqrt(sqrt(x))  */
1235       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1236
1237       /* sqrt(x) * sqrt(sqrt(x))  */
1238       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1239                                      sqrt_arg0, sqrt_sqrt);
1240     }
1241
1242   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1243      optimizations since 1./3. is not exactly representable.  If x
1244      is negative and finite, the correct value of pow(x,1./3.) is
1245      a NaN with the "invalid" exception raised, because the value
1246      of 1./3. actually has an even denominator.  The correct value
1247      of cbrt(x) is a negative real value.  */
1248   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1249   dconst1_3 = real_value_truncate (mode, dconst_third ());
1250
1251   if (flag_unsafe_math_optimizations
1252       && cbrtfn
1253       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1254       && REAL_VALUES_EQUAL (c, dconst1_3))
1255     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1256
1257   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1258      if we don't have a hardware sqrt insn.  */
1259   dconst1_6 = dconst1_3;
1260   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1261
1262   if (flag_unsafe_math_optimizations
1263       && sqrtfn
1264       && cbrtfn
1265       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1266       && optimize_function_for_speed_p (cfun)
1267       && hw_sqrt_exists
1268       && REAL_VALUES_EQUAL (c, dconst1_6))
1269     {
1270       /* sqrt(x)  */
1271       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1272
1273       /* cbrt(sqrt(x))  */
1274       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1275     }
1276
1277   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1278      and c not an integer, into
1279
1280        sqrt(x) * powi(x, n/2),                n > 0;
1281        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1282
1283      Do not calculate the powi factor when n/2 = 0.  */
1284   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1285   n = real_to_integer (&c2);
1286   real_from_integer (&cint, VOIDmode, n, SIGNED);
1287   c2_is_int = real_identical (&c2, &cint);
1288
1289   if (flag_unsafe_math_optimizations
1290       && sqrtfn
1291       && c2_is_int
1292       && !c_is_int
1293       && optimize_function_for_speed_p (cfun))
1294     {
1295       tree powi_x_ndiv2 = NULL_TREE;
1296
1297       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1298          possible or profitable, give up.  Skip the degenerate case when
1299          n is 1 or -1, where the result is always 1.  */
1300       if (absu_hwi (n) != 1)
1301         {
1302           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1303                                                      abs_hwi (n / 2));
1304           if (!powi_x_ndiv2)
1305             return NULL_TREE;
1306         }
1307
1308       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1309          result of the optimal multiply sequence just calculated.  */
1310       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1311
1312       if (absu_hwi (n) == 1)
1313         result = sqrt_arg0;
1314       else
1315         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1316                                          sqrt_arg0, powi_x_ndiv2);
1317
1318       /* If n is negative, reciprocate the result.  */
1319       if (n < 0)
1320         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1321                                          build_real (type, dconst1), result);
1322       return result;
1323     }
1324
1325   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1326
1327      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1328      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1329
1330      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1331      different from pow(x, 1./3.) due to rounding and behavior with
1332      negative x, we need to constrain this transformation to unsafe
1333      math and positive x or finite math.  */
1334   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1335   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1336   real_round (&c2, mode, &c2);
1337   n = real_to_integer (&c2);
1338   real_from_integer (&cint, VOIDmode, n, SIGNED);
1339   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1340   real_convert (&c2, mode, &c2);
1341
1342   if (flag_unsafe_math_optimizations
1343       && cbrtfn
1344       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1345       && real_identical (&c2, &c)
1346       && !c2_is_int
1347       && optimize_function_for_speed_p (cfun)
1348       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1349     {
1350       tree powi_x_ndiv3 = NULL_TREE;
1351
1352       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1353          possible or profitable, give up.  Skip the degenerate case when
1354          abs(n) < 3, where the result is always 1.  */
1355       if (absu_hwi (n) >= 3)
1356         {
1357           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1358                                                      abs_hwi (n / 3));
1359           if (!powi_x_ndiv3)
1360             return NULL_TREE;
1361         }
1362
1363       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1364          as that creates an unnecessary variable.  Instead, just produce
1365          either cbrt(x) or cbrt(x) * cbrt(x).  */
1366       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1367
1368       if (absu_hwi (n) % 3 == 1)
1369         powi_cbrt_x = cbrt_x;
1370       else
1371         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1372                                               cbrt_x, cbrt_x);
1373
1374       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1375       if (absu_hwi (n) < 3)
1376         result = powi_cbrt_x;
1377       else
1378         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1379                                          powi_x_ndiv3, powi_cbrt_x);
1380
1381       /* If n is negative, reciprocate the result.  */
1382       if (n < 0)
1383         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1384                                          build_real (type, dconst1), result);
1385
1386       return result;
1387     }
1388
1389   /* No optimizations succeeded.  */
1390   return NULL_TREE;
1391 }
1392
1393 /* ARG is the argument to a cabs builtin call in GSI with location info
1394    LOC.  Create a sequence of statements prior to GSI that calculates
1395    sqrt(R*R + I*I), where R and I are the real and imaginary components
1396    of ARG, respectively.  Return an expression holding the result.  */
1397
1398 static tree
1399 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1400 {
1401   tree real_part, imag_part, addend1, addend2, sum, result;
1402   tree type = TREE_TYPE (TREE_TYPE (arg));
1403   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1404   machine_mode mode = TYPE_MODE (type);
1405
1406   if (!flag_unsafe_math_optimizations
1407       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1408       || !sqrtfn
1409       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1410     return NULL_TREE;
1411
1412   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1413                                     REALPART_EXPR, arg);
1414   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1415                                     real_part, real_part);
1416   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1417                                     IMAGPART_EXPR, arg);
1418   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1419                                     imag_part, imag_part);
1420   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1421   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1422
1423   return result;
1424 }
1425
1426 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1427    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1428    an optimal number of multiplies, when n is a constant.  */
1429
1430 namespace {
1431
1432 const pass_data pass_data_cse_sincos =
1433 {
1434   GIMPLE_PASS, /* type */
1435   "sincos", /* name */
1436   OPTGROUP_NONE, /* optinfo_flags */
1437   TV_NONE, /* tv_id */
1438   PROP_ssa, /* properties_required */
1439   0, /* properties_provided */
1440   0, /* properties_destroyed */
1441   0, /* todo_flags_start */
1442   TODO_update_ssa, /* todo_flags_finish */
1443 };
1444
1445 class pass_cse_sincos : public gimple_opt_pass
1446 {
1447 public:
1448   pass_cse_sincos (gcc::context *ctxt)
1449     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1450   {}
1451
1452   /* opt_pass methods: */
1453   virtual bool gate (function *)
1454     {
1455       /* We no longer require either sincos or cexp, since powi expansion
1456          piggybacks on this pass.  */
1457       return optimize;
1458     }
1459
1460   virtual unsigned int execute (function *);
1461
1462 }; // class pass_cse_sincos
1463
1464 unsigned int
1465 pass_cse_sincos::execute (function *fun)
1466 {
1467   basic_block bb;
1468   bool cfg_changed = false;
1469
1470   calculate_dominance_info (CDI_DOMINATORS);
1471   memset (&sincos_stats, 0, sizeof (sincos_stats));
1472
1473   FOR_EACH_BB_FN (bb, fun)
1474     {
1475       gimple_stmt_iterator gsi;
1476       bool cleanup_eh = false;
1477
1478       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1479         {
1480           gimple stmt = gsi_stmt (gsi);
1481           tree fndecl;
1482
1483           /* Only the last stmt in a bb could throw, no need to call
1484              gimple_purge_dead_eh_edges if we change something in the middle
1485              of a basic block.  */
1486           cleanup_eh = false;
1487
1488           if (is_gimple_call (stmt)
1489               && gimple_call_lhs (stmt)
1490               && (fndecl = gimple_call_fndecl (stmt))
1491               && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
1492             {
1493               tree arg, arg0, arg1, result;
1494               HOST_WIDE_INT n;
1495               location_t loc;
1496
1497               switch (DECL_FUNCTION_CODE (fndecl))
1498                 {
1499                 CASE_FLT_FN (BUILT_IN_COS):
1500                 CASE_FLT_FN (BUILT_IN_SIN):
1501                 CASE_FLT_FN (BUILT_IN_CEXPI):
1502                   /* Make sure we have either sincos or cexp.  */
1503                   if (!targetm.libc_has_function (function_c99_math_complex)
1504                       && !targetm.libc_has_function (function_sincos))
1505                     break;
1506
1507                   arg = gimple_call_arg (stmt, 0);
1508                   if (TREE_CODE (arg) == SSA_NAME)
1509                     cfg_changed |= execute_cse_sincos_1 (arg);
1510                   break;
1511
1512                 CASE_FLT_FN (BUILT_IN_POW):
1513                   arg0 = gimple_call_arg (stmt, 0);
1514                   arg1 = gimple_call_arg (stmt, 1);
1515
1516                   loc = gimple_location (stmt);
1517                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1518
1519                   if (result)
1520                     {
1521                       tree lhs = gimple_get_lhs (stmt);
1522                       gassign *new_stmt = gimple_build_assign (lhs, result);
1523                       gimple_set_location (new_stmt, loc);
1524                       unlink_stmt_vdef (stmt);
1525                       gsi_replace (&gsi, new_stmt, true);
1526                       cleanup_eh = true;
1527                       if (gimple_vdef (stmt))
1528                         release_ssa_name (gimple_vdef (stmt));
1529                     }
1530                   break;
1531
1532                 CASE_FLT_FN (BUILT_IN_POWI):
1533                   arg0 = gimple_call_arg (stmt, 0);
1534                   arg1 = gimple_call_arg (stmt, 1);
1535                   loc = gimple_location (stmt);
1536
1537                   if (real_minus_onep (arg0))
1538                     {
1539                       tree t0, t1, cond, one, minus_one;
1540                       gassign *stmt;
1541
1542                       t0 = TREE_TYPE (arg0);
1543                       t1 = TREE_TYPE (arg1);
1544                       one = build_real (t0, dconst1);
1545                       minus_one = build_real (t0, dconstm1);
1546
1547                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1548                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1549                                                   arg1, build_int_cst (t1, 1));
1550                       gimple_set_location (stmt, loc);
1551                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1552
1553                       result = make_temp_ssa_name (t0, NULL, "powi");
1554                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1555                                                   minus_one, one);
1556                       gimple_set_location (stmt, loc);
1557                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1558                     }
1559                   else
1560                     {
1561                       if (!tree_fits_shwi_p (arg1))
1562                         break;
1563
1564                       n = tree_to_shwi (arg1);
1565                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1566                     }
1567
1568                   if (result)
1569                     {
1570                       tree lhs = gimple_get_lhs (stmt);
1571                       gassign *new_stmt = gimple_build_assign (lhs, result);
1572                       gimple_set_location (new_stmt, loc);
1573                       unlink_stmt_vdef (stmt);
1574                       gsi_replace (&gsi, new_stmt, true);
1575                       cleanup_eh = true;
1576                       if (gimple_vdef (stmt))
1577                         release_ssa_name (gimple_vdef (stmt));
1578                     }
1579                   break;
1580
1581                 CASE_FLT_FN (BUILT_IN_CABS):
1582                   arg0 = gimple_call_arg (stmt, 0);
1583                   loc = gimple_location (stmt);
1584                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1585
1586                   if (result)
1587                     {
1588                       tree lhs = gimple_get_lhs (stmt);
1589                       gassign *new_stmt = gimple_build_assign (lhs, result);
1590                       gimple_set_location (new_stmt, loc);
1591                       unlink_stmt_vdef (stmt);
1592                       gsi_replace (&gsi, new_stmt, true);
1593                       cleanup_eh = true;
1594                       if (gimple_vdef (stmt))
1595                         release_ssa_name (gimple_vdef (stmt));
1596                     }
1597                   break;
1598
1599                 default:;
1600                 }
1601             }
1602         }
1603       if (cleanup_eh)
1604         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1605     }
1606
1607   statistics_counter_event (fun, "sincos statements inserted",
1608                             sincos_stats.inserted);
1609
1610   free_dominance_info (CDI_DOMINATORS);
1611   return cfg_changed ? TODO_cleanup_cfg : 0;
1612 }
1613
1614 } // anon namespace
1615
1616 gimple_opt_pass *
1617 make_pass_cse_sincos (gcc::context *ctxt)
1618 {
1619   return new pass_cse_sincos (ctxt);
1620 }
1621
1622 /* A symbolic number is used to detect byte permutation and selection
1623    patterns.  Therefore the field N contains an artificial number
1624    consisting of octet sized markers:
1625
1626    0    - target byte has the value 0
1627    FF   - target byte has an unknown value (eg. due to sign extension)
1628    1..size - marker value is the target byte index minus one.
1629
1630    To detect permutations on memory sources (arrays and structures), a symbolic
1631    number is also associated a base address (the array or structure the load is
1632    made from), an offset from the base address and a range which gives the
1633    difference between the highest and lowest accessed memory location to make
1634    such a symbolic number. The range is thus different from size which reflects
1635    the size of the type of current expression. Note that for non memory source,
1636    range holds the same value as size.
1637
1638    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1639    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1640    still have a size of 2 but this time a range of 1.  */
1641
1642 struct symbolic_number {
1643   uint64_t n;
1644   tree type;
1645   tree base_addr;
1646   tree offset;
1647   HOST_WIDE_INT bytepos;
1648   tree alias_set;
1649   tree vuse;
1650   unsigned HOST_WIDE_INT range;
1651 };
1652
1653 #define BITS_PER_MARKER 8
1654 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1655 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1656 #define HEAD_MARKER(n, size) \
1657   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1658
1659 /* The number which the find_bswap_or_nop_1 result should match in
1660    order to have a nop.  The number is masked according to the size of
1661    the symbolic number before using it.  */
1662 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1663   (uint64_t)0x08070605 << 32 | 0x04030201)
1664
1665 /* The number which the find_bswap_or_nop_1 result should match in
1666    order to have a byte swap.  The number is masked according to the
1667    size of the symbolic number before using it.  */
1668 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1669   (uint64_t)0x01020304 << 32 | 0x05060708)
1670
1671 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1672    number N.  Return false if the requested operation is not permitted
1673    on a symbolic number.  */
1674
1675 static inline bool
1676 do_shift_rotate (enum tree_code code,
1677                  struct symbolic_number *n,
1678                  int count)
1679 {
1680   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1681   unsigned head_marker;
1682
1683   if (count % BITS_PER_UNIT != 0)
1684     return false;
1685   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1686
1687   /* Zero out the extra bits of N in order to avoid them being shifted
1688      into the significant bits.  */
1689   if (size < 64 / BITS_PER_MARKER)
1690     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1691
1692   switch (code)
1693     {
1694     case LSHIFT_EXPR:
1695       n->n <<= count;
1696       break;
1697     case RSHIFT_EXPR:
1698       head_marker = HEAD_MARKER (n->n, size);
1699       n->n >>= count;
1700       /* Arithmetic shift of signed type: result is dependent on the value.  */
1701       if (!TYPE_UNSIGNED (n->type) && head_marker)
1702         for (i = 0; i < count / BITS_PER_MARKER; i++)
1703           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1704                   << ((size - 1 - i) * BITS_PER_MARKER);
1705       break;
1706     case LROTATE_EXPR:
1707       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1708       break;
1709     case RROTATE_EXPR:
1710       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1711       break;
1712     default:
1713       return false;
1714     }
1715   /* Zero unused bits for size.  */
1716   if (size < 64 / BITS_PER_MARKER)
1717     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1718   return true;
1719 }
1720
1721 /* Perform sanity checking for the symbolic number N and the gimple
1722    statement STMT.  */
1723
1724 static inline bool
1725 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1726 {
1727   tree lhs_type;
1728
1729   lhs_type = gimple_expr_type (stmt);
1730
1731   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1732     return false;
1733
1734   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1735     return false;
1736
1737   return true;
1738 }
1739
1740 /* Initialize the symbolic number N for the bswap pass from the base element
1741    SRC manipulated by the bitwise OR expression.  */
1742
1743 static bool
1744 init_symbolic_number (struct symbolic_number *n, tree src)
1745 {
1746   int size;
1747
1748   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1749
1750   /* Set up the symbolic number N by setting each byte to a value between 1 and
1751      the byte size of rhs1.  The highest order byte is set to n->size and the
1752      lowest order byte to 1.  */
1753   n->type = TREE_TYPE (src);
1754   size = TYPE_PRECISION (n->type);
1755   if (size % BITS_PER_UNIT != 0)
1756     return false;
1757   size /= BITS_PER_UNIT;
1758   if (size > 64 / BITS_PER_MARKER)
1759     return false;
1760   n->range = size;
1761   n->n = CMPNOP;
1762
1763   if (size < 64 / BITS_PER_MARKER)
1764     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1765
1766   return true;
1767 }
1768
1769 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1770    the answer. If so, REF is that memory source and the base of the memory area
1771    accessed and the offset of the access from that base are recorded in N.  */
1772
1773 bool
1774 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1775 {
1776   /* Leaf node is an array or component ref. Memorize its base and
1777      offset from base to compare to other such leaf node.  */
1778   HOST_WIDE_INT bitsize, bitpos;
1779   machine_mode mode;
1780   int unsignedp, volatilep;
1781   tree offset, base_addr;
1782
1783   /* Not prepared to handle PDP endian.  */
1784   if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
1785     return false;
1786
1787   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1788     return false;
1789
1790   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1791                                    &unsignedp, &volatilep, false);
1792
1793   if (TREE_CODE (base_addr) == MEM_REF)
1794     {
1795       offset_int bit_offset = 0;
1796       tree off = TREE_OPERAND (base_addr, 1);
1797
1798       if (!integer_zerop (off))
1799         {
1800           offset_int boff, coff = mem_ref_offset (base_addr);
1801           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1802           bit_offset += boff;
1803         }
1804
1805       base_addr = TREE_OPERAND (base_addr, 0);
1806
1807       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1808       if (wi::neg_p (bit_offset))
1809         {
1810           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1811           offset_int tem = bit_offset.and_not (mask);
1812           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1813              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1814           bit_offset -= tem;
1815           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1816           if (offset)
1817             offset = size_binop (PLUS_EXPR, offset,
1818                                     wide_int_to_tree (sizetype, tem));
1819           else
1820             offset = wide_int_to_tree (sizetype, tem);
1821         }
1822
1823       bitpos += bit_offset.to_shwi ();
1824     }
1825
1826   if (bitpos % BITS_PER_UNIT)
1827     return false;
1828   if (bitsize % BITS_PER_UNIT)
1829     return false;
1830
1831   if (!init_symbolic_number (n, ref))
1832     return false;
1833   n->base_addr = base_addr;
1834   n->offset = offset;
1835   n->bytepos = bitpos / BITS_PER_UNIT;
1836   n->alias_set = reference_alias_ptr_type (ref);
1837   n->vuse = gimple_vuse (stmt);
1838   return true;
1839 }
1840
1841 /* Compute the symbolic number N representing the result of a bitwise OR on 2
1842    symbolic number N1 and N2 whose source statements are respectively
1843    SOURCE_STMT1 and SOURCE_STMT2.  */
1844
1845 static gimple
1846 perform_symbolic_merge (gimple source_stmt1, struct symbolic_number *n1,
1847                         gimple source_stmt2, struct symbolic_number *n2,
1848                         struct symbolic_number *n)
1849 {
1850   int i, size;
1851   uint64_t mask;
1852   gimple source_stmt;
1853   struct symbolic_number *n_start;
1854
1855   /* Sources are different, cancel bswap if they are not memory location with
1856      the same base (array, structure, ...).  */
1857   if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
1858     {
1859       uint64_t inc;
1860       HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
1861       struct symbolic_number *toinc_n_ptr, *n_end;
1862
1863       if (!n1->base_addr || !n2->base_addr
1864           || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
1865         return NULL;
1866
1867       if (!n1->offset != !n2->offset
1868           || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
1869         return NULL;
1870
1871       if (n1->bytepos < n2->bytepos)
1872         {
1873           n_start = n1;
1874           start_sub = n2->bytepos - n1->bytepos;
1875           source_stmt = source_stmt1;
1876         }
1877       else
1878         {
1879           n_start = n2;
1880           start_sub = n1->bytepos - n2->bytepos;
1881           source_stmt = source_stmt2;
1882         }
1883
1884       /* Find the highest address at which a load is performed and
1885          compute related info.  */
1886       end1 = n1->bytepos + (n1->range - 1);
1887       end2 = n2->bytepos + (n2->range - 1);
1888       if (end1 < end2)
1889         {
1890           end = end2;
1891           end_sub = end2 - end1;
1892         }
1893       else
1894         {
1895           end = end1;
1896           end_sub = end1 - end2;
1897         }
1898       n_end = (end2 > end1) ? n2 : n1;
1899
1900       /* Find symbolic number whose lsb is the most significant.  */
1901       if (BYTES_BIG_ENDIAN)
1902         toinc_n_ptr = (n_end == n1) ? n2 : n1;
1903       else
1904         toinc_n_ptr = (n_start == n1) ? n2 : n1;
1905
1906       n->range = end - n_start->bytepos + 1;
1907
1908       /* Check that the range of memory covered can be represented by
1909          a symbolic number.  */
1910       if (n->range > 64 / BITS_PER_MARKER)
1911         return NULL;
1912
1913       /* Reinterpret byte marks in symbolic number holding the value of
1914          bigger weight according to target endianness.  */
1915       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
1916       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
1917       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
1918         {
1919           unsigned marker
1920             = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
1921           if (marker && marker != MARKER_BYTE_UNKNOWN)
1922             toinc_n_ptr->n += inc;
1923         }
1924     }
1925   else
1926     {
1927       n->range = n1->range;
1928       n_start = n1;
1929       source_stmt = source_stmt1;
1930     }
1931
1932   if (!n1->alias_set
1933       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
1934     n->alias_set = n1->alias_set;
1935   else
1936     n->alias_set = ptr_type_node;
1937   n->vuse = n_start->vuse;
1938   n->base_addr = n_start->base_addr;
1939   n->offset = n_start->offset;
1940   n->bytepos = n_start->bytepos;
1941   n->type = n_start->type;
1942   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1943
1944   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
1945     {
1946       uint64_t masked1, masked2;
1947
1948       masked1 = n1->n & mask;
1949       masked2 = n2->n & mask;
1950       if (masked1 && masked2 && masked1 != masked2)
1951         return NULL;
1952     }
1953   n->n = n1->n | n2->n;
1954
1955   return source_stmt;
1956 }
1957
1958 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1959    the operation given by the rhs of STMT on the result.  If the operation
1960    could successfully be executed the function returns a gimple stmt whose
1961    rhs's first tree is the expression of the source operand and NULL
1962    otherwise.  */
1963
1964 static gimple
1965 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1966 {
1967   enum tree_code code;
1968   tree rhs1, rhs2 = NULL;
1969   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1970   enum gimple_rhs_class rhs_class;
1971
1972   if (!limit || !is_gimple_assign (stmt))
1973     return NULL;
1974
1975   rhs1 = gimple_assign_rhs1 (stmt);
1976
1977   if (find_bswap_or_nop_load (stmt, rhs1, n))
1978     return stmt;
1979
1980   if (TREE_CODE (rhs1) != SSA_NAME)
1981     return NULL;
1982
1983   code = gimple_assign_rhs_code (stmt);
1984   rhs_class = gimple_assign_rhs_class (stmt);
1985   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1986
1987   if (rhs_class == GIMPLE_BINARY_RHS)
1988     rhs2 = gimple_assign_rhs2 (stmt);
1989
1990   /* Handle unary rhs and binary rhs with integer constants as second
1991      operand.  */
1992
1993   if (rhs_class == GIMPLE_UNARY_RHS
1994       || (rhs_class == GIMPLE_BINARY_RHS
1995           && TREE_CODE (rhs2) == INTEGER_CST))
1996     {
1997       if (code != BIT_AND_EXPR
1998           && code != LSHIFT_EXPR
1999           && code != RSHIFT_EXPR
2000           && code != LROTATE_EXPR
2001           && code != RROTATE_EXPR
2002           && !CONVERT_EXPR_CODE_P (code))
2003         return NULL;
2004
2005       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2006
2007       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2008          we have to initialize the symbolic number.  */
2009       if (!source_stmt1)
2010         {
2011           if (gimple_assign_load_p (stmt)
2012               || !init_symbolic_number (n, rhs1))
2013             return NULL;
2014           source_stmt1 = stmt;
2015         }
2016
2017       switch (code)
2018         {
2019         case BIT_AND_EXPR:
2020           {
2021             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2022             uint64_t val = int_cst_value (rhs2), mask = 0;
2023             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2024
2025             /* Only constants masking full bytes are allowed.  */
2026             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2027               if ((val & tmp) != 0 && (val & tmp) != tmp)
2028                 return NULL;
2029               else if (val & tmp)
2030                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2031
2032             n->n &= mask;
2033           }
2034           break;
2035         case LSHIFT_EXPR:
2036         case RSHIFT_EXPR:
2037         case LROTATE_EXPR:
2038         case RROTATE_EXPR:
2039           if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2040             return NULL;
2041           break;
2042         CASE_CONVERT:
2043           {
2044             int i, type_size, old_type_size;
2045             tree type;
2046
2047             type = gimple_expr_type (stmt);
2048             type_size = TYPE_PRECISION (type);
2049             if (type_size % BITS_PER_UNIT != 0)
2050               return NULL;
2051             type_size /= BITS_PER_UNIT;
2052             if (type_size > 64 / BITS_PER_MARKER)
2053               return NULL;
2054
2055             /* Sign extension: result is dependent on the value.  */
2056             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2057             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2058                 && HEAD_MARKER (n->n, old_type_size))
2059               for (i = 0; i < type_size - old_type_size; i++)
2060                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2061                         << ((type_size - 1 - i) * BITS_PER_MARKER);
2062
2063             if (type_size < 64 / BITS_PER_MARKER)
2064               {
2065                 /* If STMT casts to a smaller type mask out the bits not
2066                    belonging to the target type.  */
2067                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2068               }
2069             n->type = type;
2070             if (!n->base_addr)
2071               n->range = type_size;
2072           }
2073           break;
2074         default:
2075           return NULL;
2076         };
2077       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2078     }
2079
2080   /* Handle binary rhs.  */
2081
2082   if (rhs_class == GIMPLE_BINARY_RHS)
2083     {
2084       struct symbolic_number n1, n2;
2085       gimple source_stmt, source_stmt2;
2086
2087       if (code != BIT_IOR_EXPR)
2088         return NULL;
2089
2090       if (TREE_CODE (rhs2) != SSA_NAME)
2091         return NULL;
2092
2093       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2094
2095       switch (code)
2096         {
2097         case BIT_IOR_EXPR:
2098           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2099
2100           if (!source_stmt1)
2101             return NULL;
2102
2103           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2104
2105           if (!source_stmt2)
2106             return NULL;
2107
2108           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2109             return NULL;
2110
2111           if (!n1.vuse != !n2.vuse
2112               || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2113             return NULL;
2114
2115           source_stmt
2116             = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2117
2118           if (!source_stmt)
2119             return NULL;
2120
2121           if (!verify_symbolic_number_p (n, stmt))
2122             return NULL;
2123
2124           break;
2125         default:
2126           return NULL;
2127         }
2128       return source_stmt;
2129     }
2130   return NULL;
2131 }
2132
2133 /* Check if STMT completes a bswap implementation or a read in a given
2134    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2135    accordingly.  It also sets N to represent the kind of operations
2136    performed: size of the resulting expression and whether it works on
2137    a memory source, and if so alias-set and vuse.  At last, the
2138    function returns a stmt whose rhs's first tree is the source
2139    expression.  */
2140
2141 static gimple
2142 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2143 {
2144   /* The number which the find_bswap_or_nop_1 result should match in order
2145      to have a full byte swap.  The number is shifted to the right
2146      according to the size of the symbolic number before using it.  */
2147   uint64_t cmpxchg = CMPXCHG;
2148   uint64_t cmpnop = CMPNOP;
2149
2150   gimple source_stmt;
2151   int limit;
2152
2153   /* The last parameter determines the depth search limit.  It usually
2154      correlates directly to the number n of bytes to be touched.  We
2155      increase that number by log2(n) + 1 here in order to also
2156      cover signed -> unsigned conversions of the src operand as can be seen
2157      in libgcc, and for initial shift/and operation of the src operand.  */
2158   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2159   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2160   source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2161
2162   if (!source_stmt)
2163     return NULL;
2164
2165   /* Find real size of result (highest non-zero byte).  */
2166   if (n->base_addr)
2167     {
2168       unsigned HOST_WIDE_INT rsize;
2169       uint64_t tmpn;
2170
2171       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2172       if (BYTES_BIG_ENDIAN && n->range != rsize)
2173         /* This implies an offset, which is currently not handled by
2174            bswap_replace.  */
2175         return NULL;
2176       n->range = rsize;
2177     }
2178
2179   /* Zero out the extra bits of N and CMP*.  */
2180   if (n->range < (int) sizeof (int64_t))
2181     {
2182       uint64_t mask;
2183
2184       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2185       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2186       cmpnop &= mask;
2187     }
2188
2189   /* A complete byte swap should make the symbolic number to start with
2190      the largest digit in the highest order byte. Unchanged symbolic
2191      number indicates a read with same endianness as target architecture.  */
2192   if (n->n == cmpnop)
2193     *bswap = false;
2194   else if (n->n == cmpxchg)
2195     *bswap = true;
2196   else
2197     return NULL;
2198
2199   /* Useless bit manipulation performed by code.  */
2200   if (!n->base_addr && n->n == cmpnop)
2201     return NULL;
2202
2203   n->range *= BITS_PER_UNIT;
2204   return source_stmt;
2205 }
2206
2207 namespace {
2208
2209 const pass_data pass_data_optimize_bswap =
2210 {
2211   GIMPLE_PASS, /* type */
2212   "bswap", /* name */
2213   OPTGROUP_NONE, /* optinfo_flags */
2214   TV_NONE, /* tv_id */
2215   PROP_ssa, /* properties_required */
2216   0, /* properties_provided */
2217   0, /* properties_destroyed */
2218   0, /* todo_flags_start */
2219   0, /* todo_flags_finish */
2220 };
2221
2222 class pass_optimize_bswap : public gimple_opt_pass
2223 {
2224 public:
2225   pass_optimize_bswap (gcc::context *ctxt)
2226     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2227   {}
2228
2229   /* opt_pass methods: */
2230   virtual bool gate (function *)
2231     {
2232       return flag_expensive_optimizations && optimize;
2233     }
2234
2235   virtual unsigned int execute (function *);
2236
2237 }; // class pass_optimize_bswap
2238
2239 /* Perform the bswap optimization: replace the expression computed in the rhs
2240    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2241    Which of these alternatives replace the rhs is given by N->base_addr (non
2242    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2243    load to perform are also given in N while the builtin bswap invoke is given
2244    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2245    load statements involved to construct the rhs in CUR_STMT and N->range gives
2246    the size of the rhs expression for maintaining some statistics.
2247
2248    Note that if the replacement involve a load, CUR_STMT is moved just after
2249    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2250    changing of basic block.  */
2251
2252 static bool
2253 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2254                tree load_type, struct symbolic_number *n, bool bswap)
2255 {
2256   gimple_stmt_iterator gsi;
2257   tree src, tmp, tgt;
2258   gimple bswap_stmt;
2259
2260   gsi = gsi_for_stmt (cur_stmt);
2261   src = gimple_assign_rhs1 (src_stmt);
2262   tgt = gimple_assign_lhs (cur_stmt);
2263
2264   /* Need to load the value from memory first.  */
2265   if (n->base_addr)
2266     {
2267       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2268       tree addr_expr, addr_tmp, val_expr, val_tmp;
2269       tree load_offset_ptr, aligned_load_type;
2270       gimple addr_stmt, load_stmt;
2271       unsigned align;
2272       HOST_WIDE_INT load_offset = 0;
2273
2274       align = get_object_alignment (src);
2275       /* If the new access is smaller than the original one, we need
2276          to perform big endian adjustment.  */
2277       if (BYTES_BIG_ENDIAN)
2278         {
2279           HOST_WIDE_INT bitsize, bitpos;
2280           machine_mode mode;
2281           int unsignedp, volatilep;
2282           tree offset;
2283
2284           get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2285                                &unsignedp, &volatilep, false);
2286           if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2287             {
2288               load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2289               unsigned HOST_WIDE_INT l
2290                 = (load_offset * BITS_PER_UNIT) & (align - 1);
2291               if (l)
2292                 align = l & -l;
2293             }
2294         }
2295
2296       if (bswap
2297           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2298           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2299         return false;
2300
2301       /* Move cur_stmt just before  one of the load of the original
2302          to ensure it has the same VUSE.  See PR61517 for what could
2303          go wrong.  */
2304       if (gimple_bb (cur_stmt) != gimple_bb (src_stmt))
2305         reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
2306       gsi_move_before (&gsi, &gsi_ins);
2307       gsi = gsi_for_stmt (cur_stmt);
2308
2309       /* Compute address to load from and cast according to the size
2310          of the load.  */
2311       addr_expr = build_fold_addr_expr (unshare_expr (src));
2312       if (is_gimple_mem_ref_addr (addr_expr))
2313         addr_tmp = addr_expr;
2314       else
2315         {
2316           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2317                                          "load_src");
2318           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2319           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2320         }
2321
2322       /* Perform the load.  */
2323       aligned_load_type = load_type;
2324       if (align < TYPE_ALIGN (load_type))
2325         aligned_load_type = build_aligned_type (load_type, align);
2326       load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2327       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2328                               load_offset_ptr);
2329
2330       if (!bswap)
2331         {
2332           if (n->range == 16)
2333             nop_stats.found_16bit++;
2334           else if (n->range == 32)
2335             nop_stats.found_32bit++;
2336           else
2337             {
2338               gcc_assert (n->range == 64);
2339               nop_stats.found_64bit++;
2340             }
2341
2342           /* Convert the result of load if necessary.  */
2343           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2344             {
2345               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2346                                             "load_dst");
2347               load_stmt = gimple_build_assign (val_tmp, val_expr);
2348               gimple_set_vuse (load_stmt, n->vuse);
2349               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2350               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2351             }
2352           else
2353             {
2354               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2355               gimple_set_vuse (cur_stmt, n->vuse);
2356             }
2357           update_stmt (cur_stmt);
2358
2359           if (dump_file)
2360             {
2361               fprintf (dump_file,
2362                        "%d bit load in target endianness found at: ",
2363                        (int) n->range);
2364               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2365             }
2366           return true;
2367         }
2368       else
2369         {
2370           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2371           load_stmt = gimple_build_assign (val_tmp, val_expr);
2372           gimple_set_vuse (load_stmt, n->vuse);
2373           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2374         }
2375       src = val_tmp;
2376     }
2377
2378   if (n->range == 16)
2379     bswap_stats.found_16bit++;
2380   else if (n->range == 32)
2381     bswap_stats.found_32bit++;
2382   else
2383     {
2384       gcc_assert (n->range == 64);
2385       bswap_stats.found_64bit++;
2386     }
2387
2388   tmp = src;
2389
2390   /* Convert the src expression if necessary.  */
2391   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2392     {
2393       gimple convert_stmt;
2394
2395       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2396       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2397       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2398     }
2399
2400   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2401      are considered as rotation of 2N bit values by N bits is generally not
2402      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2403      gives 0x03040102 while a bswap for that value is 0x04030201.  */
2404   if (bswap && n->range == 16)
2405     {
2406       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2407       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2408       bswap_stmt = gimple_build_assign (NULL, src);
2409     }
2410   else
2411     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2412
2413   tmp = tgt;
2414
2415   /* Convert the result if necessary.  */
2416   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2417     {
2418       gimple convert_stmt;
2419
2420       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2421       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2422       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2423     }
2424
2425   gimple_set_lhs (bswap_stmt, tmp);
2426
2427   if (dump_file)
2428     {
2429       fprintf (dump_file, "%d bit bswap implementation found at: ",
2430                (int) n->range);
2431       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2432     }
2433
2434   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2435   gsi_remove (&gsi, true);
2436   return true;
2437 }
2438
2439 /* Find manual byte swap implementations as well as load in a given
2440    endianness. Byte swaps are turned into a bswap builtin invokation
2441    while endian loads are converted to bswap builtin invokation or
2442    simple load according to the target endianness.  */
2443
2444 unsigned int
2445 pass_optimize_bswap::execute (function *fun)
2446 {
2447   basic_block bb;
2448   bool bswap32_p, bswap64_p;
2449   bool changed = false;
2450   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2451
2452   if (BITS_PER_UNIT != 8)
2453     return 0;
2454
2455   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2456                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2457   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2458                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2459                    || (bswap32_p && word_mode == SImode)));
2460
2461   /* Determine the argument type of the builtins.  The code later on
2462      assumes that the return and argument type are the same.  */
2463   if (bswap32_p)
2464     {
2465       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2466       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2467     }
2468
2469   if (bswap64_p)
2470     {
2471       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2472       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2473     }
2474
2475   memset (&nop_stats, 0, sizeof (nop_stats));
2476   memset (&bswap_stats, 0, sizeof (bswap_stats));
2477
2478   FOR_EACH_BB_FN (bb, fun)
2479     {
2480       gimple_stmt_iterator gsi;
2481
2482       /* We do a reverse scan for bswap patterns to make sure we get the
2483          widest match. As bswap pattern matching doesn't handle previously
2484          inserted smaller bswap replacements as sub-patterns, the wider
2485          variant wouldn't be detected.  */
2486       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2487         {
2488           gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2489           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2490           enum tree_code code;
2491           struct symbolic_number n;
2492           bool bswap;
2493
2494           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2495              might be moved to a different basic block by bswap_replace and gsi
2496              must not points to it if that's the case.  Moving the gsi_prev
2497              there make sure that gsi points to the statement previous to
2498              cur_stmt while still making sure that all statements are
2499              considered in this basic block.  */
2500           gsi_prev (&gsi);
2501
2502           if (!is_gimple_assign (cur_stmt))
2503             continue;
2504
2505           code = gimple_assign_rhs_code (cur_stmt);
2506           switch (code)
2507             {
2508             case LROTATE_EXPR:
2509             case RROTATE_EXPR:
2510               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2511                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2512                      % BITS_PER_UNIT)
2513                 continue;
2514               /* Fall through.  */
2515             case BIT_IOR_EXPR:
2516               break;
2517             default:
2518               continue;
2519             }
2520
2521           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2522
2523           if (!src_stmt)
2524             continue;
2525
2526           switch (n.range)
2527             {
2528             case 16:
2529               /* Already in canonical form, nothing to do.  */
2530               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2531                 continue;
2532               load_type = bswap_type = uint16_type_node;
2533               break;
2534             case 32:
2535               load_type = uint32_type_node;
2536               if (bswap32_p)
2537                 {
2538                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2539                   bswap_type = bswap32_type;
2540                 }
2541               break;
2542             case 64:
2543               load_type = uint64_type_node;
2544               if (bswap64_p)
2545                 {
2546                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2547                   bswap_type = bswap64_type;
2548                 }
2549               break;
2550             default:
2551               continue;
2552             }
2553
2554           if (bswap && !fndecl && n.range != 16)
2555             continue;
2556
2557           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2558                              &n, bswap))
2559             changed = true;
2560         }
2561     }
2562
2563   statistics_counter_event (fun, "16-bit nop implementations found",
2564                             nop_stats.found_16bit);
2565   statistics_counter_event (fun, "32-bit nop implementations found",
2566                             nop_stats.found_32bit);
2567   statistics_counter_event (fun, "64-bit nop implementations found",
2568                             nop_stats.found_64bit);
2569   statistics_counter_event (fun, "16-bit bswap implementations found",
2570                             bswap_stats.found_16bit);
2571   statistics_counter_event (fun, "32-bit bswap implementations found",
2572                             bswap_stats.found_32bit);
2573   statistics_counter_event (fun, "64-bit bswap implementations found",
2574                             bswap_stats.found_64bit);
2575
2576   return (changed ? TODO_update_ssa : 0);
2577 }
2578
2579 } // anon namespace
2580
2581 gimple_opt_pass *
2582 make_pass_optimize_bswap (gcc::context *ctxt)
2583 {
2584   return new pass_optimize_bswap (ctxt);
2585 }
2586
2587 /* Return true if stmt is a type conversion operation that can be stripped
2588    when used in a widening multiply operation.  */
2589 static bool
2590 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2591 {
2592   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2593
2594   if (TREE_CODE (result_type) == INTEGER_TYPE)
2595     {
2596       tree op_type;
2597       tree inner_op_type;
2598
2599       if (!CONVERT_EXPR_CODE_P (rhs_code))
2600         return false;
2601
2602       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2603
2604       /* If the type of OP has the same precision as the result, then
2605          we can strip this conversion.  The multiply operation will be
2606          selected to create the correct extension as a by-product.  */
2607       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2608         return true;
2609
2610       /* We can also strip a conversion if it preserves the signed-ness of
2611          the operation and doesn't narrow the range.  */
2612       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2613
2614       /* If the inner-most type is unsigned, then we can strip any
2615          intermediate widening operation.  If it's signed, then the
2616          intermediate widening operation must also be signed.  */
2617       if ((TYPE_UNSIGNED (inner_op_type)
2618            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2619           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2620         return true;
2621
2622       return false;
2623     }
2624
2625   return rhs_code == FIXED_CONVERT_EXPR;
2626 }
2627
2628 /* Return true if RHS is a suitable operand for a widening multiplication,
2629    assuming a target type of TYPE.
2630    There are two cases:
2631
2632      - RHS makes some value at least twice as wide.  Store that value
2633        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2634
2635      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2636        but leave *TYPE_OUT untouched.  */
2637
2638 static bool
2639 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2640                         tree *new_rhs_out)
2641 {
2642   gimple stmt;
2643   tree type1, rhs1;
2644
2645   if (TREE_CODE (rhs) == SSA_NAME)
2646     {
2647       stmt = SSA_NAME_DEF_STMT (rhs);
2648       if (is_gimple_assign (stmt))
2649         {
2650           if (! widening_mult_conversion_strippable_p (type, stmt))
2651             rhs1 = rhs;
2652           else
2653             {
2654               rhs1 = gimple_assign_rhs1 (stmt);
2655
2656               if (TREE_CODE (rhs1) == INTEGER_CST)
2657                 {
2658                   *new_rhs_out = rhs1;
2659                   *type_out = NULL;
2660                   return true;
2661                 }
2662             }
2663         }
2664       else
2665         rhs1 = rhs;
2666
2667       type1 = TREE_TYPE (rhs1);
2668
2669       if (TREE_CODE (type1) != TREE_CODE (type)
2670           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2671         return false;
2672
2673       *new_rhs_out = rhs1;
2674       *type_out = type1;
2675       return true;
2676     }
2677
2678   if (TREE_CODE (rhs) == INTEGER_CST)
2679     {
2680       *new_rhs_out = rhs;
2681       *type_out = NULL;
2682       return true;
2683     }
2684
2685   return false;
2686 }
2687
2688 /* Return true if STMT performs a widening multiplication, assuming the
2689    output type is TYPE.  If so, store the unwidened types of the operands
2690    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2691    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2692    and *TYPE2_OUT would give the operands of the multiplication.  */
2693
2694 static bool
2695 is_widening_mult_p (gimple stmt,
2696                     tree *type1_out, tree *rhs1_out,
2697                     tree *type2_out, tree *rhs2_out)
2698 {
2699   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2700
2701   if (TREE_CODE (type) != INTEGER_TYPE
2702       && TREE_CODE (type) != FIXED_POINT_TYPE)
2703     return false;
2704
2705   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2706                                rhs1_out))
2707     return false;
2708
2709   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2710                                rhs2_out))
2711     return false;
2712
2713   if (*type1_out == NULL)
2714     {
2715       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2716         return false;
2717       *type1_out = *type2_out;
2718     }
2719
2720   if (*type2_out == NULL)
2721     {
2722       if (!int_fits_type_p (*rhs2_out, *type1_out))
2723         return false;
2724       *type2_out = *type1_out;
2725     }
2726
2727   /* Ensure that the larger of the two operands comes first. */
2728   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2729     {
2730       tree tmp;
2731       tmp = *type1_out;
2732       *type1_out = *type2_out;
2733       *type2_out = tmp;
2734       tmp = *rhs1_out;
2735       *rhs1_out = *rhs2_out;
2736       *rhs2_out = tmp;
2737     }
2738
2739   return true;
2740 }
2741
2742 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2743    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2744    value is true iff we converted the statement.  */
2745
2746 static bool
2747 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2748 {
2749   tree lhs, rhs1, rhs2, type, type1, type2;
2750   enum insn_code handler;
2751   machine_mode to_mode, from_mode, actual_mode;
2752   optab op;
2753   int actual_precision;
2754   location_t loc = gimple_location (stmt);
2755   bool from_unsigned1, from_unsigned2;
2756
2757   lhs = gimple_assign_lhs (stmt);
2758   type = TREE_TYPE (lhs);
2759   if (TREE_CODE (type) != INTEGER_TYPE)
2760     return false;
2761
2762   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2763     return false;
2764
2765   to_mode = TYPE_MODE (type);
2766   from_mode = TYPE_MODE (type1);
2767   from_unsigned1 = TYPE_UNSIGNED (type1);
2768   from_unsigned2 = TYPE_UNSIGNED (type2);
2769
2770   if (from_unsigned1 && from_unsigned2)
2771     op = umul_widen_optab;
2772   else if (!from_unsigned1 && !from_unsigned2)
2773     op = smul_widen_optab;
2774   else
2775     op = usmul_widen_optab;
2776
2777   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2778                                                   0, &actual_mode);
2779
2780   if (handler == CODE_FOR_nothing)
2781     {
2782       if (op != smul_widen_optab)
2783         {
2784           /* We can use a signed multiply with unsigned types as long as
2785              there is a wider mode to use, or it is the smaller of the two
2786              types that is unsigned.  Note that type1 >= type2, always.  */
2787           if ((TYPE_UNSIGNED (type1)
2788                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2789               || (TYPE_UNSIGNED (type2)
2790                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2791             {
2792               from_mode = GET_MODE_WIDER_MODE (from_mode);
2793               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2794                 return false;
2795             }
2796
2797           op = smul_widen_optab;
2798           handler = find_widening_optab_handler_and_mode (op, to_mode,
2799                                                           from_mode, 0,
2800                                                           &actual_mode);
2801
2802           if (handler == CODE_FOR_nothing)
2803             return false;
2804
2805           from_unsigned1 = from_unsigned2 = false;
2806         }
2807       else
2808         return false;
2809     }
2810
2811   /* Ensure that the inputs to the handler are in the correct precison
2812      for the opcode.  This will be the full mode size.  */
2813   actual_precision = GET_MODE_PRECISION (actual_mode);
2814   if (2 * actual_precision > TYPE_PRECISION (type))
2815     return false;
2816   if (actual_precision != TYPE_PRECISION (type1)
2817       || from_unsigned1 != TYPE_UNSIGNED (type1))
2818     rhs1 = build_and_insert_cast (gsi, loc,
2819                                   build_nonstandard_integer_type
2820                                     (actual_precision, from_unsigned1), rhs1);
2821   if (actual_precision != TYPE_PRECISION (type2)
2822       || from_unsigned2 != TYPE_UNSIGNED (type2))
2823     rhs2 = build_and_insert_cast (gsi, loc,
2824                                   build_nonstandard_integer_type
2825                                     (actual_precision, from_unsigned2), rhs2);
2826
2827   /* Handle constants.  */
2828   if (TREE_CODE (rhs1) == INTEGER_CST)
2829     rhs1 = fold_convert (type1, rhs1);
2830   if (TREE_CODE (rhs2) == INTEGER_CST)
2831     rhs2 = fold_convert (type2, rhs2);
2832
2833   gimple_assign_set_rhs1 (stmt, rhs1);
2834   gimple_assign_set_rhs2 (stmt, rhs2);
2835   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2836   update_stmt (stmt);
2837   widen_mul_stats.widen_mults_inserted++;
2838   return true;
2839 }
2840
2841 /* Process a single gimple statement STMT, which is found at the
2842    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2843    rhs (given by CODE), and try to convert it into a
2844    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2845    is true iff we converted the statement.  */
2846
2847 static bool
2848 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2849                             enum tree_code code)
2850 {
2851   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2852   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2853   tree type, type1, type2, optype;
2854   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2855   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2856   optab this_optab;
2857   enum tree_code wmult_code;
2858   enum insn_code handler;
2859   machine_mode to_mode, from_mode, actual_mode;
2860   location_t loc = gimple_location (stmt);
2861   int actual_precision;
2862   bool from_unsigned1, from_unsigned2;
2863
2864   lhs = gimple_assign_lhs (stmt);
2865   type = TREE_TYPE (lhs);
2866   if (TREE_CODE (type) != INTEGER_TYPE
2867       && TREE_CODE (type) != FIXED_POINT_TYPE)
2868     return false;
2869
2870   if (code == MINUS_EXPR)
2871     wmult_code = WIDEN_MULT_MINUS_EXPR;
2872   else
2873     wmult_code = WIDEN_MULT_PLUS_EXPR;
2874
2875   rhs1 = gimple_assign_rhs1 (stmt);
2876   rhs2 = gimple_assign_rhs2 (stmt);
2877
2878   if (TREE_CODE (rhs1) == SSA_NAME)
2879     {
2880       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2881       if (is_gimple_assign (rhs1_stmt))
2882         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2883     }
2884
2885   if (TREE_CODE (rhs2) == SSA_NAME)
2886     {
2887       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2888       if (is_gimple_assign (rhs2_stmt))
2889         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2890     }
2891
2892   /* Allow for one conversion statement between the multiply
2893      and addition/subtraction statement.  If there are more than
2894      one conversions then we assume they would invalidate this
2895      transformation.  If that's not the case then they should have
2896      been folded before now.  */
2897   if (CONVERT_EXPR_CODE_P (rhs1_code))
2898     {
2899       conv1_stmt = rhs1_stmt;
2900       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2901       if (TREE_CODE (rhs1) == SSA_NAME)
2902         {
2903           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2904           if (is_gimple_assign (rhs1_stmt))
2905             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2906         }
2907       else
2908         return false;
2909     }
2910   if (CONVERT_EXPR_CODE_P (rhs2_code))
2911     {
2912       conv2_stmt = rhs2_stmt;
2913       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2914       if (TREE_CODE (rhs2) == SSA_NAME)
2915         {
2916           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2917           if (is_gimple_assign (rhs2_stmt))
2918             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2919         }
2920       else
2921         return false;
2922     }
2923
2924   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2925      is_widening_mult_p, but we still need the rhs returns.
2926
2927      It might also appear that it would be sufficient to use the existing
2928      operands of the widening multiply, but that would limit the choice of
2929      multiply-and-accumulate instructions.
2930
2931      If the widened-multiplication result has more than one uses, it is
2932      probably wiser not to do the conversion.  */
2933   if (code == PLUS_EXPR
2934       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2935     {
2936       if (!has_single_use (rhs1)
2937           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2938                                   &type2, &mult_rhs2))
2939         return false;
2940       add_rhs = rhs2;
2941       conv_stmt = conv1_stmt;
2942     }
2943   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2944     {
2945       if (!has_single_use (rhs2)
2946           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2947                                   &type2, &mult_rhs2))
2948         return false;
2949       add_rhs = rhs1;
2950       conv_stmt = conv2_stmt;
2951     }
2952   else
2953     return false;
2954
2955   to_mode = TYPE_MODE (type);
2956   from_mode = TYPE_MODE (type1);
2957   from_unsigned1 = TYPE_UNSIGNED (type1);
2958   from_unsigned2 = TYPE_UNSIGNED (type2);
2959   optype = type1;
2960
2961   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2962   if (from_unsigned1 != from_unsigned2)
2963     {
2964       if (!INTEGRAL_TYPE_P (type))
2965         return false;
2966       /* We can use a signed multiply with unsigned types as long as
2967          there is a wider mode to use, or it is the smaller of the two
2968          types that is unsigned.  Note that type1 >= type2, always.  */
2969       if ((from_unsigned1
2970            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2971           || (from_unsigned2
2972               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2973         {
2974           from_mode = GET_MODE_WIDER_MODE (from_mode);
2975           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2976             return false;
2977         }
2978
2979       from_unsigned1 = from_unsigned2 = false;
2980       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2981                                                false);
2982     }
2983
2984   /* If there was a conversion between the multiply and addition
2985      then we need to make sure it fits a multiply-and-accumulate.
2986      The should be a single mode change which does not change the
2987      value.  */
2988   if (conv_stmt)
2989     {
2990       /* We use the original, unmodified data types for this.  */
2991       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2992       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2993       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2994       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2995
2996       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2997         {
2998           /* Conversion is a truncate.  */
2999           if (TYPE_PRECISION (to_type) < data_size)
3000             return false;
3001         }
3002       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3003         {
3004           /* Conversion is an extend.  Check it's the right sort.  */
3005           if (TYPE_UNSIGNED (from_type) != is_unsigned
3006               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3007             return false;
3008         }
3009       /* else convert is a no-op for our purposes.  */
3010     }
3011
3012   /* Verify that the machine can perform a widening multiply
3013      accumulate in this mode/signedness combination, otherwise
3014      this transformation is likely to pessimize code.  */
3015   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3016   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3017                                                   from_mode, 0, &actual_mode);
3018
3019   if (handler == CODE_FOR_nothing)
3020     return false;
3021
3022   /* Ensure that the inputs to the handler are in the correct precison
3023      for the opcode.  This will be the full mode size.  */
3024   actual_precision = GET_MODE_PRECISION (actual_mode);
3025   if (actual_precision != TYPE_PRECISION (type1)
3026       || from_unsigned1 != TYPE_UNSIGNED (type1))
3027     mult_rhs1 = build_and_insert_cast (gsi, loc,
3028                                        build_nonstandard_integer_type
3029                                          (actual_precision, from_unsigned1),
3030                                        mult_rhs1);
3031   if (actual_precision != TYPE_PRECISION (type2)
3032       || from_unsigned2 != TYPE_UNSIGNED (type2))
3033     mult_rhs2 = build_and_insert_cast (gsi, loc,
3034                                        build_nonstandard_integer_type
3035                                          (actual_precision, from_unsigned2),
3036                                        mult_rhs2);
3037
3038   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3039     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3040
3041   /* Handle constants.  */
3042   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3043     mult_rhs1 = fold_convert (type1, mult_rhs1);
3044   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3045     mult_rhs2 = fold_convert (type2, mult_rhs2);
3046
3047   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3048                                   add_rhs);
3049   update_stmt (gsi_stmt (*gsi));
3050   widen_mul_stats.maccs_inserted++;
3051   return true;
3052 }
3053
3054 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3055    with uses in additions and subtractions to form fused multiply-add
3056    operations.  Returns true if successful and MUL_STMT should be removed.  */
3057
3058 static bool
3059 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
3060 {
3061   tree mul_result = gimple_get_lhs (mul_stmt);
3062   tree type = TREE_TYPE (mul_result);
3063   gimple use_stmt, neguse_stmt;
3064   gassign *fma_stmt;
3065   use_operand_p use_p;
3066   imm_use_iterator imm_iter;
3067
3068   if (FLOAT_TYPE_P (type)
3069       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3070     return false;
3071
3072   /* We don't want to do bitfield reduction ops.  */
3073   if (INTEGRAL_TYPE_P (type)
3074       && (TYPE_PRECISION (type)
3075           != GET_MODE_PRECISION (TYPE_MODE (type))))
3076     return false;
3077
3078   /* If the target doesn't support it, don't generate it.  We assume that
3079      if fma isn't available then fms, fnma or fnms are not either.  */
3080   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3081     return false;
3082
3083   /* If the multiplication has zero uses, it is kept around probably because
3084      of -fnon-call-exceptions.  Don't optimize it away in that case,
3085      it is DCE job.  */
3086   if (has_zero_uses (mul_result))
3087     return false;
3088
3089   /* Make sure that the multiplication statement becomes dead after
3090      the transformation, thus that all uses are transformed to FMAs.
3091      This means we assume that an FMA operation has the same cost
3092      as an addition.  */
3093   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3094     {
3095       enum tree_code use_code;
3096       tree result = mul_result;
3097       bool negate_p = false;
3098
3099       use_stmt = USE_STMT (use_p);
3100
3101       if (is_gimple_debug (use_stmt))
3102         continue;
3103
3104       /* For now restrict this operations to single basic blocks.  In theory
3105          we would want to support sinking the multiplication in
3106          m = a*b;
3107          if ()
3108            ma = m + c;
3109          else
3110            d = m;
3111          to form a fma in the then block and sink the multiplication to the
3112          else block.  */
3113       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3114         return false;
3115
3116       if (!is_gimple_assign (use_stmt))
3117         return false;
3118
3119       use_code = gimple_assign_rhs_code (use_stmt);
3120
3121       /* A negate on the multiplication leads to FNMA.  */
3122       if (use_code == NEGATE_EXPR)
3123         {
3124           ssa_op_iter iter;
3125           use_operand_p usep;
3126
3127           result = gimple_assign_lhs (use_stmt);
3128
3129           /* Make sure the negate statement becomes dead with this
3130              single transformation.  */
3131           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3132                                &use_p, &neguse_stmt))
3133             return false;
3134
3135           /* Make sure the multiplication isn't also used on that stmt.  */
3136           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3137             if (USE_FROM_PTR (usep) == mul_result)
3138               return false;
3139
3140           /* Re-validate.  */
3141           use_stmt = neguse_stmt;
3142           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3143             return false;
3144           if (!is_gimple_assign (use_stmt))
3145             return false;
3146
3147           use_code = gimple_assign_rhs_code (use_stmt);
3148           negate_p = true;
3149         }
3150
3151       switch (use_code)
3152         {
3153         case MINUS_EXPR:
3154           if (gimple_assign_rhs2 (use_stmt) == result)
3155             negate_p = !negate_p;
3156           break;
3157         case PLUS_EXPR:
3158           break;
3159         default:
3160           /* FMA can only be formed from PLUS and MINUS.  */
3161           return false;
3162         }
3163
3164       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3165          by a MULT_EXPR that we'll visit later, we might be able to
3166          get a more profitable match with fnma.
3167          OTOH, if we don't, a negate / fma pair has likely lower latency
3168          that a mult / subtract pair.  */
3169       if (use_code == MINUS_EXPR && !negate_p
3170           && gimple_assign_rhs1 (use_stmt) == result
3171           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3172           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3173         {
3174           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3175
3176           if (TREE_CODE (rhs2) == SSA_NAME)
3177             {
3178               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3179               if (has_single_use (rhs2)
3180                   && is_gimple_assign (stmt2)
3181                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3182               return false;
3183             }
3184         }
3185
3186       /* We can't handle a * b + a * b.  */
3187       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3188         return false;
3189
3190       /* While it is possible to validate whether or not the exact form
3191          that we've recognized is available in the backend, the assumption
3192          is that the transformation is never a loss.  For instance, suppose
3193          the target only has the plain FMA pattern available.  Consider
3194          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3195          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3196          still have 3 operations, but in the FMA form the two NEGs are
3197          independent and could be run in parallel.  */
3198     }
3199
3200   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3201     {
3202       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3203       enum tree_code use_code;
3204       tree addop, mulop1 = op1, result = mul_result;
3205       bool negate_p = false;
3206
3207       if (is_gimple_debug (use_stmt))
3208         continue;
3209
3210       use_code = gimple_assign_rhs_code (use_stmt);
3211       if (use_code == NEGATE_EXPR)
3212         {
3213           result = gimple_assign_lhs (use_stmt);
3214           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3215           gsi_remove (&gsi, true);
3216           release_defs (use_stmt);
3217
3218           use_stmt = neguse_stmt;
3219           gsi = gsi_for_stmt (use_stmt);
3220           use_code = gimple_assign_rhs_code (use_stmt);
3221           negate_p = true;
3222         }
3223
3224       if (gimple_assign_rhs1 (use_stmt) == result)
3225         {
3226           addop = gimple_assign_rhs2 (use_stmt);
3227           /* a * b - c -> a * b + (-c)  */
3228           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3229             addop = force_gimple_operand_gsi (&gsi,
3230                                               build1 (NEGATE_EXPR,
3231                                                       type, addop),
3232                                               true, NULL_TREE, true,
3233                                               GSI_SAME_STMT);
3234         }
3235       else
3236         {
3237           addop = gimple_assign_rhs1 (use_stmt);
3238           /* a - b * c -> (-b) * c + a */
3239           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3240             negate_p = !negate_p;
3241         }
3242
3243       if (negate_p)
3244         mulop1 = force_gimple_operand_gsi (&gsi,
3245                                            build1 (NEGATE_EXPR,
3246                                                    type, mulop1),
3247                                            true, NULL_TREE, true,
3248                                            GSI_SAME_STMT);
3249
3250       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3251                                       FMA_EXPR, mulop1, op2, addop);
3252       gsi_replace (&gsi, fma_stmt, true);
3253       widen_mul_stats.fmas_inserted++;
3254     }
3255
3256   return true;
3257 }
3258
3259 /* Find integer multiplications where the operands are extended from
3260    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3261    where appropriate.  */
3262
3263 namespace {
3264
3265 const pass_data pass_data_optimize_widening_mul =
3266 {
3267   GIMPLE_PASS, /* type */
3268   "widening_mul", /* name */
3269   OPTGROUP_NONE, /* optinfo_flags */
3270   TV_NONE, /* tv_id */
3271   PROP_ssa, /* properties_required */
3272   0, /* properties_provided */
3273   0, /* properties_destroyed */
3274   0, /* todo_flags_start */
3275   TODO_update_ssa, /* todo_flags_finish */
3276 };
3277
3278 class pass_optimize_widening_mul : public gimple_opt_pass
3279 {
3280 public:
3281   pass_optimize_widening_mul (gcc::context *ctxt)
3282     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3283   {}
3284
3285   /* opt_pass methods: */
3286   virtual bool gate (function *)
3287     {
3288       return flag_expensive_optimizations && optimize;
3289     }
3290
3291   virtual unsigned int execute (function *);
3292
3293 }; // class pass_optimize_widening_mul
3294
3295 unsigned int
3296 pass_optimize_widening_mul::execute (function *fun)
3297 {
3298   basic_block bb;
3299   bool cfg_changed = false;
3300
3301   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3302
3303   FOR_EACH_BB_FN (bb, fun)
3304     {
3305       gimple_stmt_iterator gsi;
3306
3307       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3308         {
3309           gimple stmt = gsi_stmt (gsi);
3310           enum tree_code code;
3311
3312           if (is_gimple_assign (stmt))
3313             {
3314               code = gimple_assign_rhs_code (stmt);
3315               switch (code)
3316                 {
3317                 case MULT_EXPR:
3318                   if (!convert_mult_to_widen (stmt, &gsi)
3319                       && convert_mult_to_fma (stmt,
3320                                               gimple_assign_rhs1 (stmt),
3321                                               gimple_assign_rhs2 (stmt)))
3322                     {
3323                       gsi_remove (&gsi, true);
3324                       release_defs (stmt);
3325                       continue;
3326                     }
3327                   break;
3328
3329                 case PLUS_EXPR:
3330                 case MINUS_EXPR:
3331                   convert_plusminus_to_widen (&gsi, stmt, code);
3332                   break;
3333
3334                 default:;
3335                 }
3336             }
3337           else if (is_gimple_call (stmt)
3338                    && gimple_call_lhs (stmt))
3339             {
3340               tree fndecl = gimple_call_fndecl (stmt);
3341               if (fndecl
3342                   && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
3343                 {
3344                   switch (DECL_FUNCTION_CODE (fndecl))
3345                     {
3346                       case BUILT_IN_POWF:
3347                       case BUILT_IN_POW:
3348                       case BUILT_IN_POWL:
3349                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3350                             && REAL_VALUES_EQUAL
3351                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3352                                   dconst2)
3353                             && convert_mult_to_fma (stmt,
3354                                                     gimple_call_arg (stmt, 0),
3355                                                     gimple_call_arg (stmt, 0)))
3356                           {
3357                             unlink_stmt_vdef (stmt);
3358                             if (gsi_remove (&gsi, true)
3359                                 && gimple_purge_dead_eh_edges (bb))
3360                               cfg_changed = true;
3361                             release_defs (stmt);
3362                             continue;
3363                           }
3364                           break;
3365
3366                       default:;
3367                     }
3368                 }
3369             }
3370           gsi_next (&gsi);
3371         }
3372     }
3373
3374   statistics_counter_event (fun, "widening multiplications inserted",
3375                             widen_mul_stats.widen_mults_inserted);
3376   statistics_counter_event (fun, "widening maccs inserted",
3377                             widen_mul_stats.maccs_inserted);
3378   statistics_counter_event (fun, "fused multiply-adds inserted",
3379                             widen_mul_stats.fmas_inserted);
3380
3381   return cfg_changed ? TODO_cleanup_cfg : 0;
3382 }
3383
3384 } // anon namespace
3385
3386 gimple_opt_pass *
3387 make_pass_optimize_widening_mul (gcc::context *ctxt)
3388 {
3389   return new pass_optimize_widening_mul (ctxt);
3390 }