Import OpenSSL-1.0.0f.
[dragonfly.git] / crypto / openssl / crypto / bn / asm / x86-mont.pl
CommitLineData
5febbddd
PA
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005
11#
12# This is a "teaser" code, as it can be improved in several ways...
13# First of all non-SSE2 path should be implemented (yes, for now it
14# performs Montgomery multiplication/convolution only on SSE2-capable
15# CPUs such as P4, others fall down to original code). Then inner loop
16# can be unrolled and modulo-scheduled to improve ILP and possibly
17# moved to 128-bit XMM register bank (though it would require input
18# rearrangement and/or increase bus bandwidth utilization). Dedicated
19# squaring procedure should give further performance improvement...
20# Yet, for being draft, the code improves rsa512 *sign* benchmark by
21# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23# December 2006
24#
25# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26# Integer-only code [being equipped with dedicated squaring procedure]
27# gives ~40% on rsa512 sign benchmark...
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30push(@INC,"${dir}","${dir}../../perlasm");
31require "x86asm.pl";
32
33&asm_init($ARGV[0],$0);
34
35$sse2=0;
36for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37
38&external_label("OPENSSL_ia32cap_P") if ($sse2);
39
40&function_begin("bn_mul_mont");
41
42$i="edx";
43$j="ecx";
44$ap="esi"; $tp="esi"; # overlapping variables!!!
45$rp="edi"; $bp="edi"; # overlapping variables!!!
46$np="ebp";
47$num="ebx";
48
49$_num=&DWP(4*0,"esp"); # stack top layout
50$_rp=&DWP(4*1,"esp");
51$_ap=&DWP(4*2,"esp");
52$_bp=&DWP(4*3,"esp");
53$_np=&DWP(4*4,"esp");
54$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
55$_sp=&DWP(4*6,"esp");
56$_bpend=&DWP(4*7,"esp");
57$frame=32; # size of above frame rounded up to 16n
58
59 &xor ("eax","eax");
60 &mov ("edi",&wparam(5)); # int num
61 &cmp ("edi",4);
62 &jl (&label("just_leave"));
63
64 &lea ("esi",&wparam(0)); # put aside pointer to argument block
65 &lea ("edx",&wparam(1)); # load ap
66 &mov ("ebp","esp"); # saved stack pointer!
67 &add ("edi",2); # extra two words on top of tp
68 &neg ("edi");
69 &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
70 &neg ("edi");
71
72 # minimize cache contention by arraning 2K window between stack
73 # pointer and ap argument [np is also position sensitive vector,
74 # but it's assumed to be near ap, as it's allocated at ~same
75 # time].
76 &mov ("eax","esp");
77 &sub ("eax","edx");
78 &and ("eax",2047);
79 &sub ("esp","eax"); # this aligns sp and ap modulo 2048
80
81 &xor ("edx","esp");
82 &and ("edx",2048);
83 &xor ("edx",2048);
84 &sub ("esp","edx"); # this splits them apart modulo 4096
85
86 &and ("esp",-64); # align to cache line
87
88 ################################# load argument block...
89 &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
90 &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
91 &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
92 &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
93 &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
94 #&mov ("edi",&DWP(5*4,"esi"));# int num
95
96 &mov ("esi",&DWP(0,"esi")); # pull n0[0]
97 &mov ($_rp,"eax"); # ... save a copy of argument block
98 &mov ($_ap,"ebx");
99 &mov ($_bp,"ecx");
100 &mov ($_np,"edx");
101 &mov ($_n0,"esi");
102 &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
103 #&mov ($_num,$num); # redundant as $num is not reused
104 &mov ($_sp,"ebp"); # saved stack pointer!
105\f
106if($sse2) {
107$acc0="mm0"; # mmx register bank layout
108$acc1="mm1";
109$car0="mm2";
110$car1="mm3";
111$mul0="mm4";
112$mul1="mm5";
113$temp="mm6";
114$mask="mm7";
115
116 &picmeup("eax","OPENSSL_ia32cap_P");
117 &bt (&DWP(0,"eax"),26);
118 &jnc (&label("non_sse2"));
119
120 &mov ("eax",-1);
121 &movd ($mask,"eax"); # mask 32 lower bits
122
123 &mov ($ap,$_ap); # load input pointers
124 &mov ($bp,$_bp);
125 &mov ($np,$_np);
126
127 &xor ($i,$i); # i=0
128 &xor ($j,$j); # j=0
129
130 &movd ($mul0,&DWP(0,$bp)); # bp[0]
131 &movd ($mul1,&DWP(0,$ap)); # ap[0]
132 &movd ($car1,&DWP(0,$np)); # np[0]
133
134 &pmuludq($mul1,$mul0); # ap[0]*bp[0]
135 &movq ($car0,$mul1);
136 &movq ($acc0,$mul1); # I wish movd worked for
137 &pand ($acc0,$mask); # inter-register transfers
138
139 &pmuludq($mul1,$_n0q); # *=n0
140
141 &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
142 &paddq ($car1,$acc0);
143
144 &movd ($acc1,&DWP(4,$np)); # np[1]
145 &movd ($acc0,&DWP(4,$ap)); # ap[1]
146
147 &psrlq ($car0,32);
148 &psrlq ($car1,32);
149
150 &inc ($j); # j++
151&set_label("1st",16);
152 &pmuludq($acc0,$mul0); # ap[j]*bp[0]
153 &pmuludq($acc1,$mul1); # np[j]*m1
154 &paddq ($car0,$acc0); # +=c0
155 &paddq ($car1,$acc1); # +=c1
156
157 &movq ($acc0,$car0);
158 &pand ($acc0,$mask);
159 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
160 &paddq ($car1,$acc0); # +=ap[j]*bp[0];
161 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
162 &psrlq ($car0,32);
163 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
164 &psrlq ($car1,32);
165
166 &lea ($j,&DWP(1,$j));
167 &cmp ($j,$num);
168 &jl (&label("1st"));
169
170 &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
171 &pmuludq($acc1,$mul1); # np[num-1]*m1
172 &paddq ($car0,$acc0); # +=c0
173 &paddq ($car1,$acc1); # +=c1
174
175 &movq ($acc0,$car0);
176 &pand ($acc0,$mask);
177 &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
178 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
179
180 &psrlq ($car0,32);
181 &psrlq ($car1,32);
182
183 &paddq ($car1,$car0);
184 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
185\f
186 &inc ($i); # i++
187&set_label("outer");
188 &xor ($j,$j); # j=0
189
190 &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
191 &movd ($mul1,&DWP(0,$ap)); # ap[0]
192 &movd ($temp,&DWP($frame,"esp")); # tp[0]
193 &movd ($car1,&DWP(0,$np)); # np[0]
194 &pmuludq($mul1,$mul0); # ap[0]*bp[i]
195
196 &paddq ($mul1,$temp); # +=tp[0]
197 &movq ($acc0,$mul1);
198 &movq ($car0,$mul1);
199 &pand ($acc0,$mask);
200
201 &pmuludq($mul1,$_n0q); # *=n0
202
203 &pmuludq($car1,$mul1);
204 &paddq ($car1,$acc0);
205
206 &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
207 &movd ($acc1,&DWP(4,$np)); # np[1]
208 &movd ($acc0,&DWP(4,$ap)); # ap[1]
209
210 &psrlq ($car0,32);
211 &psrlq ($car1,32);
212 &paddq ($car0,$temp); # +=tp[1]
213
214 &inc ($j); # j++
215 &dec ($num);
216&set_label("inner");
217 &pmuludq($acc0,$mul0); # ap[j]*bp[i]
218 &pmuludq($acc1,$mul1); # np[j]*m1
219 &paddq ($car0,$acc0); # +=c0
220 &paddq ($car1,$acc1); # +=c1
221
222 &movq ($acc0,$car0);
223 &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
224 &pand ($acc0,$mask);
225 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
226 &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
227 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
228 &psrlq ($car0,32);
229 &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
230 &psrlq ($car1,32);
231 &paddq ($car0,$temp); # +=tp[j+1]
232
233 &dec ($num);
234 &lea ($j,&DWP(1,$j)); # j++
235 &jnz (&label("inner"));
236
237 &mov ($num,$j);
238 &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
239 &pmuludq($acc1,$mul1); # np[num-1]*m1
240 &paddq ($car0,$acc0); # +=c0
241 &paddq ($car1,$acc1); # +=c1
242
243 &movq ($acc0,$car0);
244 &pand ($acc0,$mask);
245 &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
246 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
247 &psrlq ($car0,32);
248 &psrlq ($car1,32);
249
250 &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
251 &paddq ($car1,$car0);
252 &paddq ($car1,$temp);
253 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
254
255 &lea ($i,&DWP(1,$i)); # i++
256 &cmp ($i,$num);
257 &jle (&label("outer"));
258
259 &emms (); # done with mmx bank
260 &jmp (&label("common_tail"));
261
262&set_label("non_sse2",16);
263}
264\f
265if (0) {
266 &mov ("esp",$_sp);
267 &xor ("eax","eax"); # signal "not fast enough [yet]"
268 &jmp (&label("just_leave"));
269 # While the below code provides competitive performance for
270 # all key lengthes on modern Intel cores, it's still more
271 # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
272 # means compared to the original integer-only assembler.
273 # 512-bit RSA sign is better by ~40%, but that's about all
274 # one can say about all CPUs...
275} else {
276$inp="esi"; # integer path uses these registers differently
277$word="edi";
278$carry="ebp";
279
280 &mov ($inp,$_ap);
281 &lea ($carry,&DWP(1,$num));
282 &mov ($word,$_bp);
283 &xor ($j,$j); # j=0
284 &mov ("edx",$inp);
285 &and ($carry,1); # see if num is even
286 &sub ("edx",$word); # see if ap==bp
287 &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
288 &or ($carry,"edx");
289 &mov ($word,&DWP(0,$word)); # bp[0]
290 &jz (&label("bn_sqr_mont"));
291 &mov ($_bpend,"eax");
292 &mov ("eax",&DWP(0,$inp));
293 &xor ("edx","edx");
294
295&set_label("mull",16);
296 &mov ($carry,"edx");
297 &mul ($word); # ap[j]*bp[0]
298 &add ($carry,"eax");
299 &lea ($j,&DWP(1,$j));
300 &adc ("edx",0);
301 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
302 &cmp ($j,$num);
303 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
304 &jl (&label("mull"));
305
306 &mov ($carry,"edx");
307 &mul ($word); # ap[num-1]*bp[0]
308 &mov ($word,$_n0);
309 &add ("eax",$carry);
310 &mov ($inp,$_np);
311 &adc ("edx",0);
312 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
313
314 &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
315 &xor ($j,$j);
316 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
317 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
318
319 &mov ("eax",&DWP(0,$inp)); # np[0]
320 &mul ($word); # np[0]*m
321 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
322 &mov ("eax",&DWP(4,$inp)); # np[1]
323 &adc ("edx",0);
324 &inc ($j);
325
326 &jmp (&label("2ndmadd"));
327\f\f
328&set_label("1stmadd",16);
329 &mov ($carry,"edx");
330 &mul ($word); # ap[j]*bp[i]
331 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
332 &lea ($j,&DWP(1,$j));
333 &adc ("edx",0);
334 &add ($carry,"eax");
335 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
336 &adc ("edx",0);
337 &cmp ($j,$num);
338 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
339 &jl (&label("1stmadd"));
340
341 &mov ($carry,"edx");
342 &mul ($word); # ap[num-1]*bp[i]
343 &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
344 &mov ($word,$_n0);
345 &adc ("edx",0);
346 &mov ($inp,$_np);
347 &add ($carry,"eax");
348 &adc ("edx",0);
349 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
350
351 &xor ($j,$j);
352 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
353 &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
354 &adc ($j,0);
355 &mov ("eax",&DWP(0,$inp)); # np[0]
356 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
357 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
358
359 &mul ($word); # np[0]*m
360 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
361 &mov ("eax",&DWP(4,$inp)); # np[1]
362 &adc ("edx",0);
363 &mov ($j,1);
364\f
365&set_label("2ndmadd",16);
366 &mov ($carry,"edx");
367 &mul ($word); # np[j]*m
368 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
369 &lea ($j,&DWP(1,$j));
370 &adc ("edx",0);
371 &add ($carry,"eax");
372 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
373 &adc ("edx",0);
374 &cmp ($j,$num);
375 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
376 &jl (&label("2ndmadd"));
377
378 &mov ($carry,"edx");
379 &mul ($word); # np[j]*m
380 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
381 &adc ("edx",0);
382 &add ($carry,"eax");
383 &adc ("edx",0);
384 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
385
386 &xor ("eax","eax");
387 &mov ($j,$_bp); # &bp[i]
388 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
389 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
390 &lea ($j,&DWP(4,$j));
391 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
392 &cmp ($j,$_bpend);
393 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
394 &je (&label("common_tail"));
395
396 &mov ($word,&DWP(0,$j)); # bp[i+1]
397 &mov ($inp,$_ap);
398 &mov ($_bp,$j); # &bp[++i]
399 &xor ($j,$j);
400 &xor ("edx","edx");
401 &mov ("eax",&DWP(0,$inp));
402 &jmp (&label("1stmadd"));
403\f
404&set_label("bn_sqr_mont",16);
405$sbit=$num;
406 &mov ($_num,$num);
407 &mov ($_bp,$j); # i=0
408
409 &mov ("eax",$word); # ap[0]
410 &mul ($word); # ap[0]*ap[0]
411 &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
412 &mov ($sbit,"edx");
413 &shr ("edx",1);
414 &and ($sbit,1);
415 &inc ($j);
416&set_label("sqr",16);
417 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
418 &mov ($carry,"edx");
419 &mul ($word); # ap[j]*ap[0]
420 &add ("eax",$carry);
421 &lea ($j,&DWP(1,$j));
422 &adc ("edx",0);
423 &lea ($carry,&DWP(0,$sbit,"eax",2));
424 &shr ("eax",31);
425 &cmp ($j,$_num);
426 &mov ($sbit,"eax");
427 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
428 &jl (&label("sqr"));
429
430 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
431 &mov ($carry,"edx");
432 &mul ($word); # ap[num-1]*ap[0]
433 &add ("eax",$carry);
434 &mov ($word,$_n0);
435 &adc ("edx",0);
436 &mov ($inp,$_np);
437 &lea ($carry,&DWP(0,$sbit,"eax",2));
438 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
439 &shr ("eax",31);
440 &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
441
442 &lea ($carry,&DWP(0,"eax","edx",2));
443 &mov ("eax",&DWP(0,$inp)); # np[0]
444 &shr ("edx",31);
445 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
446 &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
447
448 &mul ($word); # np[0]*m
449 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
450 &mov ($num,$j);
451 &adc ("edx",0);
452 &mov ("eax",&DWP(4,$inp)); # np[1]
453 &mov ($j,1);
454\f\f
455&set_label("3rdmadd",16);
456 &mov ($carry,"edx");
457 &mul ($word); # np[j]*m
458 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
459 &adc ("edx",0);
460 &add ($carry,"eax");
461 &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
462 &adc ("edx",0);
463 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
464
465 &mov ($carry,"edx");
466 &mul ($word); # np[j+1]*m
467 &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
468 &lea ($j,&DWP(2,$j));
469 &adc ("edx",0);
470 &add ($carry,"eax");
471 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
472 &adc ("edx",0);
473 &cmp ($j,$num);
474 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
475 &jl (&label("3rdmadd"));
476
477 &mov ($carry,"edx");
478 &mul ($word); # np[j]*m
479 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
480 &adc ("edx",0);
481 &add ($carry,"eax");
482 &adc ("edx",0);
483 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
484
485 &mov ($j,$_bp); # i
486 &xor ("eax","eax");
487 &mov ($inp,$_ap);
488 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
489 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
490 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
491 &cmp ($j,$num);
492 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
493 &je (&label("common_tail"));
494\f
495 &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
496 &lea ($j,&DWP(1,$j));
497 &mov ("eax",$word);
498 &mov ($_bp,$j); # ++i
499 &mul ($word); # ap[i]*ap[i]
500 &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
501 &adc ("edx",0);
502 &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
503 &xor ($carry,$carry);
504 &cmp ($j,$num);
505 &lea ($j,&DWP(1,$j));
506 &je (&label("sqrlast"));
507
508 &mov ($sbit,"edx"); # zaps $num
509 &shr ("edx",1);
510 &and ($sbit,1);
511&set_label("sqradd",16);
512 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
513 &mov ($carry,"edx");
514 &mul ($word); # ap[j]*ap[i]
515 &add ("eax",$carry);
516 &lea ($carry,&DWP(0,"eax","eax"));
517 &adc ("edx",0);
518 &shr ("eax",31);
519 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
520 &lea ($j,&DWP(1,$j));
521 &adc ("eax",0);
522 &add ($carry,$sbit);
523 &adc ("eax",0);
524 &cmp ($j,$_num);
525 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
526 &mov ($sbit,"eax");
527 &jle (&label("sqradd"));
528
529 &mov ($carry,"edx");
9e80567f 530 &add ("edx","edx");
5febbddd 531 &shr ($carry,31);
9e80567f
PA
532 &add ("edx",$sbit);
533 &adc ($carry,0);
5febbddd
PA
534&set_label("sqrlast");
535 &mov ($word,$_n0);
536 &mov ($inp,$_np);
537 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
538
539 &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
540 &mov ("eax",&DWP(0,$inp)); # np[0]
541 &adc ($carry,0);
542 &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
543 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
544
545 &mul ($word); # np[0]*m
546 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
547 &lea ($num,&DWP(-1,$j));
548 &adc ("edx",0);
549 &mov ($j,1);
550 &mov ("eax",&DWP(4,$inp)); # np[1]
551
552 &jmp (&label("3rdmadd"));
553}
554\f
555&set_label("common_tail",16);
556 &mov ($np,$_np); # load modulus pointer
557 &mov ($rp,$_rp); # load result pointer
558 &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
559
560 &mov ("eax",&DWP(0,$tp)); # tp[0]
561 &mov ($j,$num); # j=num-1
562 &xor ($i,$i); # i=0 and clear CF!
563
564&set_label("sub",16);
565 &sbb ("eax",&DWP(0,$np,$i,4));
566 &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
567 &dec ($j); # doesn't affect CF!
568 &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
569 &lea ($i,&DWP(1,$i)); # i++
570 &jge (&label("sub"));
571
572 &sbb ("eax",0); # handle upmost overflow bit
573 &and ($tp,"eax");
574 &not ("eax");
575 &mov ($np,$rp);
576 &and ($np,"eax");
577 &or ($tp,$np); # tp=carry?tp:rp
578
579&set_label("copy",16); # copy or in-place refresh
580 &mov ("eax",&DWP(0,$tp,$num,4));
581 &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
582 &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
583 &dec ($num);
584 &jge (&label("copy"));
585
586 &mov ("esp",$_sp); # pull saved stack pointer
587 &mov ("eax",1);
588&set_label("just_leave");
589&function_end("bn_mul_mont");
590
591&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
592
593&asm_finish();