Import OpenSSL-1.0.1.
[dragonfly.git] / crypto / openssl / crypto / aes / asm / bsaes-x86_64.pl
CommitLineData
672590bc
PA
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
68# Core 2 410 0.37
69# Nehalem 310 0.35
70# Atom 570 0.26
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 11.0
87# Nehalem 9.16
88#
89# November 2011.
90#
91# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
92# suboptimal, but XTS is meant to be used with larger blocks...
93#
94# <appro@openssl.org>
95
96$flavour = shift;
97$output = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107open STDOUT,"| $^X $xlate $flavour $output";
108
109my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
110my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
111my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
112
113{
114my ($key,$rounds,$const)=("%rax","%r10d","%r11");
115
116sub Sbox {
117# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
118# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
119my @b=@_[0..7];
120my @t=@_[8..11];
121my @s=@_[12..15];
122 &InBasisChange (@b);
123 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
124 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
125}
126
127sub InBasisChange {
128# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
129# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
130my @b=@_[0..7];
131$code.=<<___;
132 pxor @b[6], @b[5]
133 pxor @b[1], @b[2]
134 pxor @b[0], @b[3]
135 pxor @b[2], @b[6]
136 pxor @b[0], @b[5]
137
138 pxor @b[3], @b[6]
139 pxor @b[7], @b[3]
140 pxor @b[5], @b[7]
141 pxor @b[4], @b[3]
142 pxor @b[5], @b[4]
143 pxor @b[1], @b[3]
144
145 pxor @b[7], @b[2]
146 pxor @b[5], @b[1]
147___
148}
149
150sub OutBasisChange {
151# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
152# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
153my @b=@_[0..7];
154$code.=<<___;
155 pxor @b[6], @b[0]
156 pxor @b[4], @b[1]
157 pxor @b[0], @b[2]
158 pxor @b[6], @b[4]
159 pxor @b[1], @b[6]
160
161 pxor @b[5], @b[1]
162 pxor @b[3], @b[5]
163 pxor @b[7], @b[3]
164 pxor @b[5], @b[7]
165 pxor @b[5], @b[2]
166
167 pxor @b[7], @b[4]
168___
169}
170
171sub InvSbox {
172# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
173# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
174my @b=@_[0..7];
175my @t=@_[8..11];
176my @s=@_[12..15];
177 &InvInBasisChange (@b);
178 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
179 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
180}
181
182sub InvInBasisChange { # OutBasisChange in reverse
183my @b=@_[5,1,2,6,3,7,0,4];
184$code.=<<___
185 pxor @b[7], @b[4]
186
187 pxor @b[5], @b[7]
188 pxor @b[5], @b[2]
189 pxor @b[7], @b[3]
190 pxor @b[3], @b[5]
191 pxor @b[5], @b[1]
192
193 pxor @b[1], @b[6]
194 pxor @b[0], @b[2]
195 pxor @b[6], @b[4]
196 pxor @b[6], @b[0]
197 pxor @b[4], @b[1]
198___
199}
200
201sub InvOutBasisChange { # InBasisChange in reverse
202my @b=@_[2,5,7,3,6,1,0,4];
203$code.=<<___;
204 pxor @b[5], @b[1]
205 pxor @b[7], @b[2]
206
207 pxor @b[1], @b[3]
208 pxor @b[5], @b[4]
209 pxor @b[5], @b[7]
210 pxor @b[4], @b[3]
211 pxor @b[0], @b[5]
212 pxor @b[7], @b[3]
213 pxor @b[2], @b[6]
214 pxor @b[1], @b[2]
215 pxor @b[3], @b[6]
216
217 pxor @b[0], @b[3]
218 pxor @b[6], @b[5]
219___
220}
221
222sub Mul_GF4 {
223#;*************************************************************
224#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
225#;*************************************************************
226my ($x0,$x1,$y0,$y1,$t0)=@_;
227$code.=<<___;
228 movdqa $y0, $t0
229 pxor $y1, $t0
230 pand $x0, $t0
231 pxor $x1, $x0
232 pand $y0, $x1
233 pand $y1, $x0
234 pxor $x1, $x0
235 pxor $t0, $x1
236___
237}
238
239sub Mul_GF4_N { # not used, see next subroutine
240# multiply and scale by N
241my ($x0,$x1,$y0,$y1,$t0)=@_;
242$code.=<<___;
243 movdqa $y0, $t0
244 pxor $y1, $t0
245 pand $x0, $t0
246 pxor $x1, $x0
247 pand $y0, $x1
248 pand $y1, $x0
249 pxor $x0, $x1
250 pxor $t0, $x0
251___
252}
253
254sub Mul_GF4_N_GF4 {
255# interleaved Mul_GF4_N and Mul_GF4
256my ($x0,$x1,$y0,$y1,$t0,
257 $x2,$x3,$y2,$y3,$t1)=@_;
258$code.=<<___;
259 movdqa $y0, $t0
260 movdqa $y2, $t1
261 pxor $y1, $t0
262 pxor $y3, $t1
263 pand $x0, $t0
264 pand $x2, $t1
265 pxor $x1, $x0
266 pxor $x3, $x2
267 pand $y0, $x1
268 pand $y2, $x3
269 pand $y1, $x0
270 pand $y3, $x2
271 pxor $x0, $x1
272 pxor $x3, $x2
273 pxor $t0, $x0
274 pxor $t1, $x3
275___
276}
277sub Mul_GF16_2 {
278my @x=@_[0..7];
279my @y=@_[8..11];
280my @t=@_[12..15];
281$code.=<<___;
282 movdqa @x[0], @t[0]
283 movdqa @x[1], @t[1]
284___
285 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
286$code.=<<___;
287 pxor @x[2], @t[0]
288 pxor @x[3], @t[1]
289 pxor @y[2], @y[0]
290 pxor @y[3], @y[1]
291___
292 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
293 @x[2], @x[3], @y[2], @y[3], @t[2]);
294$code.=<<___;
295 pxor @t[0], @x[0]
296 pxor @t[0], @x[2]
297 pxor @t[1], @x[1]
298 pxor @t[1], @x[3]
299
300 movdqa @x[4], @t[0]
301 movdqa @x[5], @t[1]
302 pxor @x[6], @t[0]
303 pxor @x[7], @t[1]
304___
305 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
306 @x[6], @x[7], @y[2], @y[3], @t[2]);
307$code.=<<___;
308 pxor @y[2], @y[0]
309 pxor @y[3], @y[1]
310___
311 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
312$code.=<<___;
313 pxor @t[0], @x[4]
314 pxor @t[0], @x[6]
315 pxor @t[1], @x[5]
316 pxor @t[1], @x[7]
317___
318}
319sub Inv_GF256 {
320#;********************************************************************
321#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
322#;********************************************************************
323my @x=@_[0..7];
324my @t=@_[8..11];
325my @s=@_[12..15];
326# direct optimizations from hardware
327$code.=<<___;
328 movdqa @x[4], @t[3]
329 movdqa @x[5], @t[2]
330 movdqa @x[1], @t[1]
331 movdqa @x[7], @s[1]
332 movdqa @x[0], @s[0]
333
334 pxor @x[6], @t[3]
335 pxor @x[7], @t[2]
336 pxor @x[3], @t[1]
337 movdqa @t[3], @s[2]
338 pxor @x[6], @s[1]
339 movdqa @t[2], @t[0]
340 pxor @x[2], @s[0]
341 movdqa @t[3], @s[3]
342
343 por @t[1], @t[2]
344 por @s[0], @t[3]
345 pxor @t[0], @s[3]
346 pand @s[0], @s[2]
347 pxor @t[1], @s[0]
348 pand @t[1], @t[0]
349 pand @s[0], @s[3]
350 movdqa @x[3], @s[0]
351 pxor @x[2], @s[0]
352 pand @s[0], @s[1]
353 pxor @s[1], @t[3]
354 pxor @s[1], @t[2]
355 movdqa @x[4], @s[1]
356 movdqa @x[1], @s[0]
357 pxor @x[5], @s[1]
358 pxor @x[0], @s[0]
359 movdqa @s[1], @t[1]
360 pand @s[0], @s[1]
361 por @s[0], @t[1]
362 pxor @s[1], @t[0]
363 pxor @s[3], @t[3]
364 pxor @s[2], @t[2]
365 pxor @s[3], @t[1]
366 movdqa @x[7], @s[0]
367 pxor @s[2], @t[0]
368 movdqa @x[6], @s[1]
369 pxor @s[2], @t[1]
370 movdqa @x[5], @s[2]
371 pand @x[3], @s[0]
372 movdqa @x[4], @s[3]
373 pand @x[2], @s[1]
374 pand @x[1], @s[2]
375 por @x[0], @s[3]
376 pxor @s[0], @t[3]
377 pxor @s[1], @t[2]
378 pxor @s[2], @t[1]
379 pxor @s[3], @t[0]
380
381 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
382
383 # new smaller inversion
384
385 movdqa @t[3], @s[0]
386 pand @t[1], @t[3]
387 pxor @t[2], @s[0]
388
389 movdqa @t[0], @s[2]
390 movdqa @s[0], @s[3]
391 pxor @t[3], @s[2]
392 pand @s[2], @s[3]
393
394 movdqa @t[1], @s[1]
395 pxor @t[2], @s[3]
396 pxor @t[0], @s[1]
397
398 pxor @t[2], @t[3]
399
400 pand @t[3], @s[1]
401
402 movdqa @s[2], @t[2]
403 pxor @t[0], @s[1]
404
405 pxor @s[1], @t[2]
406 pxor @s[1], @t[1]
407
408 pand @t[0], @t[2]
409
410 pxor @t[2], @s[2]
411 pxor @t[2], @t[1]
412
413 pand @s[3], @s[2]
414
415 pxor @s[0], @s[2]
416___
417# output in s3, s2, s1, t1
418
419# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
420
421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
422 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
423
424### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
425}
426
427# AES linear components
428
429sub ShiftRows {
430my @x=@_[0..7];
431my $mask=pop;
432$code.=<<___;
433 pxor 0x00($key),@x[0]
434 pxor 0x10($key),@x[1]
435 pshufb $mask,@x[0]
436 pxor 0x20($key),@x[2]
437 pshufb $mask,@x[1]
438 pxor 0x30($key),@x[3]
439 pshufb $mask,@x[2]
440 pxor 0x40($key),@x[4]
441 pshufb $mask,@x[3]
442 pxor 0x50($key),@x[5]
443 pshufb $mask,@x[4]
444 pxor 0x60($key),@x[6]
445 pshufb $mask,@x[5]
446 pxor 0x70($key),@x[7]
447 pshufb $mask,@x[6]
448 lea 0x80($key),$key
449 pshufb $mask,@x[7]
450___
451}
452
453sub MixColumns {
454# modified to emit output in order suitable for feeding back to aesenc[last]
455my @x=@_[0..7];
456my @t=@_[8..15];
457$code.=<<___;
458 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
459 pshufd \$0x93, @x[1], @t[1]
460 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
461 pshufd \$0x93, @x[2], @t[2]
462 pxor @t[1], @x[1]
463 pshufd \$0x93, @x[3], @t[3]
464 pxor @t[2], @x[2]
465 pshufd \$0x93, @x[4], @t[4]
466 pxor @t[3], @x[3]
467 pshufd \$0x93, @x[5], @t[5]
468 pxor @t[4], @x[4]
469 pshufd \$0x93, @x[6], @t[6]
470 pxor @t[5], @x[5]
471 pshufd \$0x93, @x[7], @t[7]
472 pxor @t[6], @x[6]
473 pxor @t[7], @x[7]
474
475 pxor @x[0], @t[1]
476 pxor @x[7], @t[0]
477 pxor @x[7], @t[1]
478 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
479 pxor @x[1], @t[2]
480 pshufd \$0x4E, @x[1], @x[1]
481 pxor @x[4], @t[5]
482 pxor @t[0], @x[0]
483 pxor @x[5], @t[6]
484 pxor @t[1], @x[1]
485 pxor @x[3], @t[4]
486 pshufd \$0x4E, @x[4], @t[0]
487 pxor @x[6], @t[7]
488 pshufd \$0x4E, @x[5], @t[1]
489 pxor @x[2], @t[3]
490 pshufd \$0x4E, @x[3], @x[4]
491 pxor @x[7], @t[3]
492 pshufd \$0x4E, @x[7], @x[5]
493 pxor @x[7], @t[4]
494 pshufd \$0x4E, @x[6], @x[3]
495 pxor @t[4], @t[0]
496 pshufd \$0x4E, @x[2], @x[6]
497 pxor @t[5], @t[1]
498
499 pxor @t[3], @x[4]
500 pxor @t[7], @x[5]
501 pxor @t[6], @x[3]
502 movdqa @t[0], @x[2]
503 pxor @t[2], @x[6]
504 movdqa @t[1], @x[7]
505___
506}
507
508sub InvMixColumns {
509my @x=@_[0..7];
510my @t=@_[8..15];
511
512$code.=<<___;
513 # multiplication by 0x0e
514 pshufd \$0x93, @x[7], @t[7]
515 movdqa @x[2], @t[2]
516 pxor @x[5], @x[7] # 7 5
517 pxor @x[5], @x[2] # 2 5
518 pshufd \$0x93, @x[0], @t[0]
519 movdqa @x[5], @t[5]
520 pxor @x[0], @x[5] # 5 0 [1]
521 pxor @x[1], @x[0] # 0 1
522 pshufd \$0x93, @x[1], @t[1]
523 pxor @x[2], @x[1] # 1 25
524 pxor @x[6], @x[0] # 01 6 [2]
525 pxor @x[3], @x[1] # 125 3 [4]
526 pshufd \$0x93, @x[3], @t[3]
527 pxor @x[0], @x[2] # 25 016 [3]
528 pxor @x[7], @x[3] # 3 75
529 pxor @x[6], @x[7] # 75 6 [0]
530 pshufd \$0x93, @x[6], @t[6]
531 movdqa @x[4], @t[4]
532 pxor @x[4], @x[6] # 6 4
533 pxor @x[3], @x[4] # 4 375 [6]
534 pxor @x[7], @x[3] # 375 756=36
535 pxor @t[5], @x[6] # 64 5 [7]
536 pxor @t[2], @x[3] # 36 2
537 pxor @t[4], @x[3] # 362 4 [5]
538 pshufd \$0x93, @t[5], @t[5]
539___
540 my @y = @x[7,5,0,2,1,3,4,6];
541$code.=<<___;
542 # multiplication by 0x0b
543 pxor @y[0], @y[1]
544 pxor @t[0], @y[0]
545 pxor @t[1], @y[1]
546 pshufd \$0x93, @t[2], @t[2]
547 pxor @t[5], @y[0]
548 pxor @t[6], @y[1]
549 pxor @t[7], @y[0]
550 pshufd \$0x93, @t[4], @t[4]
551 pxor @t[6], @t[7] # clobber t[7]
552 pxor @y[0], @y[1]
553
554 pxor @t[0], @y[3]
555 pshufd \$0x93, @t[0], @t[0]
556 pxor @t[1], @y[2]
557 pxor @t[1], @y[4]
558 pxor @t[2], @y[2]
559 pshufd \$0x93, @t[1], @t[1]
560 pxor @t[2], @y[3]
561 pxor @t[2], @y[5]
562 pxor @t[7], @y[2]
563 pshufd \$0x93, @t[2], @t[2]
564 pxor @t[3], @y[3]
565 pxor @t[3], @y[6]
566 pxor @t[3], @y[4]
567 pshufd \$0x93, @t[3], @t[3]
568 pxor @t[4], @y[7]
569 pxor @t[4], @y[5]
570 pxor @t[7], @y[7]
571 pxor @t[5], @y[3]
572 pxor @t[4], @y[4]
573 pxor @t[5], @t[7] # clobber t[7] even more
574
575 pxor @t[7], @y[5]
576 pshufd \$0x93, @t[4], @t[4]
577 pxor @t[7], @y[6]
578 pxor @t[7], @y[4]
579
580 pxor @t[5], @t[7]
581 pshufd \$0x93, @t[5], @t[5]
582 pxor @t[6], @t[7] # restore t[7]
583
584 # multiplication by 0x0d
585 pxor @y[7], @y[4]
586 pxor @t[4], @y[7]
587 pshufd \$0x93, @t[6], @t[6]
588 pxor @t[0], @y[2]
589 pxor @t[5], @y[7]
590 pxor @t[2], @y[2]
591 pshufd \$0x93, @t[7], @t[7]
592
593 pxor @y[1], @y[3]
594 pxor @t[1], @y[1]
595 pxor @t[0], @y[0]
596 pxor @t[0], @y[3]
597 pxor @t[5], @y[1]
598 pxor @t[5], @y[0]
599 pxor @t[7], @y[1]
600 pshufd \$0x93, @t[0], @t[0]
601 pxor @t[6], @y[0]
602 pxor @y[1], @y[3]
603 pxor @t[1], @y[4]
604 pshufd \$0x93, @t[1], @t[1]
605
606 pxor @t[7], @y[7]
607 pxor @t[2], @y[4]
608 pxor @t[2], @y[5]
609 pshufd \$0x93, @t[2], @t[2]
610 pxor @t[6], @y[2]
611 pxor @t[3], @t[6] # clobber t[6]
612 pxor @y[7], @y[4]
613 pxor @t[6], @y[3]
614
615 pxor @t[6], @y[6]
616 pxor @t[5], @y[5]
617 pxor @t[4], @y[6]
618 pshufd \$0x93, @t[4], @t[4]
619 pxor @t[6], @y[5]
620 pxor @t[7], @y[6]
621 pxor @t[3], @t[6] # restore t[6]
622
623 pshufd \$0x93, @t[5], @t[5]
624 pshufd \$0x93, @t[6], @t[6]
625 pshufd \$0x93, @t[7], @t[7]
626 pshufd \$0x93, @t[3], @t[3]
627
628 # multiplication by 0x09
629 pxor @y[1], @y[4]
630 pxor @y[1], @t[1] # t[1]=y[1]
631 pxor @t[5], @t[0] # clobber t[0]
632 pxor @t[5], @t[1]
633 pxor @t[0], @y[3]
634 pxor @y[0], @t[0] # t[0]=y[0]
635 pxor @t[6], @t[1]
636 pxor @t[7], @t[6] # clobber t[6]
637 pxor @t[1], @y[4]
638 pxor @t[4], @y[7]
639 pxor @y[4], @t[4] # t[4]=y[4]
640 pxor @t[3], @y[6]
641 pxor @y[3], @t[3] # t[3]=y[3]
642 pxor @t[2], @y[5]
643 pxor @y[2], @t[2] # t[2]=y[2]
644 pxor @t[7], @t[3]
645 pxor @y[5], @t[5] # t[5]=y[5]
646 pxor @t[6], @t[2]
647 pxor @t[6], @t[5]
648 pxor @y[6], @t[6] # t[6]=y[6]
649 pxor @y[7], @t[7] # t[7]=y[7]
650
651 movdqa @t[0],@XMM[0]
652 movdqa @t[1],@XMM[1]
653 movdqa @t[2],@XMM[2]
654 movdqa @t[3],@XMM[3]
655 movdqa @t[4],@XMM[4]
656 movdqa @t[5],@XMM[5]
657 movdqa @t[6],@XMM[6]
658 movdqa @t[7],@XMM[7]
659___
660}
661
662sub aesenc { # not used
663my @b=@_[0..7];
664my @t=@_[8..15];
665$code.=<<___;
666 movdqa 0x30($const),@t[0] # .LSR
667___
668 &ShiftRows (@b,@t[0]);
669 &Sbox (@b,@t);
670 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
671}
672
673sub aesenclast { # not used
674my @b=@_[0..7];
675my @t=@_[8..15];
676$code.=<<___;
677 movdqa 0x40($const),@t[0] # .LSRM0
678___
679 &ShiftRows (@b,@t[0]);
680 &Sbox (@b,@t);
681$code.=<<___
682 pxor 0x00($key),@b[0]
683 pxor 0x10($key),@b[1]
684 pxor 0x20($key),@b[4]
685 pxor 0x30($key),@b[6]
686 pxor 0x40($key),@b[3]
687 pxor 0x50($key),@b[7]
688 pxor 0x60($key),@b[2]
689 pxor 0x70($key),@b[5]
690___
691}
692
693sub swapmove {
694my ($a,$b,$n,$mask,$t)=@_;
695$code.=<<___;
696 movdqa $b,$t
697 psrlq \$$n,$b
698 pxor $a,$b
699 pand $mask,$b
700 pxor $b,$a
701 psllq \$$n,$b
702 pxor $t,$b
703___
704}
705sub swapmove2x {
706my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
707$code.=<<___;
708 movdqa $b0,$t0
709 psrlq \$$n,$b0
710 movdqa $b1,$t1
711 psrlq \$$n,$b1
712 pxor $a0,$b0
713 pxor $a1,$b1
714 pand $mask,$b0
715 pand $mask,$b1
716 pxor $b0,$a0
717 psllq \$$n,$b0
718 pxor $b1,$a1
719 psllq \$$n,$b1
720 pxor $t0,$b0
721 pxor $t1,$b1
722___
723}
724
725sub bitslice {
726my @x=reverse(@_[0..7]);
727my ($t0,$t1,$t2,$t3)=@_[8..11];
728$code.=<<___;
729 movdqa 0x00($const),$t0 # .LBS0
730 movdqa 0x10($const),$t1 # .LBS1
731___
732 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
733 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
734$code.=<<___;
735 movdqa 0x20($const),$t0 # .LBS2
736___
737 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
738 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
739
740 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
741 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
742}
743
744$code.=<<___;
745.text
746
747.extern asm_AES_encrypt
748.extern asm_AES_decrypt
749
750.type _bsaes_encrypt8,\@abi-omnipotent
751.align 64
752_bsaes_encrypt8:
753 lea .LBS0(%rip), $const # constants table
754
755 movdqa ($key), @XMM[9] # round 0 key
756 lea 0x10($key), $key
757 movdqa 0x60($const), @XMM[8] # .LM0SR
758 pxor @XMM[9], @XMM[0] # xor with round0 key
759 pxor @XMM[9], @XMM[1]
760 pshufb @XMM[8], @XMM[0]
761 pxor @XMM[9], @XMM[2]
762 pshufb @XMM[8], @XMM[1]
763 pxor @XMM[9], @XMM[3]
764 pshufb @XMM[8], @XMM[2]
765 pxor @XMM[9], @XMM[4]
766 pshufb @XMM[8], @XMM[3]
767 pxor @XMM[9], @XMM[5]
768 pshufb @XMM[8], @XMM[4]
769 pxor @XMM[9], @XMM[6]
770 pshufb @XMM[8], @XMM[5]
771 pxor @XMM[9], @XMM[7]
772 pshufb @XMM[8], @XMM[6]
773 pshufb @XMM[8], @XMM[7]
774_bsaes_encrypt8_bitslice:
775___
776 &bitslice (@XMM[0..7, 8..11]);
777$code.=<<___;
778 dec $rounds
779 jmp .Lenc_sbox
780.align 16
781.Lenc_loop:
782___
783 &ShiftRows (@XMM[0..7, 8]);
784$code.=".Lenc_sbox:\n";
785 &Sbox (@XMM[0..7, 8..15]);
786$code.=<<___;
787 dec $rounds
788 jl .Lenc_done
789___
790 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
791$code.=<<___;
792 movdqa 0x30($const), @XMM[8] # .LSR
793 jnz .Lenc_loop
794 movdqa 0x40($const), @XMM[8] # .LSRM0
795 jmp .Lenc_loop
796.align 16
797.Lenc_done:
798___
799 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
800 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
801$code.=<<___;
802 movdqa ($key), @XMM[8] # last round key
803 pxor @XMM[8], @XMM[4]
804 pxor @XMM[8], @XMM[6]
805 pxor @XMM[8], @XMM[3]
806 pxor @XMM[8], @XMM[7]
807 pxor @XMM[8], @XMM[2]
808 pxor @XMM[8], @XMM[5]
809 pxor @XMM[8], @XMM[0]
810 pxor @XMM[8], @XMM[1]
811 ret
812.size _bsaes_encrypt8,.-_bsaes_encrypt8
813
814.type _bsaes_decrypt8,\@abi-omnipotent
815.align 64
816_bsaes_decrypt8:
817 lea .LBS0(%rip), $const # constants table
818
819 movdqa ($key), @XMM[9] # round 0 key
820 lea 0x10($key), $key
821 movdqa -0x30($const), @XMM[8] # .LM0ISR
822 pxor @XMM[9], @XMM[0] # xor with round0 key
823 pxor @XMM[9], @XMM[1]
824 pshufb @XMM[8], @XMM[0]
825 pxor @XMM[9], @XMM[2]
826 pshufb @XMM[8], @XMM[1]
827 pxor @XMM[9], @XMM[3]
828 pshufb @XMM[8], @XMM[2]
829 pxor @XMM[9], @XMM[4]
830 pshufb @XMM[8], @XMM[3]
831 pxor @XMM[9], @XMM[5]
832 pshufb @XMM[8], @XMM[4]
833 pxor @XMM[9], @XMM[6]
834 pshufb @XMM[8], @XMM[5]
835 pxor @XMM[9], @XMM[7]
836 pshufb @XMM[8], @XMM[6]
837 pshufb @XMM[8], @XMM[7]
838___
839 &bitslice (@XMM[0..7, 8..11]);
840$code.=<<___;
841 dec $rounds
842 jmp .Ldec_sbox
843.align 16
844.Ldec_loop:
845___
846 &ShiftRows (@XMM[0..7, 8]);
847$code.=".Ldec_sbox:\n";
848 &InvSbox (@XMM[0..7, 8..15]);
849$code.=<<___;
850 dec $rounds
851 jl .Ldec_done
852___
853 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
854$code.=<<___;
855 movdqa -0x10($const), @XMM[8] # .LISR
856 jnz .Ldec_loop
857 movdqa -0x20($const), @XMM[8] # .LISRM0
858 jmp .Ldec_loop
859.align 16
860.Ldec_done:
861___
862 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
863$code.=<<___;
864 movdqa ($key), @XMM[8] # last round key
865 pxor @XMM[8], @XMM[6]
866 pxor @XMM[8], @XMM[4]
867 pxor @XMM[8], @XMM[2]
868 pxor @XMM[8], @XMM[7]
869 pxor @XMM[8], @XMM[3]
870 pxor @XMM[8], @XMM[5]
871 pxor @XMM[8], @XMM[0]
872 pxor @XMM[8], @XMM[1]
873 ret
874.size _bsaes_decrypt8,.-_bsaes_decrypt8
875___
876}
877{
878my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
879
880sub bitslice_key {
881my @x=reverse(@_[0..7]);
882my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
883
884 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
885$code.=<<___;
886 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
887 movdqa @x[0], @x[2]
888 movdqa @x[1], @x[3]
889___
890 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
891
892 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
893$code.=<<___;
894 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
895 movdqa @x[0], @x[4]
896 movdqa @x[2], @x[6]
897 movdqa @x[1], @x[5]
898 movdqa @x[3], @x[7]
899___
900 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
901 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
902}
903
904$code.=<<___;
905.type _bsaes_key_convert,\@abi-omnipotent
906.align 16
907_bsaes_key_convert:
908 lea .LBS1(%rip), $const
909 movdqu ($inp), %xmm7 # load round 0 key
910 movdqa -0x10($const), %xmm8 # .LBS0
911 movdqa 0x00($const), %xmm9 # .LBS1
912 movdqa 0x10($const), %xmm10 # .LBS2
913 movdqa 0x40($const), %xmm13 # .LM0
914 movdqa 0x60($const), %xmm14 # .LNOT
915
916 movdqu 0x10($inp), %xmm6 # load round 1 key
917 lea 0x10($inp), $inp
918 movdqa %xmm7, ($out) # save round 0 key
919 lea 0x10($out), $out
920 dec $rounds
921 jmp .Lkey_loop
922.align 16
923.Lkey_loop:
924 pshufb %xmm13, %xmm6 # .LM0
925 movdqa %xmm6, %xmm7
926___
927 &bitslice_key (map("%xmm$_",(0..7, 8..12)));
928$code.=<<___;
929 pxor %xmm14, %xmm5 # "pnot"
930 pxor %xmm14, %xmm6
931 pxor %xmm14, %xmm0
932 pxor %xmm14, %xmm1
933 lea 0x10($inp), $inp
934 movdqa %xmm0, 0x00($out) # write bit-sliced round key
935 movdqa %xmm1, 0x10($out)
936 movdqa %xmm2, 0x20($out)
937 movdqa %xmm3, 0x30($out)
938 movdqa %xmm4, 0x40($out)
939 movdqa %xmm5, 0x50($out)
940 movdqa %xmm6, 0x60($out)
941 movdqa %xmm7, 0x70($out)
942 lea 0x80($out),$out
943 movdqu ($inp), %xmm6 # load next round key
944 dec $rounds
945 jnz .Lkey_loop
946
947 movdqa 0x70($const), %xmm7 # .L63
948 #movdqa %xmm6, ($out) # don't save last round key
949 ret
950.size _bsaes_key_convert,.-_bsaes_key_convert
951___
952}
953
954if (0 && !$win64) { # following four functions are unsupported interface
955 # used for benchmarking...
956$code.=<<___;
957.globl bsaes_enc_key_convert
958.type bsaes_enc_key_convert,\@function,2
959.align 16
960bsaes_enc_key_convert:
961 mov 240($inp),%r10d # pass rounds
962 mov $inp,%rcx # pass key
963 mov $out,%rax # pass key schedule
964 call _bsaes_key_convert
965 pxor %xmm6,%xmm7 # fix up last round key
966 movdqa %xmm7,(%rax) # save last round key
967 ret
968.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
969
970.globl bsaes_encrypt_128
971.type bsaes_encrypt_128,\@function,4
972.align 16
973bsaes_encrypt_128:
974.Lenc128_loop:
975 movdqu 0x00($inp), @XMM[0] # load input
976 movdqu 0x10($inp), @XMM[1]
977 movdqu 0x20($inp), @XMM[2]
978 movdqu 0x30($inp), @XMM[3]
979 movdqu 0x40($inp), @XMM[4]
980 movdqu 0x50($inp), @XMM[5]
981 movdqu 0x60($inp), @XMM[6]
982 movdqu 0x70($inp), @XMM[7]
983 mov $key, %rax # pass the $key
984 lea 0x80($inp), $inp
985 mov \$10,%r10d
986
987 call _bsaes_encrypt8
988
989 movdqu @XMM[0], 0x00($out) # write output
990 movdqu @XMM[1], 0x10($out)
991 movdqu @XMM[4], 0x20($out)
992 movdqu @XMM[6], 0x30($out)
993 movdqu @XMM[3], 0x40($out)
994 movdqu @XMM[7], 0x50($out)
995 movdqu @XMM[2], 0x60($out)
996 movdqu @XMM[5], 0x70($out)
997 lea 0x80($out), $out
998 sub \$0x80,$len
999 ja .Lenc128_loop
1000 ret
1001.size bsaes_encrypt_128,.-bsaes_encrypt_128
1002
1003.globl bsaes_dec_key_convert
1004.type bsaes_dec_key_convert,\@function,2
1005.align 16
1006bsaes_dec_key_convert:
1007 mov 240($inp),%r10d # pass rounds
1008 mov $inp,%rcx # pass key
1009 mov $out,%rax # pass key schedule
1010 call _bsaes_key_convert
1011 pxor ($out),%xmm7 # fix up round 0 key
1012 movdqa %xmm6,(%rax) # save last round key
1013 movdqa %xmm7,($out)
1014 ret
1015.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1016
1017.globl bsaes_decrypt_128
1018.type bsaes_decrypt_128,\@function,4
1019.align 16
1020bsaes_decrypt_128:
1021.Ldec128_loop:
1022 movdqu 0x00($inp), @XMM[0] # load input
1023 movdqu 0x10($inp), @XMM[1]
1024 movdqu 0x20($inp), @XMM[2]
1025 movdqu 0x30($inp), @XMM[3]
1026 movdqu 0x40($inp), @XMM[4]
1027 movdqu 0x50($inp), @XMM[5]
1028 movdqu 0x60($inp), @XMM[6]
1029 movdqu 0x70($inp), @XMM[7]
1030 mov $key, %rax # pass the $key
1031 lea 0x80($inp), $inp
1032 mov \$10,%r10d
1033
1034 call _bsaes_decrypt8
1035
1036 movdqu @XMM[0], 0x00($out) # write output
1037 movdqu @XMM[1], 0x10($out)
1038 movdqu @XMM[6], 0x20($out)
1039 movdqu @XMM[4], 0x30($out)
1040 movdqu @XMM[2], 0x40($out)
1041 movdqu @XMM[7], 0x50($out)
1042 movdqu @XMM[3], 0x60($out)
1043 movdqu @XMM[5], 0x70($out)
1044 lea 0x80($out), $out
1045 sub \$0x80,$len
1046 ja .Ldec128_loop
1047 ret
1048.size bsaes_decrypt_128,.-bsaes_decrypt_128
1049___
1050}
1051{
1052######################################################################
1053#
1054# OpenSSL interface
1055#
1056my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1057 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1058my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1059
1060if ($ecb) {
1061$code.=<<___;
1062.globl bsaes_ecb_encrypt_blocks
1063.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1064.align 16
1065bsaes_ecb_encrypt_blocks:
1066 mov %rsp, %rax
1067.Lecb_enc_prologue:
1068 push %rbp
1069 push %rbx
1070 push %r12
1071 push %r13
1072 push %r14
1073 push %r15
1074 lea -0x48(%rsp),%rsp
1075___
1076$code.=<<___ if ($win64);
1077 lea -0xa0(%rsp), %rsp
1078 movaps %xmm6, 0x40(%rsp)
1079 movaps %xmm7, 0x50(%rsp)
1080 movaps %xmm8, 0x60(%rsp)
1081 movaps %xmm9, 0x70(%rsp)
1082 movaps %xmm10, 0x80(%rsp)
1083 movaps %xmm11, 0x90(%rsp)
1084 movaps %xmm12, 0xa0(%rsp)
1085 movaps %xmm13, 0xb0(%rsp)
1086 movaps %xmm14, 0xc0(%rsp)
1087 movaps %xmm15, 0xd0(%rsp)
1088.Lecb_enc_body:
1089___
1090$code.=<<___;
1091 mov %rsp,%rbp # backup %rsp
1092 mov 240($arg4),%eax # rounds
1093 mov $arg1,$inp # backup arguments
1094 mov $arg2,$out
1095 mov $arg3,$len
1096 mov $arg4,$key
1097 cmp \$8,$arg3
1098 jb .Lecb_enc_short
1099
1100 mov %eax,%ebx # backup rounds
1101 shl \$7,%rax # 128 bytes per inner round key
1102 sub \$`128-32`,%rax # size of bit-sliced key schedule
1103 sub %rax,%rsp
1104 mov %rsp,%rax # pass key schedule
1105 mov $key,%rcx # pass key
1106 mov %ebx,%r10d # pass rounds
1107 call _bsaes_key_convert
1108 pxor %xmm6,%xmm7 # fix up last round key
1109 movdqa %xmm7,(%rax) # save last round key
1110
1111 sub \$8,$len
1112.Lecb_enc_loop:
1113 movdqu 0x00($inp), @XMM[0] # load input
1114 movdqu 0x10($inp), @XMM[1]
1115 movdqu 0x20($inp), @XMM[2]
1116 movdqu 0x30($inp), @XMM[3]
1117 movdqu 0x40($inp), @XMM[4]
1118 movdqu 0x50($inp), @XMM[5]
1119 mov %rsp, %rax # pass key schedule
1120 movdqu 0x60($inp), @XMM[6]
1121 mov %ebx,%r10d # pass rounds
1122 movdqu 0x70($inp), @XMM[7]
1123 lea 0x80($inp), $inp
1124
1125 call _bsaes_encrypt8
1126
1127 movdqu @XMM[0], 0x00($out) # write output
1128 movdqu @XMM[1], 0x10($out)
1129 movdqu @XMM[4], 0x20($out)
1130 movdqu @XMM[6], 0x30($out)
1131 movdqu @XMM[3], 0x40($out)
1132 movdqu @XMM[7], 0x50($out)
1133 movdqu @XMM[2], 0x60($out)
1134 movdqu @XMM[5], 0x70($out)
1135 lea 0x80($out), $out
1136 sub \$8,$len
1137 jnc .Lecb_enc_loop
1138
1139 add \$8,$len
1140 jz .Lecb_enc_done
1141
1142 movdqu 0x00($inp), @XMM[0] # load input
1143 mov %rsp, %rax # pass key schedule
1144 mov %ebx,%r10d # pass rounds
1145 cmp \$2,$len
1146 jb .Lecb_enc_one
1147 movdqu 0x10($inp), @XMM[1]
1148 je .Lecb_enc_two
1149 movdqu 0x20($inp), @XMM[2]
1150 cmp \$4,$len
1151 jb .Lecb_enc_three
1152 movdqu 0x30($inp), @XMM[3]
1153 je .Lecb_enc_four
1154 movdqu 0x40($inp), @XMM[4]
1155 cmp \$6,$len
1156 jb .Lecb_enc_five
1157 movdqu 0x50($inp), @XMM[5]
1158 je .Lecb_enc_six
1159 movdqu 0x60($inp), @XMM[6]
1160 call _bsaes_encrypt8
1161 movdqu @XMM[0], 0x00($out) # write output
1162 movdqu @XMM[1], 0x10($out)
1163 movdqu @XMM[4], 0x20($out)
1164 movdqu @XMM[6], 0x30($out)
1165 movdqu @XMM[3], 0x40($out)
1166 movdqu @XMM[7], 0x50($out)
1167 movdqu @XMM[2], 0x60($out)
1168 jmp .Lecb_enc_done
1169.align 16
1170.Lecb_enc_six:
1171 call _bsaes_encrypt8
1172 movdqu @XMM[0], 0x00($out) # write output
1173 movdqu @XMM[1], 0x10($out)
1174 movdqu @XMM[4], 0x20($out)
1175 movdqu @XMM[6], 0x30($out)
1176 movdqu @XMM[3], 0x40($out)
1177 movdqu @XMM[7], 0x50($out)
1178 jmp .Lecb_enc_done
1179.align 16
1180.Lecb_enc_five:
1181 call _bsaes_encrypt8
1182 movdqu @XMM[0], 0x00($out) # write output
1183 movdqu @XMM[1], 0x10($out)
1184 movdqu @XMM[4], 0x20($out)
1185 movdqu @XMM[6], 0x30($out)
1186 movdqu @XMM[3], 0x40($out)
1187 jmp .Lecb_enc_done
1188.align 16
1189.Lecb_enc_four:
1190 call _bsaes_encrypt8
1191 movdqu @XMM[0], 0x00($out) # write output
1192 movdqu @XMM[1], 0x10($out)
1193 movdqu @XMM[4], 0x20($out)
1194 movdqu @XMM[6], 0x30($out)
1195 jmp .Lecb_enc_done
1196.align 16
1197.Lecb_enc_three:
1198 call _bsaes_encrypt8
1199 movdqu @XMM[0], 0x00($out) # write output
1200 movdqu @XMM[1], 0x10($out)
1201 movdqu @XMM[4], 0x20($out)
1202 jmp .Lecb_enc_done
1203.align 16
1204.Lecb_enc_two:
1205 call _bsaes_encrypt8
1206 movdqu @XMM[0], 0x00($out) # write output
1207 movdqu @XMM[1], 0x10($out)
1208 jmp .Lecb_enc_done
1209.align 16
1210.Lecb_enc_one:
1211 call _bsaes_encrypt8
1212 movdqu @XMM[0], 0x00($out) # write output
1213 jmp .Lecb_enc_done
1214.align 16
1215.Lecb_enc_short:
1216 lea ($inp), $arg1
1217 lea ($out), $arg2
1218 lea ($key), $arg3
1219 call asm_AES_encrypt
1220 lea 16($inp), $inp
1221 lea 16($out), $out
1222 dec $len
1223 jnz .Lecb_enc_short
1224
1225.Lecb_enc_done:
1226 lea (%rsp),%rax
1227 pxor %xmm0, %xmm0
1228.Lecb_enc_bzero: # wipe key schedule [if any]
1229 movdqa %xmm0, 0x00(%rax)
1230 movdqa %xmm0, 0x10(%rax)
1231 lea 0x20(%rax), %rax
1232 cmp %rax, %rbp
1233 jb .Lecb_enc_bzero
1234
1235 lea (%rbp),%rsp # restore %rsp
1236___
1237$code.=<<___ if ($win64);
1238 movaps 0x40(%rbp), %xmm6
1239 movaps 0x50(%rbp), %xmm7
1240 movaps 0x60(%rbp), %xmm8
1241 movaps 0x70(%rbp), %xmm9
1242 movaps 0x80(%rbp), %xmm10
1243 movaps 0x90(%rbp), %xmm11
1244 movaps 0xa0(%rbp), %xmm12
1245 movaps 0xb0(%rbp), %xmm13
1246 movaps 0xc0(%rbp), %xmm14
1247 movaps 0xd0(%rbp), %xmm15
1248 lea 0xa0(%rbp), %rsp
1249___
1250$code.=<<___;
1251 mov 0x48(%rsp), %r15
1252 mov 0x50(%rsp), %r14
1253 mov 0x58(%rsp), %r13
1254 mov 0x60(%rsp), %r12
1255 mov 0x68(%rsp), %rbx
1256 mov 0x70(%rsp), %rax
1257 lea 0x78(%rsp), %rsp
1258 mov %rax, %rbp
1259.Lecb_enc_epilogue:
1260 ret
1261.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1262
1263.globl bsaes_ecb_decrypt_blocks
1264.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1265.align 16
1266bsaes_ecb_decrypt_blocks:
1267 mov %rsp, %rax
1268.Lecb_dec_prologue:
1269 push %rbp
1270 push %rbx
1271 push %r12
1272 push %r13
1273 push %r14
1274 push %r15
1275 lea -0x48(%rsp),%rsp
1276___
1277$code.=<<___ if ($win64);
1278 lea -0xa0(%rsp), %rsp
1279 movaps %xmm6, 0x40(%rsp)
1280 movaps %xmm7, 0x50(%rsp)
1281 movaps %xmm8, 0x60(%rsp)
1282 movaps %xmm9, 0x70(%rsp)
1283 movaps %xmm10, 0x80(%rsp)
1284 movaps %xmm11, 0x90(%rsp)
1285 movaps %xmm12, 0xa0(%rsp)
1286 movaps %xmm13, 0xb0(%rsp)
1287 movaps %xmm14, 0xc0(%rsp)
1288 movaps %xmm15, 0xd0(%rsp)
1289.Lecb_dec_body:
1290___
1291$code.=<<___;
1292 mov %rsp,%rbp # backup %rsp
1293 mov 240($arg4),%eax # rounds
1294 mov $arg1,$inp # backup arguments
1295 mov $arg2,$out
1296 mov $arg3,$len
1297 mov $arg4,$key
1298 cmp \$8,$arg3
1299 jb .Lecb_dec_short
1300
1301 mov %eax,%ebx # backup rounds
1302 shl \$7,%rax # 128 bytes per inner round key
1303 sub \$`128-32`,%rax # size of bit-sliced key schedule
1304 sub %rax,%rsp
1305 mov %rsp,%rax # pass key schedule
1306 mov $key,%rcx # pass key
1307 mov %ebx,%r10d # pass rounds
1308 call _bsaes_key_convert
1309 pxor (%rsp),%xmm7 # fix up 0 round key
1310 movdqa %xmm6,(%rax) # save last round key
1311 movdqa %xmm7,(%rsp)
1312
1313 sub \$8,$len
1314.Lecb_dec_loop:
1315 movdqu 0x00($inp), @XMM[0] # load input
1316 movdqu 0x10($inp), @XMM[1]
1317 movdqu 0x20($inp), @XMM[2]
1318 movdqu 0x30($inp), @XMM[3]
1319 movdqu 0x40($inp), @XMM[4]
1320 movdqu 0x50($inp), @XMM[5]
1321 mov %rsp, %rax # pass key schedule
1322 movdqu 0x60($inp), @XMM[6]
1323 mov %ebx,%r10d # pass rounds
1324 movdqu 0x70($inp), @XMM[7]
1325 lea 0x80($inp), $inp
1326
1327 call _bsaes_decrypt8
1328
1329 movdqu @XMM[0], 0x00($out) # write output
1330 movdqu @XMM[1], 0x10($out)
1331 movdqu @XMM[6], 0x20($out)
1332 movdqu @XMM[4], 0x30($out)
1333 movdqu @XMM[2], 0x40($out)
1334 movdqu @XMM[7], 0x50($out)
1335 movdqu @XMM[3], 0x60($out)
1336 movdqu @XMM[5], 0x70($out)
1337 lea 0x80($out), $out
1338 sub \$8,$len
1339 jnc .Lecb_dec_loop
1340
1341 add \$8,$len
1342 jz .Lecb_dec_done
1343
1344 movdqu 0x00($inp), @XMM[0] # load input
1345 mov %rsp, %rax # pass key schedule
1346 mov %ebx,%r10d # pass rounds
1347 cmp \$2,$len
1348 jb .Lecb_dec_one
1349 movdqu 0x10($inp), @XMM[1]
1350 je .Lecb_dec_two
1351 movdqu 0x20($inp), @XMM[2]
1352 cmp \$4,$len
1353 jb .Lecb_dec_three
1354 movdqu 0x30($inp), @XMM[3]
1355 je .Lecb_dec_four
1356 movdqu 0x40($inp), @XMM[4]
1357 cmp \$6,$len
1358 jb .Lecb_dec_five
1359 movdqu 0x50($inp), @XMM[5]
1360 je .Lecb_dec_six
1361 movdqu 0x60($inp), @XMM[6]
1362 call _bsaes_decrypt8
1363 movdqu @XMM[0], 0x00($out) # write output
1364 movdqu @XMM[1], 0x10($out)
1365 movdqu @XMM[6], 0x20($out)
1366 movdqu @XMM[4], 0x30($out)
1367 movdqu @XMM[2], 0x40($out)
1368 movdqu @XMM[7], 0x50($out)
1369 movdqu @XMM[3], 0x60($out)
1370 jmp .Lecb_dec_done
1371.align 16
1372.Lecb_dec_six:
1373 call _bsaes_decrypt8
1374 movdqu @XMM[0], 0x00($out) # write output
1375 movdqu @XMM[1], 0x10($out)
1376 movdqu @XMM[6], 0x20($out)
1377 movdqu @XMM[4], 0x30($out)
1378 movdqu @XMM[2], 0x40($out)
1379 movdqu @XMM[7], 0x50($out)
1380 jmp .Lecb_dec_done
1381.align 16
1382.Lecb_dec_five:
1383 call _bsaes_decrypt8
1384 movdqu @XMM[0], 0x00($out) # write output
1385 movdqu @XMM[1], 0x10($out)
1386 movdqu @XMM[6], 0x20($out)
1387 movdqu @XMM[4], 0x30($out)
1388 movdqu @XMM[2], 0x40($out)
1389 jmp .Lecb_dec_done
1390.align 16
1391.Lecb_dec_four:
1392 call _bsaes_decrypt8
1393 movdqu @XMM[0], 0x00($out) # write output
1394 movdqu @XMM[1], 0x10($out)
1395 movdqu @XMM[6], 0x20($out)
1396 movdqu @XMM[4], 0x30($out)
1397 jmp .Lecb_dec_done
1398.align 16
1399.Lecb_dec_three:
1400 call _bsaes_decrypt8
1401 movdqu @XMM[0], 0x00($out) # write output
1402 movdqu @XMM[1], 0x10($out)
1403 movdqu @XMM[6], 0x20($out)
1404 jmp .Lecb_dec_done
1405.align 16
1406.Lecb_dec_two:
1407 call _bsaes_decrypt8
1408 movdqu @XMM[0], 0x00($out) # write output
1409 movdqu @XMM[1], 0x10($out)
1410 jmp .Lecb_dec_done
1411.align 16
1412.Lecb_dec_one:
1413 call _bsaes_decrypt8
1414 movdqu @XMM[0], 0x00($out) # write output
1415 jmp .Lecb_dec_done
1416.align 16
1417.Lecb_dec_short:
1418 lea ($inp), $arg1
1419 lea ($out), $arg2
1420 lea ($key), $arg3
1421 call asm_AES_decrypt
1422 lea 16($inp), $inp
1423 lea 16($out), $out
1424 dec $len
1425 jnz .Lecb_dec_short
1426
1427.Lecb_dec_done:
1428 lea (%rsp),%rax
1429 pxor %xmm0, %xmm0
1430.Lecb_dec_bzero: # wipe key schedule [if any]
1431 movdqa %xmm0, 0x00(%rax)
1432 movdqa %xmm0, 0x10(%rax)
1433 lea 0x20(%rax), %rax
1434 cmp %rax, %rbp
1435 jb .Lecb_dec_bzero
1436
1437 lea (%rbp),%rsp # restore %rsp
1438___
1439$code.=<<___ if ($win64);
1440 movaps 0x40(%rbp), %xmm6
1441 movaps 0x50(%rbp), %xmm7
1442 movaps 0x60(%rbp), %xmm8
1443 movaps 0x70(%rbp), %xmm9
1444 movaps 0x80(%rbp), %xmm10
1445 movaps 0x90(%rbp), %xmm11
1446 movaps 0xa0(%rbp), %xmm12
1447 movaps 0xb0(%rbp), %xmm13
1448 movaps 0xc0(%rbp), %xmm14
1449 movaps 0xd0(%rbp), %xmm15
1450 lea 0xa0(%rbp), %rsp
1451___
1452$code.=<<___;
1453 mov 0x48(%rsp), %r15
1454 mov 0x50(%rsp), %r14
1455 mov 0x58(%rsp), %r13
1456 mov 0x60(%rsp), %r12
1457 mov 0x68(%rsp), %rbx
1458 mov 0x70(%rsp), %rax
1459 lea 0x78(%rsp), %rsp
1460 mov %rax, %rbp
1461.Lecb_dec_epilogue:
1462 ret
1463.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1464___
1465}
1466$code.=<<___;
1467.extern asm_AES_cbc_encrypt
1468.globl bsaes_cbc_encrypt
1469.type bsaes_cbc_encrypt,\@abi-omnipotent
1470.align 16
1471bsaes_cbc_encrypt:
1472___
1473$code.=<<___ if ($win64);
1474 mov 48(%rsp),$arg6 # pull direction flag
1475___
1476$code.=<<___;
1477 cmp \$0,$arg6
1478 jne asm_AES_cbc_encrypt
1479 cmp \$128,$arg3
1480 jb asm_AES_cbc_encrypt
1481
1482 mov %rsp, %rax
1483.Lcbc_dec_prologue:
1484 push %rbp
1485 push %rbx
1486 push %r12
1487 push %r13
1488 push %r14
1489 push %r15
1490 lea -0x48(%rsp), %rsp
1491___
1492$code.=<<___ if ($win64);
1493 mov 0xa0(%rsp),$arg5 # pull ivp
1494 lea -0xa0(%rsp), %rsp
1495 movaps %xmm6, 0x40(%rsp)
1496 movaps %xmm7, 0x50(%rsp)
1497 movaps %xmm8, 0x60(%rsp)
1498 movaps %xmm9, 0x70(%rsp)
1499 movaps %xmm10, 0x80(%rsp)
1500 movaps %xmm11, 0x90(%rsp)
1501 movaps %xmm12, 0xa0(%rsp)
1502 movaps %xmm13, 0xb0(%rsp)
1503 movaps %xmm14, 0xc0(%rsp)
1504 movaps %xmm15, 0xd0(%rsp)
1505.Lcbc_dec_body:
1506___
1507$code.=<<___;
1508 mov %rsp, %rbp # backup %rsp
1509 mov 240($arg4), %eax # rounds
1510 mov $arg1, $inp # backup arguments
1511 mov $arg2, $out
1512 mov $arg3, $len
1513 mov $arg4, $key
1514 mov $arg5, %rbx
1515 shr \$4, $len # bytes to blocks
1516
1517 mov %eax, %edx # rounds
1518 shl \$7, %rax # 128 bytes per inner round key
1519 sub \$`128-32`, %rax # size of bit-sliced key schedule
1520 sub %rax, %rsp
1521
1522 mov %rsp, %rax # pass key schedule
1523 mov $key, %rcx # pass key
1524 mov %edx, %r10d # pass rounds
1525 call _bsaes_key_convert
1526 pxor (%rsp),%xmm7 # fix up 0 round key
1527 movdqa %xmm6,(%rax) # save last round key
1528 movdqa %xmm7,(%rsp)
1529
1530 movdqu (%rbx), @XMM[15] # load IV
1531 sub \$8,$len
1532.Lcbc_dec_loop:
1533 movdqu 0x00($inp), @XMM[0] # load input
1534 movdqu 0x10($inp), @XMM[1]
1535 movdqu 0x20($inp), @XMM[2]
1536 movdqu 0x30($inp), @XMM[3]
1537 movdqu 0x40($inp), @XMM[4]
1538 movdqu 0x50($inp), @XMM[5]
1539 mov %rsp, %rax # pass key schedule
1540 movdqu 0x60($inp), @XMM[6]
1541 mov %edx,%r10d # pass rounds
1542 movdqu 0x70($inp), @XMM[7]
1543 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1544
1545 call _bsaes_decrypt8
1546
1547 pxor 0x20(%rbp), @XMM[0] # ^= IV
1548 movdqu 0x00($inp), @XMM[8] # re-load input
1549 movdqu 0x10($inp), @XMM[9]
1550 pxor @XMM[8], @XMM[1]
1551 movdqu 0x20($inp), @XMM[10]
1552 pxor @XMM[9], @XMM[6]
1553 movdqu 0x30($inp), @XMM[11]
1554 pxor @XMM[10], @XMM[4]
1555 movdqu 0x40($inp), @XMM[12]
1556 pxor @XMM[11], @XMM[2]
1557 movdqu 0x50($inp), @XMM[13]
1558 pxor @XMM[12], @XMM[7]
1559 movdqu 0x60($inp), @XMM[14]
1560 pxor @XMM[13], @XMM[3]
1561 movdqu 0x70($inp), @XMM[15] # IV
1562 pxor @XMM[14], @XMM[5]
1563 movdqu @XMM[0], 0x00($out) # write output
1564 lea 0x80($inp), $inp
1565 movdqu @XMM[1], 0x10($out)
1566 movdqu @XMM[6], 0x20($out)
1567 movdqu @XMM[4], 0x30($out)
1568 movdqu @XMM[2], 0x40($out)
1569 movdqu @XMM[7], 0x50($out)
1570 movdqu @XMM[3], 0x60($out)
1571 movdqu @XMM[5], 0x70($out)
1572 lea 0x80($out), $out
1573 sub \$8,$len
1574 jnc .Lcbc_dec_loop
1575
1576 add \$8,$len
1577 jz .Lcbc_dec_done
1578
1579 movdqu 0x00($inp), @XMM[0] # load input
1580 mov %rsp, %rax # pass key schedule
1581 mov %edx, %r10d # pass rounds
1582 cmp \$2,$len
1583 jb .Lcbc_dec_one
1584 movdqu 0x10($inp), @XMM[1]
1585 je .Lcbc_dec_two
1586 movdqu 0x20($inp), @XMM[2]
1587 cmp \$4,$len
1588 jb .Lcbc_dec_three
1589 movdqu 0x30($inp), @XMM[3]
1590 je .Lcbc_dec_four
1591 movdqu 0x40($inp), @XMM[4]
1592 cmp \$6,$len
1593 jb .Lcbc_dec_five
1594 movdqu 0x50($inp), @XMM[5]
1595 je .Lcbc_dec_six
1596 movdqu 0x60($inp), @XMM[6]
1597 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1598 call _bsaes_decrypt8
1599 pxor 0x20(%rbp), @XMM[0] # ^= IV
1600 movdqu 0x00($inp), @XMM[8] # re-load input
1601 movdqu 0x10($inp), @XMM[9]
1602 pxor @XMM[8], @XMM[1]
1603 movdqu 0x20($inp), @XMM[10]
1604 pxor @XMM[9], @XMM[6]
1605 movdqu 0x30($inp), @XMM[11]
1606 pxor @XMM[10], @XMM[4]
1607 movdqu 0x40($inp), @XMM[12]
1608 pxor @XMM[11], @XMM[2]
1609 movdqu 0x50($inp), @XMM[13]
1610 pxor @XMM[12], @XMM[7]
1611 movdqu 0x60($inp), @XMM[15] # IV
1612 pxor @XMM[13], @XMM[3]
1613 movdqu @XMM[0], 0x00($out) # write output
1614 movdqu @XMM[1], 0x10($out)
1615 movdqu @XMM[6], 0x20($out)
1616 movdqu @XMM[4], 0x30($out)
1617 movdqu @XMM[2], 0x40($out)
1618 movdqu @XMM[7], 0x50($out)
1619 movdqu @XMM[3], 0x60($out)
1620 jmp .Lcbc_dec_done
1621.align 16
1622.Lcbc_dec_six:
1623 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1624 call _bsaes_decrypt8
1625 pxor 0x20(%rbp), @XMM[0] # ^= IV
1626 movdqu 0x00($inp), @XMM[8] # re-load input
1627 movdqu 0x10($inp), @XMM[9]
1628 pxor @XMM[8], @XMM[1]
1629 movdqu 0x20($inp), @XMM[10]
1630 pxor @XMM[9], @XMM[6]
1631 movdqu 0x30($inp), @XMM[11]
1632 pxor @XMM[10], @XMM[4]
1633 movdqu 0x40($inp), @XMM[12]
1634 pxor @XMM[11], @XMM[2]
1635 movdqu 0x50($inp), @XMM[15] # IV
1636 pxor @XMM[12], @XMM[7]
1637 movdqu @XMM[0], 0x00($out) # write output
1638 movdqu @XMM[1], 0x10($out)
1639 movdqu @XMM[6], 0x20($out)
1640 movdqu @XMM[4], 0x30($out)
1641 movdqu @XMM[2], 0x40($out)
1642 movdqu @XMM[7], 0x50($out)
1643 jmp .Lcbc_dec_done
1644.align 16
1645.Lcbc_dec_five:
1646 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1647 call _bsaes_decrypt8
1648 pxor 0x20(%rbp), @XMM[0] # ^= IV
1649 movdqu 0x00($inp), @XMM[8] # re-load input
1650 movdqu 0x10($inp), @XMM[9]
1651 pxor @XMM[8], @XMM[1]
1652 movdqu 0x20($inp), @XMM[10]
1653 pxor @XMM[9], @XMM[6]
1654 movdqu 0x30($inp), @XMM[11]
1655 pxor @XMM[10], @XMM[4]
1656 movdqu 0x40($inp), @XMM[15] # IV
1657 pxor @XMM[11], @XMM[2]
1658 movdqu @XMM[0], 0x00($out) # write output
1659 movdqu @XMM[1], 0x10($out)
1660 movdqu @XMM[6], 0x20($out)
1661 movdqu @XMM[4], 0x30($out)
1662 movdqu @XMM[2], 0x40($out)
1663 jmp .Lcbc_dec_done
1664.align 16
1665.Lcbc_dec_four:
1666 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1667 call _bsaes_decrypt8
1668 pxor 0x20(%rbp), @XMM[0] # ^= IV
1669 movdqu 0x00($inp), @XMM[8] # re-load input
1670 movdqu 0x10($inp), @XMM[9]
1671 pxor @XMM[8], @XMM[1]
1672 movdqu 0x20($inp), @XMM[10]
1673 pxor @XMM[9], @XMM[6]
1674 movdqu 0x30($inp), @XMM[15] # IV
1675 pxor @XMM[10], @XMM[4]
1676 movdqu @XMM[0], 0x00($out) # write output
1677 movdqu @XMM[1], 0x10($out)
1678 movdqu @XMM[6], 0x20($out)
1679 movdqu @XMM[4], 0x30($out)
1680 jmp .Lcbc_dec_done
1681.align 16
1682.Lcbc_dec_three:
1683 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1684 call _bsaes_decrypt8
1685 pxor 0x20(%rbp), @XMM[0] # ^= IV
1686 movdqu 0x00($inp), @XMM[8] # re-load input
1687 movdqu 0x10($inp), @XMM[9]
1688 pxor @XMM[8], @XMM[1]
1689 movdqu 0x20($inp), @XMM[15] # IV
1690 pxor @XMM[9], @XMM[6]
1691 movdqu @XMM[0], 0x00($out) # write output
1692 movdqu @XMM[1], 0x10($out)
1693 movdqu @XMM[6], 0x20($out)
1694 jmp .Lcbc_dec_done
1695.align 16
1696.Lcbc_dec_two:
1697 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1698 call _bsaes_decrypt8
1699 pxor 0x20(%rbp), @XMM[0] # ^= IV
1700 movdqu 0x00($inp), @XMM[8] # re-load input
1701 movdqu 0x10($inp), @XMM[15] # IV
1702 pxor @XMM[8], @XMM[1]
1703 movdqu @XMM[0], 0x00($out) # write output
1704 movdqu @XMM[1], 0x10($out)
1705 jmp .Lcbc_dec_done
1706.align 16
1707.Lcbc_dec_one:
1708 lea ($inp), $arg1
1709 lea 0x20(%rbp), $arg2 # buffer output
1710 lea ($key), $arg3
1711 call asm_AES_decrypt # doesn't touch %xmm
1712 pxor 0x20(%rbp), @XMM[15] # ^= IV
1713 movdqu @XMM[15], ($out) # write output
1714 movdqa @XMM[0], @XMM[15] # IV
1715
1716.Lcbc_dec_done:
1717 movdqu @XMM[15], (%rbx) # return IV
1718 lea (%rsp), %rax
1719 pxor %xmm0, %xmm0
1720.Lcbc_dec_bzero: # wipe key schedule [if any]
1721 movdqa %xmm0, 0x00(%rax)
1722 movdqa %xmm0, 0x10(%rax)
1723 lea 0x20(%rax), %rax
1724 cmp %rax, %rbp
1725 ja .Lcbc_dec_bzero
1726
1727 lea (%rbp),%rsp # restore %rsp
1728___
1729$code.=<<___ if ($win64);
1730 movaps 0x40(%rbp), %xmm6
1731 movaps 0x50(%rbp), %xmm7
1732 movaps 0x60(%rbp), %xmm8
1733 movaps 0x70(%rbp), %xmm9
1734 movaps 0x80(%rbp), %xmm10
1735 movaps 0x90(%rbp), %xmm11
1736 movaps 0xa0(%rbp), %xmm12
1737 movaps 0xb0(%rbp), %xmm13
1738 movaps 0xc0(%rbp), %xmm14
1739 movaps 0xd0(%rbp), %xmm15
1740 lea 0xa0(%rbp), %rsp
1741___
1742$code.=<<___;
1743 mov 0x48(%rsp), %r15
1744 mov 0x50(%rsp), %r14
1745 mov 0x58(%rsp), %r13
1746 mov 0x60(%rsp), %r12
1747 mov 0x68(%rsp), %rbx
1748 mov 0x70(%rsp), %rax
1749 lea 0x78(%rsp), %rsp
1750 mov %rax, %rbp
1751.Lcbc_dec_epilogue:
1752 ret
1753.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1754
1755.globl bsaes_ctr32_encrypt_blocks
1756.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1757.align 16
1758bsaes_ctr32_encrypt_blocks:
1759 mov %rsp, %rax
1760.Lctr_enc_prologue:
1761 push %rbp
1762 push %rbx
1763 push %r12
1764 push %r13
1765 push %r14
1766 push %r15
1767 lea -0x48(%rsp), %rsp
1768___
1769$code.=<<___ if ($win64);
1770 mov 0xa0(%rsp),$arg5 # pull ivp
1771 lea -0xa0(%rsp), %rsp
1772 movaps %xmm6, 0x40(%rsp)
1773 movaps %xmm7, 0x50(%rsp)
1774 movaps %xmm8, 0x60(%rsp)
1775 movaps %xmm9, 0x70(%rsp)
1776 movaps %xmm10, 0x80(%rsp)
1777 movaps %xmm11, 0x90(%rsp)
1778 movaps %xmm12, 0xa0(%rsp)
1779 movaps %xmm13, 0xb0(%rsp)
1780 movaps %xmm14, 0xc0(%rsp)
1781 movaps %xmm15, 0xd0(%rsp)
1782.Lctr_enc_body:
1783___
1784$code.=<<___;
1785 mov %rsp, %rbp # backup %rsp
1786 movdqu ($arg5), %xmm0 # load counter
1787 mov 240($arg4), %eax # rounds
1788 mov $arg1, $inp # backup arguments
1789 mov $arg2, $out
1790 mov $arg3, $len
1791 mov $arg4, $key
1792 movdqa %xmm0, 0x20(%rbp) # copy counter
1793 cmp \$8, $arg3
1794 jb .Lctr_enc_short
1795
1796 mov %eax, %ebx # rounds
1797 shl \$7, %rax # 128 bytes per inner round key
1798 sub \$`128-32`, %rax # size of bit-sliced key schedule
1799 sub %rax, %rsp
1800
1801 mov %rsp, %rax # pass key schedule
1802 mov $key, %rcx # pass key
1803 mov %ebx, %r10d # pass rounds
1804 call _bsaes_key_convert
1805 pxor %xmm6,%xmm7 # fix up last round key
1806 movdqa %xmm7,(%rax) # save last round key
1807
1808 movdqa (%rsp), @XMM[9] # load round0 key
1809 lea .LADD1(%rip), %r11
1810 movdqa 0x20(%rbp), @XMM[0] # counter copy
1811 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1812 pshufb @XMM[8], @XMM[9] # byte swap upper part
1813 pshufb @XMM[8], @XMM[0]
1814 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1815 jmp .Lctr_enc_loop
1816.align 16
1817.Lctr_enc_loop:
1818 movdqa @XMM[0], 0x20(%rbp) # save counter
1819 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1820 movdqa @XMM[0], @XMM[2]
1821 paddd 0x00(%r11), @XMM[1] # .LADD1
1822 movdqa @XMM[0], @XMM[3]
1823 paddd 0x10(%r11), @XMM[2] # .LADD2
1824 movdqa @XMM[0], @XMM[4]
1825 paddd 0x20(%r11), @XMM[3] # .LADD3
1826 movdqa @XMM[0], @XMM[5]
1827 paddd 0x30(%r11), @XMM[4] # .LADD4
1828 movdqa @XMM[0], @XMM[6]
1829 paddd 0x40(%r11), @XMM[5] # .LADD5
1830 movdqa @XMM[0], @XMM[7]
1831 paddd 0x50(%r11), @XMM[6] # .LADD6
1832 paddd 0x60(%r11), @XMM[7] # .LADD7
1833
1834 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1835 # to flip byte order in 32-bit counter
1836 movdqa (%rsp), @XMM[9] # round 0 key
1837 lea 0x10(%rsp), %rax # pass key schedule
1838 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1839 pxor @XMM[9], @XMM[0] # xor with round0 key
1840 pxor @XMM[9], @XMM[1]
1841 pshufb @XMM[8], @XMM[0]
1842 pxor @XMM[9], @XMM[2]
1843 pshufb @XMM[8], @XMM[1]
1844 pxor @XMM[9], @XMM[3]
1845 pshufb @XMM[8], @XMM[2]
1846 pxor @XMM[9], @XMM[4]
1847 pshufb @XMM[8], @XMM[3]
1848 pxor @XMM[9], @XMM[5]
1849 pshufb @XMM[8], @XMM[4]
1850 pxor @XMM[9], @XMM[6]
1851 pshufb @XMM[8], @XMM[5]
1852 pxor @XMM[9], @XMM[7]
1853 pshufb @XMM[8], @XMM[6]
1854 lea .LBS0(%rip), %r11 # constants table
1855 pshufb @XMM[8], @XMM[7]
1856 mov %ebx,%r10d # pass rounds
1857
1858 call _bsaes_encrypt8_bitslice
1859
1860 sub \$8,$len
1861 jc .Lctr_enc_loop_done
1862
1863 movdqu 0x00($inp), @XMM[8] # load input
1864 movdqu 0x10($inp), @XMM[9]
1865 movdqu 0x20($inp), @XMM[10]
1866 movdqu 0x30($inp), @XMM[11]
1867 movdqu 0x40($inp), @XMM[12]
1868 movdqu 0x50($inp), @XMM[13]
1869 movdqu 0x60($inp), @XMM[14]
1870 movdqu 0x70($inp), @XMM[15]
1871 lea 0x80($inp),$inp
1872 pxor @XMM[0], @XMM[8]
1873 movdqa 0x20(%rbp), @XMM[0] # load counter
1874 pxor @XMM[9], @XMM[1]
1875 movdqu @XMM[8], 0x00($out) # write output
1876 pxor @XMM[10], @XMM[4]
1877 movdqu @XMM[1], 0x10($out)
1878 pxor @XMM[11], @XMM[6]
1879 movdqu @XMM[4], 0x20($out)
1880 pxor @XMM[12], @XMM[3]
1881 movdqu @XMM[6], 0x30($out)
1882 pxor @XMM[13], @XMM[7]
1883 movdqu @XMM[3], 0x40($out)
1884 pxor @XMM[14], @XMM[2]
1885 movdqu @XMM[7], 0x50($out)
1886 pxor @XMM[15], @XMM[5]
1887 movdqu @XMM[2], 0x60($out)
1888 lea .LADD1(%rip), %r11
1889 movdqu @XMM[5], 0x70($out)
1890 lea 0x80($out), $out
1891 paddd 0x70(%r11), @XMM[0] # .LADD8
1892 jnz .Lctr_enc_loop
1893
1894 jmp .Lctr_enc_done
1895.align 16
1896.Lctr_enc_loop_done:
1897 add \$8, $len
1898 movdqu 0x00($inp), @XMM[8] # load input
1899 pxor @XMM[8], @XMM[0]
1900 movdqu @XMM[0], 0x00($out) # write output
1901 cmp \$2,$len
1902 jb .Lctr_enc_done
1903 movdqu 0x10($inp), @XMM[9]
1904 pxor @XMM[9], @XMM[1]
1905 movdqu @XMM[1], 0x10($out)
1906 je .Lctr_enc_done
1907 movdqu 0x20($inp), @XMM[10]
1908 pxor @XMM[10], @XMM[4]
1909 movdqu @XMM[4], 0x20($out)
1910 cmp \$4,$len
1911 jb .Lctr_enc_done
1912 movdqu 0x30($inp), @XMM[11]
1913 pxor @XMM[11], @XMM[6]
1914 movdqu @XMM[6], 0x30($out)
1915 je .Lctr_enc_done
1916 movdqu 0x40($inp), @XMM[12]
1917 pxor @XMM[12], @XMM[3]
1918 movdqu @XMM[3], 0x40($out)
1919 cmp \$6,$len
1920 jb .Lctr_enc_done
1921 movdqu 0x50($inp), @XMM[13]
1922 pxor @XMM[13], @XMM[7]
1923 movdqu @XMM[7], 0x50($out)
1924 je .Lctr_enc_done
1925 movdqu 0x60($inp), @XMM[14]
1926 pxor @XMM[14], @XMM[2]
1927 movdqu @XMM[2], 0x60($out)
1928 jmp .Lctr_enc_done
1929
1930.align 16
1931.Lctr_enc_short:
1932 lea 0x20(%rbp), $arg1
1933 lea 0x30(%rbp), $arg2
1934 lea ($key), $arg3
1935 call asm_AES_encrypt
1936 movdqu ($inp), @XMM[1]
1937 lea 16($inp), $inp
1938 mov 0x2c(%rbp), %eax # load 32-bit counter
1939 bswap %eax
1940 pxor 0x30(%rbp), @XMM[1]
1941 inc %eax # increment
1942 movdqu @XMM[1], ($out)
1943 bswap %eax
1944 lea 16($out), $out
1945 mov %eax, 0x2c(%rsp) # save 32-bit counter
1946 dec $len
1947 jnz .Lctr_enc_short
1948
1949.Lctr_enc_done:
1950 lea (%rsp), %rax
1951 pxor %xmm0, %xmm0
1952.Lctr_enc_bzero: # wipe key schedule [if any]
1953 movdqa %xmm0, 0x00(%rax)
1954 movdqa %xmm0, 0x10(%rax)
1955 lea 0x20(%rax), %rax
1956 cmp %rax, %rbp
1957 ja .Lctr_enc_bzero
1958
1959 lea (%rbp),%rsp # restore %rsp
1960___
1961$code.=<<___ if ($win64);
1962 movaps 0x40(%rbp), %xmm6
1963 movaps 0x50(%rbp), %xmm7
1964 movaps 0x60(%rbp), %xmm8
1965 movaps 0x70(%rbp), %xmm9
1966 movaps 0x80(%rbp), %xmm10
1967 movaps 0x90(%rbp), %xmm11
1968 movaps 0xa0(%rbp), %xmm12
1969 movaps 0xb0(%rbp), %xmm13
1970 movaps 0xc0(%rbp), %xmm14
1971 movaps 0xd0(%rbp), %xmm15
1972 lea 0xa0(%rbp), %rsp
1973___
1974$code.=<<___;
1975 mov 0x48(%rsp), %r15
1976 mov 0x50(%rsp), %r14
1977 mov 0x58(%rsp), %r13
1978 mov 0x60(%rsp), %r12
1979 mov 0x68(%rsp), %rbx
1980 mov 0x70(%rsp), %rax
1981 lea 0x78(%rsp), %rsp
1982 mov %rax, %rbp
1983.Lctr_enc_epilogue:
1984 ret
1985.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1986___
1987######################################################################
1988# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1989# const AES_KEY *key1, const AES_KEY *key2,
1990# const unsigned char iv[16]);
1991#
1992my ($twmask,$twres,$twtmp)=@XMM[13..15];
1993$code.=<<___;
1994.globl bsaes_xts_encrypt
1995.type bsaes_xts_encrypt,\@abi-omnipotent
1996.align 16
1997bsaes_xts_encrypt:
1998 mov %rsp, %rax
1999.Lxts_enc_prologue:
2000 push %rbp
2001 push %rbx
2002 push %r12
2003 push %r13
2004 push %r14
2005 push %r15
2006 lea -0x48(%rsp), %rsp
2007___
2008$code.=<<___ if ($win64);
2009 mov 0xa0(%rsp),$arg5 # pull key2
2010 mov 0xa8(%rsp),$arg6 # pull ivp
2011 lea -0xa0(%rsp), %rsp
2012 movaps %xmm6, 0x40(%rsp)
2013 movaps %xmm7, 0x50(%rsp)
2014 movaps %xmm8, 0x60(%rsp)
2015 movaps %xmm9, 0x70(%rsp)
2016 movaps %xmm10, 0x80(%rsp)
2017 movaps %xmm11, 0x90(%rsp)
2018 movaps %xmm12, 0xa0(%rsp)
2019 movaps %xmm13, 0xb0(%rsp)
2020 movaps %xmm14, 0xc0(%rsp)
2021 movaps %xmm15, 0xd0(%rsp)
2022.Lxts_enc_body:
2023___
2024$code.=<<___;
2025 mov %rsp, %rbp # backup %rsp
2026 mov $arg1, $inp # backup arguments
2027 mov $arg2, $out
2028 mov $arg3, $len
2029 mov $arg4, $key
2030
2031 lea ($arg6), $arg1
2032 lea 0x20(%rbp), $arg2
2033 lea ($arg5), $arg3
2034 call asm_AES_encrypt # generate initial tweak
2035
2036 mov 240($key), %eax # rounds
2037 mov $len, %rbx # backup $len
2038
2039 mov %eax, %edx # rounds
2040 shl \$7, %rax # 128 bytes per inner round key
2041 sub \$`128-32`, %rax # size of bit-sliced key schedule
2042 sub %rax, %rsp
2043
2044 mov %rsp, %rax # pass key schedule
2045 mov $key, %rcx # pass key
2046 mov %edx, %r10d # pass rounds
2047 call _bsaes_key_convert
2048 pxor %xmm6, %xmm7 # fix up last round key
2049 movdqa %xmm7, (%rax) # save last round key
2050
2051 and \$-16, $len
2052 sub \$0x80, %rsp # place for tweak[8]
2053 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2054
2055 pxor $twtmp, $twtmp
2056 movdqa .Lxts_magic(%rip), $twmask
2057 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2058
2059 sub \$0x80, $len
2060 jc .Lxts_enc_short
2061 jmp .Lxts_enc_loop
2062
2063.align 16
2064.Lxts_enc_loop:
2065___
2066 for ($i=0;$i<7;$i++) {
2067 $code.=<<___;
2068 pshufd \$0x13, $twtmp, $twres
2069 pxor $twtmp, $twtmp
2070 movdqa @XMM[7], @XMM[$i]
2071 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2072 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2073 pand $twmask, $twres # isolate carry and residue
2074 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2075 pxor $twres, @XMM[7]
2076___
2077 $code.=<<___ if ($i>=1);
2078 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2079___
2080 $code.=<<___ if ($i>=2);
2081 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2082___
2083 }
2084$code.=<<___;
2085 movdqu 0x60($inp), @XMM[8+6]
2086 pxor @XMM[8+5], @XMM[5]
2087 movdqu 0x70($inp), @XMM[8+7]
2088 lea 0x80($inp), $inp
2089 movdqa @XMM[7], 0x70(%rsp)
2090 pxor @XMM[8+6], @XMM[6]
2091 lea 0x80(%rsp), %rax # pass key schedule
2092 pxor @XMM[8+7], @XMM[7]
2093 mov %edx, %r10d # pass rounds
2094
2095 call _bsaes_encrypt8
2096
2097 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2098 pxor 0x10(%rsp), @XMM[1]
2099 movdqu @XMM[0], 0x00($out) # write output
2100 pxor 0x20(%rsp), @XMM[4]
2101 movdqu @XMM[1], 0x10($out)
2102 pxor 0x30(%rsp), @XMM[6]
2103 movdqu @XMM[4], 0x20($out)
2104 pxor 0x40(%rsp), @XMM[3]
2105 movdqu @XMM[6], 0x30($out)
2106 pxor 0x50(%rsp), @XMM[7]
2107 movdqu @XMM[3], 0x40($out)
2108 pxor 0x60(%rsp), @XMM[2]
2109 movdqu @XMM[7], 0x50($out)
2110 pxor 0x70(%rsp), @XMM[5]
2111 movdqu @XMM[2], 0x60($out)
2112 movdqu @XMM[5], 0x70($out)
2113 lea 0x80($out), $out
2114
2115 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2116 pxor $twtmp, $twtmp
2117 movdqa .Lxts_magic(%rip), $twmask
2118 pcmpgtd @XMM[7], $twtmp
2119 pshufd \$0x13, $twtmp, $twres
2120 pxor $twtmp, $twtmp
2121 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2122 pand $twmask, $twres # isolate carry and residue
2123 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2124 pxor $twres, @XMM[7]
2125
2126 sub \$0x80,$len
2127 jnc .Lxts_enc_loop
2128
2129.Lxts_enc_short:
2130 add \$0x80, $len
2131 jz .Lxts_enc_done
2132___
2133 for ($i=0;$i<7;$i++) {
2134 $code.=<<___;
2135 pshufd \$0x13, $twtmp, $twres
2136 pxor $twtmp, $twtmp
2137 movdqa @XMM[7], @XMM[$i]
2138 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2139 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2140 pand $twmask, $twres # isolate carry and residue
2141 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2142 pxor $twres, @XMM[7]
2143___
2144 $code.=<<___ if ($i>=1);
2145 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2146 cmp \$`0x10*$i`,$len
2147 je .Lxts_enc_$i
2148___
2149 $code.=<<___ if ($i>=2);
2150 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2151___
2152 }
2153$code.=<<___;
2154 movdqu 0x60($inp), @XMM[8+6]
2155 pxor @XMM[8+5], @XMM[5]
2156 movdqa @XMM[7], 0x70(%rsp)
2157 lea 0x70($inp), $inp
2158 pxor @XMM[8+6], @XMM[6]
2159 lea 0x80(%rsp), %rax # pass key schedule
2160 mov %edx, %r10d # pass rounds
2161
2162 call _bsaes_encrypt8
2163
2164 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2165 pxor 0x10(%rsp), @XMM[1]
2166 movdqu @XMM[0], 0x00($out) # write output
2167 pxor 0x20(%rsp), @XMM[4]
2168 movdqu @XMM[1], 0x10($out)
2169 pxor 0x30(%rsp), @XMM[6]
2170 movdqu @XMM[4], 0x20($out)
2171 pxor 0x40(%rsp), @XMM[3]
2172 movdqu @XMM[6], 0x30($out)
2173 pxor 0x50(%rsp), @XMM[7]
2174 movdqu @XMM[3], 0x40($out)
2175 pxor 0x60(%rsp), @XMM[2]
2176 movdqu @XMM[7], 0x50($out)
2177 movdqu @XMM[2], 0x60($out)
2178 lea 0x70($out), $out
2179
2180 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2181 jmp .Lxts_enc_done
2182.align 16
2183.Lxts_enc_6:
2184 pxor @XMM[8+4], @XMM[4]
2185 lea 0x60($inp), $inp
2186 pxor @XMM[8+5], @XMM[5]
2187 lea 0x80(%rsp), %rax # pass key schedule
2188 mov %edx, %r10d # pass rounds
2189
2190 call _bsaes_encrypt8
2191
2192 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2193 pxor 0x10(%rsp), @XMM[1]
2194 movdqu @XMM[0], 0x00($out) # write output
2195 pxor 0x20(%rsp), @XMM[4]
2196 movdqu @XMM[1], 0x10($out)
2197 pxor 0x30(%rsp), @XMM[6]
2198 movdqu @XMM[4], 0x20($out)
2199 pxor 0x40(%rsp), @XMM[3]
2200 movdqu @XMM[6], 0x30($out)
2201 pxor 0x50(%rsp), @XMM[7]
2202 movdqu @XMM[3], 0x40($out)
2203 movdqu @XMM[7], 0x50($out)
2204 lea 0x60($out), $out
2205
2206 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2207 jmp .Lxts_enc_done
2208.align 16
2209.Lxts_enc_5:
2210 pxor @XMM[8+3], @XMM[3]
2211 lea 0x50($inp), $inp
2212 pxor @XMM[8+4], @XMM[4]
2213 lea 0x80(%rsp), %rax # pass key schedule
2214 mov %edx, %r10d # pass rounds
2215
2216 call _bsaes_encrypt8
2217
2218 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2219 pxor 0x10(%rsp), @XMM[1]
2220 movdqu @XMM[0], 0x00($out) # write output
2221 pxor 0x20(%rsp), @XMM[4]
2222 movdqu @XMM[1], 0x10($out)
2223 pxor 0x30(%rsp), @XMM[6]
2224 movdqu @XMM[4], 0x20($out)
2225 pxor 0x40(%rsp), @XMM[3]
2226 movdqu @XMM[6], 0x30($out)
2227 movdqu @XMM[3], 0x40($out)
2228 lea 0x50($out), $out
2229
2230 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2231 jmp .Lxts_enc_done
2232.align 16
2233.Lxts_enc_4:
2234 pxor @XMM[8+2], @XMM[2]
2235 lea 0x40($inp), $inp
2236 pxor @XMM[8+3], @XMM[3]
2237 lea 0x80(%rsp), %rax # pass key schedule
2238 mov %edx, %r10d # pass rounds
2239
2240 call _bsaes_encrypt8
2241
2242 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2243 pxor 0x10(%rsp), @XMM[1]
2244 movdqu @XMM[0], 0x00($out) # write output
2245 pxor 0x20(%rsp), @XMM[4]
2246 movdqu @XMM[1], 0x10($out)
2247 pxor 0x30(%rsp), @XMM[6]
2248 movdqu @XMM[4], 0x20($out)
2249 movdqu @XMM[6], 0x30($out)
2250 lea 0x40($out), $out
2251
2252 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2253 jmp .Lxts_enc_done
2254.align 16
2255.Lxts_enc_3:
2256 pxor @XMM[8+1], @XMM[1]
2257 lea 0x30($inp), $inp
2258 pxor @XMM[8+2], @XMM[2]
2259 lea 0x80(%rsp), %rax # pass key schedule
2260 mov %edx, %r10d # pass rounds
2261
2262 call _bsaes_encrypt8
2263
2264 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2265 pxor 0x10(%rsp), @XMM[1]
2266 movdqu @XMM[0], 0x00($out) # write output
2267 pxor 0x20(%rsp), @XMM[4]
2268 movdqu @XMM[1], 0x10($out)
2269 movdqu @XMM[4], 0x20($out)
2270 lea 0x30($out), $out
2271
2272 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2273 jmp .Lxts_enc_done
2274.align 16
2275.Lxts_enc_2:
2276 pxor @XMM[8+0], @XMM[0]
2277 lea 0x20($inp), $inp
2278 pxor @XMM[8+1], @XMM[1]
2279 lea 0x80(%rsp), %rax # pass key schedule
2280 mov %edx, %r10d # pass rounds
2281
2282 call _bsaes_encrypt8
2283
2284 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2285 pxor 0x10(%rsp), @XMM[1]
2286 movdqu @XMM[0], 0x00($out) # write output
2287 movdqu @XMM[1], 0x10($out)
2288 lea 0x20($out), $out
2289
2290 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2291 jmp .Lxts_enc_done
2292.align 16
2293.Lxts_enc_1:
2294 pxor @XMM[0], @XMM[8]
2295 lea 0x10($inp), $inp
2296 movdqa @XMM[8], 0x20(%rbp)
2297 lea 0x20(%rbp), $arg1
2298 lea 0x20(%rbp), $arg2
2299 lea ($key), $arg3
2300 call asm_AES_encrypt # doesn't touch %xmm
2301 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2302 #pxor @XMM[8], @XMM[0]
2303 #lea 0x80(%rsp), %rax # pass key schedule
2304 #mov %edx, %r10d # pass rounds
2305 #call _bsaes_encrypt8
2306 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2307 movdqu @XMM[0], 0x00($out) # write output
2308 lea 0x10($out), $out
2309
2310 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2311
2312.Lxts_enc_done:
2313 and \$15, %ebx
2314 jz .Lxts_enc_ret
2315 mov $out, %rdx
2316
2317.Lxts_enc_steal:
2318 movzb ($inp), %eax
2319 movzb -16(%rdx), %ecx
2320 lea 1($inp), $inp
2321 mov %al, -16(%rdx)
2322 mov %cl, 0(%rdx)
2323 lea 1(%rdx), %rdx
2324 sub \$1,%ebx
2325 jnz .Lxts_enc_steal
2326
2327 movdqu -16($out), @XMM[0]
2328 lea 0x20(%rbp), $arg1
2329 pxor @XMM[7], @XMM[0]
2330 lea 0x20(%rbp), $arg2
2331 movdqa @XMM[0], 0x20(%rbp)
2332 lea ($key), $arg3
2333 call asm_AES_encrypt # doesn't touch %xmm
2334 pxor 0x20(%rbp), @XMM[7]
2335 movdqu @XMM[7], -16($out)
2336
2337.Lxts_enc_ret:
2338 lea (%rsp), %rax
2339 pxor %xmm0, %xmm0
2340.Lxts_enc_bzero: # wipe key schedule [if any]
2341 movdqa %xmm0, 0x00(%rax)
2342 movdqa %xmm0, 0x10(%rax)
2343 lea 0x20(%rax), %rax
2344 cmp %rax, %rbp
2345 ja .Lxts_enc_bzero
2346
2347 lea (%rbp),%rsp # restore %rsp
2348___
2349$code.=<<___ if ($win64);
2350 movaps 0x40(%rbp), %xmm6
2351 movaps 0x50(%rbp), %xmm7
2352 movaps 0x60(%rbp), %xmm8
2353 movaps 0x70(%rbp), %xmm9
2354 movaps 0x80(%rbp), %xmm10
2355 movaps 0x90(%rbp), %xmm11
2356 movaps 0xa0(%rbp), %xmm12
2357 movaps 0xb0(%rbp), %xmm13
2358 movaps 0xc0(%rbp), %xmm14
2359 movaps 0xd0(%rbp), %xmm15
2360 lea 0xa0(%rbp), %rsp
2361___
2362$code.=<<___;
2363 mov 0x48(%rsp), %r15
2364 mov 0x50(%rsp), %r14
2365 mov 0x58(%rsp), %r13
2366 mov 0x60(%rsp), %r12
2367 mov 0x68(%rsp), %rbx
2368 mov 0x70(%rsp), %rax
2369 lea 0x78(%rsp), %rsp
2370 mov %rax, %rbp
2371.Lxts_enc_epilogue:
2372 ret
2373.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2374
2375.globl bsaes_xts_decrypt
2376.type bsaes_xts_decrypt,\@abi-omnipotent
2377.align 16
2378bsaes_xts_decrypt:
2379 mov %rsp, %rax
2380.Lxts_dec_prologue:
2381 push %rbp
2382 push %rbx
2383 push %r12
2384 push %r13
2385 push %r14
2386 push %r15
2387 lea -0x48(%rsp), %rsp
2388___
2389$code.=<<___ if ($win64);
2390 mov 0xa0(%rsp),$arg5 # pull key2
2391 mov 0xa8(%rsp),$arg6 # pull ivp
2392 lea -0xa0(%rsp), %rsp
2393 movaps %xmm6, 0x40(%rsp)
2394 movaps %xmm7, 0x50(%rsp)
2395 movaps %xmm8, 0x60(%rsp)
2396 movaps %xmm9, 0x70(%rsp)
2397 movaps %xmm10, 0x80(%rsp)
2398 movaps %xmm11, 0x90(%rsp)
2399 movaps %xmm12, 0xa0(%rsp)
2400 movaps %xmm13, 0xb0(%rsp)
2401 movaps %xmm14, 0xc0(%rsp)
2402 movaps %xmm15, 0xd0(%rsp)
2403.Lxts_dec_body:
2404___
2405$code.=<<___;
2406 mov %rsp, %rbp # backup %rsp
2407 mov $arg1, $inp # backup arguments
2408 mov $arg2, $out
2409 mov $arg3, $len
2410 mov $arg4, $key
2411
2412 lea ($arg6), $arg1
2413 lea 0x20(%rbp), $arg2
2414 lea ($arg5), $arg3
2415 call asm_AES_encrypt # generate initial tweak
2416
2417 mov 240($key), %eax # rounds
2418 mov $len, %rbx # backup $len
2419
2420 mov %eax, %edx # rounds
2421 shl \$7, %rax # 128 bytes per inner round key
2422 sub \$`128-32`, %rax # size of bit-sliced key schedule
2423 sub %rax, %rsp
2424
2425 mov %rsp, %rax # pass key schedule
2426 mov $key, %rcx # pass key
2427 mov %edx, %r10d # pass rounds
2428 call _bsaes_key_convert
2429 pxor (%rsp), %xmm7 # fix up round 0 key
2430 movdqa %xmm6, (%rax) # save last round key
2431 movdqa %xmm7, (%rsp)
2432
2433 xor %eax, %eax # if ($len%16) len-=16;
2434 and \$-16, $len
2435 test \$15, %ebx
2436 setnz %al
2437 shl \$4, %rax
2438 sub %rax, $len
2439
2440 sub \$0x80, %rsp # place for tweak[8]
2441 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2442
2443 pxor $twtmp, $twtmp
2444 movdqa .Lxts_magic(%rip), $twmask
2445 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2446
2447 sub \$0x80, $len
2448 jc .Lxts_dec_short
2449 jmp .Lxts_dec_loop
2450
2451.align 16
2452.Lxts_dec_loop:
2453___
2454 for ($i=0;$i<7;$i++) {
2455 $code.=<<___;
2456 pshufd \$0x13, $twtmp, $twres
2457 pxor $twtmp, $twtmp
2458 movdqa @XMM[7], @XMM[$i]
2459 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2460 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2461 pand $twmask, $twres # isolate carry and residue
2462 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2463 pxor $twres, @XMM[7]
2464___
2465 $code.=<<___ if ($i>=1);
2466 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2467___
2468 $code.=<<___ if ($i>=2);
2469 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2470___
2471 }
2472$code.=<<___;
2473 movdqu 0x60($inp), @XMM[8+6]
2474 pxor @XMM[8+5], @XMM[5]
2475 movdqu 0x70($inp), @XMM[8+7]
2476 lea 0x80($inp), $inp
2477 movdqa @XMM[7], 0x70(%rsp)
2478 pxor @XMM[8+6], @XMM[6]
2479 lea 0x80(%rsp), %rax # pass key schedule
2480 pxor @XMM[8+7], @XMM[7]
2481 mov %edx, %r10d # pass rounds
2482
2483 call _bsaes_decrypt8
2484
2485 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2486 pxor 0x10(%rsp), @XMM[1]
2487 movdqu @XMM[0], 0x00($out) # write output
2488 pxor 0x20(%rsp), @XMM[6]
2489 movdqu @XMM[1], 0x10($out)
2490 pxor 0x30(%rsp), @XMM[4]
2491 movdqu @XMM[6], 0x20($out)
2492 pxor 0x40(%rsp), @XMM[2]
2493 movdqu @XMM[4], 0x30($out)
2494 pxor 0x50(%rsp), @XMM[7]
2495 movdqu @XMM[2], 0x40($out)
2496 pxor 0x60(%rsp), @XMM[3]
2497 movdqu @XMM[7], 0x50($out)
2498 pxor 0x70(%rsp), @XMM[5]
2499 movdqu @XMM[3], 0x60($out)
2500 movdqu @XMM[5], 0x70($out)
2501 lea 0x80($out), $out
2502
2503 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2504 pxor $twtmp, $twtmp
2505 movdqa .Lxts_magic(%rip), $twmask
2506 pcmpgtd @XMM[7], $twtmp
2507 pshufd \$0x13, $twtmp, $twres
2508 pxor $twtmp, $twtmp
2509 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2510 pand $twmask, $twres # isolate carry and residue
2511 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2512 pxor $twres, @XMM[7]
2513
2514 sub \$0x80,$len
2515 jnc .Lxts_dec_loop
2516
2517.Lxts_dec_short:
2518 add \$0x80, $len
2519 jz .Lxts_dec_done
2520___
2521 for ($i=0;$i<7;$i++) {
2522 $code.=<<___;
2523 pshufd \$0x13, $twtmp, $twres
2524 pxor $twtmp, $twtmp
2525 movdqa @XMM[7], @XMM[$i]
2526 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2527 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2528 pand $twmask, $twres # isolate carry and residue
2529 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2530 pxor $twres, @XMM[7]
2531___
2532 $code.=<<___ if ($i>=1);
2533 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2534 cmp \$`0x10*$i`,$len
2535 je .Lxts_dec_$i
2536___
2537 $code.=<<___ if ($i>=2);
2538 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2539___
2540 }
2541$code.=<<___;
2542 movdqu 0x60($inp), @XMM[8+6]
2543 pxor @XMM[8+5], @XMM[5]
2544 movdqa @XMM[7], 0x70(%rsp)
2545 lea 0x70($inp), $inp
2546 pxor @XMM[8+6], @XMM[6]
2547 lea 0x80(%rsp), %rax # pass key schedule
2548 mov %edx, %r10d # pass rounds
2549
2550 call _bsaes_decrypt8
2551
2552 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2553 pxor 0x10(%rsp), @XMM[1]
2554 movdqu @XMM[0], 0x00($out) # write output
2555 pxor 0x20(%rsp), @XMM[6]
2556 movdqu @XMM[1], 0x10($out)
2557 pxor 0x30(%rsp), @XMM[4]
2558 movdqu @XMM[6], 0x20($out)
2559 pxor 0x40(%rsp), @XMM[2]
2560 movdqu @XMM[4], 0x30($out)
2561 pxor 0x50(%rsp), @XMM[7]
2562 movdqu @XMM[2], 0x40($out)
2563 pxor 0x60(%rsp), @XMM[3]
2564 movdqu @XMM[7], 0x50($out)
2565 movdqu @XMM[3], 0x60($out)
2566 lea 0x70($out), $out
2567
2568 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2569 jmp .Lxts_dec_done
2570.align 16
2571.Lxts_dec_6:
2572 pxor @XMM[8+4], @XMM[4]
2573 lea 0x60($inp), $inp
2574 pxor @XMM[8+5], @XMM[5]
2575 lea 0x80(%rsp), %rax # pass key schedule
2576 mov %edx, %r10d # pass rounds
2577
2578 call _bsaes_decrypt8
2579
2580 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2581 pxor 0x10(%rsp), @XMM[1]
2582 movdqu @XMM[0], 0x00($out) # write output
2583 pxor 0x20(%rsp), @XMM[6]
2584 movdqu @XMM[1], 0x10($out)
2585 pxor 0x30(%rsp), @XMM[4]
2586 movdqu @XMM[6], 0x20($out)
2587 pxor 0x40(%rsp), @XMM[2]
2588 movdqu @XMM[4], 0x30($out)
2589 pxor 0x50(%rsp), @XMM[7]
2590 movdqu @XMM[2], 0x40($out)
2591 movdqu @XMM[7], 0x50($out)
2592 lea 0x60($out), $out
2593
2594 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2595 jmp .Lxts_dec_done
2596.align 16
2597.Lxts_dec_5:
2598 pxor @XMM[8+3], @XMM[3]
2599 lea 0x50($inp), $inp
2600 pxor @XMM[8+4], @XMM[4]
2601 lea 0x80(%rsp), %rax # pass key schedule
2602 mov %edx, %r10d # pass rounds
2603
2604 call _bsaes_decrypt8
2605
2606 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2607 pxor 0x10(%rsp), @XMM[1]
2608 movdqu @XMM[0], 0x00($out) # write output
2609 pxor 0x20(%rsp), @XMM[6]
2610 movdqu @XMM[1], 0x10($out)
2611 pxor 0x30(%rsp), @XMM[4]
2612 movdqu @XMM[6], 0x20($out)
2613 pxor 0x40(%rsp), @XMM[2]
2614 movdqu @XMM[4], 0x30($out)
2615 movdqu @XMM[2], 0x40($out)
2616 lea 0x50($out), $out
2617
2618 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2619 jmp .Lxts_dec_done
2620.align 16
2621.Lxts_dec_4:
2622 pxor @XMM[8+2], @XMM[2]
2623 lea 0x40($inp), $inp
2624 pxor @XMM[8+3], @XMM[3]
2625 lea 0x80(%rsp), %rax # pass key schedule
2626 mov %edx, %r10d # pass rounds
2627
2628 call _bsaes_decrypt8
2629
2630 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2631 pxor 0x10(%rsp), @XMM[1]
2632 movdqu @XMM[0], 0x00($out) # write output
2633 pxor 0x20(%rsp), @XMM[6]
2634 movdqu @XMM[1], 0x10($out)
2635 pxor 0x30(%rsp), @XMM[4]
2636 movdqu @XMM[6], 0x20($out)
2637 movdqu @XMM[4], 0x30($out)
2638 lea 0x40($out), $out
2639
2640 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2641 jmp .Lxts_dec_done
2642.align 16
2643.Lxts_dec_3:
2644 pxor @XMM[8+1], @XMM[1]
2645 lea 0x30($inp), $inp
2646 pxor @XMM[8+2], @XMM[2]
2647 lea 0x80(%rsp), %rax # pass key schedule
2648 mov %edx, %r10d # pass rounds
2649
2650 call _bsaes_decrypt8
2651
2652 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2653 pxor 0x10(%rsp), @XMM[1]
2654 movdqu @XMM[0], 0x00($out) # write output
2655 pxor 0x20(%rsp), @XMM[6]
2656 movdqu @XMM[1], 0x10($out)
2657 movdqu @XMM[6], 0x20($out)
2658 lea 0x30($out), $out
2659
2660 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2661 jmp .Lxts_dec_done
2662.align 16
2663.Lxts_dec_2:
2664 pxor @XMM[8+0], @XMM[0]
2665 lea 0x20($inp), $inp
2666 pxor @XMM[8+1], @XMM[1]
2667 lea 0x80(%rsp), %rax # pass key schedule
2668 mov %edx, %r10d # pass rounds
2669
2670 call _bsaes_decrypt8
2671
2672 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2673 pxor 0x10(%rsp), @XMM[1]
2674 movdqu @XMM[0], 0x00($out) # write output
2675 movdqu @XMM[1], 0x10($out)
2676 lea 0x20($out), $out
2677
2678 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2679 jmp .Lxts_dec_done
2680.align 16
2681.Lxts_dec_1:
2682 pxor @XMM[0], @XMM[8]
2683 lea 0x10($inp), $inp
2684 movdqa @XMM[8], 0x20(%rbp)
2685 lea 0x20(%rbp), $arg1
2686 lea 0x20(%rbp), $arg2
2687 lea ($key), $arg3
2688 call asm_AES_decrypt # doesn't touch %xmm
2689 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2690 #pxor @XMM[8], @XMM[0]
2691 #lea 0x80(%rsp), %rax # pass key schedule
2692 #mov %edx, %r10d # pass rounds
2693 #call _bsaes_decrypt8
2694 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2695 movdqu @XMM[0], 0x00($out) # write output
2696 lea 0x10($out), $out
2697
2698 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2699
2700.Lxts_dec_done:
2701 and \$15, %ebx
2702 jz .Lxts_dec_ret
2703
2704 pxor $twtmp, $twtmp
2705 movdqa .Lxts_magic(%rip), $twmask
2706 pcmpgtd @XMM[7], $twtmp
2707 pshufd \$0x13, $twtmp, $twres
2708 movdqa @XMM[7], @XMM[6]
2709 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2710 pand $twmask, $twres # isolate carry and residue
2711 movdqu ($inp), @XMM[0]
2712 pxor $twres, @XMM[7]
2713
2714 lea 0x20(%rbp), $arg1
2715 pxor @XMM[7], @XMM[0]
2716 lea 0x20(%rbp), $arg2
2717 movdqa @XMM[0], 0x20(%rbp)
2718 lea ($key), $arg3
2719 call asm_AES_decrypt # doesn't touch %xmm
2720 pxor 0x20(%rbp), @XMM[7]
2721 mov $out, %rdx
2722 movdqu @XMM[7], ($out)
2723
2724.Lxts_dec_steal:
2725 movzb 16($inp), %eax
2726 movzb (%rdx), %ecx
2727 lea 1($inp), $inp
2728 mov %al, (%rdx)
2729 mov %cl, 16(%rdx)
2730 lea 1(%rdx), %rdx
2731 sub \$1,%ebx
2732 jnz .Lxts_dec_steal
2733
2734 movdqu ($out), @XMM[0]
2735 lea 0x20(%rbp), $arg1
2736 pxor @XMM[6], @XMM[0]
2737 lea 0x20(%rbp), $arg2
2738 movdqa @XMM[0], 0x20(%rbp)
2739 lea ($key), $arg3
2740 call asm_AES_decrypt # doesn't touch %xmm
2741 pxor 0x20(%rbp), @XMM[6]
2742 movdqu @XMM[6], ($out)
2743
2744.Lxts_dec_ret:
2745 lea (%rsp), %rax
2746 pxor %xmm0, %xmm0
2747.Lxts_dec_bzero: # wipe key schedule [if any]
2748 movdqa %xmm0, 0x00(%rax)
2749 movdqa %xmm0, 0x10(%rax)
2750 lea 0x20(%rax), %rax
2751 cmp %rax, %rbp
2752 ja .Lxts_dec_bzero
2753
2754 lea (%rbp),%rsp # restore %rsp
2755___
2756$code.=<<___ if ($win64);
2757 movaps 0x40(%rbp), %xmm6
2758 movaps 0x50(%rbp), %xmm7
2759 movaps 0x60(%rbp), %xmm8
2760 movaps 0x70(%rbp), %xmm9
2761 movaps 0x80(%rbp), %xmm10
2762 movaps 0x90(%rbp), %xmm11
2763 movaps 0xa0(%rbp), %xmm12
2764 movaps 0xb0(%rbp), %xmm13
2765 movaps 0xc0(%rbp), %xmm14
2766 movaps 0xd0(%rbp), %xmm15
2767 lea 0xa0(%rbp), %rsp
2768___
2769$code.=<<___;
2770 mov 0x48(%rsp), %r15
2771 mov 0x50(%rsp), %r14
2772 mov 0x58(%rsp), %r13
2773 mov 0x60(%rsp), %r12
2774 mov 0x68(%rsp), %rbx
2775 mov 0x70(%rsp), %rax
2776 lea 0x78(%rsp), %rsp
2777 mov %rax, %rbp
2778.Lxts_dec_epilogue:
2779 ret
2780.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2781___
2782}
2783$code.=<<___;
2784.type _bsaes_const,\@object
2785.align 64
2786_bsaes_const:
2787.LM0ISR: # InvShiftRows constants
2788 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2789.LISRM0:
2790 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2791.LISR:
2792 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2793.LBS0: # bit-slice constants
2794 .quad 0x5555555555555555, 0x5555555555555555
2795.LBS1:
2796 .quad 0x3333333333333333, 0x3333333333333333
2797.LBS2:
2798 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2799.LSR: # shiftrows constants
2800 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2801.LSRM0:
2802 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2803.LM0:
2804 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2805.LM0SR:
2806 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2807.LNOT: # magic constants
2808 .quad 0xffffffffffffffff, 0xffffffffffffffff
2809.L63:
2810 .quad 0x6363636363636363, 0x6363636363636363
2811.LSWPUP: # byte-swap upper dword
2812 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2813.LSWPUPM0SR:
2814 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2815.LADD1: # counter increment constants
2816 .quad 0x0000000000000000, 0x0000000100000000
2817.LADD2:
2818 .quad 0x0000000000000000, 0x0000000200000000
2819.LADD3:
2820 .quad 0x0000000000000000, 0x0000000300000000
2821.LADD4:
2822 .quad 0x0000000000000000, 0x0000000400000000
2823.LADD5:
2824 .quad 0x0000000000000000, 0x0000000500000000
2825.LADD6:
2826 .quad 0x0000000000000000, 0x0000000600000000
2827.LADD7:
2828 .quad 0x0000000000000000, 0x0000000700000000
2829.LADD8:
2830 .quad 0x0000000000000000, 0x0000000800000000
2831.Lxts_magic:
2832 .long 0x87,0,1,0
2833.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2834.align 64
2835.size _bsaes_const,.-_bsaes_const
2836___
2837
2838# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2839# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2840if ($win64) {
2841$rec="%rcx";
2842$frame="%rdx";
2843$context="%r8";
2844$disp="%r9";
2845
2846$code.=<<___;
2847.extern __imp_RtlVirtualUnwind
2848.type se_handler,\@abi-omnipotent
2849.align 16
2850se_handler:
2851 push %rsi
2852 push %rdi
2853 push %rbx
2854 push %rbp
2855 push %r12
2856 push %r13
2857 push %r14
2858 push %r15
2859 pushfq
2860 sub \$64,%rsp
2861
2862 mov 120($context),%rax # pull context->Rax
2863 mov 248($context),%rbx # pull context->Rip
2864
2865 mov 8($disp),%rsi # disp->ImageBase
2866 mov 56($disp),%r11 # disp->HandlerData
2867
2868 mov 0(%r11),%r10d # HandlerData[0]
2869 lea (%rsi,%r10),%r10 # prologue label
2870 cmp %r10,%rbx # context->Rip<prologue label
2871 jb .Lin_prologue
2872
2873 mov 152($context),%rax # pull context->Rsp
2874
2875 mov 4(%r11),%r10d # HandlerData[1]
2876 lea (%rsi,%r10),%r10 # epilogue label
2877 cmp %r10,%rbx # context->Rip>=epilogue label
2878 jae .Lin_prologue
2879
2880 mov 160($context),%rax # pull context->Rbp
2881
2882 lea 0x40(%rax),%rsi # %xmm save area
2883 lea 512($context),%rdi # &context.Xmm6
2884 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2885 .long 0xa548f3fc # cld; rep movsq
2886 lea 0xa0(%rax),%rax # adjust stack pointer
2887
2888 mov 0x70(%rax),%rbp
2889 mov 0x68(%rax),%rbx
2890 mov 0x60(%rax),%r12
2891 mov 0x58(%rax),%r13
2892 mov 0x50(%rax),%r14
2893 mov 0x48(%rax),%r15
2894 lea 0x78(%rax),%rax # adjust stack pointer
2895 mov %rbx,144($context) # restore context->Rbx
2896 mov %rbp,160($context) # restore context->Rbp
2897 mov %r12,216($context) # restore context->R12
2898 mov %r13,224($context) # restore context->R13
2899 mov %r14,232($context) # restore context->R14
2900 mov %r15,240($context) # restore context->R15
2901
2902.Lin_prologue:
2903 mov %rax,152($context) # restore context->Rsp
2904
2905 mov 40($disp),%rdi # disp->ContextRecord
2906 mov $context,%rsi # context
2907 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2908 .long 0xa548f3fc # cld; rep movsq
2909
2910 mov $disp,%rsi
2911 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2912 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2913 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2914 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2915 mov 40(%rsi),%r10 # disp->ContextRecord
2916 lea 56(%rsi),%r11 # &disp->HandlerData
2917 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2918 mov %r10,32(%rsp) # arg5
2919 mov %r11,40(%rsp) # arg6
2920 mov %r12,48(%rsp) # arg7
2921 mov %rcx,56(%rsp) # arg8, (NULL)
2922 call *__imp_RtlVirtualUnwind(%rip)
2923
2924 mov \$1,%eax # ExceptionContinueSearch
2925 add \$64,%rsp
2926 popfq
2927 pop %r15
2928 pop %r14
2929 pop %r13
2930 pop %r12
2931 pop %rbp
2932 pop %rbx
2933 pop %rdi
2934 pop %rsi
2935 ret
2936.size se_handler,.-se_handler
2937
2938.section .pdata
2939.align 4
2940___
2941$code.=<<___ if ($ecb);
2942 .rva .Lecb_enc_prologue
2943 .rva .Lecb_enc_epilogue
2944 .rva .Lecb_enc_info
2945
2946 .rva .Lecb_dec_prologue
2947 .rva .Lecb_dec_epilogue
2948 .rva .Lecb_dec_info
2949___
2950$code.=<<___;
2951 .rva .Lcbc_dec_prologue
2952 .rva .Lcbc_dec_epilogue
2953 .rva .Lcbc_dec_info
2954
2955 .rva .Lctr_enc_prologue
2956 .rva .Lctr_enc_epilogue
2957 .rva .Lctr_enc_info
2958
2959 .rva .Lxts_enc_prologue
2960 .rva .Lxts_enc_epilogue
2961 .rva .Lxts_enc_info
2962
2963 .rva .Lxts_dec_prologue
2964 .rva .Lxts_dec_epilogue
2965 .rva .Lxts_dec_info
2966
2967.section .xdata
2968.align 8
2969___
2970$code.=<<___ if ($ecb);
2971.Lecb_enc_info:
2972 .byte 9,0,0,0
2973 .rva se_handler
2974 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
2975.Lecb_dec_info:
2976 .byte 9,0,0,0
2977 .rva se_handler
2978 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
2979___
2980$code.=<<___;
2981.Lcbc_dec_info:
2982 .byte 9,0,0,0
2983 .rva se_handler
2984 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
2985.Lctr_enc_info:
2986 .byte 9,0,0,0
2987 .rva se_handler
2988 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
2989.Lxts_enc_info:
2990 .byte 9,0,0,0
2991 .rva se_handler
2992 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
2993.Lxts_dec_info:
2994 .byte 9,0,0,0
2995 .rva se_handler
2996 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
2997___
2998}
2999
3000$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3001
3002print $code;
3003
3004close STDOUT;