Import OpenSSL-1.0.1a.
[dragonfly.git] / crypto / openssl / crypto / aes / asm / bsaes-x86_64.pl
CommitLineData
672590bc
PA
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
503f6d4b
PA
68# Core 2 240 0.22
69# Nehalem 180 0.20
70# Atom 430 0.19
672590bc
PA
71#
72# The ratio values mean that 128-byte blocks will be processed
503f6d4b 73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
672590bc
PA
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 11.0
87# Nehalem 9.16
503f6d4b 88# Atom 20.9
672590bc
PA
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95# <appro@openssl.org>
96
97$flavour = shift;
98$output = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open STDOUT,"| $^X $xlate $flavour $output";
109
110my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
111my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
112my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
113
114{
115my ($key,$rounds,$const)=("%rax","%r10d","%r11");
116
117sub Sbox {
118# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
119# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
120my @b=@_[0..7];
121my @t=@_[8..11];
122my @s=@_[12..15];
123 &InBasisChange (@b);
124 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
125 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
126}
127
128sub InBasisChange {
129# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
130# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
131my @b=@_[0..7];
132$code.=<<___;
133 pxor @b[6], @b[5]
134 pxor @b[1], @b[2]
135 pxor @b[0], @b[3]
136 pxor @b[2], @b[6]
137 pxor @b[0], @b[5]
138
139 pxor @b[3], @b[6]
140 pxor @b[7], @b[3]
141 pxor @b[5], @b[7]
142 pxor @b[4], @b[3]
143 pxor @b[5], @b[4]
144 pxor @b[1], @b[3]
145
146 pxor @b[7], @b[2]
147 pxor @b[5], @b[1]
148___
149}
150
151sub OutBasisChange {
152# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
153# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
154my @b=@_[0..7];
155$code.=<<___;
156 pxor @b[6], @b[0]
157 pxor @b[4], @b[1]
158 pxor @b[0], @b[2]
159 pxor @b[6], @b[4]
160 pxor @b[1], @b[6]
161
162 pxor @b[5], @b[1]
163 pxor @b[3], @b[5]
164 pxor @b[7], @b[3]
165 pxor @b[5], @b[7]
166 pxor @b[5], @b[2]
167
168 pxor @b[7], @b[4]
169___
170}
171
172sub InvSbox {
173# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
174# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
175my @b=@_[0..7];
176my @t=@_[8..11];
177my @s=@_[12..15];
178 &InvInBasisChange (@b);
179 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
180 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
181}
182
183sub InvInBasisChange { # OutBasisChange in reverse
184my @b=@_[5,1,2,6,3,7,0,4];
185$code.=<<___
186 pxor @b[7], @b[4]
187
188 pxor @b[5], @b[7]
189 pxor @b[5], @b[2]
190 pxor @b[7], @b[3]
191 pxor @b[3], @b[5]
192 pxor @b[5], @b[1]
193
194 pxor @b[1], @b[6]
195 pxor @b[0], @b[2]
196 pxor @b[6], @b[4]
197 pxor @b[6], @b[0]
198 pxor @b[4], @b[1]
199___
200}
201
202sub InvOutBasisChange { # InBasisChange in reverse
203my @b=@_[2,5,7,3,6,1,0,4];
204$code.=<<___;
205 pxor @b[5], @b[1]
206 pxor @b[7], @b[2]
207
208 pxor @b[1], @b[3]
209 pxor @b[5], @b[4]
210 pxor @b[5], @b[7]
211 pxor @b[4], @b[3]
212 pxor @b[0], @b[5]
213 pxor @b[7], @b[3]
214 pxor @b[2], @b[6]
215 pxor @b[1], @b[2]
216 pxor @b[3], @b[6]
217
218 pxor @b[0], @b[3]
219 pxor @b[6], @b[5]
220___
221}
222
223sub Mul_GF4 {
224#;*************************************************************
225#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
226#;*************************************************************
227my ($x0,$x1,$y0,$y1,$t0)=@_;
228$code.=<<___;
229 movdqa $y0, $t0
230 pxor $y1, $t0
231 pand $x0, $t0
232 pxor $x1, $x0
233 pand $y0, $x1
234 pand $y1, $x0
235 pxor $x1, $x0
236 pxor $t0, $x1
237___
238}
239
240sub Mul_GF4_N { # not used, see next subroutine
241# multiply and scale by N
242my ($x0,$x1,$y0,$y1,$t0)=@_;
243$code.=<<___;
244 movdqa $y0, $t0
245 pxor $y1, $t0
246 pand $x0, $t0
247 pxor $x1, $x0
248 pand $y0, $x1
249 pand $y1, $x0
250 pxor $x0, $x1
251 pxor $t0, $x0
252___
253}
254
255sub Mul_GF4_N_GF4 {
256# interleaved Mul_GF4_N and Mul_GF4
257my ($x0,$x1,$y0,$y1,$t0,
258 $x2,$x3,$y2,$y3,$t1)=@_;
259$code.=<<___;
260 movdqa $y0, $t0
261 movdqa $y2, $t1
262 pxor $y1, $t0
263 pxor $y3, $t1
264 pand $x0, $t0
265 pand $x2, $t1
266 pxor $x1, $x0
267 pxor $x3, $x2
268 pand $y0, $x1
269 pand $y2, $x3
270 pand $y1, $x0
271 pand $y3, $x2
272 pxor $x0, $x1
273 pxor $x3, $x2
274 pxor $t0, $x0
275 pxor $t1, $x3
276___
277}
278sub Mul_GF16_2 {
279my @x=@_[0..7];
280my @y=@_[8..11];
281my @t=@_[12..15];
282$code.=<<___;
283 movdqa @x[0], @t[0]
284 movdqa @x[1], @t[1]
285___
286 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
287$code.=<<___;
288 pxor @x[2], @t[0]
289 pxor @x[3], @t[1]
290 pxor @y[2], @y[0]
291 pxor @y[3], @y[1]
292___
293 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
294 @x[2], @x[3], @y[2], @y[3], @t[2]);
295$code.=<<___;
296 pxor @t[0], @x[0]
297 pxor @t[0], @x[2]
298 pxor @t[1], @x[1]
299 pxor @t[1], @x[3]
300
301 movdqa @x[4], @t[0]
302 movdqa @x[5], @t[1]
303 pxor @x[6], @t[0]
304 pxor @x[7], @t[1]
305___
306 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
307 @x[6], @x[7], @y[2], @y[3], @t[2]);
308$code.=<<___;
309 pxor @y[2], @y[0]
310 pxor @y[3], @y[1]
311___
312 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
313$code.=<<___;
314 pxor @t[0], @x[4]
315 pxor @t[0], @x[6]
316 pxor @t[1], @x[5]
317 pxor @t[1], @x[7]
318___
319}
320sub Inv_GF256 {
321#;********************************************************************
322#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
323#;********************************************************************
324my @x=@_[0..7];
325my @t=@_[8..11];
326my @s=@_[12..15];
327# direct optimizations from hardware
328$code.=<<___;
329 movdqa @x[4], @t[3]
330 movdqa @x[5], @t[2]
331 movdqa @x[1], @t[1]
332 movdqa @x[7], @s[1]
333 movdqa @x[0], @s[0]
334
335 pxor @x[6], @t[3]
336 pxor @x[7], @t[2]
337 pxor @x[3], @t[1]
338 movdqa @t[3], @s[2]
339 pxor @x[6], @s[1]
340 movdqa @t[2], @t[0]
341 pxor @x[2], @s[0]
342 movdqa @t[3], @s[3]
343
344 por @t[1], @t[2]
345 por @s[0], @t[3]
346 pxor @t[0], @s[3]
347 pand @s[0], @s[2]
348 pxor @t[1], @s[0]
349 pand @t[1], @t[0]
350 pand @s[0], @s[3]
351 movdqa @x[3], @s[0]
352 pxor @x[2], @s[0]
353 pand @s[0], @s[1]
354 pxor @s[1], @t[3]
355 pxor @s[1], @t[2]
356 movdqa @x[4], @s[1]
357 movdqa @x[1], @s[0]
358 pxor @x[5], @s[1]
359 pxor @x[0], @s[0]
360 movdqa @s[1], @t[1]
361 pand @s[0], @s[1]
362 por @s[0], @t[1]
363 pxor @s[1], @t[0]
364 pxor @s[3], @t[3]
365 pxor @s[2], @t[2]
366 pxor @s[3], @t[1]
367 movdqa @x[7], @s[0]
368 pxor @s[2], @t[0]
369 movdqa @x[6], @s[1]
370 pxor @s[2], @t[1]
371 movdqa @x[5], @s[2]
372 pand @x[3], @s[0]
373 movdqa @x[4], @s[3]
374 pand @x[2], @s[1]
375 pand @x[1], @s[2]
376 por @x[0], @s[3]
377 pxor @s[0], @t[3]
378 pxor @s[1], @t[2]
379 pxor @s[2], @t[1]
380 pxor @s[3], @t[0]
381
382 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
383
384 # new smaller inversion
385
386 movdqa @t[3], @s[0]
387 pand @t[1], @t[3]
388 pxor @t[2], @s[0]
389
390 movdqa @t[0], @s[2]
391 movdqa @s[0], @s[3]
392 pxor @t[3], @s[2]
393 pand @s[2], @s[3]
394
395 movdqa @t[1], @s[1]
396 pxor @t[2], @s[3]
397 pxor @t[0], @s[1]
398
399 pxor @t[2], @t[3]
400
401 pand @t[3], @s[1]
402
403 movdqa @s[2], @t[2]
404 pxor @t[0], @s[1]
405
406 pxor @s[1], @t[2]
407 pxor @s[1], @t[1]
408
409 pand @t[0], @t[2]
410
411 pxor @t[2], @s[2]
412 pxor @t[2], @t[1]
413
414 pand @s[3], @s[2]
415
416 pxor @s[0], @s[2]
417___
418# output in s3, s2, s1, t1
419
420# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
421
422# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
423 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
424
425### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
426}
427
428# AES linear components
429
430sub ShiftRows {
431my @x=@_[0..7];
432my $mask=pop;
433$code.=<<___;
434 pxor 0x00($key),@x[0]
435 pxor 0x10($key),@x[1]
436 pshufb $mask,@x[0]
437 pxor 0x20($key),@x[2]
438 pshufb $mask,@x[1]
439 pxor 0x30($key),@x[3]
440 pshufb $mask,@x[2]
441 pxor 0x40($key),@x[4]
442 pshufb $mask,@x[3]
443 pxor 0x50($key),@x[5]
444 pshufb $mask,@x[4]
445 pxor 0x60($key),@x[6]
446 pshufb $mask,@x[5]
447 pxor 0x70($key),@x[7]
448 pshufb $mask,@x[6]
449 lea 0x80($key),$key
450 pshufb $mask,@x[7]
451___
452}
453
454sub MixColumns {
455# modified to emit output in order suitable for feeding back to aesenc[last]
456my @x=@_[0..7];
457my @t=@_[8..15];
458$code.=<<___;
459 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
460 pshufd \$0x93, @x[1], @t[1]
461 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
462 pshufd \$0x93, @x[2], @t[2]
463 pxor @t[1], @x[1]
464 pshufd \$0x93, @x[3], @t[3]
465 pxor @t[2], @x[2]
466 pshufd \$0x93, @x[4], @t[4]
467 pxor @t[3], @x[3]
468 pshufd \$0x93, @x[5], @t[5]
469 pxor @t[4], @x[4]
470 pshufd \$0x93, @x[6], @t[6]
471 pxor @t[5], @x[5]
472 pshufd \$0x93, @x[7], @t[7]
473 pxor @t[6], @x[6]
474 pxor @t[7], @x[7]
475
476 pxor @x[0], @t[1]
477 pxor @x[7], @t[0]
478 pxor @x[7], @t[1]
479 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
480 pxor @x[1], @t[2]
481 pshufd \$0x4E, @x[1], @x[1]
482 pxor @x[4], @t[5]
483 pxor @t[0], @x[0]
484 pxor @x[5], @t[6]
485 pxor @t[1], @x[1]
486 pxor @x[3], @t[4]
487 pshufd \$0x4E, @x[4], @t[0]
488 pxor @x[6], @t[7]
489 pshufd \$0x4E, @x[5], @t[1]
490 pxor @x[2], @t[3]
491 pshufd \$0x4E, @x[3], @x[4]
492 pxor @x[7], @t[3]
493 pshufd \$0x4E, @x[7], @x[5]
494 pxor @x[7], @t[4]
495 pshufd \$0x4E, @x[6], @x[3]
496 pxor @t[4], @t[0]
497 pshufd \$0x4E, @x[2], @x[6]
498 pxor @t[5], @t[1]
499
500 pxor @t[3], @x[4]
501 pxor @t[7], @x[5]
502 pxor @t[6], @x[3]
503 movdqa @t[0], @x[2]
504 pxor @t[2], @x[6]
505 movdqa @t[1], @x[7]
506___
507}
508
509sub InvMixColumns {
510my @x=@_[0..7];
511my @t=@_[8..15];
512
513$code.=<<___;
514 # multiplication by 0x0e
515 pshufd \$0x93, @x[7], @t[7]
516 movdqa @x[2], @t[2]
517 pxor @x[5], @x[7] # 7 5
518 pxor @x[5], @x[2] # 2 5
519 pshufd \$0x93, @x[0], @t[0]
520 movdqa @x[5], @t[5]
521 pxor @x[0], @x[5] # 5 0 [1]
522 pxor @x[1], @x[0] # 0 1
523 pshufd \$0x93, @x[1], @t[1]
524 pxor @x[2], @x[1] # 1 25
525 pxor @x[6], @x[0] # 01 6 [2]
526 pxor @x[3], @x[1] # 125 3 [4]
527 pshufd \$0x93, @x[3], @t[3]
528 pxor @x[0], @x[2] # 25 016 [3]
529 pxor @x[7], @x[3] # 3 75
530 pxor @x[6], @x[7] # 75 6 [0]
531 pshufd \$0x93, @x[6], @t[6]
532 movdqa @x[4], @t[4]
533 pxor @x[4], @x[6] # 6 4
534 pxor @x[3], @x[4] # 4 375 [6]
535 pxor @x[7], @x[3] # 375 756=36
536 pxor @t[5], @x[6] # 64 5 [7]
537 pxor @t[2], @x[3] # 36 2
538 pxor @t[4], @x[3] # 362 4 [5]
539 pshufd \$0x93, @t[5], @t[5]
540___
541 my @y = @x[7,5,0,2,1,3,4,6];
542$code.=<<___;
543 # multiplication by 0x0b
544 pxor @y[0], @y[1]
545 pxor @t[0], @y[0]
546 pxor @t[1], @y[1]
547 pshufd \$0x93, @t[2], @t[2]
548 pxor @t[5], @y[0]
549 pxor @t[6], @y[1]
550 pxor @t[7], @y[0]
551 pshufd \$0x93, @t[4], @t[4]
552 pxor @t[6], @t[7] # clobber t[7]
553 pxor @y[0], @y[1]
554
555 pxor @t[0], @y[3]
556 pshufd \$0x93, @t[0], @t[0]
557 pxor @t[1], @y[2]
558 pxor @t[1], @y[4]
559 pxor @t[2], @y[2]
560 pshufd \$0x93, @t[1], @t[1]
561 pxor @t[2], @y[3]
562 pxor @t[2], @y[5]
563 pxor @t[7], @y[2]
564 pshufd \$0x93, @t[2], @t[2]
565 pxor @t[3], @y[3]
566 pxor @t[3], @y[6]
567 pxor @t[3], @y[4]
568 pshufd \$0x93, @t[3], @t[3]
569 pxor @t[4], @y[7]
570 pxor @t[4], @y[5]
571 pxor @t[7], @y[7]
572 pxor @t[5], @y[3]
573 pxor @t[4], @y[4]
574 pxor @t[5], @t[7] # clobber t[7] even more
575
576 pxor @t[7], @y[5]
577 pshufd \$0x93, @t[4], @t[4]
578 pxor @t[7], @y[6]
579 pxor @t[7], @y[4]
580
581 pxor @t[5], @t[7]
582 pshufd \$0x93, @t[5], @t[5]
583 pxor @t[6], @t[7] # restore t[7]
584
585 # multiplication by 0x0d
586 pxor @y[7], @y[4]
587 pxor @t[4], @y[7]
588 pshufd \$0x93, @t[6], @t[6]
589 pxor @t[0], @y[2]
590 pxor @t[5], @y[7]
591 pxor @t[2], @y[2]
592 pshufd \$0x93, @t[7], @t[7]
593
594 pxor @y[1], @y[3]
595 pxor @t[1], @y[1]
596 pxor @t[0], @y[0]
597 pxor @t[0], @y[3]
598 pxor @t[5], @y[1]
599 pxor @t[5], @y[0]
600 pxor @t[7], @y[1]
601 pshufd \$0x93, @t[0], @t[0]
602 pxor @t[6], @y[0]
603 pxor @y[1], @y[3]
604 pxor @t[1], @y[4]
605 pshufd \$0x93, @t[1], @t[1]
606
607 pxor @t[7], @y[7]
608 pxor @t[2], @y[4]
609 pxor @t[2], @y[5]
610 pshufd \$0x93, @t[2], @t[2]
611 pxor @t[6], @y[2]
612 pxor @t[3], @t[6] # clobber t[6]
613 pxor @y[7], @y[4]
614 pxor @t[6], @y[3]
615
616 pxor @t[6], @y[6]
617 pxor @t[5], @y[5]
618 pxor @t[4], @y[6]
619 pshufd \$0x93, @t[4], @t[4]
620 pxor @t[6], @y[5]
621 pxor @t[7], @y[6]
622 pxor @t[3], @t[6] # restore t[6]
623
624 pshufd \$0x93, @t[5], @t[5]
625 pshufd \$0x93, @t[6], @t[6]
626 pshufd \$0x93, @t[7], @t[7]
627 pshufd \$0x93, @t[3], @t[3]
628
629 # multiplication by 0x09
630 pxor @y[1], @y[4]
631 pxor @y[1], @t[1] # t[1]=y[1]
632 pxor @t[5], @t[0] # clobber t[0]
633 pxor @t[5], @t[1]
634 pxor @t[0], @y[3]
635 pxor @y[0], @t[0] # t[0]=y[0]
636 pxor @t[6], @t[1]
637 pxor @t[7], @t[6] # clobber t[6]
638 pxor @t[1], @y[4]
639 pxor @t[4], @y[7]
640 pxor @y[4], @t[4] # t[4]=y[4]
641 pxor @t[3], @y[6]
642 pxor @y[3], @t[3] # t[3]=y[3]
643 pxor @t[2], @y[5]
644 pxor @y[2], @t[2] # t[2]=y[2]
645 pxor @t[7], @t[3]
646 pxor @y[5], @t[5] # t[5]=y[5]
647 pxor @t[6], @t[2]
648 pxor @t[6], @t[5]
649 pxor @y[6], @t[6] # t[6]=y[6]
650 pxor @y[7], @t[7] # t[7]=y[7]
651
652 movdqa @t[0],@XMM[0]
653 movdqa @t[1],@XMM[1]
654 movdqa @t[2],@XMM[2]
655 movdqa @t[3],@XMM[3]
656 movdqa @t[4],@XMM[4]
657 movdqa @t[5],@XMM[5]
658 movdqa @t[6],@XMM[6]
659 movdqa @t[7],@XMM[7]
660___
661}
662
663sub aesenc { # not used
664my @b=@_[0..7];
665my @t=@_[8..15];
666$code.=<<___;
667 movdqa 0x30($const),@t[0] # .LSR
668___
669 &ShiftRows (@b,@t[0]);
670 &Sbox (@b,@t);
671 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
672}
673
674sub aesenclast { # not used
675my @b=@_[0..7];
676my @t=@_[8..15];
677$code.=<<___;
678 movdqa 0x40($const),@t[0] # .LSRM0
679___
680 &ShiftRows (@b,@t[0]);
681 &Sbox (@b,@t);
682$code.=<<___
683 pxor 0x00($key),@b[0]
684 pxor 0x10($key),@b[1]
685 pxor 0x20($key),@b[4]
686 pxor 0x30($key),@b[6]
687 pxor 0x40($key),@b[3]
688 pxor 0x50($key),@b[7]
689 pxor 0x60($key),@b[2]
690 pxor 0x70($key),@b[5]
691___
692}
693
694sub swapmove {
695my ($a,$b,$n,$mask,$t)=@_;
696$code.=<<___;
697 movdqa $b,$t
698 psrlq \$$n,$b
699 pxor $a,$b
700 pand $mask,$b
701 pxor $b,$a
702 psllq \$$n,$b
703 pxor $t,$b
704___
705}
706sub swapmove2x {
707my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
708$code.=<<___;
709 movdqa $b0,$t0
710 psrlq \$$n,$b0
711 movdqa $b1,$t1
712 psrlq \$$n,$b1
713 pxor $a0,$b0
714 pxor $a1,$b1
715 pand $mask,$b0
716 pand $mask,$b1
717 pxor $b0,$a0
718 psllq \$$n,$b0
719 pxor $b1,$a1
720 psllq \$$n,$b1
721 pxor $t0,$b0
722 pxor $t1,$b1
723___
724}
725
726sub bitslice {
727my @x=reverse(@_[0..7]);
728my ($t0,$t1,$t2,$t3)=@_[8..11];
729$code.=<<___;
730 movdqa 0x00($const),$t0 # .LBS0
731 movdqa 0x10($const),$t1 # .LBS1
732___
733 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
734 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
735$code.=<<___;
736 movdqa 0x20($const),$t0 # .LBS2
737___
738 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
739 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
740
741 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
742 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
743}
744
745$code.=<<___;
746.text
747
748.extern asm_AES_encrypt
749.extern asm_AES_decrypt
750
751.type _bsaes_encrypt8,\@abi-omnipotent
752.align 64
753_bsaes_encrypt8:
754 lea .LBS0(%rip), $const # constants table
755
756 movdqa ($key), @XMM[9] # round 0 key
757 lea 0x10($key), $key
503f6d4b 758 movdqa 0x50($const), @XMM[8] # .LM0SR
672590bc
PA
759 pxor @XMM[9], @XMM[0] # xor with round0 key
760 pxor @XMM[9], @XMM[1]
761 pshufb @XMM[8], @XMM[0]
762 pxor @XMM[9], @XMM[2]
763 pshufb @XMM[8], @XMM[1]
764 pxor @XMM[9], @XMM[3]
765 pshufb @XMM[8], @XMM[2]
766 pxor @XMM[9], @XMM[4]
767 pshufb @XMM[8], @XMM[3]
768 pxor @XMM[9], @XMM[5]
769 pshufb @XMM[8], @XMM[4]
770 pxor @XMM[9], @XMM[6]
771 pshufb @XMM[8], @XMM[5]
772 pxor @XMM[9], @XMM[7]
773 pshufb @XMM[8], @XMM[6]
774 pshufb @XMM[8], @XMM[7]
775_bsaes_encrypt8_bitslice:
776___
777 &bitslice (@XMM[0..7, 8..11]);
778$code.=<<___;
779 dec $rounds
780 jmp .Lenc_sbox
781.align 16
782.Lenc_loop:
783___
784 &ShiftRows (@XMM[0..7, 8]);
785$code.=".Lenc_sbox:\n";
786 &Sbox (@XMM[0..7, 8..15]);
787$code.=<<___;
788 dec $rounds
789 jl .Lenc_done
790___
791 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
792$code.=<<___;
793 movdqa 0x30($const), @XMM[8] # .LSR
794 jnz .Lenc_loop
795 movdqa 0x40($const), @XMM[8] # .LSRM0
796 jmp .Lenc_loop
797.align 16
798.Lenc_done:
799___
800 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
801 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
802$code.=<<___;
803 movdqa ($key), @XMM[8] # last round key
804 pxor @XMM[8], @XMM[4]
805 pxor @XMM[8], @XMM[6]
806 pxor @XMM[8], @XMM[3]
807 pxor @XMM[8], @XMM[7]
808 pxor @XMM[8], @XMM[2]
809 pxor @XMM[8], @XMM[5]
810 pxor @XMM[8], @XMM[0]
811 pxor @XMM[8], @XMM[1]
812 ret
813.size _bsaes_encrypt8,.-_bsaes_encrypt8
814
815.type _bsaes_decrypt8,\@abi-omnipotent
816.align 64
817_bsaes_decrypt8:
818 lea .LBS0(%rip), $const # constants table
819
820 movdqa ($key), @XMM[9] # round 0 key
821 lea 0x10($key), $key
822 movdqa -0x30($const), @XMM[8] # .LM0ISR
823 pxor @XMM[9], @XMM[0] # xor with round0 key
824 pxor @XMM[9], @XMM[1]
825 pshufb @XMM[8], @XMM[0]
826 pxor @XMM[9], @XMM[2]
827 pshufb @XMM[8], @XMM[1]
828 pxor @XMM[9], @XMM[3]
829 pshufb @XMM[8], @XMM[2]
830 pxor @XMM[9], @XMM[4]
831 pshufb @XMM[8], @XMM[3]
832 pxor @XMM[9], @XMM[5]
833 pshufb @XMM[8], @XMM[4]
834 pxor @XMM[9], @XMM[6]
835 pshufb @XMM[8], @XMM[5]
836 pxor @XMM[9], @XMM[7]
837 pshufb @XMM[8], @XMM[6]
838 pshufb @XMM[8], @XMM[7]
839___
840 &bitslice (@XMM[0..7, 8..11]);
841$code.=<<___;
842 dec $rounds
843 jmp .Ldec_sbox
844.align 16
845.Ldec_loop:
846___
847 &ShiftRows (@XMM[0..7, 8]);
848$code.=".Ldec_sbox:\n";
849 &InvSbox (@XMM[0..7, 8..15]);
850$code.=<<___;
851 dec $rounds
852 jl .Ldec_done
853___
854 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
855$code.=<<___;
856 movdqa -0x10($const), @XMM[8] # .LISR
857 jnz .Ldec_loop
858 movdqa -0x20($const), @XMM[8] # .LISRM0
859 jmp .Ldec_loop
860.align 16
861.Ldec_done:
862___
863 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
864$code.=<<___;
865 movdqa ($key), @XMM[8] # last round key
866 pxor @XMM[8], @XMM[6]
867 pxor @XMM[8], @XMM[4]
868 pxor @XMM[8], @XMM[2]
869 pxor @XMM[8], @XMM[7]
870 pxor @XMM[8], @XMM[3]
871 pxor @XMM[8], @XMM[5]
872 pxor @XMM[8], @XMM[0]
873 pxor @XMM[8], @XMM[1]
874 ret
875.size _bsaes_decrypt8,.-_bsaes_decrypt8
876___
877}
878{
879my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
880
881sub bitslice_key {
882my @x=reverse(@_[0..7]);
883my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
884
885 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
886$code.=<<___;
887 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
888 movdqa @x[0], @x[2]
889 movdqa @x[1], @x[3]
890___
891 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
892
893 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
894$code.=<<___;
895 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
896 movdqa @x[0], @x[4]
897 movdqa @x[2], @x[6]
898 movdqa @x[1], @x[5]
899 movdqa @x[3], @x[7]
900___
901 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
902 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
903}
904
905$code.=<<___;
906.type _bsaes_key_convert,\@abi-omnipotent
907.align 16
908_bsaes_key_convert:
503f6d4b 909 lea .Lmasks(%rip), $const
672590bc 910 movdqu ($inp), %xmm7 # load round 0 key
672590bc 911 lea 0x10($inp), $inp
503f6d4b
PA
912 movdqa 0x00($const), %xmm0 # 0x01...
913 movdqa 0x10($const), %xmm1 # 0x02...
914 movdqa 0x20($const), %xmm2 # 0x04...
915 movdqa 0x30($const), %xmm3 # 0x08...
916 movdqa 0x40($const), %xmm4 # .LM0
917 pcmpeqd %xmm5, %xmm5 # .LNOT
918
919 movdqu ($inp), %xmm6 # load round 1 key
672590bc
PA
920 movdqa %xmm7, ($out) # save round 0 key
921 lea 0x10($out), $out
922 dec $rounds
923 jmp .Lkey_loop
924.align 16
925.Lkey_loop:
503f6d4b
PA
926 pshufb %xmm4, %xmm6 # .LM0
927
928 movdqa %xmm0, %xmm8
929 movdqa %xmm1, %xmm9
930
931 pand %xmm6, %xmm8
932 pand %xmm6, %xmm9
933 movdqa %xmm2, %xmm10
934 pcmpeqb %xmm0, %xmm8
935 psllq \$4, %xmm0 # 0x10...
936 movdqa %xmm3, %xmm11
937 pcmpeqb %xmm1, %xmm9
938 psllq \$4, %xmm1 # 0x20...
939
940 pand %xmm6, %xmm10
941 pand %xmm6, %xmm11
942 movdqa %xmm0, %xmm12
943 pcmpeqb %xmm2, %xmm10
944 psllq \$4, %xmm2 # 0x40...
945 movdqa %xmm1, %xmm13
946 pcmpeqb %xmm3, %xmm11
947 psllq \$4, %xmm3 # 0x80...
948
949 movdqa %xmm2, %xmm14
950 movdqa %xmm3, %xmm15
951 pxor %xmm5, %xmm8 # "pnot"
952 pxor %xmm5, %xmm9
953
954 pand %xmm6, %xmm12
955 pand %xmm6, %xmm13
956 movdqa %xmm8, 0x00($out) # write bit-sliced round key
957 pcmpeqb %xmm0, %xmm12
958 psrlq \$4, %xmm0 # 0x01...
959 movdqa %xmm9, 0x10($out)
960 pcmpeqb %xmm1, %xmm13
961 psrlq \$4, %xmm1 # 0x02...
962 lea 0x10($inp), $inp
963
964 pand %xmm6, %xmm14
965 pand %xmm6, %xmm15
966 movdqa %xmm10, 0x20($out)
967 pcmpeqb %xmm2, %xmm14
968 psrlq \$4, %xmm2 # 0x04...
969 movdqa %xmm11, 0x30($out)
970 pcmpeqb %xmm3, %xmm15
971 psrlq \$4, %xmm3 # 0x08...
972 movdqu ($inp), %xmm6 # load next round key
973
974 pxor %xmm5, %xmm13 # "pnot"
975 pxor %xmm5, %xmm14
976 movdqa %xmm12, 0x40($out)
977 movdqa %xmm13, 0x50($out)
978 movdqa %xmm14, 0x60($out)
979 movdqa %xmm15, 0x70($out)
672590bc 980 lea 0x80($out),$out
672590bc
PA
981 dec $rounds
982 jnz .Lkey_loop
983
503f6d4b 984 movdqa 0x50($const), %xmm7 # .L63
672590bc
PA
985 #movdqa %xmm6, ($out) # don't save last round key
986 ret
987.size _bsaes_key_convert,.-_bsaes_key_convert
988___
989}
990
991if (0 && !$win64) { # following four functions are unsupported interface
992 # used for benchmarking...
993$code.=<<___;
994.globl bsaes_enc_key_convert
995.type bsaes_enc_key_convert,\@function,2
996.align 16
997bsaes_enc_key_convert:
998 mov 240($inp),%r10d # pass rounds
999 mov $inp,%rcx # pass key
1000 mov $out,%rax # pass key schedule
1001 call _bsaes_key_convert
1002 pxor %xmm6,%xmm7 # fix up last round key
1003 movdqa %xmm7,(%rax) # save last round key
1004 ret
1005.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1006
1007.globl bsaes_encrypt_128
1008.type bsaes_encrypt_128,\@function,4
1009.align 16
1010bsaes_encrypt_128:
1011.Lenc128_loop:
1012 movdqu 0x00($inp), @XMM[0] # load input
1013 movdqu 0x10($inp), @XMM[1]
1014 movdqu 0x20($inp), @XMM[2]
1015 movdqu 0x30($inp), @XMM[3]
1016 movdqu 0x40($inp), @XMM[4]
1017 movdqu 0x50($inp), @XMM[5]
1018 movdqu 0x60($inp), @XMM[6]
1019 movdqu 0x70($inp), @XMM[7]
1020 mov $key, %rax # pass the $key
1021 lea 0x80($inp), $inp
1022 mov \$10,%r10d
1023
1024 call _bsaes_encrypt8
1025
1026 movdqu @XMM[0], 0x00($out) # write output
1027 movdqu @XMM[1], 0x10($out)
1028 movdqu @XMM[4], 0x20($out)
1029 movdqu @XMM[6], 0x30($out)
1030 movdqu @XMM[3], 0x40($out)
1031 movdqu @XMM[7], 0x50($out)
1032 movdqu @XMM[2], 0x60($out)
1033 movdqu @XMM[5], 0x70($out)
1034 lea 0x80($out), $out
1035 sub \$0x80,$len
1036 ja .Lenc128_loop
1037 ret
1038.size bsaes_encrypt_128,.-bsaes_encrypt_128
1039
1040.globl bsaes_dec_key_convert
1041.type bsaes_dec_key_convert,\@function,2
1042.align 16
1043bsaes_dec_key_convert:
1044 mov 240($inp),%r10d # pass rounds
1045 mov $inp,%rcx # pass key
1046 mov $out,%rax # pass key schedule
1047 call _bsaes_key_convert
1048 pxor ($out),%xmm7 # fix up round 0 key
1049 movdqa %xmm6,(%rax) # save last round key
1050 movdqa %xmm7,($out)
1051 ret
1052.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1053
1054.globl bsaes_decrypt_128
1055.type bsaes_decrypt_128,\@function,4
1056.align 16
1057bsaes_decrypt_128:
1058.Ldec128_loop:
1059 movdqu 0x00($inp), @XMM[0] # load input
1060 movdqu 0x10($inp), @XMM[1]
1061 movdqu 0x20($inp), @XMM[2]
1062 movdqu 0x30($inp), @XMM[3]
1063 movdqu 0x40($inp), @XMM[4]
1064 movdqu 0x50($inp), @XMM[5]
1065 movdqu 0x60($inp), @XMM[6]
1066 movdqu 0x70($inp), @XMM[7]
1067 mov $key, %rax # pass the $key
1068 lea 0x80($inp), $inp
1069 mov \$10,%r10d
1070
1071 call _bsaes_decrypt8
1072
1073 movdqu @XMM[0], 0x00($out) # write output
1074 movdqu @XMM[1], 0x10($out)
1075 movdqu @XMM[6], 0x20($out)
1076 movdqu @XMM[4], 0x30($out)
1077 movdqu @XMM[2], 0x40($out)
1078 movdqu @XMM[7], 0x50($out)
1079 movdqu @XMM[3], 0x60($out)
1080 movdqu @XMM[5], 0x70($out)
1081 lea 0x80($out), $out
1082 sub \$0x80,$len
1083 ja .Ldec128_loop
1084 ret
1085.size bsaes_decrypt_128,.-bsaes_decrypt_128
1086___
1087}
1088{
1089######################################################################
1090#
1091# OpenSSL interface
1092#
1093my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1094 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1095my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1096
1097if ($ecb) {
1098$code.=<<___;
1099.globl bsaes_ecb_encrypt_blocks
1100.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1101.align 16
1102bsaes_ecb_encrypt_blocks:
1103 mov %rsp, %rax
1104.Lecb_enc_prologue:
1105 push %rbp
1106 push %rbx
1107 push %r12
1108 push %r13
1109 push %r14
1110 push %r15
1111 lea -0x48(%rsp),%rsp
1112___
1113$code.=<<___ if ($win64);
1114 lea -0xa0(%rsp), %rsp
1115 movaps %xmm6, 0x40(%rsp)
1116 movaps %xmm7, 0x50(%rsp)
1117 movaps %xmm8, 0x60(%rsp)
1118 movaps %xmm9, 0x70(%rsp)
1119 movaps %xmm10, 0x80(%rsp)
1120 movaps %xmm11, 0x90(%rsp)
1121 movaps %xmm12, 0xa0(%rsp)
1122 movaps %xmm13, 0xb0(%rsp)
1123 movaps %xmm14, 0xc0(%rsp)
1124 movaps %xmm15, 0xd0(%rsp)
1125.Lecb_enc_body:
1126___
1127$code.=<<___;
1128 mov %rsp,%rbp # backup %rsp
1129 mov 240($arg4),%eax # rounds
1130 mov $arg1,$inp # backup arguments
1131 mov $arg2,$out
1132 mov $arg3,$len
1133 mov $arg4,$key
1134 cmp \$8,$arg3
1135 jb .Lecb_enc_short
1136
1137 mov %eax,%ebx # backup rounds
1138 shl \$7,%rax # 128 bytes per inner round key
1139 sub \$`128-32`,%rax # size of bit-sliced key schedule
1140 sub %rax,%rsp
1141 mov %rsp,%rax # pass key schedule
1142 mov $key,%rcx # pass key
1143 mov %ebx,%r10d # pass rounds
1144 call _bsaes_key_convert
1145 pxor %xmm6,%xmm7 # fix up last round key
1146 movdqa %xmm7,(%rax) # save last round key
1147
1148 sub \$8,$len
1149.Lecb_enc_loop:
1150 movdqu 0x00($inp), @XMM[0] # load input
1151 movdqu 0x10($inp), @XMM[1]
1152 movdqu 0x20($inp), @XMM[2]
1153 movdqu 0x30($inp), @XMM[3]
1154 movdqu 0x40($inp), @XMM[4]
1155 movdqu 0x50($inp), @XMM[5]
1156 mov %rsp, %rax # pass key schedule
1157 movdqu 0x60($inp), @XMM[6]
1158 mov %ebx,%r10d # pass rounds
1159 movdqu 0x70($inp), @XMM[7]
1160 lea 0x80($inp), $inp
1161
1162 call _bsaes_encrypt8
1163
1164 movdqu @XMM[0], 0x00($out) # write output
1165 movdqu @XMM[1], 0x10($out)
1166 movdqu @XMM[4], 0x20($out)
1167 movdqu @XMM[6], 0x30($out)
1168 movdqu @XMM[3], 0x40($out)
1169 movdqu @XMM[7], 0x50($out)
1170 movdqu @XMM[2], 0x60($out)
1171 movdqu @XMM[5], 0x70($out)
1172 lea 0x80($out), $out
1173 sub \$8,$len
1174 jnc .Lecb_enc_loop
1175
1176 add \$8,$len
1177 jz .Lecb_enc_done
1178
1179 movdqu 0x00($inp), @XMM[0] # load input
1180 mov %rsp, %rax # pass key schedule
1181 mov %ebx,%r10d # pass rounds
1182 cmp \$2,$len
1183 jb .Lecb_enc_one
1184 movdqu 0x10($inp), @XMM[1]
1185 je .Lecb_enc_two
1186 movdqu 0x20($inp), @XMM[2]
1187 cmp \$4,$len
1188 jb .Lecb_enc_three
1189 movdqu 0x30($inp), @XMM[3]
1190 je .Lecb_enc_four
1191 movdqu 0x40($inp), @XMM[4]
1192 cmp \$6,$len
1193 jb .Lecb_enc_five
1194 movdqu 0x50($inp), @XMM[5]
1195 je .Lecb_enc_six
1196 movdqu 0x60($inp), @XMM[6]
1197 call _bsaes_encrypt8
1198 movdqu @XMM[0], 0x00($out) # write output
1199 movdqu @XMM[1], 0x10($out)
1200 movdqu @XMM[4], 0x20($out)
1201 movdqu @XMM[6], 0x30($out)
1202 movdqu @XMM[3], 0x40($out)
1203 movdqu @XMM[7], 0x50($out)
1204 movdqu @XMM[2], 0x60($out)
1205 jmp .Lecb_enc_done
1206.align 16
1207.Lecb_enc_six:
1208 call _bsaes_encrypt8
1209 movdqu @XMM[0], 0x00($out) # write output
1210 movdqu @XMM[1], 0x10($out)
1211 movdqu @XMM[4], 0x20($out)
1212 movdqu @XMM[6], 0x30($out)
1213 movdqu @XMM[3], 0x40($out)
1214 movdqu @XMM[7], 0x50($out)
1215 jmp .Lecb_enc_done
1216.align 16
1217.Lecb_enc_five:
1218 call _bsaes_encrypt8
1219 movdqu @XMM[0], 0x00($out) # write output
1220 movdqu @XMM[1], 0x10($out)
1221 movdqu @XMM[4], 0x20($out)
1222 movdqu @XMM[6], 0x30($out)
1223 movdqu @XMM[3], 0x40($out)
1224 jmp .Lecb_enc_done
1225.align 16
1226.Lecb_enc_four:
1227 call _bsaes_encrypt8
1228 movdqu @XMM[0], 0x00($out) # write output
1229 movdqu @XMM[1], 0x10($out)
1230 movdqu @XMM[4], 0x20($out)
1231 movdqu @XMM[6], 0x30($out)
1232 jmp .Lecb_enc_done
1233.align 16
1234.Lecb_enc_three:
1235 call _bsaes_encrypt8
1236 movdqu @XMM[0], 0x00($out) # write output
1237 movdqu @XMM[1], 0x10($out)
1238 movdqu @XMM[4], 0x20($out)
1239 jmp .Lecb_enc_done
1240.align 16
1241.Lecb_enc_two:
1242 call _bsaes_encrypt8
1243 movdqu @XMM[0], 0x00($out) # write output
1244 movdqu @XMM[1], 0x10($out)
1245 jmp .Lecb_enc_done
1246.align 16
1247.Lecb_enc_one:
1248 call _bsaes_encrypt8
1249 movdqu @XMM[0], 0x00($out) # write output
1250 jmp .Lecb_enc_done
1251.align 16
1252.Lecb_enc_short:
1253 lea ($inp), $arg1
1254 lea ($out), $arg2
1255 lea ($key), $arg3
1256 call asm_AES_encrypt
1257 lea 16($inp), $inp
1258 lea 16($out), $out
1259 dec $len
1260 jnz .Lecb_enc_short
1261
1262.Lecb_enc_done:
1263 lea (%rsp),%rax
1264 pxor %xmm0, %xmm0
1265.Lecb_enc_bzero: # wipe key schedule [if any]
1266 movdqa %xmm0, 0x00(%rax)
1267 movdqa %xmm0, 0x10(%rax)
1268 lea 0x20(%rax), %rax
1269 cmp %rax, %rbp
1270 jb .Lecb_enc_bzero
1271
1272 lea (%rbp),%rsp # restore %rsp
1273___
1274$code.=<<___ if ($win64);
1275 movaps 0x40(%rbp), %xmm6
1276 movaps 0x50(%rbp), %xmm7
1277 movaps 0x60(%rbp), %xmm8
1278 movaps 0x70(%rbp), %xmm9
1279 movaps 0x80(%rbp), %xmm10
1280 movaps 0x90(%rbp), %xmm11
1281 movaps 0xa0(%rbp), %xmm12
1282 movaps 0xb0(%rbp), %xmm13
1283 movaps 0xc0(%rbp), %xmm14
1284 movaps 0xd0(%rbp), %xmm15
1285 lea 0xa0(%rbp), %rsp
1286___
1287$code.=<<___;
1288 mov 0x48(%rsp), %r15
1289 mov 0x50(%rsp), %r14
1290 mov 0x58(%rsp), %r13
1291 mov 0x60(%rsp), %r12
1292 mov 0x68(%rsp), %rbx
1293 mov 0x70(%rsp), %rax
1294 lea 0x78(%rsp), %rsp
1295 mov %rax, %rbp
1296.Lecb_enc_epilogue:
1297 ret
1298.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1299
1300.globl bsaes_ecb_decrypt_blocks
1301.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1302.align 16
1303bsaes_ecb_decrypt_blocks:
1304 mov %rsp, %rax
1305.Lecb_dec_prologue:
1306 push %rbp
1307 push %rbx
1308 push %r12
1309 push %r13
1310 push %r14
1311 push %r15
1312 lea -0x48(%rsp),%rsp
1313___
1314$code.=<<___ if ($win64);
1315 lea -0xa0(%rsp), %rsp
1316 movaps %xmm6, 0x40(%rsp)
1317 movaps %xmm7, 0x50(%rsp)
1318 movaps %xmm8, 0x60(%rsp)
1319 movaps %xmm9, 0x70(%rsp)
1320 movaps %xmm10, 0x80(%rsp)
1321 movaps %xmm11, 0x90(%rsp)
1322 movaps %xmm12, 0xa0(%rsp)
1323 movaps %xmm13, 0xb0(%rsp)
1324 movaps %xmm14, 0xc0(%rsp)
1325 movaps %xmm15, 0xd0(%rsp)
1326.Lecb_dec_body:
1327___
1328$code.=<<___;
1329 mov %rsp,%rbp # backup %rsp
1330 mov 240($arg4),%eax # rounds
1331 mov $arg1,$inp # backup arguments
1332 mov $arg2,$out
1333 mov $arg3,$len
1334 mov $arg4,$key
1335 cmp \$8,$arg3
1336 jb .Lecb_dec_short
1337
1338 mov %eax,%ebx # backup rounds
1339 shl \$7,%rax # 128 bytes per inner round key
1340 sub \$`128-32`,%rax # size of bit-sliced key schedule
1341 sub %rax,%rsp
1342 mov %rsp,%rax # pass key schedule
1343 mov $key,%rcx # pass key
1344 mov %ebx,%r10d # pass rounds
1345 call _bsaes_key_convert
1346 pxor (%rsp),%xmm7 # fix up 0 round key
1347 movdqa %xmm6,(%rax) # save last round key
1348 movdqa %xmm7,(%rsp)
1349
1350 sub \$8,$len
1351.Lecb_dec_loop:
1352 movdqu 0x00($inp), @XMM[0] # load input
1353 movdqu 0x10($inp), @XMM[1]
1354 movdqu 0x20($inp), @XMM[2]
1355 movdqu 0x30($inp), @XMM[3]
1356 movdqu 0x40($inp), @XMM[4]
1357 movdqu 0x50($inp), @XMM[5]
1358 mov %rsp, %rax # pass key schedule
1359 movdqu 0x60($inp), @XMM[6]
1360 mov %ebx,%r10d # pass rounds
1361 movdqu 0x70($inp), @XMM[7]
1362 lea 0x80($inp), $inp
1363
1364 call _bsaes_decrypt8
1365
1366 movdqu @XMM[0], 0x00($out) # write output
1367 movdqu @XMM[1], 0x10($out)
1368 movdqu @XMM[6], 0x20($out)
1369 movdqu @XMM[4], 0x30($out)
1370 movdqu @XMM[2], 0x40($out)
1371 movdqu @XMM[7], 0x50($out)
1372 movdqu @XMM[3], 0x60($out)
1373 movdqu @XMM[5], 0x70($out)
1374 lea 0x80($out), $out
1375 sub \$8,$len
1376 jnc .Lecb_dec_loop
1377
1378 add \$8,$len
1379 jz .Lecb_dec_done
1380
1381 movdqu 0x00($inp), @XMM[0] # load input
1382 mov %rsp, %rax # pass key schedule
1383 mov %ebx,%r10d # pass rounds
1384 cmp \$2,$len
1385 jb .Lecb_dec_one
1386 movdqu 0x10($inp), @XMM[1]
1387 je .Lecb_dec_two
1388 movdqu 0x20($inp), @XMM[2]
1389 cmp \$4,$len
1390 jb .Lecb_dec_three
1391 movdqu 0x30($inp), @XMM[3]
1392 je .Lecb_dec_four
1393 movdqu 0x40($inp), @XMM[4]
1394 cmp \$6,$len
1395 jb .Lecb_dec_five
1396 movdqu 0x50($inp), @XMM[5]
1397 je .Lecb_dec_six
1398 movdqu 0x60($inp), @XMM[6]
1399 call _bsaes_decrypt8
1400 movdqu @XMM[0], 0x00($out) # write output
1401 movdqu @XMM[1], 0x10($out)
1402 movdqu @XMM[6], 0x20($out)
1403 movdqu @XMM[4], 0x30($out)
1404 movdqu @XMM[2], 0x40($out)
1405 movdqu @XMM[7], 0x50($out)
1406 movdqu @XMM[3], 0x60($out)
1407 jmp .Lecb_dec_done
1408.align 16
1409.Lecb_dec_six:
1410 call _bsaes_decrypt8
1411 movdqu @XMM[0], 0x00($out) # write output
1412 movdqu @XMM[1], 0x10($out)
1413 movdqu @XMM[6], 0x20($out)
1414 movdqu @XMM[4], 0x30($out)
1415 movdqu @XMM[2], 0x40($out)
1416 movdqu @XMM[7], 0x50($out)
1417 jmp .Lecb_dec_done
1418.align 16
1419.Lecb_dec_five:
1420 call _bsaes_decrypt8
1421 movdqu @XMM[0], 0x00($out) # write output
1422 movdqu @XMM[1], 0x10($out)
1423 movdqu @XMM[6], 0x20($out)
1424 movdqu @XMM[4], 0x30($out)
1425 movdqu @XMM[2], 0x40($out)
1426 jmp .Lecb_dec_done
1427.align 16
1428.Lecb_dec_four:
1429 call _bsaes_decrypt8
1430 movdqu @XMM[0], 0x00($out) # write output
1431 movdqu @XMM[1], 0x10($out)
1432 movdqu @XMM[6], 0x20($out)
1433 movdqu @XMM[4], 0x30($out)
1434 jmp .Lecb_dec_done
1435.align 16
1436.Lecb_dec_three:
1437 call _bsaes_decrypt8
1438 movdqu @XMM[0], 0x00($out) # write output
1439 movdqu @XMM[1], 0x10($out)
1440 movdqu @XMM[6], 0x20($out)
1441 jmp .Lecb_dec_done
1442.align 16
1443.Lecb_dec_two:
1444 call _bsaes_decrypt8
1445 movdqu @XMM[0], 0x00($out) # write output
1446 movdqu @XMM[1], 0x10($out)
1447 jmp .Lecb_dec_done
1448.align 16
1449.Lecb_dec_one:
1450 call _bsaes_decrypt8
1451 movdqu @XMM[0], 0x00($out) # write output
1452 jmp .Lecb_dec_done
1453.align 16
1454.Lecb_dec_short:
1455 lea ($inp), $arg1
1456 lea ($out), $arg2
1457 lea ($key), $arg3
1458 call asm_AES_decrypt
1459 lea 16($inp), $inp
1460 lea 16($out), $out
1461 dec $len
1462 jnz .Lecb_dec_short
1463
1464.Lecb_dec_done:
1465 lea (%rsp),%rax
1466 pxor %xmm0, %xmm0
1467.Lecb_dec_bzero: # wipe key schedule [if any]
1468 movdqa %xmm0, 0x00(%rax)
1469 movdqa %xmm0, 0x10(%rax)
1470 lea 0x20(%rax), %rax
1471 cmp %rax, %rbp
1472 jb .Lecb_dec_bzero
1473
1474 lea (%rbp),%rsp # restore %rsp
1475___
1476$code.=<<___ if ($win64);
1477 movaps 0x40(%rbp), %xmm6
1478 movaps 0x50(%rbp), %xmm7
1479 movaps 0x60(%rbp), %xmm8
1480 movaps 0x70(%rbp), %xmm9
1481 movaps 0x80(%rbp), %xmm10
1482 movaps 0x90(%rbp), %xmm11
1483 movaps 0xa0(%rbp), %xmm12
1484 movaps 0xb0(%rbp), %xmm13
1485 movaps 0xc0(%rbp), %xmm14
1486 movaps 0xd0(%rbp), %xmm15
1487 lea 0xa0(%rbp), %rsp
1488___
1489$code.=<<___;
1490 mov 0x48(%rsp), %r15
1491 mov 0x50(%rsp), %r14
1492 mov 0x58(%rsp), %r13
1493 mov 0x60(%rsp), %r12
1494 mov 0x68(%rsp), %rbx
1495 mov 0x70(%rsp), %rax
1496 lea 0x78(%rsp), %rsp
1497 mov %rax, %rbp
1498.Lecb_dec_epilogue:
1499 ret
1500.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1501___
1502}
1503$code.=<<___;
1504.extern asm_AES_cbc_encrypt
1505.globl bsaes_cbc_encrypt
1506.type bsaes_cbc_encrypt,\@abi-omnipotent
1507.align 16
1508bsaes_cbc_encrypt:
1509___
1510$code.=<<___ if ($win64);
1511 mov 48(%rsp),$arg6 # pull direction flag
1512___
1513$code.=<<___;
1514 cmp \$0,$arg6
1515 jne asm_AES_cbc_encrypt
1516 cmp \$128,$arg3
1517 jb asm_AES_cbc_encrypt
1518
1519 mov %rsp, %rax
1520.Lcbc_dec_prologue:
1521 push %rbp
1522 push %rbx
1523 push %r12
1524 push %r13
1525 push %r14
1526 push %r15
1527 lea -0x48(%rsp), %rsp
1528___
1529$code.=<<___ if ($win64);
1530 mov 0xa0(%rsp),$arg5 # pull ivp
1531 lea -0xa0(%rsp), %rsp
1532 movaps %xmm6, 0x40(%rsp)
1533 movaps %xmm7, 0x50(%rsp)
1534 movaps %xmm8, 0x60(%rsp)
1535 movaps %xmm9, 0x70(%rsp)
1536 movaps %xmm10, 0x80(%rsp)
1537 movaps %xmm11, 0x90(%rsp)
1538 movaps %xmm12, 0xa0(%rsp)
1539 movaps %xmm13, 0xb0(%rsp)
1540 movaps %xmm14, 0xc0(%rsp)
1541 movaps %xmm15, 0xd0(%rsp)
1542.Lcbc_dec_body:
1543___
1544$code.=<<___;
1545 mov %rsp, %rbp # backup %rsp
1546 mov 240($arg4), %eax # rounds
1547 mov $arg1, $inp # backup arguments
1548 mov $arg2, $out
1549 mov $arg3, $len
1550 mov $arg4, $key
1551 mov $arg5, %rbx
1552 shr \$4, $len # bytes to blocks
1553
1554 mov %eax, %edx # rounds
1555 shl \$7, %rax # 128 bytes per inner round key
1556 sub \$`128-32`, %rax # size of bit-sliced key schedule
1557 sub %rax, %rsp
1558
1559 mov %rsp, %rax # pass key schedule
1560 mov $key, %rcx # pass key
1561 mov %edx, %r10d # pass rounds
1562 call _bsaes_key_convert
1563 pxor (%rsp),%xmm7 # fix up 0 round key
1564 movdqa %xmm6,(%rax) # save last round key
1565 movdqa %xmm7,(%rsp)
1566
1567 movdqu (%rbx), @XMM[15] # load IV
1568 sub \$8,$len
1569.Lcbc_dec_loop:
1570 movdqu 0x00($inp), @XMM[0] # load input
1571 movdqu 0x10($inp), @XMM[1]
1572 movdqu 0x20($inp), @XMM[2]
1573 movdqu 0x30($inp), @XMM[3]
1574 movdqu 0x40($inp), @XMM[4]
1575 movdqu 0x50($inp), @XMM[5]
1576 mov %rsp, %rax # pass key schedule
1577 movdqu 0x60($inp), @XMM[6]
1578 mov %edx,%r10d # pass rounds
1579 movdqu 0x70($inp), @XMM[7]
1580 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1581
1582 call _bsaes_decrypt8
1583
1584 pxor 0x20(%rbp), @XMM[0] # ^= IV
1585 movdqu 0x00($inp), @XMM[8] # re-load input
1586 movdqu 0x10($inp), @XMM[9]
1587 pxor @XMM[8], @XMM[1]
1588 movdqu 0x20($inp), @XMM[10]
1589 pxor @XMM[9], @XMM[6]
1590 movdqu 0x30($inp), @XMM[11]
1591 pxor @XMM[10], @XMM[4]
1592 movdqu 0x40($inp), @XMM[12]
1593 pxor @XMM[11], @XMM[2]
1594 movdqu 0x50($inp), @XMM[13]
1595 pxor @XMM[12], @XMM[7]
1596 movdqu 0x60($inp), @XMM[14]
1597 pxor @XMM[13], @XMM[3]
1598 movdqu 0x70($inp), @XMM[15] # IV
1599 pxor @XMM[14], @XMM[5]
1600 movdqu @XMM[0], 0x00($out) # write output
1601 lea 0x80($inp), $inp
1602 movdqu @XMM[1], 0x10($out)
1603 movdqu @XMM[6], 0x20($out)
1604 movdqu @XMM[4], 0x30($out)
1605 movdqu @XMM[2], 0x40($out)
1606 movdqu @XMM[7], 0x50($out)
1607 movdqu @XMM[3], 0x60($out)
1608 movdqu @XMM[5], 0x70($out)
1609 lea 0x80($out), $out
1610 sub \$8,$len
1611 jnc .Lcbc_dec_loop
1612
1613 add \$8,$len
1614 jz .Lcbc_dec_done
1615
1616 movdqu 0x00($inp), @XMM[0] # load input
1617 mov %rsp, %rax # pass key schedule
1618 mov %edx, %r10d # pass rounds
1619 cmp \$2,$len
1620 jb .Lcbc_dec_one
1621 movdqu 0x10($inp), @XMM[1]
1622 je .Lcbc_dec_two
1623 movdqu 0x20($inp), @XMM[2]
1624 cmp \$4,$len
1625 jb .Lcbc_dec_three
1626 movdqu 0x30($inp), @XMM[3]
1627 je .Lcbc_dec_four
1628 movdqu 0x40($inp), @XMM[4]
1629 cmp \$6,$len
1630 jb .Lcbc_dec_five
1631 movdqu 0x50($inp), @XMM[5]
1632 je .Lcbc_dec_six
1633 movdqu 0x60($inp), @XMM[6]
1634 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1635 call _bsaes_decrypt8
1636 pxor 0x20(%rbp), @XMM[0] # ^= IV
1637 movdqu 0x00($inp), @XMM[8] # re-load input
1638 movdqu 0x10($inp), @XMM[9]
1639 pxor @XMM[8], @XMM[1]
1640 movdqu 0x20($inp), @XMM[10]
1641 pxor @XMM[9], @XMM[6]
1642 movdqu 0x30($inp), @XMM[11]
1643 pxor @XMM[10], @XMM[4]
1644 movdqu 0x40($inp), @XMM[12]
1645 pxor @XMM[11], @XMM[2]
1646 movdqu 0x50($inp), @XMM[13]
1647 pxor @XMM[12], @XMM[7]
1648 movdqu 0x60($inp), @XMM[15] # IV
1649 pxor @XMM[13], @XMM[3]
1650 movdqu @XMM[0], 0x00($out) # write output
1651 movdqu @XMM[1], 0x10($out)
1652 movdqu @XMM[6], 0x20($out)
1653 movdqu @XMM[4], 0x30($out)
1654 movdqu @XMM[2], 0x40($out)
1655 movdqu @XMM[7], 0x50($out)
1656 movdqu @XMM[3], 0x60($out)
1657 jmp .Lcbc_dec_done
1658.align 16
1659.Lcbc_dec_six:
1660 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1661 call _bsaes_decrypt8
1662 pxor 0x20(%rbp), @XMM[0] # ^= IV
1663 movdqu 0x00($inp), @XMM[8] # re-load input
1664 movdqu 0x10($inp), @XMM[9]
1665 pxor @XMM[8], @XMM[1]
1666 movdqu 0x20($inp), @XMM[10]
1667 pxor @XMM[9], @XMM[6]
1668 movdqu 0x30($inp), @XMM[11]
1669 pxor @XMM[10], @XMM[4]
1670 movdqu 0x40($inp), @XMM[12]
1671 pxor @XMM[11], @XMM[2]
1672 movdqu 0x50($inp), @XMM[15] # IV
1673 pxor @XMM[12], @XMM[7]
1674 movdqu @XMM[0], 0x00($out) # write output
1675 movdqu @XMM[1], 0x10($out)
1676 movdqu @XMM[6], 0x20($out)
1677 movdqu @XMM[4], 0x30($out)
1678 movdqu @XMM[2], 0x40($out)
1679 movdqu @XMM[7], 0x50($out)
1680 jmp .Lcbc_dec_done
1681.align 16
1682.Lcbc_dec_five:
1683 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1684 call _bsaes_decrypt8
1685 pxor 0x20(%rbp), @XMM[0] # ^= IV
1686 movdqu 0x00($inp), @XMM[8] # re-load input
1687 movdqu 0x10($inp), @XMM[9]
1688 pxor @XMM[8], @XMM[1]
1689 movdqu 0x20($inp), @XMM[10]
1690 pxor @XMM[9], @XMM[6]
1691 movdqu 0x30($inp), @XMM[11]
1692 pxor @XMM[10], @XMM[4]
1693 movdqu 0x40($inp), @XMM[15] # IV
1694 pxor @XMM[11], @XMM[2]
1695 movdqu @XMM[0], 0x00($out) # write output
1696 movdqu @XMM[1], 0x10($out)
1697 movdqu @XMM[6], 0x20($out)
1698 movdqu @XMM[4], 0x30($out)
1699 movdqu @XMM[2], 0x40($out)
1700 jmp .Lcbc_dec_done
1701.align 16
1702.Lcbc_dec_four:
1703 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1704 call _bsaes_decrypt8
1705 pxor 0x20(%rbp), @XMM[0] # ^= IV
1706 movdqu 0x00($inp), @XMM[8] # re-load input
1707 movdqu 0x10($inp), @XMM[9]
1708 pxor @XMM[8], @XMM[1]
1709 movdqu 0x20($inp), @XMM[10]
1710 pxor @XMM[9], @XMM[6]
1711 movdqu 0x30($inp), @XMM[15] # IV
1712 pxor @XMM[10], @XMM[4]
1713 movdqu @XMM[0], 0x00($out) # write output
1714 movdqu @XMM[1], 0x10($out)
1715 movdqu @XMM[6], 0x20($out)
1716 movdqu @XMM[4], 0x30($out)
1717 jmp .Lcbc_dec_done
1718.align 16
1719.Lcbc_dec_three:
1720 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1721 call _bsaes_decrypt8
1722 pxor 0x20(%rbp), @XMM[0] # ^= IV
1723 movdqu 0x00($inp), @XMM[8] # re-load input
1724 movdqu 0x10($inp), @XMM[9]
1725 pxor @XMM[8], @XMM[1]
1726 movdqu 0x20($inp), @XMM[15] # IV
1727 pxor @XMM[9], @XMM[6]
1728 movdqu @XMM[0], 0x00($out) # write output
1729 movdqu @XMM[1], 0x10($out)
1730 movdqu @XMM[6], 0x20($out)
1731 jmp .Lcbc_dec_done
1732.align 16
1733.Lcbc_dec_two:
1734 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1735 call _bsaes_decrypt8
1736 pxor 0x20(%rbp), @XMM[0] # ^= IV
1737 movdqu 0x00($inp), @XMM[8] # re-load input
1738 movdqu 0x10($inp), @XMM[15] # IV
1739 pxor @XMM[8], @XMM[1]
1740 movdqu @XMM[0], 0x00($out) # write output
1741 movdqu @XMM[1], 0x10($out)
1742 jmp .Lcbc_dec_done
1743.align 16
1744.Lcbc_dec_one:
1745 lea ($inp), $arg1
1746 lea 0x20(%rbp), $arg2 # buffer output
1747 lea ($key), $arg3
1748 call asm_AES_decrypt # doesn't touch %xmm
1749 pxor 0x20(%rbp), @XMM[15] # ^= IV
1750 movdqu @XMM[15], ($out) # write output
1751 movdqa @XMM[0], @XMM[15] # IV
1752
1753.Lcbc_dec_done:
1754 movdqu @XMM[15], (%rbx) # return IV
1755 lea (%rsp), %rax
1756 pxor %xmm0, %xmm0
1757.Lcbc_dec_bzero: # wipe key schedule [if any]
1758 movdqa %xmm0, 0x00(%rax)
1759 movdqa %xmm0, 0x10(%rax)
1760 lea 0x20(%rax), %rax
1761 cmp %rax, %rbp
1762 ja .Lcbc_dec_bzero
1763
1764 lea (%rbp),%rsp # restore %rsp
1765___
1766$code.=<<___ if ($win64);
1767 movaps 0x40(%rbp), %xmm6
1768 movaps 0x50(%rbp), %xmm7
1769 movaps 0x60(%rbp), %xmm8
1770 movaps 0x70(%rbp), %xmm9
1771 movaps 0x80(%rbp), %xmm10
1772 movaps 0x90(%rbp), %xmm11
1773 movaps 0xa0(%rbp), %xmm12
1774 movaps 0xb0(%rbp), %xmm13
1775 movaps 0xc0(%rbp), %xmm14
1776 movaps 0xd0(%rbp), %xmm15
1777 lea 0xa0(%rbp), %rsp
1778___
1779$code.=<<___;
1780 mov 0x48(%rsp), %r15
1781 mov 0x50(%rsp), %r14
1782 mov 0x58(%rsp), %r13
1783 mov 0x60(%rsp), %r12
1784 mov 0x68(%rsp), %rbx
1785 mov 0x70(%rsp), %rax
1786 lea 0x78(%rsp), %rsp
1787 mov %rax, %rbp
1788.Lcbc_dec_epilogue:
1789 ret
1790.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1791
1792.globl bsaes_ctr32_encrypt_blocks
1793.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1794.align 16
1795bsaes_ctr32_encrypt_blocks:
1796 mov %rsp, %rax
1797.Lctr_enc_prologue:
1798 push %rbp
1799 push %rbx
1800 push %r12
1801 push %r13
1802 push %r14
1803 push %r15
1804 lea -0x48(%rsp), %rsp
1805___
1806$code.=<<___ if ($win64);
1807 mov 0xa0(%rsp),$arg5 # pull ivp
1808 lea -0xa0(%rsp), %rsp
1809 movaps %xmm6, 0x40(%rsp)
1810 movaps %xmm7, 0x50(%rsp)
1811 movaps %xmm8, 0x60(%rsp)
1812 movaps %xmm9, 0x70(%rsp)
1813 movaps %xmm10, 0x80(%rsp)
1814 movaps %xmm11, 0x90(%rsp)
1815 movaps %xmm12, 0xa0(%rsp)
1816 movaps %xmm13, 0xb0(%rsp)
1817 movaps %xmm14, 0xc0(%rsp)
1818 movaps %xmm15, 0xd0(%rsp)
1819.Lctr_enc_body:
1820___
1821$code.=<<___;
1822 mov %rsp, %rbp # backup %rsp
1823 movdqu ($arg5), %xmm0 # load counter
1824 mov 240($arg4), %eax # rounds
1825 mov $arg1, $inp # backup arguments
1826 mov $arg2, $out
1827 mov $arg3, $len
1828 mov $arg4, $key
1829 movdqa %xmm0, 0x20(%rbp) # copy counter
1830 cmp \$8, $arg3
1831 jb .Lctr_enc_short
1832
1833 mov %eax, %ebx # rounds
1834 shl \$7, %rax # 128 bytes per inner round key
1835 sub \$`128-32`, %rax # size of bit-sliced key schedule
1836 sub %rax, %rsp
1837
1838 mov %rsp, %rax # pass key schedule
1839 mov $key, %rcx # pass key
1840 mov %ebx, %r10d # pass rounds
1841 call _bsaes_key_convert
1842 pxor %xmm6,%xmm7 # fix up last round key
1843 movdqa %xmm7,(%rax) # save last round key
1844
1845 movdqa (%rsp), @XMM[9] # load round0 key
1846 lea .LADD1(%rip), %r11
1847 movdqa 0x20(%rbp), @XMM[0] # counter copy
1848 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1849 pshufb @XMM[8], @XMM[9] # byte swap upper part
1850 pshufb @XMM[8], @XMM[0]
1851 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1852 jmp .Lctr_enc_loop
1853.align 16
1854.Lctr_enc_loop:
1855 movdqa @XMM[0], 0x20(%rbp) # save counter
1856 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1857 movdqa @XMM[0], @XMM[2]
1858 paddd 0x00(%r11), @XMM[1] # .LADD1
1859 movdqa @XMM[0], @XMM[3]
1860 paddd 0x10(%r11), @XMM[2] # .LADD2
1861 movdqa @XMM[0], @XMM[4]
1862 paddd 0x20(%r11), @XMM[3] # .LADD3
1863 movdqa @XMM[0], @XMM[5]
1864 paddd 0x30(%r11), @XMM[4] # .LADD4
1865 movdqa @XMM[0], @XMM[6]
1866 paddd 0x40(%r11), @XMM[5] # .LADD5
1867 movdqa @XMM[0], @XMM[7]
1868 paddd 0x50(%r11), @XMM[6] # .LADD6
1869 paddd 0x60(%r11), @XMM[7] # .LADD7
1870
1871 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1872 # to flip byte order in 32-bit counter
1873 movdqa (%rsp), @XMM[9] # round 0 key
1874 lea 0x10(%rsp), %rax # pass key schedule
1875 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1876 pxor @XMM[9], @XMM[0] # xor with round0 key
1877 pxor @XMM[9], @XMM[1]
1878 pshufb @XMM[8], @XMM[0]
1879 pxor @XMM[9], @XMM[2]
1880 pshufb @XMM[8], @XMM[1]
1881 pxor @XMM[9], @XMM[3]
1882 pshufb @XMM[8], @XMM[2]
1883 pxor @XMM[9], @XMM[4]
1884 pshufb @XMM[8], @XMM[3]
1885 pxor @XMM[9], @XMM[5]
1886 pshufb @XMM[8], @XMM[4]
1887 pxor @XMM[9], @XMM[6]
1888 pshufb @XMM[8], @XMM[5]
1889 pxor @XMM[9], @XMM[7]
1890 pshufb @XMM[8], @XMM[6]
1891 lea .LBS0(%rip), %r11 # constants table
1892 pshufb @XMM[8], @XMM[7]
1893 mov %ebx,%r10d # pass rounds
1894
1895 call _bsaes_encrypt8_bitslice
1896
1897 sub \$8,$len
1898 jc .Lctr_enc_loop_done
1899
1900 movdqu 0x00($inp), @XMM[8] # load input
1901 movdqu 0x10($inp), @XMM[9]
1902 movdqu 0x20($inp), @XMM[10]
1903 movdqu 0x30($inp), @XMM[11]
1904 movdqu 0x40($inp), @XMM[12]
1905 movdqu 0x50($inp), @XMM[13]
1906 movdqu 0x60($inp), @XMM[14]
1907 movdqu 0x70($inp), @XMM[15]
1908 lea 0x80($inp),$inp
1909 pxor @XMM[0], @XMM[8]
1910 movdqa 0x20(%rbp), @XMM[0] # load counter
1911 pxor @XMM[9], @XMM[1]
1912 movdqu @XMM[8], 0x00($out) # write output
1913 pxor @XMM[10], @XMM[4]
1914 movdqu @XMM[1], 0x10($out)
1915 pxor @XMM[11], @XMM[6]
1916 movdqu @XMM[4], 0x20($out)
1917 pxor @XMM[12], @XMM[3]
1918 movdqu @XMM[6], 0x30($out)
1919 pxor @XMM[13], @XMM[7]
1920 movdqu @XMM[3], 0x40($out)
1921 pxor @XMM[14], @XMM[2]
1922 movdqu @XMM[7], 0x50($out)
1923 pxor @XMM[15], @XMM[5]
1924 movdqu @XMM[2], 0x60($out)
1925 lea .LADD1(%rip), %r11
1926 movdqu @XMM[5], 0x70($out)
1927 lea 0x80($out), $out
1928 paddd 0x70(%r11), @XMM[0] # .LADD8
1929 jnz .Lctr_enc_loop
1930
1931 jmp .Lctr_enc_done
1932.align 16
1933.Lctr_enc_loop_done:
1934 add \$8, $len
1935 movdqu 0x00($inp), @XMM[8] # load input
1936 pxor @XMM[8], @XMM[0]
1937 movdqu @XMM[0], 0x00($out) # write output
1938 cmp \$2,$len
1939 jb .Lctr_enc_done
1940 movdqu 0x10($inp), @XMM[9]
1941 pxor @XMM[9], @XMM[1]
1942 movdqu @XMM[1], 0x10($out)
1943 je .Lctr_enc_done
1944 movdqu 0x20($inp), @XMM[10]
1945 pxor @XMM[10], @XMM[4]
1946 movdqu @XMM[4], 0x20($out)
1947 cmp \$4,$len
1948 jb .Lctr_enc_done
1949 movdqu 0x30($inp), @XMM[11]
1950 pxor @XMM[11], @XMM[6]
1951 movdqu @XMM[6], 0x30($out)
1952 je .Lctr_enc_done
1953 movdqu 0x40($inp), @XMM[12]
1954 pxor @XMM[12], @XMM[3]
1955 movdqu @XMM[3], 0x40($out)
1956 cmp \$6,$len
1957 jb .Lctr_enc_done
1958 movdqu 0x50($inp), @XMM[13]
1959 pxor @XMM[13], @XMM[7]
1960 movdqu @XMM[7], 0x50($out)
1961 je .Lctr_enc_done
1962 movdqu 0x60($inp), @XMM[14]
1963 pxor @XMM[14], @XMM[2]
1964 movdqu @XMM[2], 0x60($out)
1965 jmp .Lctr_enc_done
1966
1967.align 16
1968.Lctr_enc_short:
1969 lea 0x20(%rbp), $arg1
1970 lea 0x30(%rbp), $arg2
1971 lea ($key), $arg3
1972 call asm_AES_encrypt
1973 movdqu ($inp), @XMM[1]
1974 lea 16($inp), $inp
1975 mov 0x2c(%rbp), %eax # load 32-bit counter
1976 bswap %eax
1977 pxor 0x30(%rbp), @XMM[1]
1978 inc %eax # increment
1979 movdqu @XMM[1], ($out)
1980 bswap %eax
1981 lea 16($out), $out
1982 mov %eax, 0x2c(%rsp) # save 32-bit counter
1983 dec $len
1984 jnz .Lctr_enc_short
1985
1986.Lctr_enc_done:
1987 lea (%rsp), %rax
1988 pxor %xmm0, %xmm0
1989.Lctr_enc_bzero: # wipe key schedule [if any]
1990 movdqa %xmm0, 0x00(%rax)
1991 movdqa %xmm0, 0x10(%rax)
1992 lea 0x20(%rax), %rax
1993 cmp %rax, %rbp
1994 ja .Lctr_enc_bzero
1995
1996 lea (%rbp),%rsp # restore %rsp
1997___
1998$code.=<<___ if ($win64);
1999 movaps 0x40(%rbp), %xmm6
2000 movaps 0x50(%rbp), %xmm7
2001 movaps 0x60(%rbp), %xmm8
2002 movaps 0x70(%rbp), %xmm9
2003 movaps 0x80(%rbp), %xmm10
2004 movaps 0x90(%rbp), %xmm11
2005 movaps 0xa0(%rbp), %xmm12
2006 movaps 0xb0(%rbp), %xmm13
2007 movaps 0xc0(%rbp), %xmm14
2008 movaps 0xd0(%rbp), %xmm15
2009 lea 0xa0(%rbp), %rsp
2010___
2011$code.=<<___;
2012 mov 0x48(%rsp), %r15
2013 mov 0x50(%rsp), %r14
2014 mov 0x58(%rsp), %r13
2015 mov 0x60(%rsp), %r12
2016 mov 0x68(%rsp), %rbx
2017 mov 0x70(%rsp), %rax
2018 lea 0x78(%rsp), %rsp
2019 mov %rax, %rbp
2020.Lctr_enc_epilogue:
2021 ret
2022.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2023___
2024######################################################################
2025# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2026# const AES_KEY *key1, const AES_KEY *key2,
2027# const unsigned char iv[16]);
2028#
2029my ($twmask,$twres,$twtmp)=@XMM[13..15];
2030$code.=<<___;
2031.globl bsaes_xts_encrypt
2032.type bsaes_xts_encrypt,\@abi-omnipotent
2033.align 16
2034bsaes_xts_encrypt:
2035 mov %rsp, %rax
2036.Lxts_enc_prologue:
2037 push %rbp
2038 push %rbx
2039 push %r12
2040 push %r13
2041 push %r14
2042 push %r15
2043 lea -0x48(%rsp), %rsp
2044___
2045$code.=<<___ if ($win64);
2046 mov 0xa0(%rsp),$arg5 # pull key2
2047 mov 0xa8(%rsp),$arg6 # pull ivp
2048 lea -0xa0(%rsp), %rsp
2049 movaps %xmm6, 0x40(%rsp)
2050 movaps %xmm7, 0x50(%rsp)
2051 movaps %xmm8, 0x60(%rsp)
2052 movaps %xmm9, 0x70(%rsp)
2053 movaps %xmm10, 0x80(%rsp)
2054 movaps %xmm11, 0x90(%rsp)
2055 movaps %xmm12, 0xa0(%rsp)
2056 movaps %xmm13, 0xb0(%rsp)
2057 movaps %xmm14, 0xc0(%rsp)
2058 movaps %xmm15, 0xd0(%rsp)
2059.Lxts_enc_body:
2060___
2061$code.=<<___;
2062 mov %rsp, %rbp # backup %rsp
2063 mov $arg1, $inp # backup arguments
2064 mov $arg2, $out
2065 mov $arg3, $len
2066 mov $arg4, $key
2067
2068 lea ($arg6), $arg1
2069 lea 0x20(%rbp), $arg2
2070 lea ($arg5), $arg3
2071 call asm_AES_encrypt # generate initial tweak
2072
2073 mov 240($key), %eax # rounds
2074 mov $len, %rbx # backup $len
2075
2076 mov %eax, %edx # rounds
2077 shl \$7, %rax # 128 bytes per inner round key
2078 sub \$`128-32`, %rax # size of bit-sliced key schedule
2079 sub %rax, %rsp
2080
2081 mov %rsp, %rax # pass key schedule
2082 mov $key, %rcx # pass key
2083 mov %edx, %r10d # pass rounds
2084 call _bsaes_key_convert
2085 pxor %xmm6, %xmm7 # fix up last round key
2086 movdqa %xmm7, (%rax) # save last round key
2087
2088 and \$-16, $len
2089 sub \$0x80, %rsp # place for tweak[8]
2090 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2091
2092 pxor $twtmp, $twtmp
2093 movdqa .Lxts_magic(%rip), $twmask
2094 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2095
2096 sub \$0x80, $len
2097 jc .Lxts_enc_short
2098 jmp .Lxts_enc_loop
2099
2100.align 16
2101.Lxts_enc_loop:
2102___
2103 for ($i=0;$i<7;$i++) {
2104 $code.=<<___;
2105 pshufd \$0x13, $twtmp, $twres
2106 pxor $twtmp, $twtmp
2107 movdqa @XMM[7], @XMM[$i]
2108 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2109 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2110 pand $twmask, $twres # isolate carry and residue
2111 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2112 pxor $twres, @XMM[7]
2113___
2114 $code.=<<___ if ($i>=1);
2115 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2116___
2117 $code.=<<___ if ($i>=2);
2118 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2119___
2120 }
2121$code.=<<___;
2122 movdqu 0x60($inp), @XMM[8+6]
2123 pxor @XMM[8+5], @XMM[5]
2124 movdqu 0x70($inp), @XMM[8+7]
2125 lea 0x80($inp), $inp
2126 movdqa @XMM[7], 0x70(%rsp)
2127 pxor @XMM[8+6], @XMM[6]
2128 lea 0x80(%rsp), %rax # pass key schedule
2129 pxor @XMM[8+7], @XMM[7]
2130 mov %edx, %r10d # pass rounds
2131
2132 call _bsaes_encrypt8
2133
2134 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2135 pxor 0x10(%rsp), @XMM[1]
2136 movdqu @XMM[0], 0x00($out) # write output
2137 pxor 0x20(%rsp), @XMM[4]
2138 movdqu @XMM[1], 0x10($out)
2139 pxor 0x30(%rsp), @XMM[6]
2140 movdqu @XMM[4], 0x20($out)
2141 pxor 0x40(%rsp), @XMM[3]
2142 movdqu @XMM[6], 0x30($out)
2143 pxor 0x50(%rsp), @XMM[7]
2144 movdqu @XMM[3], 0x40($out)
2145 pxor 0x60(%rsp), @XMM[2]
2146 movdqu @XMM[7], 0x50($out)
2147 pxor 0x70(%rsp), @XMM[5]
2148 movdqu @XMM[2], 0x60($out)
2149 movdqu @XMM[5], 0x70($out)
2150 lea 0x80($out), $out
2151
2152 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2153 pxor $twtmp, $twtmp
2154 movdqa .Lxts_magic(%rip), $twmask
2155 pcmpgtd @XMM[7], $twtmp
2156 pshufd \$0x13, $twtmp, $twres
2157 pxor $twtmp, $twtmp
2158 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2159 pand $twmask, $twres # isolate carry and residue
2160 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2161 pxor $twres, @XMM[7]
2162
2163 sub \$0x80,$len
2164 jnc .Lxts_enc_loop
2165
2166.Lxts_enc_short:
2167 add \$0x80, $len
2168 jz .Lxts_enc_done
2169___
2170 for ($i=0;$i<7;$i++) {
2171 $code.=<<___;
2172 pshufd \$0x13, $twtmp, $twres
2173 pxor $twtmp, $twtmp
2174 movdqa @XMM[7], @XMM[$i]
2175 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2176 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2177 pand $twmask, $twres # isolate carry and residue
2178 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2179 pxor $twres, @XMM[7]
2180___
2181 $code.=<<___ if ($i>=1);
2182 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2183 cmp \$`0x10*$i`,$len
2184 je .Lxts_enc_$i
2185___
2186 $code.=<<___ if ($i>=2);
2187 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2188___
2189 }
2190$code.=<<___;
2191 movdqu 0x60($inp), @XMM[8+6]
2192 pxor @XMM[8+5], @XMM[5]
2193 movdqa @XMM[7], 0x70(%rsp)
2194 lea 0x70($inp), $inp
2195 pxor @XMM[8+6], @XMM[6]
2196 lea 0x80(%rsp), %rax # pass key schedule
2197 mov %edx, %r10d # pass rounds
2198
2199 call _bsaes_encrypt8
2200
2201 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2202 pxor 0x10(%rsp), @XMM[1]
2203 movdqu @XMM[0], 0x00($out) # write output
2204 pxor 0x20(%rsp), @XMM[4]
2205 movdqu @XMM[1], 0x10($out)
2206 pxor 0x30(%rsp), @XMM[6]
2207 movdqu @XMM[4], 0x20($out)
2208 pxor 0x40(%rsp), @XMM[3]
2209 movdqu @XMM[6], 0x30($out)
2210 pxor 0x50(%rsp), @XMM[7]
2211 movdqu @XMM[3], 0x40($out)
2212 pxor 0x60(%rsp), @XMM[2]
2213 movdqu @XMM[7], 0x50($out)
2214 movdqu @XMM[2], 0x60($out)
2215 lea 0x70($out), $out
2216
2217 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2218 jmp .Lxts_enc_done
2219.align 16
2220.Lxts_enc_6:
2221 pxor @XMM[8+4], @XMM[4]
2222 lea 0x60($inp), $inp
2223 pxor @XMM[8+5], @XMM[5]
2224 lea 0x80(%rsp), %rax # pass key schedule
2225 mov %edx, %r10d # pass rounds
2226
2227 call _bsaes_encrypt8
2228
2229 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2230 pxor 0x10(%rsp), @XMM[1]
2231 movdqu @XMM[0], 0x00($out) # write output
2232 pxor 0x20(%rsp), @XMM[4]
2233 movdqu @XMM[1], 0x10($out)
2234 pxor 0x30(%rsp), @XMM[6]
2235 movdqu @XMM[4], 0x20($out)
2236 pxor 0x40(%rsp), @XMM[3]
2237 movdqu @XMM[6], 0x30($out)
2238 pxor 0x50(%rsp), @XMM[7]
2239 movdqu @XMM[3], 0x40($out)
2240 movdqu @XMM[7], 0x50($out)
2241 lea 0x60($out), $out
2242
2243 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2244 jmp .Lxts_enc_done
2245.align 16
2246.Lxts_enc_5:
2247 pxor @XMM[8+3], @XMM[3]
2248 lea 0x50($inp), $inp
2249 pxor @XMM[8+4], @XMM[4]
2250 lea 0x80(%rsp), %rax # pass key schedule
2251 mov %edx, %r10d # pass rounds
2252
2253 call _bsaes_encrypt8
2254
2255 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2256 pxor 0x10(%rsp), @XMM[1]
2257 movdqu @XMM[0], 0x00($out) # write output
2258 pxor 0x20(%rsp), @XMM[4]
2259 movdqu @XMM[1], 0x10($out)
2260 pxor 0x30(%rsp), @XMM[6]
2261 movdqu @XMM[4], 0x20($out)
2262 pxor 0x40(%rsp), @XMM[3]
2263 movdqu @XMM[6], 0x30($out)
2264 movdqu @XMM[3], 0x40($out)
2265 lea 0x50($out), $out
2266
2267 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2268 jmp .Lxts_enc_done
2269.align 16
2270.Lxts_enc_4:
2271 pxor @XMM[8+2], @XMM[2]
2272 lea 0x40($inp), $inp
2273 pxor @XMM[8+3], @XMM[3]
2274 lea 0x80(%rsp), %rax # pass key schedule
2275 mov %edx, %r10d # pass rounds
2276
2277 call _bsaes_encrypt8
2278
2279 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2280 pxor 0x10(%rsp), @XMM[1]
2281 movdqu @XMM[0], 0x00($out) # write output
2282 pxor 0x20(%rsp), @XMM[4]
2283 movdqu @XMM[1], 0x10($out)
2284 pxor 0x30(%rsp), @XMM[6]
2285 movdqu @XMM[4], 0x20($out)
2286 movdqu @XMM[6], 0x30($out)
2287 lea 0x40($out), $out
2288
2289 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2290 jmp .Lxts_enc_done
2291.align 16
2292.Lxts_enc_3:
2293 pxor @XMM[8+1], @XMM[1]
2294 lea 0x30($inp), $inp
2295 pxor @XMM[8+2], @XMM[2]
2296 lea 0x80(%rsp), %rax # pass key schedule
2297 mov %edx, %r10d # pass rounds
2298
2299 call _bsaes_encrypt8
2300
2301 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2302 pxor 0x10(%rsp), @XMM[1]
2303 movdqu @XMM[0], 0x00($out) # write output
2304 pxor 0x20(%rsp), @XMM[4]
2305 movdqu @XMM[1], 0x10($out)
2306 movdqu @XMM[4], 0x20($out)
2307 lea 0x30($out), $out
2308
2309 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2310 jmp .Lxts_enc_done
2311.align 16
2312.Lxts_enc_2:
2313 pxor @XMM[8+0], @XMM[0]
2314 lea 0x20($inp), $inp
2315 pxor @XMM[8+1], @XMM[1]
2316 lea 0x80(%rsp), %rax # pass key schedule
2317 mov %edx, %r10d # pass rounds
2318
2319 call _bsaes_encrypt8
2320
2321 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2322 pxor 0x10(%rsp), @XMM[1]
2323 movdqu @XMM[0], 0x00($out) # write output
2324 movdqu @XMM[1], 0x10($out)
2325 lea 0x20($out), $out
2326
2327 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2328 jmp .Lxts_enc_done
2329.align 16
2330.Lxts_enc_1:
2331 pxor @XMM[0], @XMM[8]
2332 lea 0x10($inp), $inp
2333 movdqa @XMM[8], 0x20(%rbp)
2334 lea 0x20(%rbp), $arg1
2335 lea 0x20(%rbp), $arg2
2336 lea ($key), $arg3
2337 call asm_AES_encrypt # doesn't touch %xmm
2338 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2339 #pxor @XMM[8], @XMM[0]
2340 #lea 0x80(%rsp), %rax # pass key schedule
2341 #mov %edx, %r10d # pass rounds
2342 #call _bsaes_encrypt8
2343 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2344 movdqu @XMM[0], 0x00($out) # write output
2345 lea 0x10($out), $out
2346
2347 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2348
2349.Lxts_enc_done:
2350 and \$15, %ebx
2351 jz .Lxts_enc_ret
2352 mov $out, %rdx
2353
2354.Lxts_enc_steal:
2355 movzb ($inp), %eax
2356 movzb -16(%rdx), %ecx
2357 lea 1($inp), $inp
2358 mov %al, -16(%rdx)
2359 mov %cl, 0(%rdx)
2360 lea 1(%rdx), %rdx
2361 sub \$1,%ebx
2362 jnz .Lxts_enc_steal
2363
2364 movdqu -16($out), @XMM[0]
2365 lea 0x20(%rbp), $arg1
2366 pxor @XMM[7], @XMM[0]
2367 lea 0x20(%rbp), $arg2
2368 movdqa @XMM[0], 0x20(%rbp)
2369 lea ($key), $arg3
2370 call asm_AES_encrypt # doesn't touch %xmm
2371 pxor 0x20(%rbp), @XMM[7]
2372 movdqu @XMM[7], -16($out)
2373
2374.Lxts_enc_ret:
2375 lea (%rsp), %rax
2376 pxor %xmm0, %xmm0
2377.Lxts_enc_bzero: # wipe key schedule [if any]
2378 movdqa %xmm0, 0x00(%rax)
2379 movdqa %xmm0, 0x10(%rax)
2380 lea 0x20(%rax), %rax
2381 cmp %rax, %rbp
2382 ja .Lxts_enc_bzero
2383
2384 lea (%rbp),%rsp # restore %rsp
2385___
2386$code.=<<___ if ($win64);
2387 movaps 0x40(%rbp), %xmm6
2388 movaps 0x50(%rbp), %xmm7
2389 movaps 0x60(%rbp), %xmm8
2390 movaps 0x70(%rbp), %xmm9
2391 movaps 0x80(%rbp), %xmm10
2392 movaps 0x90(%rbp), %xmm11
2393 movaps 0xa0(%rbp), %xmm12
2394 movaps 0xb0(%rbp), %xmm13
2395 movaps 0xc0(%rbp), %xmm14
2396 movaps 0xd0(%rbp), %xmm15
2397 lea 0xa0(%rbp), %rsp
2398___
2399$code.=<<___;
2400 mov 0x48(%rsp), %r15
2401 mov 0x50(%rsp), %r14
2402 mov 0x58(%rsp), %r13
2403 mov 0x60(%rsp), %r12
2404 mov 0x68(%rsp), %rbx
2405 mov 0x70(%rsp), %rax
2406 lea 0x78(%rsp), %rsp
2407 mov %rax, %rbp
2408.Lxts_enc_epilogue:
2409 ret
2410.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2411
2412.globl bsaes_xts_decrypt
2413.type bsaes_xts_decrypt,\@abi-omnipotent
2414.align 16
2415bsaes_xts_decrypt:
2416 mov %rsp, %rax
2417.Lxts_dec_prologue:
2418 push %rbp
2419 push %rbx
2420 push %r12
2421 push %r13
2422 push %r14
2423 push %r15
2424 lea -0x48(%rsp), %rsp
2425___
2426$code.=<<___ if ($win64);
2427 mov 0xa0(%rsp),$arg5 # pull key2
2428 mov 0xa8(%rsp),$arg6 # pull ivp
2429 lea -0xa0(%rsp), %rsp
2430 movaps %xmm6, 0x40(%rsp)
2431 movaps %xmm7, 0x50(%rsp)
2432 movaps %xmm8, 0x60(%rsp)
2433 movaps %xmm9, 0x70(%rsp)
2434 movaps %xmm10, 0x80(%rsp)
2435 movaps %xmm11, 0x90(%rsp)
2436 movaps %xmm12, 0xa0(%rsp)
2437 movaps %xmm13, 0xb0(%rsp)
2438 movaps %xmm14, 0xc0(%rsp)
2439 movaps %xmm15, 0xd0(%rsp)
2440.Lxts_dec_body:
2441___
2442$code.=<<___;
2443 mov %rsp, %rbp # backup %rsp
2444 mov $arg1, $inp # backup arguments
2445 mov $arg2, $out
2446 mov $arg3, $len
2447 mov $arg4, $key
2448
2449 lea ($arg6), $arg1
2450 lea 0x20(%rbp), $arg2
2451 lea ($arg5), $arg3
2452 call asm_AES_encrypt # generate initial tweak
2453
2454 mov 240($key), %eax # rounds
2455 mov $len, %rbx # backup $len
2456
2457 mov %eax, %edx # rounds
2458 shl \$7, %rax # 128 bytes per inner round key
2459 sub \$`128-32`, %rax # size of bit-sliced key schedule
2460 sub %rax, %rsp
2461
2462 mov %rsp, %rax # pass key schedule
2463 mov $key, %rcx # pass key
2464 mov %edx, %r10d # pass rounds
2465 call _bsaes_key_convert
2466 pxor (%rsp), %xmm7 # fix up round 0 key
2467 movdqa %xmm6, (%rax) # save last round key
2468 movdqa %xmm7, (%rsp)
2469
2470 xor %eax, %eax # if ($len%16) len-=16;
2471 and \$-16, $len
2472 test \$15, %ebx
2473 setnz %al
2474 shl \$4, %rax
2475 sub %rax, $len
2476
2477 sub \$0x80, %rsp # place for tweak[8]
2478 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2479
2480 pxor $twtmp, $twtmp
2481 movdqa .Lxts_magic(%rip), $twmask
2482 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2483
2484 sub \$0x80, $len
2485 jc .Lxts_dec_short
2486 jmp .Lxts_dec_loop
2487
2488.align 16
2489.Lxts_dec_loop:
2490___
2491 for ($i=0;$i<7;$i++) {
2492 $code.=<<___;
2493 pshufd \$0x13, $twtmp, $twres
2494 pxor $twtmp, $twtmp
2495 movdqa @XMM[7], @XMM[$i]
2496 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2497 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2498 pand $twmask, $twres # isolate carry and residue
2499 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2500 pxor $twres, @XMM[7]
2501___
2502 $code.=<<___ if ($i>=1);
2503 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2504___
2505 $code.=<<___ if ($i>=2);
2506 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2507___
2508 }
2509$code.=<<___;
2510 movdqu 0x60($inp), @XMM[8+6]
2511 pxor @XMM[8+5], @XMM[5]
2512 movdqu 0x70($inp), @XMM[8+7]
2513 lea 0x80($inp), $inp
2514 movdqa @XMM[7], 0x70(%rsp)
2515 pxor @XMM[8+6], @XMM[6]
2516 lea 0x80(%rsp), %rax # pass key schedule
2517 pxor @XMM[8+7], @XMM[7]
2518 mov %edx, %r10d # pass rounds
2519
2520 call _bsaes_decrypt8
2521
2522 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2523 pxor 0x10(%rsp), @XMM[1]
2524 movdqu @XMM[0], 0x00($out) # write output
2525 pxor 0x20(%rsp), @XMM[6]
2526 movdqu @XMM[1], 0x10($out)
2527 pxor 0x30(%rsp), @XMM[4]
2528 movdqu @XMM[6], 0x20($out)
2529 pxor 0x40(%rsp), @XMM[2]
2530 movdqu @XMM[4], 0x30($out)
2531 pxor 0x50(%rsp), @XMM[7]
2532 movdqu @XMM[2], 0x40($out)
2533 pxor 0x60(%rsp), @XMM[3]
2534 movdqu @XMM[7], 0x50($out)
2535 pxor 0x70(%rsp), @XMM[5]
2536 movdqu @XMM[3], 0x60($out)
2537 movdqu @XMM[5], 0x70($out)
2538 lea 0x80($out), $out
2539
2540 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2541 pxor $twtmp, $twtmp
2542 movdqa .Lxts_magic(%rip), $twmask
2543 pcmpgtd @XMM[7], $twtmp
2544 pshufd \$0x13, $twtmp, $twres
2545 pxor $twtmp, $twtmp
2546 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2547 pand $twmask, $twres # isolate carry and residue
2548 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2549 pxor $twres, @XMM[7]
2550
2551 sub \$0x80,$len
2552 jnc .Lxts_dec_loop
2553
2554.Lxts_dec_short:
2555 add \$0x80, $len
2556 jz .Lxts_dec_done
2557___
2558 for ($i=0;$i<7;$i++) {
2559 $code.=<<___;
2560 pshufd \$0x13, $twtmp, $twres
2561 pxor $twtmp, $twtmp
2562 movdqa @XMM[7], @XMM[$i]
2563 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2564 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2565 pand $twmask, $twres # isolate carry and residue
2566 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2567 pxor $twres, @XMM[7]
2568___
2569 $code.=<<___ if ($i>=1);
2570 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2571 cmp \$`0x10*$i`,$len
2572 je .Lxts_dec_$i
2573___
2574 $code.=<<___ if ($i>=2);
2575 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2576___
2577 }
2578$code.=<<___;
2579 movdqu 0x60($inp), @XMM[8+6]
2580 pxor @XMM[8+5], @XMM[5]
2581 movdqa @XMM[7], 0x70(%rsp)
2582 lea 0x70($inp), $inp
2583 pxor @XMM[8+6], @XMM[6]
2584 lea 0x80(%rsp), %rax # pass key schedule
2585 mov %edx, %r10d # pass rounds
2586
2587 call _bsaes_decrypt8
2588
2589 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2590 pxor 0x10(%rsp), @XMM[1]
2591 movdqu @XMM[0], 0x00($out) # write output
2592 pxor 0x20(%rsp), @XMM[6]
2593 movdqu @XMM[1], 0x10($out)
2594 pxor 0x30(%rsp), @XMM[4]
2595 movdqu @XMM[6], 0x20($out)
2596 pxor 0x40(%rsp), @XMM[2]
2597 movdqu @XMM[4], 0x30($out)
2598 pxor 0x50(%rsp), @XMM[7]
2599 movdqu @XMM[2], 0x40($out)
2600 pxor 0x60(%rsp), @XMM[3]
2601 movdqu @XMM[7], 0x50($out)
2602 movdqu @XMM[3], 0x60($out)
2603 lea 0x70($out), $out
2604
2605 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2606 jmp .Lxts_dec_done
2607.align 16
2608.Lxts_dec_6:
2609 pxor @XMM[8+4], @XMM[4]
2610 lea 0x60($inp), $inp
2611 pxor @XMM[8+5], @XMM[5]
2612 lea 0x80(%rsp), %rax # pass key schedule
2613 mov %edx, %r10d # pass rounds
2614
2615 call _bsaes_decrypt8
2616
2617 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2618 pxor 0x10(%rsp), @XMM[1]
2619 movdqu @XMM[0], 0x00($out) # write output
2620 pxor 0x20(%rsp), @XMM[6]
2621 movdqu @XMM[1], 0x10($out)
2622 pxor 0x30(%rsp), @XMM[4]
2623 movdqu @XMM[6], 0x20($out)
2624 pxor 0x40(%rsp), @XMM[2]
2625 movdqu @XMM[4], 0x30($out)
2626 pxor 0x50(%rsp), @XMM[7]
2627 movdqu @XMM[2], 0x40($out)
2628 movdqu @XMM[7], 0x50($out)
2629 lea 0x60($out), $out
2630
2631 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2632 jmp .Lxts_dec_done
2633.align 16
2634.Lxts_dec_5:
2635 pxor @XMM[8+3], @XMM[3]
2636 lea 0x50($inp), $inp
2637 pxor @XMM[8+4], @XMM[4]
2638 lea 0x80(%rsp), %rax # pass key schedule
2639 mov %edx, %r10d # pass rounds
2640
2641 call _bsaes_decrypt8
2642
2643 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2644 pxor 0x10(%rsp), @XMM[1]
2645 movdqu @XMM[0], 0x00($out) # write output
2646 pxor 0x20(%rsp), @XMM[6]
2647 movdqu @XMM[1], 0x10($out)
2648 pxor 0x30(%rsp), @XMM[4]
2649 movdqu @XMM[6], 0x20($out)
2650 pxor 0x40(%rsp), @XMM[2]
2651 movdqu @XMM[4], 0x30($out)
2652 movdqu @XMM[2], 0x40($out)
2653 lea 0x50($out), $out
2654
2655 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2656 jmp .Lxts_dec_done
2657.align 16
2658.Lxts_dec_4:
2659 pxor @XMM[8+2], @XMM[2]
2660 lea 0x40($inp), $inp
2661 pxor @XMM[8+3], @XMM[3]
2662 lea 0x80(%rsp), %rax # pass key schedule
2663 mov %edx, %r10d # pass rounds
2664
2665 call _bsaes_decrypt8
2666
2667 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2668 pxor 0x10(%rsp), @XMM[1]
2669 movdqu @XMM[0], 0x00($out) # write output
2670 pxor 0x20(%rsp), @XMM[6]
2671 movdqu @XMM[1], 0x10($out)
2672 pxor 0x30(%rsp), @XMM[4]
2673 movdqu @XMM[6], 0x20($out)
2674 movdqu @XMM[4], 0x30($out)
2675 lea 0x40($out), $out
2676
2677 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2678 jmp .Lxts_dec_done
2679.align 16
2680.Lxts_dec_3:
2681 pxor @XMM[8+1], @XMM[1]
2682 lea 0x30($inp), $inp
2683 pxor @XMM[8+2], @XMM[2]
2684 lea 0x80(%rsp), %rax # pass key schedule
2685 mov %edx, %r10d # pass rounds
2686
2687 call _bsaes_decrypt8
2688
2689 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2690 pxor 0x10(%rsp), @XMM[1]
2691 movdqu @XMM[0], 0x00($out) # write output
2692 pxor 0x20(%rsp), @XMM[6]
2693 movdqu @XMM[1], 0x10($out)
2694 movdqu @XMM[6], 0x20($out)
2695 lea 0x30($out), $out
2696
2697 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2698 jmp .Lxts_dec_done
2699.align 16
2700.Lxts_dec_2:
2701 pxor @XMM[8+0], @XMM[0]
2702 lea 0x20($inp), $inp
2703 pxor @XMM[8+1], @XMM[1]
2704 lea 0x80(%rsp), %rax # pass key schedule
2705 mov %edx, %r10d # pass rounds
2706
2707 call _bsaes_decrypt8
2708
2709 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2710 pxor 0x10(%rsp), @XMM[1]
2711 movdqu @XMM[0], 0x00($out) # write output
2712 movdqu @XMM[1], 0x10($out)
2713 lea 0x20($out), $out
2714
2715 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2716 jmp .Lxts_dec_done
2717.align 16
2718.Lxts_dec_1:
2719 pxor @XMM[0], @XMM[8]
2720 lea 0x10($inp), $inp
2721 movdqa @XMM[8], 0x20(%rbp)
2722 lea 0x20(%rbp), $arg1
2723 lea 0x20(%rbp), $arg2
2724 lea ($key), $arg3
2725 call asm_AES_decrypt # doesn't touch %xmm
2726 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2727 #pxor @XMM[8], @XMM[0]
2728 #lea 0x80(%rsp), %rax # pass key schedule
2729 #mov %edx, %r10d # pass rounds
2730 #call _bsaes_decrypt8
2731 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2732 movdqu @XMM[0], 0x00($out) # write output
2733 lea 0x10($out), $out
2734
2735 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2736
2737.Lxts_dec_done:
2738 and \$15, %ebx
2739 jz .Lxts_dec_ret
2740
2741 pxor $twtmp, $twtmp
2742 movdqa .Lxts_magic(%rip), $twmask
2743 pcmpgtd @XMM[7], $twtmp
2744 pshufd \$0x13, $twtmp, $twres
2745 movdqa @XMM[7], @XMM[6]
2746 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2747 pand $twmask, $twres # isolate carry and residue
2748 movdqu ($inp), @XMM[0]
2749 pxor $twres, @XMM[7]
2750
2751 lea 0x20(%rbp), $arg1
2752 pxor @XMM[7], @XMM[0]
2753 lea 0x20(%rbp), $arg2
2754 movdqa @XMM[0], 0x20(%rbp)
2755 lea ($key), $arg3
2756 call asm_AES_decrypt # doesn't touch %xmm
2757 pxor 0x20(%rbp), @XMM[7]
2758 mov $out, %rdx
2759 movdqu @XMM[7], ($out)
2760
2761.Lxts_dec_steal:
2762 movzb 16($inp), %eax
2763 movzb (%rdx), %ecx
2764 lea 1($inp), $inp
2765 mov %al, (%rdx)
2766 mov %cl, 16(%rdx)
2767 lea 1(%rdx), %rdx
2768 sub \$1,%ebx
2769 jnz .Lxts_dec_steal
2770
2771 movdqu ($out), @XMM[0]
2772 lea 0x20(%rbp), $arg1
2773 pxor @XMM[6], @XMM[0]
2774 lea 0x20(%rbp), $arg2
2775 movdqa @XMM[0], 0x20(%rbp)
2776 lea ($key), $arg3
2777 call asm_AES_decrypt # doesn't touch %xmm
2778 pxor 0x20(%rbp), @XMM[6]
2779 movdqu @XMM[6], ($out)
2780
2781.Lxts_dec_ret:
2782 lea (%rsp), %rax
2783 pxor %xmm0, %xmm0
2784.Lxts_dec_bzero: # wipe key schedule [if any]
2785 movdqa %xmm0, 0x00(%rax)
2786 movdqa %xmm0, 0x10(%rax)
2787 lea 0x20(%rax), %rax
2788 cmp %rax, %rbp
2789 ja .Lxts_dec_bzero
2790
2791 lea (%rbp),%rsp # restore %rsp
2792___
2793$code.=<<___ if ($win64);
2794 movaps 0x40(%rbp), %xmm6
2795 movaps 0x50(%rbp), %xmm7
2796 movaps 0x60(%rbp), %xmm8
2797 movaps 0x70(%rbp), %xmm9
2798 movaps 0x80(%rbp), %xmm10
2799 movaps 0x90(%rbp), %xmm11
2800 movaps 0xa0(%rbp), %xmm12
2801 movaps 0xb0(%rbp), %xmm13
2802 movaps 0xc0(%rbp), %xmm14
2803 movaps 0xd0(%rbp), %xmm15
2804 lea 0xa0(%rbp), %rsp
2805___
2806$code.=<<___;
2807 mov 0x48(%rsp), %r15
2808 mov 0x50(%rsp), %r14
2809 mov 0x58(%rsp), %r13
2810 mov 0x60(%rsp), %r12
2811 mov 0x68(%rsp), %rbx
2812 mov 0x70(%rsp), %rax
2813 lea 0x78(%rsp), %rsp
2814 mov %rax, %rbp
2815.Lxts_dec_epilogue:
2816 ret
2817.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2818___
2819}
2820$code.=<<___;
2821.type _bsaes_const,\@object
2822.align 64
2823_bsaes_const:
2824.LM0ISR: # InvShiftRows constants
2825 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2826.LISRM0:
2827 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2828.LISR:
2829 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2830.LBS0: # bit-slice constants
2831 .quad 0x5555555555555555, 0x5555555555555555
2832.LBS1:
2833 .quad 0x3333333333333333, 0x3333333333333333
2834.LBS2:
2835 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2836.LSR: # shiftrows constants
2837 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2838.LSRM0:
2839 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
672590bc
PA
2840.LM0SR:
2841 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
672590bc
PA
2842.LSWPUP: # byte-swap upper dword
2843 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2844.LSWPUPM0SR:
2845 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2846.LADD1: # counter increment constants
2847 .quad 0x0000000000000000, 0x0000000100000000
2848.LADD2:
2849 .quad 0x0000000000000000, 0x0000000200000000
2850.LADD3:
2851 .quad 0x0000000000000000, 0x0000000300000000
2852.LADD4:
2853 .quad 0x0000000000000000, 0x0000000400000000
2854.LADD5:
2855 .quad 0x0000000000000000, 0x0000000500000000
2856.LADD6:
2857 .quad 0x0000000000000000, 0x0000000600000000
2858.LADD7:
2859 .quad 0x0000000000000000, 0x0000000700000000
2860.LADD8:
2861 .quad 0x0000000000000000, 0x0000000800000000
2862.Lxts_magic:
2863 .long 0x87,0,1,0
503f6d4b
PA
2864.Lmasks:
2865 .quad 0x0101010101010101, 0x0101010101010101
2866 .quad 0x0202020202020202, 0x0202020202020202
2867 .quad 0x0404040404040404, 0x0404040404040404
2868 .quad 0x0808080808080808, 0x0808080808080808
2869.LM0:
2870 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2871.L63:
2872 .quad 0x6363636363636363, 0x6363636363636363
672590bc
PA
2873.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2874.align 64
2875.size _bsaes_const,.-_bsaes_const
2876___
2877
2878# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2879# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2880if ($win64) {
2881$rec="%rcx";
2882$frame="%rdx";
2883$context="%r8";
2884$disp="%r9";
2885
2886$code.=<<___;
2887.extern __imp_RtlVirtualUnwind
2888.type se_handler,\@abi-omnipotent
2889.align 16
2890se_handler:
2891 push %rsi
2892 push %rdi
2893 push %rbx
2894 push %rbp
2895 push %r12
2896 push %r13
2897 push %r14
2898 push %r15
2899 pushfq
2900 sub \$64,%rsp
2901
2902 mov 120($context),%rax # pull context->Rax
2903 mov 248($context),%rbx # pull context->Rip
2904
2905 mov 8($disp),%rsi # disp->ImageBase
2906 mov 56($disp),%r11 # disp->HandlerData
2907
2908 mov 0(%r11),%r10d # HandlerData[0]
2909 lea (%rsi,%r10),%r10 # prologue label
2910 cmp %r10,%rbx # context->Rip<prologue label
2911 jb .Lin_prologue
2912
2913 mov 152($context),%rax # pull context->Rsp
2914
2915 mov 4(%r11),%r10d # HandlerData[1]
2916 lea (%rsi,%r10),%r10 # epilogue label
2917 cmp %r10,%rbx # context->Rip>=epilogue label
2918 jae .Lin_prologue
2919
2920 mov 160($context),%rax # pull context->Rbp
2921
2922 lea 0x40(%rax),%rsi # %xmm save area
2923 lea 512($context),%rdi # &context.Xmm6
2924 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2925 .long 0xa548f3fc # cld; rep movsq
2926 lea 0xa0(%rax),%rax # adjust stack pointer
2927
2928 mov 0x70(%rax),%rbp
2929 mov 0x68(%rax),%rbx
2930 mov 0x60(%rax),%r12
2931 mov 0x58(%rax),%r13
2932 mov 0x50(%rax),%r14
2933 mov 0x48(%rax),%r15
2934 lea 0x78(%rax),%rax # adjust stack pointer
2935 mov %rbx,144($context) # restore context->Rbx
2936 mov %rbp,160($context) # restore context->Rbp
2937 mov %r12,216($context) # restore context->R12
2938 mov %r13,224($context) # restore context->R13
2939 mov %r14,232($context) # restore context->R14
2940 mov %r15,240($context) # restore context->R15
2941
2942.Lin_prologue:
2943 mov %rax,152($context) # restore context->Rsp
2944
2945 mov 40($disp),%rdi # disp->ContextRecord
2946 mov $context,%rsi # context
2947 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2948 .long 0xa548f3fc # cld; rep movsq
2949
2950 mov $disp,%rsi
2951 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2952 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2953 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2954 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2955 mov 40(%rsi),%r10 # disp->ContextRecord
2956 lea 56(%rsi),%r11 # &disp->HandlerData
2957 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2958 mov %r10,32(%rsp) # arg5
2959 mov %r11,40(%rsp) # arg6
2960 mov %r12,48(%rsp) # arg7
2961 mov %rcx,56(%rsp) # arg8, (NULL)
2962 call *__imp_RtlVirtualUnwind(%rip)
2963
2964 mov \$1,%eax # ExceptionContinueSearch
2965 add \$64,%rsp
2966 popfq
2967 pop %r15
2968 pop %r14
2969 pop %r13
2970 pop %r12
2971 pop %rbp
2972 pop %rbx
2973 pop %rdi
2974 pop %rsi
2975 ret
2976.size se_handler,.-se_handler
2977
2978.section .pdata
2979.align 4
2980___
2981$code.=<<___ if ($ecb);
2982 .rva .Lecb_enc_prologue
2983 .rva .Lecb_enc_epilogue
2984 .rva .Lecb_enc_info
2985
2986 .rva .Lecb_dec_prologue
2987 .rva .Lecb_dec_epilogue
2988 .rva .Lecb_dec_info
2989___
2990$code.=<<___;
2991 .rva .Lcbc_dec_prologue
2992 .rva .Lcbc_dec_epilogue
2993 .rva .Lcbc_dec_info
2994
2995 .rva .Lctr_enc_prologue
2996 .rva .Lctr_enc_epilogue
2997 .rva .Lctr_enc_info
2998
2999 .rva .Lxts_enc_prologue
3000 .rva .Lxts_enc_epilogue
3001 .rva .Lxts_enc_info
3002
3003 .rva .Lxts_dec_prologue
3004 .rva .Lxts_dec_epilogue
3005 .rva .Lxts_dec_info
3006
3007.section .xdata
3008.align 8
3009___
3010$code.=<<___ if ($ecb);
3011.Lecb_enc_info:
3012 .byte 9,0,0,0
3013 .rva se_handler
3014 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3015.Lecb_dec_info:
3016 .byte 9,0,0,0
3017 .rva se_handler
3018 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3019___
3020$code.=<<___;
3021.Lcbc_dec_info:
3022 .byte 9,0,0,0
3023 .rva se_handler
3024 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3025.Lctr_enc_info:
3026 .byte 9,0,0,0
3027 .rva se_handler
3028 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3029.Lxts_enc_info:
3030 .byte 9,0,0,0
3031 .rva se_handler
3032 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3033.Lxts_dec_info:
3034 .byte 9,0,0,0
3035 .rva se_handler
3036 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3037___
3038}
3039
3040$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3041
3042print $code;
3043
3044close STDOUT;