Import OpenSSL-1.0.1d.
[dragonfly.git] / crypto / openssl / crypto / camellia / asm / cmll-x86_64.pl
CommitLineData
5febbddd
PA
1#!/usr/bin/env perl
2
3# ====================================================================
4# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5#
6# This module may be used under the terms of either the GNU General
7# Public License version 2 or later, the GNU Lesser General Public
8# License version 2.1 or later, the Mozilla Public License version
9# 1.1 or the BSD License. The exact terms of either license are
10# distributed along with this module. For further details see
11# http://www.openssl.org/~appro/camellia/.
12# ====================================================================
13
14# Performance in cycles per processed byte (less is better) in
15# 'openssl speed ...' benchmark:
16#
17# AMD64 Core2 EM64T
18# -evp camellia-128-ecb 16.7 21.0 22.7
19# + over gcc 3.4.6 +25% +5% 0%
20#
21# camellia-128-cbc 15.7 20.4 21.1
22#
23# 128-bit key setup 128 216 205 cycles/key
24# + over gcc 3.4.6 +54% +39% +15%
25#
26# Numbers in "+" rows represent performance improvement over compiler
27# generated code. Key setup timings are impressive on AMD and Core2
28# thanks to 64-bit operations being covertly deployed. Improvement on
29# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30# apparently emulates some of 64-bit operations in [32-bit] microcode.
31
32$flavour = shift;
33$output = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41die "can't locate x86_64-xlate.pl";
42
9bb344e0
PA
43open OUT,"| \"$^X\" $xlate $flavour $output";
44*STDOUT=*OUT;
5febbddd
PA
45
46sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
47sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
48 $r =~ s/%[er]([sd]i)/%\1l/;
49 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
50
51$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
52@S=("%r8d","%r9d","%r10d","%r11d");
53$i0="%esi";
54$i1="%edi";
55$Tbl="%rbp"; # size optimization
56$inp="%r12";
57$out="%r13";
58$key="%r14";
59$keyend="%r15";
60$arg0d=$win64?"%ecx":"%edi";
61
62# const unsigned int Camellia_SBOX[4][256];
63# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
64# and [2][] - with [3][]. This is done to minimize code size.
65$SBOX1_1110=0; # Camellia_SBOX[0]
66$SBOX4_4404=4; # Camellia_SBOX[1]
67$SBOX2_0222=2048; # Camellia_SBOX[2]
68$SBOX3_3033=2052; # Camellia_SBOX[3]
69
70sub Camellia_Feistel {
71my $i=@_[0];
72my $seed=defined(@_[1])?@_[1]:0;
73my $scale=$seed<0?-8:8;
74my $j=($i&1)*2;
75my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
76
77$code.=<<___;
78 xor $s0,$t0 # t0^=key[0]
79 xor $s1,$t1 # t1^=key[1]
80 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
81 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
82 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
83 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
84 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
85 shr \$16,$t0
86 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
87 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
88 shr \$16,$t1
89 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
90 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
91 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
92 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
93 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
94 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
95 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
96 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
97 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
98 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
99 mov `$seed+($i+1)*$scale+4`($key),$t0
100 xor $t3,$t2 # t2^=t3
101 ror \$8,$t3 # t3=RightRotate(t3,8)
102 xor $t2,$s2
103 xor $t2,$s3
104 xor $t3,$s3
105___
106}
107
108# void Camellia_EncryptBlock_Rounds(
109# int grandRounds,
110# const Byte plaintext[],
111# const KEY_TABLE_TYPE keyTable,
112# Byte ciphertext[])
113$code=<<___;
114.text
115
116# V1.x API
117.globl Camellia_EncryptBlock
118.type Camellia_EncryptBlock,\@abi-omnipotent
119.align 16
120Camellia_EncryptBlock:
121 movl \$128,%eax
122 subl $arg0d,%eax
123 movl \$3,$arg0d
124 adcl \$0,$arg0d # keyBitLength==128?3:4
125 jmp .Lenc_rounds
126.size Camellia_EncryptBlock,.-Camellia_EncryptBlock
127# V2
128.globl Camellia_EncryptBlock_Rounds
129.type Camellia_EncryptBlock_Rounds,\@function,4
130.align 16
131.Lenc_rounds:
132Camellia_EncryptBlock_Rounds:
133 push %rbx
134 push %rbp
135 push %r13
136 push %r14
137 push %r15
138.Lenc_prologue:
139
140 #mov %rsi,$inp # put away arguments
141 mov %rcx,$out
142 mov %rdx,$key
143
144 shl \$6,%edi # process grandRounds
145 lea .LCamellia_SBOX(%rip),$Tbl
146 lea ($key,%rdi),$keyend
147
148 mov 0(%rsi),@S[0] # load plaintext
149 mov 4(%rsi),@S[1]
150 mov 8(%rsi),@S[2]
151 bswap @S[0]
152 mov 12(%rsi),@S[3]
153 bswap @S[1]
154 bswap @S[2]
155 bswap @S[3]
156
157 call _x86_64_Camellia_encrypt
158
159 bswap @S[0]
160 bswap @S[1]
161 bswap @S[2]
162 mov @S[0],0($out)
163 bswap @S[3]
164 mov @S[1],4($out)
165 mov @S[2],8($out)
166 mov @S[3],12($out)
167
168 mov 0(%rsp),%r15
169 mov 8(%rsp),%r14
170 mov 16(%rsp),%r13
171 mov 24(%rsp),%rbp
172 mov 32(%rsp),%rbx
173 lea 40(%rsp),%rsp
174.Lenc_epilogue:
175 ret
176.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
177
178.type _x86_64_Camellia_encrypt,\@abi-omnipotent
179.align 16
180_x86_64_Camellia_encrypt:
181 xor 0($key),@S[1]
182 xor 4($key),@S[0] # ^=key[0-3]
183 xor 8($key),@S[3]
184 xor 12($key),@S[2]
185.align 16
186.Leloop:
187 mov 16($key),$t1 # prefetch key[4-5]
188 mov 20($key),$t0
189
190___
191 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
192$code.=<<___;
193 lea 16*4($key),$key
194 cmp $keyend,$key
195 mov 8($key),$t3 # prefetch key[2-3]
196 mov 12($key),$t2
197 je .Ledone
198
199 and @S[0],$t0
200 or @S[3],$t3
201 rol \$1,$t0
202 xor $t3,@S[2] # s2^=s3|key[3];
203 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
204 and @S[2],$t2
205 or @S[1],$t1
206 rol \$1,$t2
207 xor $t1,@S[0] # s0^=s1|key[1];
208 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
209 jmp .Leloop
210
211.align 16
212.Ledone:
213 xor @S[2],$t0 # SwapHalf
214 xor @S[3],$t1
215 xor @S[0],$t2
216 xor @S[1],$t3
217
218 mov $t0,@S[0]
219 mov $t1,@S[1]
220 mov $t2,@S[2]
221 mov $t3,@S[3]
222
223 .byte 0xf3,0xc3 # rep ret
224.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
225
226# V1.x API
227.globl Camellia_DecryptBlock
228.type Camellia_DecryptBlock,\@abi-omnipotent
229.align 16
230Camellia_DecryptBlock:
231 movl \$128,%eax
232 subl $arg0d,%eax
233 movl \$3,$arg0d
234 adcl \$0,$arg0d # keyBitLength==128?3:4
235 jmp .Ldec_rounds
236.size Camellia_DecryptBlock,.-Camellia_DecryptBlock
237# V2
238.globl Camellia_DecryptBlock_Rounds
239.type Camellia_DecryptBlock_Rounds,\@function,4
240.align 16
241.Ldec_rounds:
242Camellia_DecryptBlock_Rounds:
243 push %rbx
244 push %rbp
245 push %r13
246 push %r14
247 push %r15
248.Ldec_prologue:
249
250 #mov %rsi,$inp # put away arguments
251 mov %rcx,$out
252 mov %rdx,$keyend
253
254 shl \$6,%edi # process grandRounds
255 lea .LCamellia_SBOX(%rip),$Tbl
256 lea ($keyend,%rdi),$key
257
258 mov 0(%rsi),@S[0] # load plaintext
259 mov 4(%rsi),@S[1]
260 mov 8(%rsi),@S[2]
261 bswap @S[0]
262 mov 12(%rsi),@S[3]
263 bswap @S[1]
264 bswap @S[2]
265 bswap @S[3]
266
267 call _x86_64_Camellia_decrypt
268
269 bswap @S[0]
270 bswap @S[1]
271 bswap @S[2]
272 mov @S[0],0($out)
273 bswap @S[3]
274 mov @S[1],4($out)
275 mov @S[2],8($out)
276 mov @S[3],12($out)
277
278 mov 0(%rsp),%r15
279 mov 8(%rsp),%r14
280 mov 16(%rsp),%r13
281 mov 24(%rsp),%rbp
282 mov 32(%rsp),%rbx
283 lea 40(%rsp),%rsp
284.Ldec_epilogue:
285 ret
286.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
287
288.type _x86_64_Camellia_decrypt,\@abi-omnipotent
289.align 16
290_x86_64_Camellia_decrypt:
291 xor 0($key),@S[1]
292 xor 4($key),@S[0] # ^=key[0-3]
293 xor 8($key),@S[3]
294 xor 12($key),@S[2]
295.align 16
296.Ldloop:
297 mov -8($key),$t1 # prefetch key[4-5]
298 mov -4($key),$t0
299
300___
301 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
302$code.=<<___;
303 lea -16*4($key),$key
304 cmp $keyend,$key
305 mov 0($key),$t3 # prefetch key[2-3]
306 mov 4($key),$t2
307 je .Lddone
308
309 and @S[0],$t0
310 or @S[3],$t3
311 rol \$1,$t0
312 xor $t3,@S[2] # s2^=s3|key[3];
313 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
314 and @S[2],$t2
315 or @S[1],$t1
316 rol \$1,$t2
317 xor $t1,@S[0] # s0^=s1|key[1];
318 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
319
320 jmp .Ldloop
321
322.align 16
323.Lddone:
324 xor @S[2],$t2
325 xor @S[3],$t3
326 xor @S[0],$t0
327 xor @S[1],$t1
328
329 mov $t2,@S[0] # SwapHalf
330 mov $t3,@S[1]
331 mov $t0,@S[2]
332 mov $t1,@S[3]
333
334 .byte 0xf3,0xc3 # rep ret
335.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
336___
337
338sub _saveround {
339my ($rnd,$key,@T)=@_;
340my $bias=int(@T[0])?shift(@T):0;
341
342 if ($#T==3) {
343 $code.=<<___;
344 mov @T[1],`$bias+$rnd*8+0`($key)
345 mov @T[0],`$bias+$rnd*8+4`($key)
346 mov @T[3],`$bias+$rnd*8+8`($key)
347 mov @T[2],`$bias+$rnd*8+12`($key)
348___
349 } else {
350 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
351 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
352 }
353}
354
355sub _loadround {
356my ($rnd,$key,@T)=@_;
357my $bias=int(@T[0])?shift(@T):0;
358
359$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
360$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
361}
362
363# shld is very slow on Intel EM64T family. Even on AMD it limits
364# instruction decode rate [because it's VectorPath] and consequently
365# performance...
366sub __rotl128 {
367my ($i0,$i1,$rot)=@_;
368
369 if ($rot) {
370 $code.=<<___;
371 mov $i0,%r11
372 shld \$$rot,$i1,$i0
373 shld \$$rot,%r11,$i1
374___
375 }
376}
377
378# ... Implementing 128-bit rotate without shld gives 80% better
379# performance EM64T, +15% on AMD64 and only ~7% degradation on
380# Core2. This is therefore preferred.
381sub _rotl128 {
382my ($i0,$i1,$rot)=@_;
383
384 if ($rot) {
385 $code.=<<___;
386 mov $i0,%r11
387 shl \$$rot,$i0
388 mov $i1,%r9
389 shr \$`64-$rot`,%r9
390 shr \$`64-$rot`,%r11
391 or %r9,$i0
392 shl \$$rot,$i1
393 or %r11,$i1
394___
395 }
396}
397
398{ my $step=0;
399
400$code.=<<___;
401.globl Camellia_Ekeygen
402.type Camellia_Ekeygen,\@function,3
403.align 16
404Camellia_Ekeygen:
405 push %rbx
406 push %rbp
407 push %r13
408 push %r14
409 push %r15
410.Lkey_prologue:
411
412 mov %rdi,$keyend # put away arguments, keyBitLength
413 mov %rdx,$out # keyTable
414
415 mov 0(%rsi),@S[0] # load 0-127 bits
416 mov 4(%rsi),@S[1]
417 mov 8(%rsi),@S[2]
418 mov 12(%rsi),@S[3]
419
420 bswap @S[0]
421 bswap @S[1]
422 bswap @S[2]
423 bswap @S[3]
424___
425 &_saveround (0,$out,@S); # KL<<<0
426$code.=<<___;
427 cmp \$128,$keyend # check keyBitLength
428 je .L1st128
429
430 mov 16(%rsi),@S[0] # load 128-191 bits
431 mov 20(%rsi),@S[1]
432 cmp \$192,$keyend
433 je .L1st192
434 mov 24(%rsi),@S[2] # load 192-255 bits
435 mov 28(%rsi),@S[3]
436 jmp .L1st256
437.L1st192:
438 mov @S[0],@S[2]
439 mov @S[1],@S[3]
440 not @S[2]
441 not @S[3]
442.L1st256:
443 bswap @S[0]
444 bswap @S[1]
445 bswap @S[2]
446 bswap @S[3]
447___
448 &_saveround (4,$out,@S); # temp storage for KR!
449$code.=<<___;
450 xor 0($out),@S[1] # KR^KL
451 xor 4($out),@S[0]
452 xor 8($out),@S[3]
453 xor 12($out),@S[2]
454
455.L1st128:
456 lea .LCamellia_SIGMA(%rip),$key
457 lea .LCamellia_SBOX(%rip),$Tbl
458
459 mov 0($key),$t1
460 mov 4($key),$t0
461___
462 &Camellia_Feistel($step++);
463 &Camellia_Feistel($step++);
464$code.=<<___;
465 xor 0($out),@S[1] # ^KL
466 xor 4($out),@S[0]
467 xor 8($out),@S[3]
468 xor 12($out),@S[2]
469___
470 &Camellia_Feistel($step++);
471 &Camellia_Feistel($step++);
472$code.=<<___;
473 cmp \$128,$keyend
474 jne .L2nd256
475
476 lea 128($out),$out # size optimization
477 shl \$32,%r8 # @S[0]||
478 shl \$32,%r10 # @S[2]||
479 or %r9,%r8 # ||@S[1]
480 or %r11,%r10 # ||@S[3]
481___
482 &_loadround (0,$out,-128,"%rax","%rbx"); # KL
483 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
484 &_rotl128 ("%rax","%rbx",15);
485 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
486 &_rotl128 ("%r8","%r10",15);
487 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
488 &_rotl128 ("%r8","%r10",15); # 15+15=30
489 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
490 &_rotl128 ("%rax","%rbx",30); # 15+30=45
491 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
492 &_rotl128 ("%r8","%r10",15); # 30+15=45
493 &_saveround (12,$out,-128,"%r8"); # KA<<<45
494 &_rotl128 ("%rax","%rbx",15); # 45+15=60
495 &_saveround (13,$out,-128,"%rbx"); # KL<<<60
496 &_rotl128 ("%r8","%r10",15); # 45+15=60
497 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
498 &_rotl128 ("%rax","%rbx",17); # 60+17=77
499 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
500 &_rotl128 ("%rax","%rbx",17); # 77+17=94
501 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
502 &_rotl128 ("%r8","%r10",34); # 60+34=94
503 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
504 &_rotl128 ("%rax","%rbx",17); # 94+17=111
505 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
506 &_rotl128 ("%r8","%r10",17); # 94+17=111
507 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
508$code.=<<___;
509 mov \$3,%eax
510 jmp .Ldone
511.align 16
512.L2nd256:
513___
514 &_saveround (6,$out,@S); # temp storage for KA!
515$code.=<<___;
516 xor `4*8+0`($out),@S[1] # KA^KR
517 xor `4*8+4`($out),@S[0]
518 xor `5*8+0`($out),@S[3]
519 xor `5*8+4`($out),@S[2]
520___
521 &Camellia_Feistel($step++);
522 &Camellia_Feistel($step++);
523
524 &_loadround (0,$out,"%rax","%rbx"); # KL
525 &_loadround (4,$out,"%rcx","%rdx"); # KR
526 &_loadround (6,$out,"%r14","%r15"); # KA
527$code.=<<___;
528 lea 128($out),$out # size optimization
529 shl \$32,%r8 # @S[0]||
530 shl \$32,%r10 # @S[2]||
531 or %r9,%r8 # ||@S[1]
532 or %r11,%r10 # ||@S[3]
533___
534 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
535 &_rotl128 ("%rcx","%rdx",15);
536 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
537 &_rotl128 ("%r14","%r15",15);
538 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
539 &_rotl128 ("%rcx","%rdx",15); # 15+15=30
540 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
541 &_rotl128 ("%r8","%r10",30);
542 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
543 &_rotl128 ("%rax","%rbx",45);
544 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
545 &_rotl128 ("%r14","%r15",30); # 15+30=45
546 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
547 &_rotl128 ("%rax","%rbx",15); # 45+15=60
548 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
549 &_rotl128 ("%rcx","%rdx",30); # 30+30=60
550 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
551 &_rotl128 ("%r8","%r10",30); # 30+30=60
552 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
553 &_rotl128 ("%rax","%rbx",17); # 60+17=77
554 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
555 &_rotl128 ("%r14","%r15",32); # 45+32=77
556 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
557 &_rotl128 ("%rcx","%rdx",34); # 60+34=94
558 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
559 &_rotl128 ("%r14","%r15",17); # 77+17=94
560 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
561 &_rotl128 ("%rax","%rbx",34); # 77+34=111
562 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
563 &_rotl128 ("%r8","%r10",51); # 60+51=111
564 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
565$code.=<<___;
566 mov \$4,%eax
567.Ldone:
568 mov 0(%rsp),%r15
569 mov 8(%rsp),%r14
570 mov 16(%rsp),%r13
571 mov 24(%rsp),%rbp
572 mov 32(%rsp),%rbx
573 lea 40(%rsp),%rsp
574.Lkey_epilogue:
575 ret
576.size Camellia_Ekeygen,.-Camellia_Ekeygen
577___
578}
579
580@SBOX=(
581112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
582 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
583134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
584166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
585139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
586223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
587 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
588254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
589170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
590 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
591135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
592 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
593233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
594120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
595114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
596 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
597
598sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
599sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
600sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
601sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
602
603$code.=<<___;
604.align 64
605.LCamellia_SIGMA:
606.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
607.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
608.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
609.long 0, 0, 0, 0
610.LCamellia_SBOX:
611___
612# tables are interleaved, remember?
613sub data_word { $code.=".long\t".join(',',@_)."\n"; }
614for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
615for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
616
617# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
618# size_t length, const CAMELLIA_KEY *key,
619# unsigned char *ivp,const int enc);
620{
621$_key="0(%rsp)";
622$_end="8(%rsp)"; # inp+len&~15
623$_res="16(%rsp)"; # len&15
624$ivec="24(%rsp)";
625$_ivp="40(%rsp)";
626$_rsp="48(%rsp)";
627
628$code.=<<___;
629.globl Camellia_cbc_encrypt
630.type Camellia_cbc_encrypt,\@function,6
631.align 16
632Camellia_cbc_encrypt:
633 cmp \$0,%rdx
634 je .Lcbc_abort
635 push %rbx
636 push %rbp
637 push %r12
638 push %r13
639 push %r14
640 push %r15
641.Lcbc_prologue:
642
643 mov %rsp,%rbp
644 sub \$64,%rsp
645 and \$-64,%rsp
646
647 # place stack frame just "above mod 1024" the key schedule,
648 # this ensures that cache associativity suffices
649 lea -64-63(%rcx),%r10
650 sub %rsp,%r10
651 neg %r10
652 and \$0x3C0,%r10
653 sub %r10,%rsp
654 #add \$8,%rsp # 8 is reserved for callee's ra
655
656 mov %rdi,$inp # inp argument
657 mov %rsi,$out # out argument
658 mov %r8,%rbx # ivp argument
659 mov %rcx,$key # key argument
660 mov 272(%rcx),${keyend}d # grandRounds
661
662 mov %r8,$_ivp
663 mov %rbp,$_rsp
664
665.Lcbc_body:
666 lea .LCamellia_SBOX(%rip),$Tbl
667
668 mov \$32,%ecx
669.align 4
670.Lcbc_prefetch_sbox:
671 mov 0($Tbl),%rax
672 mov 32($Tbl),%rsi
673 mov 64($Tbl),%rdi
674 mov 96($Tbl),%r11
675 lea 128($Tbl),$Tbl
676 loop .Lcbc_prefetch_sbox
677 sub \$4096,$Tbl
678 shl \$6,$keyend
679 mov %rdx,%rcx # len argument
680 lea ($key,$keyend),$keyend
681
682 cmp \$0,%r9d # enc argument
683 je .LCBC_DECRYPT
684
685 and \$-16,%rdx
686 and \$15,%rcx # length residue
687 lea ($inp,%rdx),%rdx
688 mov $key,$_key
689 mov %rdx,$_end
690 mov %rcx,$_res
691
692 cmp $inp,%rdx
693 mov 0(%rbx),@S[0] # load IV
694 mov 4(%rbx),@S[1]
695 mov 8(%rbx),@S[2]
696 mov 12(%rbx),@S[3]
697 je .Lcbc_enc_tail
698 jmp .Lcbc_eloop
699
700.align 16
701.Lcbc_eloop:
702 xor 0($inp),@S[0]
703 xor 4($inp),@S[1]
704 xor 8($inp),@S[2]
705 bswap @S[0]
706 xor 12($inp),@S[3]
707 bswap @S[1]
708 bswap @S[2]
709 bswap @S[3]
710
711 call _x86_64_Camellia_encrypt
712
713 mov $_key,$key # "rewind" the key
714 bswap @S[0]
715 mov $_end,%rdx
716 bswap @S[1]
717 mov $_res,%rcx
718 bswap @S[2]
719 mov @S[0],0($out)
720 bswap @S[3]
721 mov @S[1],4($out)
722 mov @S[2],8($out)
723 lea 16($inp),$inp
724 mov @S[3],12($out)
725 cmp %rdx,$inp
726 lea 16($out),$out
727 jne .Lcbc_eloop
728
729 cmp \$0,%rcx
730 jne .Lcbc_enc_tail
731
732 mov $_ivp,$out
733 mov @S[0],0($out) # write out IV residue
734 mov @S[1],4($out)
735 mov @S[2],8($out)
736 mov @S[3],12($out)
737 jmp .Lcbc_done
738
739.align 16
740.Lcbc_enc_tail:
741 xor %rax,%rax
742 mov %rax,0+$ivec
743 mov %rax,8+$ivec
744 mov %rax,$_res
745
746.Lcbc_enc_pushf:
747 pushfq
748 cld
749 mov $inp,%rsi
750 lea 8+$ivec,%rdi
751 .long 0x9066A4F3 # rep movsb
752 popfq
753.Lcbc_enc_popf:
754
755 lea $ivec,$inp
756 lea 16+$ivec,%rax
757 mov %rax,$_end
758 jmp .Lcbc_eloop # one more time
759
760.align 16
761.LCBC_DECRYPT:
762 xchg $key,$keyend
763 add \$15,%rdx
764 and \$15,%rcx # length residue
765 and \$-16,%rdx
766 mov $key,$_key
767 lea ($inp,%rdx),%rdx
768 mov %rdx,$_end
769 mov %rcx,$_res
770
771 mov (%rbx),%rax # load IV
772 mov 8(%rbx),%rbx
773 jmp .Lcbc_dloop
774.align 16
775.Lcbc_dloop:
776 mov 0($inp),@S[0]
777 mov 4($inp),@S[1]
778 mov 8($inp),@S[2]
779 bswap @S[0]
780 mov 12($inp),@S[3]
781 bswap @S[1]
782 mov %rax,0+$ivec # save IV to temporary storage
783 bswap @S[2]
784 mov %rbx,8+$ivec
785 bswap @S[3]
786
787 call _x86_64_Camellia_decrypt
788
789 mov $_key,$key # "rewind" the key
790 mov $_end,%rdx
791 mov $_res,%rcx
792
793 bswap @S[0]
794 mov ($inp),%rax # load IV for next iteration
795 bswap @S[1]
796 mov 8($inp),%rbx
797 bswap @S[2]
798 xor 0+$ivec,@S[0]
799 bswap @S[3]
800 xor 4+$ivec,@S[1]
801 xor 8+$ivec,@S[2]
802 lea 16($inp),$inp
803 xor 12+$ivec,@S[3]
804 cmp %rdx,$inp
805 je .Lcbc_ddone
806
807 mov @S[0],0($out)
808 mov @S[1],4($out)
809 mov @S[2],8($out)
810 mov @S[3],12($out)
811
812 lea 16($out),$out
813 jmp .Lcbc_dloop
814
815.align 16
816.Lcbc_ddone:
817 mov $_ivp,%rdx
818 cmp \$0,%rcx
819 jne .Lcbc_dec_tail
820
821 mov @S[0],0($out)
822 mov @S[1],4($out)
823 mov @S[2],8($out)
824 mov @S[3],12($out)
825
826 mov %rax,(%rdx) # write out IV residue
827 mov %rbx,8(%rdx)
828 jmp .Lcbc_done
829.align 16
830.Lcbc_dec_tail:
831 mov @S[0],0+$ivec
832 mov @S[1],4+$ivec
833 mov @S[2],8+$ivec
834 mov @S[3],12+$ivec
835
836.Lcbc_dec_pushf:
837 pushfq
838 cld
839 lea 8+$ivec,%rsi
840 lea ($out),%rdi
841 .long 0x9066A4F3 # rep movsb
842 popfq
843.Lcbc_dec_popf:
844
845 mov %rax,(%rdx) # write out IV residue
846 mov %rbx,8(%rdx)
847 jmp .Lcbc_done
848
849.align 16
850.Lcbc_done:
851 mov $_rsp,%rcx
852 mov 0(%rcx),%r15
853 mov 8(%rcx),%r14
854 mov 16(%rcx),%r13
855 mov 24(%rcx),%r12
856 mov 32(%rcx),%rbp
857 mov 40(%rcx),%rbx
858 lea 48(%rcx),%rsp
859.Lcbc_abort:
860 ret
861.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
862
863.asciz "Camellia for x86_64 by <appro\@openssl.org>"
864___
865}
866
867# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
868# CONTEXT *context,DISPATCHER_CONTEXT *disp)
869if ($win64) {
870$rec="%rcx";
871$frame="%rdx";
872$context="%r8";
873$disp="%r9";
874
875$code.=<<___;
876.extern __imp_RtlVirtualUnwind
877.type common_se_handler,\@abi-omnipotent
878.align 16
879common_se_handler:
880 push %rsi
881 push %rdi
882 push %rbx
883 push %rbp
884 push %r12
885 push %r13
886 push %r14
887 push %r15
888 pushfq
889 lea -64(%rsp),%rsp
890
891 mov 120($context),%rax # pull context->Rax
892 mov 248($context),%rbx # pull context->Rip
893
894 mov 8($disp),%rsi # disp->ImageBase
895 mov 56($disp),%r11 # disp->HandlerData
896
897 mov 0(%r11),%r10d # HandlerData[0]
898 lea (%rsi,%r10),%r10 # prologue label
899 cmp %r10,%rbx # context->Rip<prologue label
900 jb .Lin_prologue
901
902 mov 152($context),%rax # pull context->Rsp
903
904 mov 4(%r11),%r10d # HandlerData[1]
905 lea (%rsi,%r10),%r10 # epilogue label
906 cmp %r10,%rbx # context->Rip>=epilogue label
907 jae .Lin_prologue
908
909 lea 40(%rax),%rax
910 mov -8(%rax),%rbx
911 mov -16(%rax),%rbp
912 mov -24(%rax),%r13
913 mov -32(%rax),%r14
914 mov -40(%rax),%r15
915 mov %rbx,144($context) # restore context->Rbx
916 mov %rbp,160($context) # restore context->Rbp
917 mov %r13,224($context) # restore context->R13
918 mov %r14,232($context) # restore context->R14
919 mov %r15,240($context) # restore context->R15
920
921.Lin_prologue:
922 mov 8(%rax),%rdi
923 mov 16(%rax),%rsi
924 mov %rax,152($context) # restore context->Rsp
925 mov %rsi,168($context) # restore context->Rsi
926 mov %rdi,176($context) # restore context->Rdi
927
928 jmp .Lcommon_seh_exit
929.size common_se_handler,.-common_se_handler
930
931.type cbc_se_handler,\@abi-omnipotent
932.align 16
933cbc_se_handler:
934 push %rsi
935 push %rdi
936 push %rbx
937 push %rbp
938 push %r12
939 push %r13
940 push %r14
941 push %r15
942 pushfq
943 lea -64(%rsp),%rsp
944
945 mov 120($context),%rax # pull context->Rax
946 mov 248($context),%rbx # pull context->Rip
947
948 lea .Lcbc_prologue(%rip),%r10
949 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
950 jb .Lin_cbc_prologue
951
952 lea .Lcbc_body(%rip),%r10
953 cmp %r10,%rbx # context->Rip<.Lcbc_body
954 jb .Lin_cbc_frame_setup
955
956 mov 152($context),%rax # pull context->Rsp
957
958 lea .Lcbc_abort(%rip),%r10
959 cmp %r10,%rbx # context->Rip>=.Lcbc_abort
960 jae .Lin_cbc_prologue
961
962 # handle pushf/popf in Camellia_cbc_encrypt
963 lea .Lcbc_enc_pushf(%rip),%r10
964 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf
965 jbe .Lin_cbc_no_flag
966 lea 8(%rax),%rax
967 lea .Lcbc_enc_popf(%rip),%r10
968 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf
969 jb .Lin_cbc_no_flag
970 lea -8(%rax),%rax
971 lea .Lcbc_dec_pushf(%rip),%r10
972 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf
973 jbe .Lin_cbc_no_flag
974 lea 8(%rax),%rax
975 lea .Lcbc_dec_popf(%rip),%r10
976 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf
977 jb .Lin_cbc_no_flag
978 lea -8(%rax),%rax
979
980.Lin_cbc_no_flag:
981 mov 48(%rax),%rax # $_rsp
982 lea 48(%rax),%rax
983
984.Lin_cbc_frame_setup:
985 mov -8(%rax),%rbx
986 mov -16(%rax),%rbp
987 mov -24(%rax),%r12
988 mov -32(%rax),%r13
989 mov -40(%rax),%r14
990 mov -48(%rax),%r15
991 mov %rbx,144($context) # restore context->Rbx
992 mov %rbp,160($context) # restore context->Rbp
993 mov %r12,216($context) # restore context->R12
994 mov %r13,224($context) # restore context->R13
995 mov %r14,232($context) # restore context->R14
996 mov %r15,240($context) # restore context->R15
997
998.Lin_cbc_prologue:
999 mov 8(%rax),%rdi
1000 mov 16(%rax),%rsi
1001 mov %rax,152($context) # restore context->Rsp
1002 mov %rsi,168($context) # restore context->Rsi
1003 mov %rdi,176($context) # restore context->Rdi
1004
1005.align 4
1006.Lcommon_seh_exit:
1007
1008 mov 40($disp),%rdi # disp->ContextRecord
1009 mov $context,%rsi # context
1010 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1011 .long 0xa548f3fc # cld; rep movsq
1012
1013 mov $disp,%rsi
1014 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1015 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1016 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1017 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1018 mov 40(%rsi),%r10 # disp->ContextRecord
1019 lea 56(%rsi),%r11 # &disp->HandlerData
1020 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1021 mov %r10,32(%rsp) # arg5
1022 mov %r11,40(%rsp) # arg6
1023 mov %r12,48(%rsp) # arg7
1024 mov %rcx,56(%rsp) # arg8, (NULL)
1025 call *__imp_RtlVirtualUnwind(%rip)
1026
1027 mov \$1,%eax # ExceptionContinueSearch
1028 lea 64(%rsp),%rsp
1029 popfq
1030 pop %r15
1031 pop %r14
1032 pop %r13
1033 pop %r12
1034 pop %rbp
1035 pop %rbx
1036 pop %rdi
1037 pop %rsi
1038 ret
1039.size cbc_se_handler,.-cbc_se_handler
1040
1041.section .pdata
1042.align 4
1043 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds
1044 .rva .LSEH_end_Camellia_EncryptBlock_Rounds
1045 .rva .LSEH_info_Camellia_EncryptBlock_Rounds
1046
1047 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds
1048 .rva .LSEH_end_Camellia_DecryptBlock_Rounds
1049 .rva .LSEH_info_Camellia_DecryptBlock_Rounds
1050
1051 .rva .LSEH_begin_Camellia_Ekeygen
1052 .rva .LSEH_end_Camellia_Ekeygen
1053 .rva .LSEH_info_Camellia_Ekeygen
1054
1055 .rva .LSEH_begin_Camellia_cbc_encrypt
1056 .rva .LSEH_end_Camellia_cbc_encrypt
1057 .rva .LSEH_info_Camellia_cbc_encrypt
1058
1059.section .xdata
1060.align 8
1061.LSEH_info_Camellia_EncryptBlock_Rounds:
1062 .byte 9,0,0,0
1063 .rva common_se_handler
1064 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
1065.LSEH_info_Camellia_DecryptBlock_Rounds:
1066 .byte 9,0,0,0
1067 .rva common_se_handler
1068 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
1069.LSEH_info_Camellia_Ekeygen:
1070 .byte 9,0,0,0
1071 .rva common_se_handler
1072 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[]
1073.LSEH_info_Camellia_cbc_encrypt:
1074 .byte 9,0,0,0
1075 .rva cbc_se_handler
1076___
1077}
1078
1079$code =~ s/\`([^\`]*)\`/eval $1/gem;
1080print $code;
1081close STDOUT;