Import OpenSSL-1.0.1d.
[dragonfly.git] / crypto / openssl / crypto / camellia / asm / cmll-x86_64.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5 #
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
13
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
16 #
17 #                       AMD64   Core2   EM64T
18 # -evp camellia-128-ecb 16.7    21.0    22.7
19 # + over gcc 3.4.6      +25%    +5%     0%
20 #
21 # camellia-128-cbc      15.7    20.4    21.1
22 #
23 # 128-bit key setup     128     216     205     cycles/key
24 # + over gcc 3.4.6      +54%    +39%    +15%
25 #
26 # Numbers in "+" rows represent performance improvement over compiler
27 # generated code. Key setup timings are impressive on AMD and Core2
28 # thanks to 64-bit operations being covertly deployed. Improvement on
29 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30 # apparently emulates some of 64-bit operations in [32-bit] microcode.
31
32 $flavour = shift;
33 $output  = shift;
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41 die "can't locate x86_64-xlate.pl";
42
43 open OUT,"| \"$^X\" $xlate $flavour $output";
44 *STDOUT=*OUT;
45
46 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
47 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
48                         $r =~ s/%[er]([sd]i)/%\1l/;
49                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
50
51 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
52 @S=("%r8d","%r9d","%r10d","%r11d");
53 $i0="%esi";
54 $i1="%edi";
55 $Tbl="%rbp";    # size optimization
56 $inp="%r12";
57 $out="%r13";
58 $key="%r14";
59 $keyend="%r15";
60 $arg0d=$win64?"%ecx":"%edi";
61
62 # const unsigned int Camellia_SBOX[4][256];
63 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
64 # and [2][] - with [3][]. This is done to minimize code size.
65 $SBOX1_1110=0;          # Camellia_SBOX[0]
66 $SBOX4_4404=4;          # Camellia_SBOX[1]
67 $SBOX2_0222=2048;       # Camellia_SBOX[2]
68 $SBOX3_3033=2052;       # Camellia_SBOX[3]
69
70 sub Camellia_Feistel {
71 my $i=@_[0];
72 my $seed=defined(@_[1])?@_[1]:0;
73 my $scale=$seed<0?-8:8;
74 my $j=($i&1)*2;
75 my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
76
77 $code.=<<___;
78         xor     $s0,$t0                         # t0^=key[0]
79         xor     $s1,$t1                         # t1^=key[1]
80         movz    `&hi("$t0")`,$i0                # (t0>>8)&0xff
81         movz    `&lo("$t1")`,$i1                # (t1>>0)&0xff
82         mov     $SBOX3_3033($Tbl,$i0,8),$t3     # t3=SBOX3_3033[0]
83         mov     $SBOX1_1110($Tbl,$i1,8),$t2     # t2=SBOX1_1110[1]
84         movz    `&lo("$t0")`,$i0                # (t0>>0)&0xff
85         shr     \$16,$t0
86         movz    `&hi("$t1")`,$i1                # (t1>>8)&0xff
87         xor     $SBOX4_4404($Tbl,$i0,8),$t3     # t3^=SBOX4_4404[0]
88         shr     \$16,$t1
89         xor     $SBOX4_4404($Tbl,$i1,8),$t2     # t2^=SBOX4_4404[1]
90         movz    `&hi("$t0")`,$i0                # (t0>>24)&0xff
91         movz    `&lo("$t1")`,$i1                # (t1>>16)&0xff
92         xor     $SBOX1_1110($Tbl,$i0,8),$t3     # t3^=SBOX1_1110[0]
93         xor     $SBOX3_3033($Tbl,$i1,8),$t2     # t2^=SBOX3_3033[1]
94         movz    `&lo("$t0")`,$i0                # (t0>>16)&0xff
95         movz    `&hi("$t1")`,$i1                # (t1>>24)&0xff
96         xor     $SBOX2_0222($Tbl,$i0,8),$t3     # t3^=SBOX2_0222[0]
97         xor     $SBOX2_0222($Tbl,$i1,8),$t2     # t2^=SBOX2_0222[1]
98         mov     `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
99         mov     `$seed+($i+1)*$scale+4`($key),$t0
100         xor     $t3,$t2                         # t2^=t3
101         ror     \$8,$t3                         # t3=RightRotate(t3,8)
102         xor     $t2,$s2
103         xor     $t2,$s3
104         xor     $t3,$s3
105 ___
106 }
107
108 # void Camellia_EncryptBlock_Rounds(
109 #               int grandRounds,
110 #               const Byte plaintext[],
111 #               const KEY_TABLE_TYPE keyTable,
112 #               Byte ciphertext[])
113 $code=<<___;
114 .text
115
116 # V1.x API
117 .globl  Camellia_EncryptBlock
118 .type   Camellia_EncryptBlock,\@abi-omnipotent
119 .align  16
120 Camellia_EncryptBlock:
121         movl    \$128,%eax
122         subl    $arg0d,%eax
123         movl    \$3,$arg0d
124         adcl    \$0,$arg0d      # keyBitLength==128?3:4
125         jmp     .Lenc_rounds
126 .size   Camellia_EncryptBlock,.-Camellia_EncryptBlock
127 # V2
128 .globl  Camellia_EncryptBlock_Rounds
129 .type   Camellia_EncryptBlock_Rounds,\@function,4
130 .align  16
131 .Lenc_rounds:
132 Camellia_EncryptBlock_Rounds:
133         push    %rbx
134         push    %rbp
135         push    %r13
136         push    %r14
137         push    %r15
138 .Lenc_prologue:
139
140         #mov    %rsi,$inp               # put away arguments
141         mov     %rcx,$out
142         mov     %rdx,$key
143
144         shl     \$6,%edi                # process grandRounds
145         lea     .LCamellia_SBOX(%rip),$Tbl
146         lea     ($key,%rdi),$keyend
147
148         mov     0(%rsi),@S[0]           # load plaintext
149         mov     4(%rsi),@S[1]
150         mov     8(%rsi),@S[2]
151         bswap   @S[0]
152         mov     12(%rsi),@S[3]
153         bswap   @S[1]
154         bswap   @S[2]
155         bswap   @S[3]
156
157         call    _x86_64_Camellia_encrypt
158
159         bswap   @S[0]
160         bswap   @S[1]
161         bswap   @S[2]
162         mov     @S[0],0($out)
163         bswap   @S[3]
164         mov     @S[1],4($out)
165         mov     @S[2],8($out)
166         mov     @S[3],12($out)
167
168         mov     0(%rsp),%r15
169         mov     8(%rsp),%r14
170         mov     16(%rsp),%r13
171         mov     24(%rsp),%rbp
172         mov     32(%rsp),%rbx
173         lea     40(%rsp),%rsp
174 .Lenc_epilogue:
175         ret
176 .size   Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
177
178 .type   _x86_64_Camellia_encrypt,\@abi-omnipotent
179 .align  16
180 _x86_64_Camellia_encrypt:
181         xor     0($key),@S[1]
182         xor     4($key),@S[0]           # ^=key[0-3]
183         xor     8($key),@S[3]
184         xor     12($key),@S[2]
185 .align  16
186 .Leloop:
187         mov     16($key),$t1            # prefetch key[4-5]
188         mov     20($key),$t0
189
190 ___
191         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
192 $code.=<<___;
193         lea     16*4($key),$key
194         cmp     $keyend,$key
195         mov     8($key),$t3             # prefetch key[2-3]
196         mov     12($key),$t2
197         je      .Ledone
198
199         and     @S[0],$t0
200         or      @S[3],$t3
201         rol     \$1,$t0
202         xor     $t3,@S[2]               # s2^=s3|key[3];
203         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
204         and     @S[2],$t2
205         or      @S[1],$t1
206         rol     \$1,$t2
207         xor     $t1,@S[0]               # s0^=s1|key[1];
208         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
209         jmp     .Leloop
210
211 .align  16
212 .Ledone:
213         xor     @S[2],$t0               # SwapHalf
214         xor     @S[3],$t1
215         xor     @S[0],$t2
216         xor     @S[1],$t3
217
218         mov     $t0,@S[0]
219         mov     $t1,@S[1]
220         mov     $t2,@S[2]
221         mov     $t3,@S[3]
222
223         .byte   0xf3,0xc3               # rep ret
224 .size   _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
225
226 # V1.x API
227 .globl  Camellia_DecryptBlock
228 .type   Camellia_DecryptBlock,\@abi-omnipotent
229 .align  16
230 Camellia_DecryptBlock:
231         movl    \$128,%eax
232         subl    $arg0d,%eax
233         movl    \$3,$arg0d
234         adcl    \$0,$arg0d      # keyBitLength==128?3:4
235         jmp     .Ldec_rounds
236 .size   Camellia_DecryptBlock,.-Camellia_DecryptBlock
237 # V2
238 .globl  Camellia_DecryptBlock_Rounds
239 .type   Camellia_DecryptBlock_Rounds,\@function,4
240 .align  16
241 .Ldec_rounds:
242 Camellia_DecryptBlock_Rounds:
243         push    %rbx
244         push    %rbp
245         push    %r13
246         push    %r14
247         push    %r15
248 .Ldec_prologue:
249
250         #mov    %rsi,$inp               # put away arguments
251         mov     %rcx,$out
252         mov     %rdx,$keyend
253
254         shl     \$6,%edi                # process grandRounds
255         lea     .LCamellia_SBOX(%rip),$Tbl
256         lea     ($keyend,%rdi),$key
257
258         mov     0(%rsi),@S[0]           # load plaintext
259         mov     4(%rsi),@S[1]
260         mov     8(%rsi),@S[2]
261         bswap   @S[0]
262         mov     12(%rsi),@S[3]
263         bswap   @S[1]
264         bswap   @S[2]
265         bswap   @S[3]
266
267         call    _x86_64_Camellia_decrypt
268
269         bswap   @S[0]
270         bswap   @S[1]
271         bswap   @S[2]
272         mov     @S[0],0($out)
273         bswap   @S[3]
274         mov     @S[1],4($out)
275         mov     @S[2],8($out)
276         mov     @S[3],12($out)
277
278         mov     0(%rsp),%r15
279         mov     8(%rsp),%r14
280         mov     16(%rsp),%r13
281         mov     24(%rsp),%rbp
282         mov     32(%rsp),%rbx
283         lea     40(%rsp),%rsp
284 .Ldec_epilogue:
285         ret
286 .size   Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
287
288 .type   _x86_64_Camellia_decrypt,\@abi-omnipotent
289 .align  16
290 _x86_64_Camellia_decrypt:
291         xor     0($key),@S[1]
292         xor     4($key),@S[0]           # ^=key[0-3]
293         xor     8($key),@S[3]
294         xor     12($key),@S[2]
295 .align  16
296 .Ldloop:
297         mov     -8($key),$t1            # prefetch key[4-5]
298         mov     -4($key),$t0
299
300 ___
301         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
302 $code.=<<___;
303         lea     -16*4($key),$key
304         cmp     $keyend,$key
305         mov     0($key),$t3             # prefetch key[2-3]
306         mov     4($key),$t2
307         je      .Lddone
308
309         and     @S[0],$t0
310         or      @S[3],$t3
311         rol     \$1,$t0
312         xor     $t3,@S[2]               # s2^=s3|key[3];
313         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
314         and     @S[2],$t2
315         or      @S[1],$t1
316         rol     \$1,$t2
317         xor     $t1,@S[0]               # s0^=s1|key[1];
318         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
319
320         jmp     .Ldloop
321
322 .align  16
323 .Lddone:
324         xor     @S[2],$t2
325         xor     @S[3],$t3
326         xor     @S[0],$t0
327         xor     @S[1],$t1
328
329         mov     $t2,@S[0]               # SwapHalf
330         mov     $t3,@S[1]
331         mov     $t0,@S[2]
332         mov     $t1,@S[3]
333
334         .byte   0xf3,0xc3               # rep ret
335 .size   _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
336 ___
337
338 sub _saveround {
339 my ($rnd,$key,@T)=@_;
340 my $bias=int(@T[0])?shift(@T):0;
341
342     if ($#T==3) {
343         $code.=<<___;
344         mov     @T[1],`$bias+$rnd*8+0`($key)
345         mov     @T[0],`$bias+$rnd*8+4`($key)
346         mov     @T[3],`$bias+$rnd*8+8`($key)
347         mov     @T[2],`$bias+$rnd*8+12`($key)
348 ___
349     } else {
350         $code.="        mov     @T[0],`$bias+$rnd*8+0`($key)\n";
351         $code.="        mov     @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
352     }
353 }
354
355 sub _loadround {
356 my ($rnd,$key,@T)=@_;
357 my $bias=int(@T[0])?shift(@T):0;
358
359 $code.="        mov     `$bias+$rnd*8+0`($key),@T[0]\n";
360 $code.="        mov     `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
361 }
362
363 # shld is very slow on Intel EM64T family. Even on AMD it limits
364 # instruction decode rate [because it's VectorPath] and consequently
365 # performance...
366 sub __rotl128 {
367 my ($i0,$i1,$rot)=@_;
368
369     if ($rot) {
370         $code.=<<___;
371         mov     $i0,%r11
372         shld    \$$rot,$i1,$i0
373         shld    \$$rot,%r11,$i1
374 ___
375     }
376 }
377
378 # ... Implementing 128-bit rotate without shld gives 80% better
379 # performance EM64T, +15% on AMD64 and only ~7% degradation on
380 # Core2. This is therefore preferred.
381 sub _rotl128 {
382 my ($i0,$i1,$rot)=@_;
383
384     if ($rot) {
385         $code.=<<___;
386         mov     $i0,%r11
387         shl     \$$rot,$i0
388         mov     $i1,%r9
389         shr     \$`64-$rot`,%r9
390         shr     \$`64-$rot`,%r11
391         or      %r9,$i0
392         shl     \$$rot,$i1
393         or      %r11,$i1
394 ___
395     }
396 }
397
398 { my $step=0;
399
400 $code.=<<___;
401 .globl  Camellia_Ekeygen
402 .type   Camellia_Ekeygen,\@function,3
403 .align  16
404 Camellia_Ekeygen:
405         push    %rbx
406         push    %rbp
407         push    %r13
408         push    %r14
409         push    %r15
410 .Lkey_prologue:
411
412         mov     %rdi,$keyend            # put away arguments, keyBitLength
413         mov     %rdx,$out               # keyTable
414
415         mov     0(%rsi),@S[0]           # load 0-127 bits
416         mov     4(%rsi),@S[1]
417         mov     8(%rsi),@S[2]
418         mov     12(%rsi),@S[3]
419
420         bswap   @S[0]
421         bswap   @S[1]
422         bswap   @S[2]
423         bswap   @S[3]
424 ___
425         &_saveround     (0,$out,@S);    # KL<<<0
426 $code.=<<___;
427         cmp     \$128,$keyend           # check keyBitLength
428         je      .L1st128
429
430         mov     16(%rsi),@S[0]          # load 128-191 bits
431         mov     20(%rsi),@S[1]
432         cmp     \$192,$keyend
433         je      .L1st192
434         mov     24(%rsi),@S[2]          # load 192-255 bits
435         mov     28(%rsi),@S[3]
436         jmp     .L1st256
437 .L1st192:
438         mov     @S[0],@S[2]
439         mov     @S[1],@S[3]
440         not     @S[2]
441         not     @S[3]
442 .L1st256:
443         bswap   @S[0]
444         bswap   @S[1]
445         bswap   @S[2]
446         bswap   @S[3]
447 ___
448         &_saveround     (4,$out,@S);    # temp storage for KR!
449 $code.=<<___;
450         xor     0($out),@S[1]           # KR^KL
451         xor     4($out),@S[0]
452         xor     8($out),@S[3]
453         xor     12($out),@S[2]
454
455 .L1st128:
456         lea     .LCamellia_SIGMA(%rip),$key
457         lea     .LCamellia_SBOX(%rip),$Tbl
458
459         mov     0($key),$t1
460         mov     4($key),$t0
461 ___
462         &Camellia_Feistel($step++);
463         &Camellia_Feistel($step++);
464 $code.=<<___;
465         xor     0($out),@S[1]           # ^KL
466         xor     4($out),@S[0]
467         xor     8($out),@S[3]
468         xor     12($out),@S[2]
469 ___
470         &Camellia_Feistel($step++);
471         &Camellia_Feistel($step++);
472 $code.=<<___;
473         cmp     \$128,$keyend
474         jne     .L2nd256
475
476         lea     128($out),$out          # size optimization
477         shl     \$32,%r8                # @S[0]||
478         shl     \$32,%r10               # @S[2]||
479         or      %r9,%r8                 # ||@S[1]
480         or      %r11,%r10               # ||@S[3]
481 ___
482         &_loadround     (0,$out,-128,"%rax","%rbx");    # KL
483         &_saveround     (2,$out,-128,"%r8","%r10");     # KA<<<0
484         &_rotl128       ("%rax","%rbx",15);
485         &_saveround     (4,$out,-128,"%rax","%rbx");    # KL<<<15
486         &_rotl128       ("%r8","%r10",15);
487         &_saveround     (6,$out,-128,"%r8","%r10");     # KA<<<15
488         &_rotl128       ("%r8","%r10",15);              # 15+15=30
489         &_saveround     (8,$out,-128,"%r8","%r10");     # KA<<<30
490         &_rotl128       ("%rax","%rbx",30);             # 15+30=45
491         &_saveround     (10,$out,-128,"%rax","%rbx");   # KL<<<45
492         &_rotl128       ("%r8","%r10",15);              # 30+15=45
493         &_saveround     (12,$out,-128,"%r8");           # KA<<<45
494         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
495         &_saveround     (13,$out,-128,"%rbx");          # KL<<<60
496         &_rotl128       ("%r8","%r10",15);              # 45+15=60
497         &_saveround     (14,$out,-128,"%r8","%r10");    # KA<<<60
498         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
499         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<77
500         &_rotl128       ("%rax","%rbx",17);             # 77+17=94
501         &_saveround     (18,$out,-128,"%rax","%rbx");   # KL<<<94
502         &_rotl128       ("%r8","%r10",34);              # 60+34=94
503         &_saveround     (20,$out,-128,"%r8","%r10");    # KA<<<94
504         &_rotl128       ("%rax","%rbx",17);             # 94+17=111
505         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<111
506         &_rotl128       ("%r8","%r10",17);              # 94+17=111
507         &_saveround     (24,$out,-128,"%r8","%r10");    # KA<<<111
508 $code.=<<___;
509         mov     \$3,%eax
510         jmp     .Ldone
511 .align  16
512 .L2nd256:
513 ___
514         &_saveround     (6,$out,@S);    # temp storage for KA!
515 $code.=<<___;
516         xor     `4*8+0`($out),@S[1]     # KA^KR
517         xor     `4*8+4`($out),@S[0]
518         xor     `5*8+0`($out),@S[3]
519         xor     `5*8+4`($out),@S[2]
520 ___
521         &Camellia_Feistel($step++);
522         &Camellia_Feistel($step++);
523
524         &_loadround     (0,$out,"%rax","%rbx"); # KL
525         &_loadround     (4,$out,"%rcx","%rdx"); # KR
526         &_loadround     (6,$out,"%r14","%r15"); # KA
527 $code.=<<___;
528         lea     128($out),$out          # size optimization
529         shl     \$32,%r8                # @S[0]||
530         shl     \$32,%r10               # @S[2]||
531         or      %r9,%r8                 # ||@S[1]
532         or      %r11,%r10               # ||@S[3]
533 ___
534         &_saveround     (2,$out,-128,"%r8","%r10");     # KB<<<0
535         &_rotl128       ("%rcx","%rdx",15);
536         &_saveround     (4,$out,-128,"%rcx","%rdx");    # KR<<<15
537         &_rotl128       ("%r14","%r15",15);
538         &_saveround     (6,$out,-128,"%r14","%r15");    # KA<<<15
539         &_rotl128       ("%rcx","%rdx",15);             # 15+15=30
540         &_saveround     (8,$out,-128,"%rcx","%rdx");    # KR<<<30
541         &_rotl128       ("%r8","%r10",30);
542         &_saveround     (10,$out,-128,"%r8","%r10");    # KB<<<30
543         &_rotl128       ("%rax","%rbx",45);
544         &_saveround     (12,$out,-128,"%rax","%rbx");   # KL<<<45
545         &_rotl128       ("%r14","%r15",30);             # 15+30=45
546         &_saveround     (14,$out,-128,"%r14","%r15");   # KA<<<45
547         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
548         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<60
549         &_rotl128       ("%rcx","%rdx",30);             # 30+30=60
550         &_saveround     (18,$out,-128,"%rcx","%rdx");   # KR<<<60
551         &_rotl128       ("%r8","%r10",30);              # 30+30=60
552         &_saveround     (20,$out,-128,"%r8","%r10");    # KB<<<60
553         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
554         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<77
555         &_rotl128       ("%r14","%r15",32);             # 45+32=77
556         &_saveround     (24,$out,-128,"%r14","%r15");   # KA<<<77
557         &_rotl128       ("%rcx","%rdx",34);             # 60+34=94
558         &_saveround     (26,$out,-128,"%rcx","%rdx");   # KR<<<94
559         &_rotl128       ("%r14","%r15",17);             # 77+17=94
560         &_saveround     (28,$out,-128,"%r14","%r15");   # KA<<<77
561         &_rotl128       ("%rax","%rbx",34);             # 77+34=111
562         &_saveround     (30,$out,-128,"%rax","%rbx");   # KL<<<111
563         &_rotl128       ("%r8","%r10",51);              # 60+51=111
564         &_saveround     (32,$out,-128,"%r8","%r10");    # KB<<<111
565 $code.=<<___;
566         mov     \$4,%eax
567 .Ldone:
568         mov     0(%rsp),%r15
569         mov     8(%rsp),%r14
570         mov     16(%rsp),%r13
571         mov     24(%rsp),%rbp
572         mov     32(%rsp),%rbx
573         lea     40(%rsp),%rsp
574 .Lkey_epilogue:
575         ret
576 .size   Camellia_Ekeygen,.-Camellia_Ekeygen
577 ___
578 }
579
580 @SBOX=(
581 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
582  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
583 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
584 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
585 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
586 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
587  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
588 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
589 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
590  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
591 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
592  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
593 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
594 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
595 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
596  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
597
598 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
599 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
600 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
601 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
602
603 $code.=<<___;
604 .align  64
605 .LCamellia_SIGMA:
606 .long   0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
607 .long   0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
608 .long   0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
609 .long   0,          0,          0,          0
610 .LCamellia_SBOX:
611 ___
612 # tables are interleaved, remember?
613 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
614 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
615 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
616
617 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
618 #                       size_t length, const CAMELLIA_KEY *key,
619 #                       unsigned char *ivp,const int enc);
620 {
621 $_key="0(%rsp)";
622 $_end="8(%rsp)";        # inp+len&~15
623 $_res="16(%rsp)";       # len&15
624 $ivec="24(%rsp)";
625 $_ivp="40(%rsp)";
626 $_rsp="48(%rsp)";
627
628 $code.=<<___;
629 .globl  Camellia_cbc_encrypt
630 .type   Camellia_cbc_encrypt,\@function,6
631 .align  16
632 Camellia_cbc_encrypt:
633         cmp     \$0,%rdx
634         je      .Lcbc_abort
635         push    %rbx
636         push    %rbp
637         push    %r12
638         push    %r13
639         push    %r14
640         push    %r15
641 .Lcbc_prologue:
642
643         mov     %rsp,%rbp
644         sub     \$64,%rsp
645         and     \$-64,%rsp
646
647         # place stack frame just "above mod 1024" the key schedule,
648         # this ensures that cache associativity suffices
649         lea     -64-63(%rcx),%r10
650         sub     %rsp,%r10
651         neg     %r10
652         and     \$0x3C0,%r10
653         sub     %r10,%rsp
654         #add    \$8,%rsp                # 8 is reserved for callee's ra
655
656         mov     %rdi,$inp               # inp argument
657         mov     %rsi,$out               # out argument
658         mov     %r8,%rbx                # ivp argument
659         mov     %rcx,$key               # key argument
660         mov     272(%rcx),${keyend}d    # grandRounds
661
662         mov     %r8,$_ivp
663         mov     %rbp,$_rsp
664
665 .Lcbc_body:
666         lea     .LCamellia_SBOX(%rip),$Tbl
667
668         mov     \$32,%ecx
669 .align  4
670 .Lcbc_prefetch_sbox:
671         mov     0($Tbl),%rax
672         mov     32($Tbl),%rsi
673         mov     64($Tbl),%rdi
674         mov     96($Tbl),%r11
675         lea     128($Tbl),$Tbl
676         loop    .Lcbc_prefetch_sbox
677         sub     \$4096,$Tbl
678         shl     \$6,$keyend
679         mov     %rdx,%rcx               # len argument
680         lea     ($key,$keyend),$keyend
681
682         cmp     \$0,%r9d                # enc argument
683         je      .LCBC_DECRYPT
684
685         and     \$-16,%rdx
686         and     \$15,%rcx               # length residue
687         lea     ($inp,%rdx),%rdx
688         mov     $key,$_key
689         mov     %rdx,$_end
690         mov     %rcx,$_res
691
692         cmp     $inp,%rdx
693         mov     0(%rbx),@S[0]           # load IV
694         mov     4(%rbx),@S[1]
695         mov     8(%rbx),@S[2]
696         mov     12(%rbx),@S[3]
697         je      .Lcbc_enc_tail
698         jmp     .Lcbc_eloop
699
700 .align  16
701 .Lcbc_eloop:
702         xor     0($inp),@S[0]
703         xor     4($inp),@S[1]
704         xor     8($inp),@S[2]
705         bswap   @S[0]
706         xor     12($inp),@S[3]
707         bswap   @S[1]
708         bswap   @S[2]
709         bswap   @S[3]
710
711         call    _x86_64_Camellia_encrypt
712
713         mov     $_key,$key              # "rewind" the key
714         bswap   @S[0]
715         mov     $_end,%rdx
716         bswap   @S[1]
717         mov     $_res,%rcx
718         bswap   @S[2]
719         mov     @S[0],0($out)
720         bswap   @S[3]
721         mov     @S[1],4($out)
722         mov     @S[2],8($out)
723         lea     16($inp),$inp
724         mov     @S[3],12($out)
725         cmp     %rdx,$inp
726         lea     16($out),$out
727         jne     .Lcbc_eloop
728
729         cmp     \$0,%rcx
730         jne     .Lcbc_enc_tail
731
732         mov     $_ivp,$out
733         mov     @S[0],0($out)           # write out IV residue
734         mov     @S[1],4($out)
735         mov     @S[2],8($out)
736         mov     @S[3],12($out)
737         jmp     .Lcbc_done
738
739 .align  16
740 .Lcbc_enc_tail:
741         xor     %rax,%rax
742         mov     %rax,0+$ivec
743         mov     %rax,8+$ivec
744         mov     %rax,$_res
745
746 .Lcbc_enc_pushf:
747         pushfq
748         cld
749         mov     $inp,%rsi
750         lea     8+$ivec,%rdi
751         .long   0x9066A4F3              # rep movsb
752         popfq
753 .Lcbc_enc_popf:
754
755         lea     $ivec,$inp
756         lea     16+$ivec,%rax
757         mov     %rax,$_end
758         jmp     .Lcbc_eloop             # one more time
759
760 .align  16
761 .LCBC_DECRYPT:
762         xchg    $key,$keyend
763         add     \$15,%rdx
764         and     \$15,%rcx               # length residue
765         and     \$-16,%rdx
766         mov     $key,$_key
767         lea     ($inp,%rdx),%rdx
768         mov     %rdx,$_end
769         mov     %rcx,$_res
770
771         mov     (%rbx),%rax             # load IV
772         mov     8(%rbx),%rbx
773         jmp     .Lcbc_dloop
774 .align  16
775 .Lcbc_dloop:
776         mov     0($inp),@S[0]
777         mov     4($inp),@S[1]
778         mov     8($inp),@S[2]
779         bswap   @S[0]
780         mov     12($inp),@S[3]
781         bswap   @S[1]
782         mov     %rax,0+$ivec            # save IV to temporary storage
783         bswap   @S[2]
784         mov     %rbx,8+$ivec
785         bswap   @S[3]
786
787         call    _x86_64_Camellia_decrypt
788
789         mov     $_key,$key              # "rewind" the key
790         mov     $_end,%rdx
791         mov     $_res,%rcx
792
793         bswap   @S[0]
794         mov     ($inp),%rax             # load IV for next iteration
795         bswap   @S[1]
796         mov     8($inp),%rbx
797         bswap   @S[2]
798         xor     0+$ivec,@S[0]
799         bswap   @S[3]
800         xor     4+$ivec,@S[1]
801         xor     8+$ivec,@S[2]
802         lea     16($inp),$inp
803         xor     12+$ivec,@S[3]
804         cmp     %rdx,$inp
805         je      .Lcbc_ddone
806
807         mov     @S[0],0($out)
808         mov     @S[1],4($out)
809         mov     @S[2],8($out)
810         mov     @S[3],12($out)
811
812         lea     16($out),$out
813         jmp     .Lcbc_dloop
814
815 .align  16
816 .Lcbc_ddone:
817         mov     $_ivp,%rdx
818         cmp     \$0,%rcx
819         jne     .Lcbc_dec_tail
820
821         mov     @S[0],0($out)
822         mov     @S[1],4($out)
823         mov     @S[2],8($out)
824         mov     @S[3],12($out)
825
826         mov     %rax,(%rdx)             # write out IV residue
827         mov     %rbx,8(%rdx)
828         jmp     .Lcbc_done
829 .align  16
830 .Lcbc_dec_tail:
831         mov     @S[0],0+$ivec
832         mov     @S[1],4+$ivec
833         mov     @S[2],8+$ivec
834         mov     @S[3],12+$ivec
835
836 .Lcbc_dec_pushf:
837         pushfq
838         cld
839         lea     8+$ivec,%rsi
840         lea     ($out),%rdi
841         .long   0x9066A4F3              # rep movsb
842         popfq
843 .Lcbc_dec_popf:
844
845         mov     %rax,(%rdx)             # write out IV residue
846         mov     %rbx,8(%rdx)
847         jmp     .Lcbc_done
848
849 .align  16
850 .Lcbc_done:
851         mov     $_rsp,%rcx
852         mov     0(%rcx),%r15
853         mov     8(%rcx),%r14
854         mov     16(%rcx),%r13
855         mov     24(%rcx),%r12
856         mov     32(%rcx),%rbp
857         mov     40(%rcx),%rbx
858         lea     48(%rcx),%rsp
859 .Lcbc_abort:
860         ret
861 .size   Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
862
863 .asciz  "Camellia for x86_64 by <appro\@openssl.org>"
864 ___
865 }
866
867 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
868 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
869 if ($win64) {
870 $rec="%rcx";
871 $frame="%rdx";
872 $context="%r8";
873 $disp="%r9";
874
875 $code.=<<___;
876 .extern __imp_RtlVirtualUnwind
877 .type   common_se_handler,\@abi-omnipotent
878 .align  16
879 common_se_handler:
880         push    %rsi
881         push    %rdi
882         push    %rbx
883         push    %rbp
884         push    %r12
885         push    %r13
886         push    %r14
887         push    %r15
888         pushfq
889         lea     -64(%rsp),%rsp
890
891         mov     120($context),%rax      # pull context->Rax
892         mov     248($context),%rbx      # pull context->Rip
893
894         mov     8($disp),%rsi           # disp->ImageBase
895         mov     56($disp),%r11          # disp->HandlerData
896
897         mov     0(%r11),%r10d           # HandlerData[0]
898         lea     (%rsi,%r10),%r10        # prologue label
899         cmp     %r10,%rbx               # context->Rip<prologue label
900         jb      .Lin_prologue
901
902         mov     152($context),%rax      # pull context->Rsp
903
904         mov     4(%r11),%r10d           # HandlerData[1]
905         lea     (%rsi,%r10),%r10        # epilogue label
906         cmp     %r10,%rbx               # context->Rip>=epilogue label
907         jae     .Lin_prologue
908
909         lea     40(%rax),%rax
910         mov     -8(%rax),%rbx
911         mov     -16(%rax),%rbp
912         mov     -24(%rax),%r13
913         mov     -32(%rax),%r14
914         mov     -40(%rax),%r15
915         mov     %rbx,144($context)      # restore context->Rbx
916         mov     %rbp,160($context)      # restore context->Rbp
917         mov     %r13,224($context)      # restore context->R13
918         mov     %r14,232($context)      # restore context->R14
919         mov     %r15,240($context)      # restore context->R15
920
921 .Lin_prologue:
922         mov     8(%rax),%rdi
923         mov     16(%rax),%rsi
924         mov     %rax,152($context)      # restore context->Rsp
925         mov     %rsi,168($context)      # restore context->Rsi
926         mov     %rdi,176($context)      # restore context->Rdi
927
928         jmp     .Lcommon_seh_exit
929 .size   common_se_handler,.-common_se_handler
930
931 .type   cbc_se_handler,\@abi-omnipotent
932 .align  16
933 cbc_se_handler:
934         push    %rsi
935         push    %rdi
936         push    %rbx
937         push    %rbp
938         push    %r12
939         push    %r13
940         push    %r14
941         push    %r15
942         pushfq
943         lea     -64(%rsp),%rsp
944
945         mov     120($context),%rax      # pull context->Rax
946         mov     248($context),%rbx      # pull context->Rip
947
948         lea     .Lcbc_prologue(%rip),%r10
949         cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
950         jb      .Lin_cbc_prologue
951
952         lea     .Lcbc_body(%rip),%r10
953         cmp     %r10,%rbx               # context->Rip<.Lcbc_body
954         jb      .Lin_cbc_frame_setup
955
956         mov     152($context),%rax      # pull context->Rsp
957
958         lea     .Lcbc_abort(%rip),%r10
959         cmp     %r10,%rbx               # context->Rip>=.Lcbc_abort
960         jae     .Lin_cbc_prologue
961
962         # handle pushf/popf in Camellia_cbc_encrypt
963         lea     .Lcbc_enc_pushf(%rip),%r10
964         cmp     %r10,%rbx               # context->Rip<=.Lcbc_enc_pushf
965         jbe     .Lin_cbc_no_flag
966         lea     8(%rax),%rax
967         lea     .Lcbc_enc_popf(%rip),%r10
968         cmp     %r10,%rbx               # context->Rip<.Lcbc_enc_popf
969         jb      .Lin_cbc_no_flag
970         lea     -8(%rax),%rax
971         lea     .Lcbc_dec_pushf(%rip),%r10
972         cmp     %r10,%rbx               # context->Rip<=.Lcbc_dec_pushf
973         jbe     .Lin_cbc_no_flag
974         lea     8(%rax),%rax
975         lea     .Lcbc_dec_popf(%rip),%r10
976         cmp     %r10,%rbx               # context->Rip<.Lcbc_dec_popf
977         jb      .Lin_cbc_no_flag
978         lea     -8(%rax),%rax
979
980 .Lin_cbc_no_flag:
981         mov     48(%rax),%rax           # $_rsp
982         lea     48(%rax),%rax
983
984 .Lin_cbc_frame_setup:
985         mov     -8(%rax),%rbx
986         mov     -16(%rax),%rbp
987         mov     -24(%rax),%r12
988         mov     -32(%rax),%r13
989         mov     -40(%rax),%r14
990         mov     -48(%rax),%r15
991         mov     %rbx,144($context)      # restore context->Rbx
992         mov     %rbp,160($context)      # restore context->Rbp
993         mov     %r12,216($context)      # restore context->R12
994         mov     %r13,224($context)      # restore context->R13
995         mov     %r14,232($context)      # restore context->R14
996         mov     %r15,240($context)      # restore context->R15
997
998 .Lin_cbc_prologue:
999         mov     8(%rax),%rdi
1000         mov     16(%rax),%rsi
1001         mov     %rax,152($context)      # restore context->Rsp
1002         mov     %rsi,168($context)      # restore context->Rsi
1003         mov     %rdi,176($context)      # restore context->Rdi
1004
1005 .align  4
1006 .Lcommon_seh_exit:
1007
1008         mov     40($disp),%rdi          # disp->ContextRecord
1009         mov     $context,%rsi           # context
1010         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
1011         .long   0xa548f3fc              # cld; rep movsq
1012
1013         mov     $disp,%rsi
1014         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1015         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1016         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1017         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1018         mov     40(%rsi),%r10           # disp->ContextRecord
1019         lea     56(%rsi),%r11           # &disp->HandlerData
1020         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1021         mov     %r10,32(%rsp)           # arg5
1022         mov     %r11,40(%rsp)           # arg6
1023         mov     %r12,48(%rsp)           # arg7
1024         mov     %rcx,56(%rsp)           # arg8, (NULL)
1025         call    *__imp_RtlVirtualUnwind(%rip)
1026
1027         mov     \$1,%eax                # ExceptionContinueSearch
1028         lea     64(%rsp),%rsp
1029         popfq
1030         pop     %r15
1031         pop     %r14
1032         pop     %r13
1033         pop     %r12
1034         pop     %rbp
1035         pop     %rbx
1036         pop     %rdi
1037         pop     %rsi
1038         ret
1039 .size   cbc_se_handler,.-cbc_se_handler
1040
1041 .section        .pdata
1042 .align  4
1043         .rva    .LSEH_begin_Camellia_EncryptBlock_Rounds
1044         .rva    .LSEH_end_Camellia_EncryptBlock_Rounds
1045         .rva    .LSEH_info_Camellia_EncryptBlock_Rounds
1046
1047         .rva    .LSEH_begin_Camellia_DecryptBlock_Rounds
1048         .rva    .LSEH_end_Camellia_DecryptBlock_Rounds
1049         .rva    .LSEH_info_Camellia_DecryptBlock_Rounds
1050
1051         .rva    .LSEH_begin_Camellia_Ekeygen
1052         .rva    .LSEH_end_Camellia_Ekeygen
1053         .rva    .LSEH_info_Camellia_Ekeygen
1054
1055         .rva    .LSEH_begin_Camellia_cbc_encrypt
1056         .rva    .LSEH_end_Camellia_cbc_encrypt
1057         .rva    .LSEH_info_Camellia_cbc_encrypt
1058
1059 .section        .xdata
1060 .align  8
1061 .LSEH_info_Camellia_EncryptBlock_Rounds:
1062         .byte   9,0,0,0
1063         .rva    common_se_handler
1064         .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
1065 .LSEH_info_Camellia_DecryptBlock_Rounds:
1066         .byte   9,0,0,0
1067         .rva    common_se_handler
1068         .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
1069 .LSEH_info_Camellia_Ekeygen:
1070         .byte   9,0,0,0
1071         .rva    common_se_handler
1072         .rva    .Lkey_prologue,.Lkey_epilogue   # HandlerData[]
1073 .LSEH_info_Camellia_cbc_encrypt:
1074         .byte   9,0,0,0
1075         .rva    cbc_se_handler
1076 ___
1077 }
1078
1079 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1080 print $code;
1081 close STDOUT;