1. Advanced Vector eXtensions (AVX) 2. 256 bit AVX Vector Registers

Intel’s extensions of SSE to 256 bits / 32 bytes & 3 operand 255192 191 128 127 64 63 0 X3 X2 X1 X0 • Therefore, can operate on 4 (8) double (single) precision values at • Register file, not stack a time double prec fp words • Each vector reg contains 4 (8) 64 (32) • AVX uses 3 (4) operand assembly: bit values

Intel : dest, src1, src2 [,imm8] 255 223 191 159 127 95 63 31 0 • All ymm registers caller-saved gcc : [imm8,] src2, src1, dest X7X6X5X4X3X2X1X0 • ST(0) still ret val for funcs if x8632: 224 192 160 128 96 64 32 single prec fp words → Must write to mem, flds/fldl • SIMD history: • Can read/write only X0 directly to scalar 1. MMX (64bit int ops aliased to x87 regs) 255...... 0 → Must shuffle vectors around, and re- 2. 3DNow! (AMD floats, non-IEEE) ymm[0-15] duce to scalar in X0 3. SSE/SSE1 : SIMD for floats • Lower 128 bits of ymm regs are xmm YMM Vector Register 4. SSE2 : SIMD for doubles (and integers) (SSE) regs 5. SSE3 : SIMD for complex & cleanup • Not aliased with ST(x) 6. SSE4[.1,.2,a] : competing standards; not covered → So far, VPU is overlapped with FPU 7. SSE5 : AMD effort not yet implemented 8. AVX : planned to have FMAC, 1st version does not

3. Common AVX 16 byte (128 bit) Data Movement Instructions 4. Common AVX Double Precision Data Movement Instructions • Only way to communicate between upper and lower 128bit words! Mnemonic Operands Action Mnemonic Operands Action vmovapd m32rs, rd rd[0:3] = m32rs[0:3] vbroadcastf128 m16, rd rd[0] = rd[1] = rs[0] vmovapd rs, m32rd m32rd[0:3] = rs[0:3] vextractf128 0/1,rs,m16r16d 0: m16r16d[0] = rs[0] vmovupd mrs, rd rd[0:3] = mrs[0:3] 1: m16r16d[0] = rs[1] vmovupd rs, mrd mrd[0:3] = rs[0:3] vinsertf128 0/1, m16r16s2, rs1, rd rd[:] = rs1[:]; vmovsd m8s, r16d rd[0] = ms[0]; rd[1:3] = 0.0 0: rd[0] = m16r16s2[0] vmovsd r16s, m8rd rmd[0] = rs[0] 1: rd[1] = m16r16s2[0] vmovsd r16s2,r16s1,r16d rd[0]=s2[0]; rd[1]=s1[1]; d[2:3]=0 vperm2f128 imm8, m32rs2, rs1, rd imm8=:0dcc0baa; 0,1:s1; 2,3:s2 vbroadcastsd m8, rd rd[0:3] = rs[0] rd[0] = s[aa]; rd[1] = s[cc]; vmovddup m32rs, rd rd[0:1] = s[0]; rd[2:3] = s[1]; if (b) rd[0] = 0; vmaskmovpd m32rs2, rs1, rd rd[i] = (rs1[64*i-1b])?m32rs2[i]:0.0; if (d) rd[1] = 0; store 0.0 or mrs2 wrd based on most sig bit of rs1 wrd vmaskmovpd rs2, rs1, m32 m32[i] = (rs1[64*i-1b])?rs2[i]:(no store) • mX: X-byte aligned memory address r16: 16-byte register, i.e. instead of • xmm ymm • [#b] : select # bit from register • [#b] : select # bit from register 6. AVX Double Precision Computational Operations Mnemonic Operands Action vandpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] & m16rs2[0:3] 5. Common AVX Double Precision Permute Instructions vandnpd m32rs2, rs1, rd rd[0:3] = (∼rs1[0:3]) & m32rs2[0:3] vorpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] | m32rs2[0:3] Mnemonic Operands Action vxorpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] ^ m32rs2[0:3] (zero!) Permute Instructions vhaddpd m32rs2, rs1, rd rd[0]=s1[1]+s1[0]; rd[1]=s2[1]+s2[0]; vblendpd imm4, m32rs2, rs1, rd imm4 = d:c:b:a (binary); rd[2]=s1[3]+s1[2]; rd[3]=s2[3]+s2[2]; (bit) ? m32rs2 : rs1 ∀ word and bit vaddsubpd m32rs2, rs1, rd rd[0] = s1[0] - s2[0]; rd[1] = s1[1] + s2[1]; vblendvpd rs3,m32rs2,rs1,rd dest[i] = mask[i]?s2[i]:s1[i] rd[2] = s1[2] - s2[2]; rd[3] = s1[3] + s2[3] most sig bit in wrd i of rs3 make mask vaddpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] + m32rs2[0:3] vpermilpd imm4, m32rs, rd rd[0] = (a) ? s[1] : s[0]; rd[1] = (b) ? s[1] : s[0]; vaddsd mr16s2,r16s1,r16d rd[0] = rs1[0] + mrs2[0] rd[2] (c) ? s[3] : s[2]; rd[3] = (d) ? s[3] : s[2] vsubpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] - m32rs2[0:3] vshufpd dcba, m32rs2, rs1, rd rd[0] = (a)?s1[1]:s1[0]; rd[1] = (b)?s2[1]:s2[0] vsubsd mr16s2,r16s1,r16d rd[0] = rs1[0] - mrs2[0] rd[2] = (c)?s1[3]:s1[2]; rd[3] = (d)?s2[3]:s2[2]; vmulpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] * m32rs2[0:3] vunpcklpd m32rs2, rs1, rd rd[0] = s1[0]; rd[1] = s2[0]; vmulsd mr16s2,r16s1,r16d rd[0] = rs1[0] * mrs2[0] rd[2] = s1[2]; rd[3] = s2[2] vdivpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] / m32rs2[0:3] vunpckhpd m32rs2, rs1, rd rd[0] = s1[1]; rd[1] = s2[1]; vdivsd mr16s2,r16s1,r16d rd[0] = rs1[0] / m32rs2[0] rd[2] = s1[3]; rd[3] = s2[3] vmaxpd m32rs2, rs1, rd rd[0:3] = MAX(rs1[0:3],m32rs2[0:3]) vmaxsd mr16s2,r16s1,r16d rd[0] = MAX(rs1[0], m32rs2[0]) vminpd m32rs2, rs1, rd rd[0:3] = MIN(rs1[0:3],m32rs2[0:3]) vminsd mr16s2,r16s1,r16d rd[0] = MIN(rs1[0], m32rs2[0])

8. ZAXPY using AVX

7. Double Precision Vector Comparisons #define ralp %ymm0 /* rdi/4 rsi/8 #define ralp_ %xmm0 void ATL_UAXPY(const int N, const TYPE *alpha, Mnem Operands Action #define ialp %ymm1 rdx/12 rcx/16 #define ialp_ %xmm1 const TYPE *X, const int incX, vcmppd imm8,m32rs2,rs1,rd rd ← all 1s if comparison #define y0 %ymm2 r8/20 r9/24 is true, else all zeros #define y0_ %xmm2 TYPE *Y, const int incY) #define x0 %ymm3 */ vcmpXXpd m32rs2, rs1, rd rd ← all 1s if comparison #define x0_ %xmm3 .globl ATL_UAXPY is true, else all zeros #define x1 %ymm4 ATL_UAXPY: #define x1_ %xmm4 #ifdef ATL_GAS_x8632 movmskpd ymm, ireg Copy sign bits of xmm’s floats to low 4 bits #define FSIZE 16 of ireg, zero rest of ireg #ifdef ATL_GAS_x8632 sub $FSIZE, %esp #define N %ebx movl %ebx, (%esp) vcomisd m8r16s, r16d set ICC as shown in table on right #define X %edx movl %esi, 4(%esp) #define Y %ecx movl FSIZE+4(%esp), N NOTE: compares low scalar only! #define II %eax movl FSIZE+8(%esp), %esi #else movl FSIZE+12(%esp), %edx imm8 COMP XX NaN #defineN %rdi movl FSIZE+20(%esp), Y 0 s1 == s2 EQ 0 ⇒ Use movmskps to get to ireg #defineX %rdx #define TMPOFF 8(%esp) 1 s1 < s2 LT 0 ⇒ Then use bt and test to branch #defineY %rcx #define rsi esi 2 s1 ≤ s2 LE 0 #define II %rax #else rd[0] > mrs[0] ZF=0, PF=0, CF=0 3 isNan(s1||s2) UNORD 1 #endif mov %r8, Y rd[0] < mrs[0] ZF=0, PF=0, CF=1 4 s1 6= s2 NE 1 #define Y_b %cl #define TMPOFF -8(%rsp) 5 s1 6< s2 NLT 1 rd[0] = mrs[0] ZF=1, PF=0, CF=0 NaN ZF=1, PF=1, CF=1 #define X_b %dl #endif 6 s1 6≤ s2 NLE 1 #define II_b %al fld1 /*ST={1.0}*/ vcomisd result 7 NotNan(s1||s2) ORDERD 0 #define PFW prefetchnta fldz /*ST={0.0,1.0}*/ #define PFR prefetchnta PFR (X) #define PFDIST 768 fsub /*ST={-1.0}*/ fmull 8(%rsi) /* ST = {-ai} */ fstpl TMPOFF /* ST={}, store -ia to tmp */ PFW (Y) 9. AVX ZAXPY, pt II vbroadcastsd 8(%rsi), ialp /* ai ai ai ai */YALIGNED: /* Y is known to be 32-byte aligned */ 10. AVX ZAXPY, pt III vbroadcastsd TMPOFF, ralp /* -ia -ia -ia -ia */ mov N, II YAXULOOP: YUXULOOP: vblendpd $0x5,ralp,ialp,ialp/* ai -ai ai -ai */ andb $0xFE, II_b /* make II a multiple of veclen */ vmovupd (X,II), x0 /* x1i x1r x0i x0r */ vmovupd (X,II), x0 /* x1i x1r x0i x0r */ vbroadcastsd (%rsi), ralp /* ar ar ar ar */ sub II, N /* N now has how much must be cleaned vshufpd $0x5,x0,x0,x1 /* x1r x1i x0r x0i */ vshufpd $0x5,x0,x0,x1 /* x1r x1i x0r x0i */ /* shl $4, II /* II = N*sizeof(DCPLX) */ vmulpd ralp, x0, x0 vmovupd (Y,II), y0 * If Y is not 16-byte aligned, lea (X, II), X /* X += N */ /* ar*x1i, ar*x1r, ar*x0i, ar*x0r */ vmulpd ralp, x0, x0 /* ar*x1i, ar*x1r, ar*x0i, * then can’t make 32-byte aligned lea (Y, II), Y /* Y += N */ vaddpd (Y,II), x0, y0 vaddpd x0, y0, y0 */ negII /*II=-II*/ PFR PFDIST(X,II) PFR PFDIST(X,II) test $0x0F, Y_b test $0x1F, X_b /* if X not 32-byte aligned */ vmulpd ialp, x1, x1 vmulpd ialp, x1, x1 /* ai*x1r,-ai*x1i, ai*x0r,-ai*x0i jnz UNALIGNED jnz YAXULOOP /* jump to unaligned X loop */ /* ai*x1r,-ai*x1i, ai*x0r,-ai*x0i */ vaddpd x1, y0, y0 test $0x1F, Y_b YAXALOOP: vaddpd x1, y0, y0 PFW PFDIST(Y,II) jz YALIGNED /* jmp to Y known 32-byte */ vmovapd (X,II), x0 /* x1i x1r x0i x0r */ PFW PFDIST(Y,II) vmovupd y0, (Y, II) /* vshufpd $0x5, x0, x0, x1/* x1r x1i x0r x0i */ vmovapd y0, (Y, II) add $32, II * If we reach here, Y is 16-byte aligned, vmulpd ralp, x0, x0 add $32, II jnz YUXULOOP * so peel 1 iteration to make 32-byte /* ar*x1i, ar*x1r, ar*x0i, ar*x0r */ jnz YAXULOOP cmp $0, N */ vaddpd (Y,II), x0, y0 cmp $0, N jz DONE movupd (X), x0_ /* x0 = {xi, xr} */ PFR PFDIST(X,II) jz DONE pshufd $0x4E,x0_,x1_/* x1 = {xr, xi} */ vmulpd ialp, x1, x1 jmp CLEANUP CLEANUP: movapd (Y), y0_ /* y0 = {yi, yr} */ /* ai*x1r,-ai*x1i, ai*x0r,-ai*x0i */ CULOOP: mulpd ralp_, x0_ /* x0 = {ar*xi, ar*xr} */ vaddpd x1, y0, y0 UNALIGNED: movupd (X), x0_ /* x0 = {xi, xr} */ addpd x0_, y0_ PFW PFDIST(Y,II) mov N, II pshufd $0x4E,x0_,x1_ /* x1 = {xr, xi} */ mulpd ialp_, x1_ /* x1 = {ai*xr,-ai*xi} */ vmovapd y0, (Y, II) andb $0xFE, II_b /* make II a mul of veclen */ movupd (Y), y0_ /* y0 = {yi, yr} */ addpd x1_, y0_ add $32, II sub II, N /* N has cleanup remainder */ mulpd ralp_, x0_ /* x0 = {ar*xi, ar*xr} */ movapd y0_, (Y) jnz YAXALOOP shl $4, II /* II = N*sizeof(DCPLX) */ addpd x0_, y0_ add $16, X lea (X, II), X /* X += N */ mulpd ialp_, x1_ /* x1 = {ai*xr,-ai*xi} */ add $16, Y cmp $0, N lea (Y, II), Y /* Y += N */ addpd x1_, y0_ sub $1, N jnz CLEANUP negII /*II=-II*/ movupd y0_, (Y) DONE: add $16, X #ifdef ATL_GAS_x8632 add $16, Y movl (%esp), %ebx sub $1, N movl 4(%esp), %esi jnz CULOOP add $FSIZE, %esp jmp DONE #endif ret

11. Common AVX Single Data Movement Instructions

Mnemonic Operands Action 12. Common AVX Single Permute Instructions vmovaps m32rs, rd rd[0:7] = m32rs[0:7] Mnemonic Operands Action vmovaps rs, m32rd m32rd[0:3] = rs[0:7] Permute Instructions vmovups mrs, rd rd[0:7] = mrs[0:7] vblendps imm8,m32rs2,rs1,rd imm8 = h:g:f:e:d:c:b:a (binary); vmovups rs, mrd mrd[0:7] = rs[0:7] (bit) ? m32rs2 : rs1 ∀ word and bit vblendvps rs3,m32rs2,rs1,rd dest[i] = mask[i]?s2[i]:s1[i] vmovss ms, r16d rd[0] = ms[0]; rd[1:7] = 0.0 most sig bit in wrd i of rs3 make mask vmovss r16s, mrd mrd[0] = rs[0] vpermilps imm8, m32rs, rd rd[0]=s[aa]; rd[1]=s[bb]; rd[2]=s[cc]; rd[3]=s[dd]; vmovss r16s2,r16s1,r16d rd[0]=rs2[0]; rd[1:3]=s1[1:3]; rd[4:7]=0 imm8=dd:cc:bb:aa rd[0]=s[4+aa]; rd[1]=s[4+bb]; rd[2]=s[4+cc]; rd[4+3]=s[dd]; vbroadcastss m8, rd rd[0:7] = rs[0] vshufps imm8,m32rs2,rs1,rd rd[0]=s1[aa]; rd[1]=s1[bb]; rd[2]=s2[cc]; rd[3]=s3[dd] vmovsldup m32rs, rd rd[0:1]=s[0]; rd[2:3]=s[2]; imm8=dd:cc:bb:aa rd[4]=s1[4+aa]; rd[5]=s1[4+bb]; rd[4:5]=s[4]; rd[6:7]=s[6] rd[6]=s2[4+cc]; rd[7]=s2[4+dd]; vmovshdup m32rs, rd rd[0:1]=s[1]; rd[2:3]=s[3]; vunpcklps m32rs2, rs1, rd rd[0]=s1[0]; rd[1]=s2[0]; rd[2]=s1[1]; rd[3]=s2[1]; rd[4]=s1[4]; rd[5]=s1[4]; rd[6]=s1[5]; rd[7]=s2[5] rd[4:5]=s[5]; rd[6:7]=s[7] vunpckhps m32rs2, rs1, rd rd[0]=s1[2]; rd[1]=s2[2]; rd[2]=s1[3]; rd[3]=s2[3]; vmaskmovps m32rs2, rs1, rd rd[i] = (rs1[32*i-1b])?m32rs2[i]:0.0; rd[4]=s1[6]; rd[5]=s1[6]; rd[6]=s1[7]; rd[7]=s2[7] store 0.0 or mrs2 wrd based on most sig bit of rs1 wrd vmaskmovps rs2, rs1, m32 m32[i] = (rs1[32*i-1b])?rs2[i]:(no store) 13. AVX Single Precision Computational Instructions Mnemonic Operands Action 14. Single Precision Vector Comparisons vandps m32rs2, rs1, rd rd[0:7] = rs1[0:7] & m16rs2[0:3] vandnps m32rs2, rs1, rd rd[0:7] = (∼rs1[0:3]) & m32rs2[0:3] Mnem Operands Action vorps m32rs2, rs1, rd rd[0:7] = rs1[0:7] | m32rs2[0:3] vcmpps imm8,m32rs2,rs1,rd rd ← all 1s if comparison vxorps m32rs2, rs1, rd rd[0:7] = rs1[0:7] ^ m32rs2[0:3] (zero!) is true, else all zeros vhaddps m32rs2,rs1,rd rd[0]=s1[1]+s1[0]; rd[1]=s1[3]+s1[2]; rd[2]=s2[1]+s2[0]; rd[3]=s2[3]+s2[2]; vcmpXXps m32rs2, rs1, rd rd ← all 1s if comparison rd[4]=s1[5]+s1[4]; rd[5]=s1[7]+s1[6]; is true, else all zeros rd[6]=s2[5]+s2[4]; rd[7]=s2[7]+s2[6]; vmovmskps ymm, ireg Copy sign bits of xmm’s floats to low 8 bits vaddsubps m32rs2, rs1, rd rd[0:6:2]=s1[0:6:2]-s2[0:6:2]; of ireg, zero rest of ireg rd[1:7:2]=s1[1:7:2]+s2[1:7:2]; vaddps m32rs2, rs1, rd rd[0:7] = rs1[0:7] + m32rs2[0:7] vcomiss m4r16s, r16d set ICC as shown in table on right vaddss mr16s2,r16s1,r16d rd[0] = rs1[0] + mrs2[0] NOTE: compares low scalar only! vsubps m32rs2, rs1, rd rd[0:7] = rs1[0:7] - m32rs2[0:7] vsubss mr16s2,r16s1,r16d rd[0] = rs1[0] - mrs2[0] imm8 COMP XX NaN vmulps m32rs2, rs1, rd rd[0:7] = rs1[0:7] * m32rs2[0:7] 0 s1 == s2 EQ 0 ⇒ Use vmovmskps to get to ireg vmulss mr16s2,r16s1,r16d rd[0] = rs1[0] * mrs2[0] 1 s1 < s2 LT 0 ⇒ Then use bt and test to branch 2 s1 ≤ s2 LE 0 vdivps m32rs2, rs1, rd rd[0:7] = rs1[0:7] / m32rs2[0:7] rd[0] > mrs[0] ZF=0, PF=0, CF=0 3 isNan(s1||s2) UNORD 1 < vdivss mr16s2,r16s1,r16d rd[0] = rs1[0] / m32rs2[0] 4 s1 6= s2 NE 1 rd[0] mrs[0] ZF=0, PF=0, CF=1 5 s1 6< s2 NLT 1 rd[0] = mrs[0] ZF=1, PF=0, CF=0 vmaxps m32rs2, rs1, rd rd[0:3] = MAX(rs1[0:7],m32rs2[0:7]) 6 s1 6≤ s2 NLE 1 NaN ZF=1, PF=1, CF=1 vmaxss mr16s2,r16s1,r16d rd[0] = MAX(rs1[0], m32rs2[0]) 7 NotNan(s1||s2) ORDERD 0 vcomisd result vminps m32rs2, rs1, rd rd[0:3] = MIN(rs1[0:7],m32rs2[0:7]) vminss mr16s2,r16s1,r16d rd[0] = MIN(rs1[0], m32rs2[0])

16. CIAMAX, pt II 15. CIAMAX, pt I mov NN, N DONE: #define NN %rdi /* shr$3,N /*N/=8*/ sub XX, Imax /* # of bytes away from start */ #defineN %rsi * Bitmask with all 1s except jz CLEANUP shr $3, Imax /* # of elts (index) where max was found #define N_w %si * 0 sign bit for abs value shl $3, N ret #define XX %r8 */ sub N, NN #defineX %rdx mov $0x7FFFFFFF, %eax lea (X,N,8), X /* X += N */ CLEANUP: #define X_b %dl movl %eax, -8(%rsp) shl$3,N /*N*=sizeof*/ lea (X,NN,8), X #define Imax %rax vbroadcastss -8(%rsp), absval neg N neg NN #define bitreg %rcx mov %rsi, X CULOOP: #define bitreg_b %cl mov %rsi, XX UALOOP: movss (X,NN,8), x0_ /* vmovups (X,N), x0 andps absval_, x0_ #define maxval %ymm0 * 1st elt initial max, move ptr and dec N vandps absval, x0, x0 movss 4(X,NN,8), x1_ #define maxval_ %xmm0 */ vmovups 32(X,N), x1 andps absval_, x1_ #define absval %ymm1 mov X, Imax vandps absval, x1, x1 addss x1_, x0_ #define absval_ %xmm1 movss (X), maxval_ prefetchnta 1024(X,N) comiss x0_, maxval_ /* need new max if ZF=PF=0, #define x0 %ymm2 andps absval_, maxval_ vhaddps x1, x0, x0 jc SNEWMAX #define x0_ %xmm2 movss 4(X), x1_ /* abs(x7i)+abs(x7r) ... abs(x0i)+abs(x0r) */ add $1, NN #define x1 %ymm3 andps absval_, x1_ vcmpLEps maxval, x0, x1 jnz CULOOP #define x1_ %xmm3 addps x1_, maxval_ /* all 1s if maxval already has max */ jmp DONE #define up %ymm4 shufps $0x00, maxval_, maxval_ vmovmskps x1, bitreg #define up_ %xmm4 /* XX XX XX XX max, max, max, max */ cmp $0xFF, bitreg_b SNEWMAX: /* rdi/4 rsi/8 vinsertf128 $1, maxval_, maxval, maxval jnz VNEWMAX movss x0_, maxval_ int ATL_UIAMAX(const int N, const TYPE *X, /* maxval in all 8 entries */ UGOTMAX: lea (X,NN,8), Imax const int incX) */ add $8, X add $64, N add $1, NN .text sub $1, NN jnz UALOOP jnz CULOOP .globl ATL_UIAMAX jmp DONE ATL_UIAMAX: cmp $0, NN jnz CLEANUP 17. CIAMAX, pt III /* * When we jump to this label, new max somewhere in sums in x0: * {x7, x6, x3, x2, x5, x4, x1, x0} * So, ignore old max, and just find the max of these 8 elts */ VNEWMAX: /* x0 = x7, x6, x3, x2, x5, x4, x1, x0 */ vextractf128 $1, x0, up_ movss x1_, maxval_ /* XX XX XX XX x7 x6 x3 x2 */ lea 32(X,N), Imax movss x0_, maxval_ DONE4:/* x7, x6, x3, x2, x5, x4, x1, x0 */ lea (X,N), Imax vshufps $0x03, x0, x0, x1 vshufps $0x01, x0, x0, x1 vcomiss x1_, maxval_ vcomiss x1_, maxval_ /* newmax if ZF=PF=0, CF=1 */ /* newmax if ZF=PF=0, CF=1 */ jnc DONE5 jnc DONE1 movss x1_, maxval_ movss x1_, maxval_ lea 40(X,N), Imax lea 8(X,N), Imax DONE5: /* XX XX XX XX x7 x6 x3 x2 */ DONE1: vshufps $0x02, up, up, x1 vcomiss up_, maxval_ vcomiss x1_, maxval_ jnc DONE2 /* newmax if ZF=PF=0, CF=1 */ movss up_, maxval_ jnc DONE6 lea 16(X,N), Imax movss x1_, maxval_ DONE2: lea 48(X,N), Imax vshufps $0x01, up, up, x1 DONE6: vcomiss x1_, maxval_ vshufps $0x03, up, up, x1 /* newmax if ZF=PF=0, CF=1 */ vcomiss x1_, maxval_ jnc DONE3 /* newmax if ZF=PF=0, CF=1 */ movss x1_, maxval_ jnc DONE7 lea 24(X,N), Imax movss x1_, maxval_ DONE3: /* x7, x6, x3, x2, x5, x4, x1, x0 */ lea 56(X,N), Imax vshufps $0x02, x0, x0, x1 DONE7: vcomiss x1_, maxval_ vshufps $0x00, maxval, maxval, maxval /* newmax if ZF=PF=0, CF=1 */ vinsertf128 $1, maxval_, maxval, maxval jnc DONE4 jmp UGOTMAX