(AVX) Intel's Extensions of SSE to 256 Bits / 32 Bytes & 3 Operand

Total Page:16

File Type:pdf, Size:1020Kb

(AVX) Intel's Extensions of SSE to 256 Bits / 32 Bytes & 3 Operand 1. Advanced Vector eXtensions (AVX) 2. 256 bit AVX Vector Registers Intel’s extensions of SSE to 256 bits / 32 bytes & 3 operand 255192 191 128 127 64 63 0 X3 X2 X1 X0 • Therefore, can operate on 4 (8) double (single) precision values at • Register file, not stack a time double prec fp words • Each vector reg contains 4 (8) 64 (32) • AVX uses 3 (4) operand assembly: bit values Intel : dest, src1, src2 [,imm8] 255 223 191 159 127 95 63 31 0 • All ymm registers caller-saved gcc : [imm8,] src2, src1, dest X7X6X5X4X3X2X1X0 • ST(0) still ret val for funcs if x8632: 224 192 160 128 96 64 32 single prec fp words → Must write to mem, flds/fldl • x86 SIMD history: • Can read/write only X0 directly to scalar 1. MMX (64bit int ops aliased to x87 regs) 255..................0 → Must shuffle vectors around, and re- 2. 3DNow! (AMD floats, non-IEEE) ymm[0-15] duce to scalar in X0 3. SSE/SSE1 : SIMD for floats • Lower 128 bits of ymm regs are xmm YMM Vector Register 4. SSE2 : SIMD for doubles (and integers) (SSE) regs 5. SSE3 : SIMD for complex & cleanup • Not aliased with ST(x) 6. SSE4[.1,.2,a] : competing standards; not covered → So far, VPU is overlapped with FPU 7. SSE5 : AMD effort not yet implemented 8. AVX : planned to have FMAC, 1st version does not 3. Common AVX 16 byte (128 bit) Data Movement Instructions 4. Common AVX Double Precision Data Movement Instructions • Only way to communicate between upper and lower 128bit words! Mnemonic Operands Action Mnemonic Operands Action vmovapd m32rs, rd rd[0:3] = m32rs[0:3] vbroadcastf128 m16, rd rd[0] = rd[1] = rs[0] vmovapd rs, m32rd m32rd[0:3] = rs[0:3] vextractf128 0/1,rs,m16r16d 0: m16r16d[0] = rs[0] vmovupd mrs, rd rd[0:3] = mrs[0:3] 1: m16r16d[0] = rs[1] vmovupd rs, mrd mrd[0:3] = rs[0:3] vinsertf128 0/1, m16r16s2, rs1, rd rd[:] = rs1[:]; vmovsd m8s, r16d rd[0] = ms[0]; rd[1:3] = 0.0 0: rd[0] = m16r16s2[0] vmovsd r16s, m8rd rmd[0] = rs[0] 1: rd[1] = m16r16s2[0] vmovsd r16s2,r16s1,r16d rd[0]=s2[0]; rd[1]=s1[1]; d[2:3]=0 vperm2f128 imm8, m32rs2, rs1, rd imm8=:0dcc0baa; 0,1:s1; 2,3:s2 vbroadcastsd m8, rd rd[0:3] = rs[0] rd[0] = s[aa]; rd[1] = s[cc]; vmovddup m32rs, rd rd[0:1] = s[0]; rd[2:3] = s[1]; if (b) rd[0] = 0; vmaskmovpd m32rs2, rs1, rd rd[i] = (rs1[64*i-1b])?m32rs2[i]:0.0; if (d) rd[1] = 0; store 0.0 or mrs2 wrd based on most sig bit of rs1 wrd vmaskmovpd rs2, rs1, m32 m32[i] = (rs1[64*i-1b])?rs2[i]:(no store) • mX: X-byte aligned memory address r16: 16-byte register, i.e. instead of • xmm ymm • [#b] : select # bit from register • [#b] : select # bit from register 6. AVX Double Precision Computational Operations Mnemonic Operands Action vandpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] & m16rs2[0:3] 5. Common AVX Double Precision Permute Instructions vandnpd m32rs2, rs1, rd rd[0:3] = (∼rs1[0:3]) & m32rs2[0:3] vorpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] | m32rs2[0:3] Mnemonic Operands Action vxorpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] ^ m32rs2[0:3] (zero!) Permute Instructions vhaddpd m32rs2, rs1, rd rd[0]=s1[1]+s1[0]; rd[1]=s2[1]+s2[0]; vblendpd imm4, m32rs2, rs1, rd imm4 = d:c:b:a (binary); rd[2]=s1[3]+s1[2]; rd[3]=s2[3]+s2[2]; (bit) ? m32rs2 : rs1 ∀ word and bit vaddsubpd m32rs2, rs1, rd rd[0] = s1[0] - s2[0]; rd[1] = s1[1] + s2[1]; vblendvpd rs3,m32rs2,rs1,rd dest[i] = mask[i]?s2[i]:s1[i] rd[2] = s1[2] - s2[2]; rd[3] = s1[3] + s2[3] most sig bit in wrd i of rs3 make mask vaddpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] + m32rs2[0:3] vpermilpd imm4, m32rs, rd rd[0] = (a) ? s[1] : s[0]; rd[1] = (b) ? s[1] : s[0]; vaddsd mr16s2,r16s1,r16d rd[0] = rs1[0] + mrs2[0] rd[2] (c) ? s[3] : s[2]; rd[3] = (d) ? s[3] : s[2] vsubpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] - m32rs2[0:3] vshufpd dcba, m32rs2, rs1, rd rd[0] = (a)?s1[1]:s1[0]; rd[1] = (b)?s2[1]:s2[0] vsubsd mr16s2,r16s1,r16d rd[0] = rs1[0] - mrs2[0] rd[2] = (c)?s1[3]:s1[2]; rd[3] = (d)?s2[3]:s2[2]; vmulpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] * m32rs2[0:3] vunpcklpd m32rs2, rs1, rd rd[0] = s1[0]; rd[1] = s2[0]; vmulsd mr16s2,r16s1,r16d rd[0] = rs1[0] * mrs2[0] rd[2] = s1[2]; rd[3] = s2[2] vdivpd m32rs2, rs1, rd rd[0:3] = rs1[0:3] / m32rs2[0:3] vunpckhpd m32rs2, rs1, rd rd[0] = s1[1]; rd[1] = s2[1]; vdivsd mr16s2,r16s1,r16d rd[0] = rs1[0] / m32rs2[0] rd[2] = s1[3]; rd[3] = s2[3] vmaxpd m32rs2, rs1, rd rd[0:3] = MAX(rs1[0:3],m32rs2[0:3]) vmaxsd mr16s2,r16s1,r16d rd[0] = MAX(rs1[0], m32rs2[0]) vminpd m32rs2, rs1, rd rd[0:3] = MIN(rs1[0:3],m32rs2[0:3]) vminsd mr16s2,r16s1,r16d rd[0] = MIN(rs1[0], m32rs2[0]) 8. ZAXPY using AVX 7. Double Precision Vector Comparisons #define ralp %ymm0 /* rdi/4 rsi/8 #define ralp_ %xmm0 void ATL_UAXPY(const int N, const TYPE *alpha, Mnem Operands Action #define ialp %ymm1 rdx/12 rcx/16 #define ialp_ %xmm1 const TYPE *X, const int incX, vcmppd imm8,m32rs2,rs1,rd rd ← all 1s if comparison #define y0 %ymm2 r8/20 r9/24 is true, else all zeros #define y0_ %xmm2 TYPE *Y, const int incY) #define x0 %ymm3 */ vcmpXXpd m32rs2, rs1, rd rd ← all 1s if comparison #define x0_ %xmm3 .globl ATL_UAXPY is true, else all zeros #define x1 %ymm4 ATL_UAXPY: #define x1_ %xmm4 #ifdef ATL_GAS_x8632 movmskpd ymm, ireg Copy sign bits of xmm’s floats to low 4 bits #define FSIZE 16 of ireg, zero rest of ireg #ifdef ATL_GAS_x8632 sub $FSIZE, %esp #define N %ebx movl %ebx, (%esp) vcomisd m8r16s, r16d set ICC as shown in table on right #define X %edx movl %esi, 4(%esp) #define Y %ecx movl FSIZE+4(%esp), N NOTE: compares low scalar only! #define II %eax movl FSIZE+8(%esp), %esi #else movl FSIZE+12(%esp), %edx imm8 COMP XX NaN #defineN %rdi movl FSIZE+20(%esp), Y 0 s1 == s2 EQ 0 ⇒ Use movmskps to get to ireg #defineX %rdx #define TMPOFF 8(%esp) 1 s1 < s2 LT 0 ⇒ Then use bt and test to branch #defineY %rcx #define rsi esi 2 s1 ≤ s2 LE 0 #define II %rax #else rd[0] > mrs[0] ZF=0, PF=0, CF=0 3 isNan(s1||s2) UNORD 1 #endif mov %r8, Y rd[0] < mrs[0] ZF=0, PF=0, CF=1 4 s1 6= s2 NE 1 #define Y_b %cl #define TMPOFF -8(%rsp) 5 s1 6< s2 NLT 1 rd[0] = mrs[0] ZF=1, PF=0, CF=0 NaN ZF=1, PF=1, CF=1 #define X_b %dl #endif 6 s1 6≤ s2 NLE 1 #define II_b %al fld1 /*ST={1.0}*/ vcomisd result 7 NotNan(s1||s2) ORDERD 0 #define PFW prefetchnta fldz /*ST={0.0,1.0}*/ #define PFR prefetchnta PFR (X) #define PFDIST 768 fsub /*ST={-1.0}*/ fmull 8(%rsi) /* ST = {-ai} */ fstpl TMPOFF /* ST={}, store -ia to tmp */ PFW (Y) 9. AVX ZAXPY, pt II vbroadcastsd 8(%rsi), ialp /* ai ai ai ai */YALIGNED: /* Y is known to be 32-byte aligned */ 10. AVX ZAXPY, pt III vbroadcastsd TMPOFF, ralp /* -ia -ia -ia -ia */ mov N, II YAXULOOP: YUXULOOP: vblendpd $0x5,ralp,ialp,ialp/* ai -ai ai -ai */ andb $0xFE, II_b /* make II a multiple of veclen */ vmovupd (X,II), x0 /* x1i x1r x0i x0r */ vmovupd (X,II), x0 /* x1i x1r x0i x0r */ vbroadcastsd (%rsi), ralp /* ar ar ar ar */ sub II, N /* N now has how much must be cleaned vshufpd $0x5,x0,x0,x1 /* x1r x1i x0r x0i */ vshufpd $0x5,x0,x0,x1 /* x1r x1i x0r x0i */ /* shl $4, II /* II = N*sizeof(DCPLX) */ vmulpd ralp, x0, x0 vmovupd (Y,II), y0 * If Y is not 16-byte aligned, lea (X, II), X /* X += N */ /* ar*x1i, ar*x1r, ar*x0i, ar*x0r */ vmulpd ralp, x0, x0 /* ar*x1i, ar*x1r, ar*x0i, * then can’t make 32-byte aligned lea (Y, II), Y /* Y += N */ vaddpd (Y,II), x0, y0 vaddpd x0, y0, y0 */ negII /*II=-II*/ PFR PFDIST(X,II) PFR PFDIST(X,II) test $0x0F, Y_b test $0x1F, X_b /* if X not 32-byte aligned */ vmulpd ialp, x1, x1 vmulpd ialp, x1, x1 /* ai*x1r,-ai*x1i, ai*x0r,-ai*x0i jnz UNALIGNED jnz YAXULOOP /* jump to unaligned X loop */ /* ai*x1r,-ai*x1i, ai*x0r,-ai*x0i */ vaddpd x1, y0, y0 test $0x1F, Y_b YAXALOOP: vaddpd x1, y0, y0 PFW PFDIST(Y,II) jz YALIGNED /* jmp to Y known 32-byte */ vmovapd (X,II), x0 /* x1i x1r x0i x0r */ PFW PFDIST(Y,II) vmovupd y0, (Y, II) /* vshufpd $0x5, x0, x0, x1/* x1r x1i x0r x0i */ vmovapd y0, (Y, II) add $32, II * If we reach here, Y is 16-byte aligned, vmulpd ralp, x0, x0 add $32, II jnz YUXULOOP * so peel 1 iteration to make 32-byte /* ar*x1i, ar*x1r, ar*x0i, ar*x0r */ jnz YAXULOOP cmp $0, N */ vaddpd (Y,II), x0, y0 cmp $0, N jz DONE movupd (X), x0_ /* x0 = {xi, xr} */ PFR PFDIST(X,II) jz DONE pshufd $0x4E,x0_,x1_/* x1 = {xr, xi} */ vmulpd ialp, x1, x1 jmp CLEANUP CLEANUP: movapd (Y), y0_ /* y0 = {yi, yr} */ /* ai*x1r,-ai*x1i, ai*x0r,-ai*x0i */ CULOOP: mulpd ralp_, x0_ /* x0 = {ar*xi, ar*xr} */ vaddpd x1, y0, y0 UNALIGNED: movupd (X), x0_ /* x0 = {xi, xr} */ addpd x0_, y0_ PFW PFDIST(Y,II) mov N, II pshufd $0x4E,x0_,x1_ /* x1 = {xr, xi} */ mulpd ialp_, x1_ /* x1 = {ai*xr,-ai*xi} */ vmovapd y0, (Y, II) andb $0xFE, II_b /* make II a mul of veclen */ movupd (Y), y0_ /* y0 = {yi, yr} */ addpd x1_, y0_ add $32, II sub II, N /* N has cleanup remainder */ mulpd ralp_, x0_ /* x0 = {ar*xi, ar*xr} */ movapd y0_, (Y) jnz YAXALOOP shl $4, II /* II = N*sizeof(DCPLX) */ addpd x0_, y0_ add $16, X lea (X, II), X /* X += N */ mulpd ialp_, x1_ /* x1 = {ai*xr,-ai*xi} */ add $16, Y cmp $0, N lea (Y, II), Y /* Y += N */ addpd x1_, y0_ sub $1, N jnz CLEANUP negII /*II=-II*/ movupd y0_, (Y) DONE: add $16, X #ifdef ATL_GAS_x8632 add $16, Y movl (%esp), %ebx sub $1, N movl 4(%esp), %esi jnz CULOOP add $FSIZE, %esp jmp DONE #endif ret 11.
Recommended publications
  • SIMD Extensions
    SIMD Extensions PDF generated using the open source mwlib toolkit. See http://code.pediapress.com/ for more information. PDF generated at: Sat, 12 May 2012 17:14:46 UTC Contents Articles SIMD 1 MMX (instruction set) 6 3DNow! 8 Streaming SIMD Extensions 12 SSE2 16 SSE3 18 SSSE3 20 SSE4 22 SSE5 26 Advanced Vector Extensions 28 CVT16 instruction set 31 XOP instruction set 31 References Article Sources and Contributors 33 Image Sources, Licenses and Contributors 34 Article Licenses License 35 SIMD 1 SIMD Single instruction Multiple instruction Single data SISD MISD Multiple data SIMD MIMD Single instruction, multiple data (SIMD), is a class of parallel computers in Flynn's taxonomy. It describes computers with multiple processing elements that perform the same operation on multiple data simultaneously. Thus, such machines exploit data level parallelism. History The first use of SIMD instructions was in vector supercomputers of the early 1970s such as the CDC Star-100 and the Texas Instruments ASC, which could operate on a vector of data with a single instruction. Vector processing was especially popularized by Cray in the 1970s and 1980s. Vector-processing architectures are now considered separate from SIMD machines, based on the fact that vector machines processed the vectors one word at a time through pipelined processors (though still based on a single instruction), whereas modern SIMD machines process all elements of the vector simultaneously.[1] The first era of modern SIMD machines was characterized by massively parallel processing-style supercomputers such as the Thinking Machines CM-1 and CM-2. These machines had many limited-functionality processors that would work in parallel.
    [Show full text]
  • RISC-V Vector Extension Webinar I
    RISC-V Vector Extension Webinar I July 13th, 2021 Thang Tran, Ph.D. Principal Engineer Who WeAndes Are Technology Corporation CPU Pure-play RISC-V Founding Major Open-Source CPU IP Vendor Premier Member Contributor/Maintainer RISC-V Ambassador 16-year-old Running Task Groups Public Company TSC Vice Chair Director of the Board Quick Facts + NL 100 years 80% FR BJ KR USA JP IL SH CPU Experience in Silicon Valley R&D SZ TW (HQ) 200+ 20K+ 7B+ Licensees AndeSight IDE Total shipment of Andes- installations Embedded™ SoC Confidential Taking RISC-V® Mainstream 2 Andes Technology Corporation Overview Andes Highlights •Founded in March 2005 in Hsinchu Science Park, Taiwan, ROC. •World class 32/64-bit RISC-V CPU IP public company •Over 200 people; 80% are engineers; R&D team consisting of Silicon Valley veterans •TSMC OIP Award “Partner of the Year” for New IP (2015) •A Premier founding member of RISC-V Foundation •2018 MCU Innovation Award by China Electronic News: AndesCore™ N25F/NX25F •ASPENCORE WEAA 2020 Outstanding Product Performance of the Year: AndesCore™ NX27V •2020 HsinChu Science Park Innovation Award: AndesCore™ NX27V Andes Mission • Trusted Computing Expert and World No.1 RISC-V IP Provider Emerging Opportunities • AIoT, 5G/Networking, Storage and Cloud computing 3 V5 Adoptions: From MCU to Datacenters • Edge to Cloud: − ADAS − Datacenter AI accelerators − AIoT − SSD: enterprise (& consumer) − Blockchain − 5G macro/small cells − FPGA − MCU − Multimedia − Security − Wireless (BT/WiFi) 5G Macro • 1 to 1000+ core • 40nm to 5nm • Many in AI Copyright© 2020 Andes Technology Corp. 4 Webinar I - Agenda • Andes overview • Vector technology background – SIMD/vector concept – Vector processor basic • RISC-V V extension ISA – Basic – CSR – Memory operations – Compute instructions • Sample codes – Matrix multiplication – Loads with RVV versions 0.8 and 1.0 • AndesCore™ NX27V introduction • Summary Copyright© 2020 Andes Technology Corp.
    [Show full text]
  • NASM – the Netwide Assembler
    NASM – The Netwide Assembler version 2.14rc7 © 1996−2017 The NASM Development Team — All Rights Reserved This document is redistributable under the license given in the file "LICENSE" distributed in the NASM archive. Contents Chapter 1: Introduction . 17 1.1 What Is NASM?. 17 1.1.1 License Conditions . 17 Chapter 2: Running NASM . 19 2.1 NASM Command−Line Syntax . 19 2.1.1 The −o Option: Specifying the Output File Name . 19 2.1.2 The −f Option: Specifying the Output File Format . 20 2.1.3 The −l Option: Generating a Listing File . 20 2.1.4 The −M Option: Generate Makefile Dependencies. 20 2.1.5 The −MG Option: Generate Makefile Dependencies . 20 2.1.6 The −MF Option: Set Makefile Dependency File. 20 2.1.7 The −MD Option: Assemble and Generate Dependencies . 20 2.1.8 The −MT Option: Dependency Target Name . 21 2.1.9 The −MQ Option: Dependency Target Name (Quoted) . 21 2.1.10 The −MP Option: Emit phony targets . 21 2.1.11 The −MW Option: Watcom Make quoting style . 21 2.1.12 The −F Option: Selecting a Debug Information Format . 21 2.1.13 The −g Option: Enabling Debug Information. 21 2.1.14 The −X Option: Selecting an Error Reporting Format . 21 2.1.15 The −Z Option: Send Errors to a File. 22 2.1.16 The −s Option: Send Errors to stdout ..........................22 2.1.17 The −i Option: Include File Search Directories . 22 2.1.18 The −p Option: Pre−Include a File . 22 2.1.19 The −d Option: Pre−Define a Macro .
    [Show full text]
  • AMD's Bulldozer Architecture
    AMD's Bulldozer Architecture Chris Ziemba Jonathan Lunt Overview • AMD's Roadmap • Instruction Set • Architecture • Performance • Later Iterations o Piledriver o Steamroller o Excavator Slide 2 1 Changed this section, bulldozer is covered in architecture so it makes sense to not reiterate with later slides Chris Ziemba, 鳬o AMD's Roadmap • October 2011 o First iteration, Bulldozer released • June 2013 o Piledriver, implemented in 2nd gen FX-CPUs • 2013 o Steamroller, implemented in 3rd gen FX-CPUs • 2014 o Excavator, implemented in 4th gen Fusion APUs • 2015 o Revised Excavator adopted in 2015 for FX-CPUs and beyond Instruction Set: Overview • Type: CISC • Instruction Set: x86-64 (AMD64) o Includes Old x86 Registers o Extends Registers and adds new ones o Two Operating Modes: Long Mode & Legacy Mode • Integer Size: 64 bits • Virtual Address Space: 64 bits o 16 EB of Address Space (17,179,869,184 GB) • Physical Address Space: 48 bits (Current Versions) o Saves space/transistors/etc o 256TB of Address Space Instruction Set: ISA Registers Instruction Set: Operating Modes Instruction Set: Extensions • Intel x86 Extensions o SSE4 : Streaming SIMD (Single Instruction, Multiple Data) Extension 4. Mainly for DSP and Graphics Processing. o AES-NI: Advanced Encryption Standard (AES) Instructions o AVX: Advanced Vector Extensions. 256 bit registers for computationally complex floating point operations such as image/video processing, simulation, etc. • AMD x86 Extensions o XOP: AMD specified SSE5 Revision o FMA4: Fused multiply-add (MAC) instructions
    [Show full text]
  • C++ Code __M128 Add (Const __M128 &X, Const __M128 &Y){ X X3 X2 X1 X0 Return Mm Add Ps(X, Y); } + + + + +
    ECE/ME/EMA/CS 759 High Performance Computing for Engineering Applications Final Project Related Issues Variable Sharing in OpenMP OpenMP synchronization issues OpenMP performance issues November 9, 2015 Lecture 24 © Dan Negrut, 2015 ECE/ME/EMA/CS 759 UW-Madison Quote of the Day “Without music to decorate it, time is just a bunch of boring production deadlines or dates by which bills must be paid.” -- Frank Zappa, Musician 1940 - 1993 2 Before We Get Started Issues covered last time: Final Project discussion Open MP optimization issues, wrap up Today’s topics SSE and AVX quick overview Parallel computing w/ MPI Other issues: HW08, due on Wd, Nov. 10 at 11:59 PM 3 Parallelism, as Expressed at Various Levels Cluster Group of computers communicating through fast interconnect Coprocessors/Accelerators Special compute devices attached to the local node through special interconnect Node Group of processors communicating through shared memory Socket Group of cores communicating through shared cache Core Group of functional units communicating through registers Hyper-Threads Group of thread contexts sharing functional units Superscalar Group of instructions sharing functional units Pipeline Sequence of instructions sharing functional units Vector Single instruction using multiple functional units Have discussed already Haven’t discussed yet 4 [Intel] Have discussed, but little direct control Instruction Set Architecture (ISA) Extensions Extensions to the base x86 ISA One way the x86 has evolved over the years Extensions for vectorizing
    [Show full text]
  • Computer Architectures an Overview
    Computer Architectures An Overview PDF generated using the open source mwlib toolkit. See http://code.pediapress.com/ for more information. PDF generated at: Sat, 25 Feb 2012 22:35:32 UTC Contents Articles Microarchitecture 1 x86 7 PowerPC 23 IBM POWER 33 MIPS architecture 39 SPARC 57 ARM architecture 65 DEC Alpha 80 AlphaStation 92 AlphaServer 95 Very long instruction word 103 Instruction-level parallelism 107 Explicitly parallel instruction computing 108 References Article Sources and Contributors 111 Image Sources, Licenses and Contributors 113 Article Licenses License 114 Microarchitecture 1 Microarchitecture In computer engineering, microarchitecture (sometimes abbreviated to µarch or uarch), also called computer organization, is the way a given instruction set architecture (ISA) is implemented on a processor. A given ISA may be implemented with different microarchitectures.[1] Implementations might vary due to different goals of a given design or due to shifts in technology.[2] Computer architecture is the combination of microarchitecture and instruction set design. Relation to instruction set architecture The ISA is roughly the same as the programming model of a processor as seen by an assembly language programmer or compiler writer. The ISA includes the execution model, processor registers, address and data formats among other things. The Intel Core microarchitecture microarchitecture includes the constituent parts of the processor and how these interconnect and interoperate to implement the ISA. The microarchitecture of a machine is usually represented as (more or less detailed) diagrams that describe the interconnections of the various microarchitectural elements of the machine, which may be everything from single gates and registers, to complete arithmetic logic units (ALU)s and even larger elements.
    [Show full text]
  • An Introduction to CUDA/Opencl and Graphics Processors
    An Introduction to CUDA/OpenCL and Graphics Processors Bryan Catanzaro, NVIDIA Research Overview ¡ Terminology ¡ The CUDA and OpenCL programming models ¡ Understanding how CUDA maps to NVIDIA GPUs ¡ Thrust 2/74 Heterogeneous Parallel Computing Latency Throughput Optimized CPU Optimized GPU Fast Serial Scalable Parallel Processing Processing 3/74 Latency vs. Throughput Latency Throughput ¡ Latency: yoke of oxen § Each core optimized for executing a single thread ¡ Throughput: flock of chickens § Cores optimized for aggregate throughput, deemphasizing individual performance ¡ (apologies to Seymour Cray) 4/74 Latency vs. Throughput, cont. Specificaons Sandy Bridge- Kepler EP (Tesla K20) 8 cores, 2 issue, 14 SMs, 6 issue, 32 Processing Elements 8 way SIMD way SIMD @3.1 GHz @730 MHz 8 cores, 2 threads, 8 14 SMs, 64 SIMD Resident Strands/ way SIMD: vectors, 32 way Threads (max) SIMD: Sandy Bridge-EP (32nm) 96 strands 28672 threads SP GFLOP/s 396 3924 Memory Bandwidth 51 GB/s 250 GB/s Register File 128 kB (?) 3.5 MB Local Store/L1 Cache 256 kB 896 kB L2 Cache 2 MB 1.5 MB L3 Cache 20 MB - Kepler (28nm) 5/74 Why Heterogeneity? ¡ Different goals produce different designs § Manycore assumes work load is highly parallel § Multicore must be good at everything, parallel or not ¡ Multicore: minimize latency experienced by 1 thread § lots of big on-chip caches § extremely sophisticated control ¡ Manycore: maximize throughput of all threads § lots of big ALUs § multithreading can hide latency … so skip the big caches § simpler control, cost amortized over
    [Show full text]
  • (GAMI) API Specification Designed and Implemented for Intel® Rack Scale Design Software V2.3.2 Release
    Intel® Rack Scale Design (Intel® RSD) Generic Assets Management Interface (GAMI) API Specification Software v2.3.2 September 2018 Revision 003US Document Number: 337206-003US All information provided here is subject to change without notice. Contact your Intel representative to obtain the latest Intel product specifications and roadmaps. Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software, or service activation. Performance varies depending on system configuration. No computer system can be absolutely secure. Check with your system manufacturer or retailer or learn more at www.intel.com. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document. The products described may contain design defects or errors known as errata, which may cause the product to deviate from published specifications. Current characterized errata are available on request. Intel disclaims all express and implied warranties, including without limitation, the implied warranties of merchantability, fitness for a particular purpose, and noninfringement, as well as any warranty arising from course of performance, course of dealing, or usage in trade. Copies of documents that have an order number and are referenced in this document may be obtained by calling 1-800-548-4725 or by visiting http://www.intel.com/design/literature.htm. Intel, Xeon, and the Intel logo are trademarks of Intel Corporation in the United States and other countries. *Other names and brands may be claimed as the property of others. Copyright © 2018 Intel Corporation. All rights reserved. Intel® RSD GAMI API Specification Software v2.3.2 September 2018 2 Document Number: 337206-003US Contents 1.0 Introduction ........................................................................................................................................................................
    [Show full text]
  • 128-Bit SSE5 Instruction Set and Supplemental 64-Bit Media
    AMD64 Technology 128-Bit SSE5 Instruction Set Publication No. Revision Date 43479 3.01 August 2007 Advanced Micro Devices © 2007 Advanced Micro Devices, Inc. All rights reserved. The contents of this document are provided in connection with Advanced Micro Devices, Inc. (“AMD”) products. AMD makes no representations or warranties with respect to the accuracy or completeness of the contents of this publication and reserves the right to make changes to specifications and product descriptions at any time without notice. The information contained herein may be of a preliminary or advance nature and is subject to change without notice. No license, whether express, implied, arising by estoppel or otherwise, to any intellectual property rights is granted by this publication. Except as set forth in AMD’s Standard Terms and Conditions of Sale, AMD assumes no liability whatsoever, and disclaims any express or implied warranty, relating to its products including, but not limited to, the implied warranty of merchantability, fitness for a particular purpose, or infringement of any intellectual property right. AMD’s products are not designed, intended, authorized or warranted for use as components in systems intended for surgical implant into the body, or in other applications intended to support or sustain life, or in any other application in which the failure of AMD’s product could create a situation where personal injury, death, or severe property or environmental damage may occur. AMD reserves the right to discontinue or make changes to its products at any time without notice. Trademarks AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc.
    [Show full text]
  • Zynq-7000 All Programmable Soc Architecture Porting Quick Start Guide
    Zynq-7000 All Programmable SoC Architecture Porting Quick Start Guide UG1181 (v1.1.1) October 22, 2015 Revision History The following table shows the revision history for this document. Date Version Revision 10/22/2015 1.1.1 Updated title. 08/31/2015 1.1 Updated the TrustZone section. 06/25/2015 1.0 Initial Xilinx release. Architecture Porting Guide www.xilinx.com Send Feedback 2 UG1181 (v1.1.1) October 22, 2015 Table of Contents Revision History . 2 Chapter 1: Introduction Chapter 2: Porting Considerations ARM Cortex-A9 Features . 6 Chapter 3: Feature Comparison Across Architectures Architectural Comparison . 13 Address Maps. 18 Detailed Porting Guides: MIPS, PowerPC, Intel, and Renesas . 24 Appendix A: Additional Resources and Legal Notices Xilinx Resources . 25 Solution Centers. 25 References . 25 Please Read: Important Legal Notices . 25 Architecture Porting Guide www.xilinx.com Send Feedback 3 UG1181 (v1.1.1) October 22, 2015 Chapter 1 Introduction This document supports Xilinx® Zynq®-7000 All Programmable (AP) SoC customers that want to port embedded software from non ARM based processors to an ARM processing architecture. This porting guide references documentation on porting for PowerPC®, Intel®, Renesas-SH, and MIPS processors to ARM processors. (Zynq-7000 AP SoC contains the ARM® Cortex®-A9 dual core processor.) The ARM Cortex-A9 processor is a popular general purpose choice for low-power or thermally constrained, cost-sensitive devices. The processor is a mature option and remains a very popular choice for smart phones, digital TV, and both consumer and enterprise applications enabling the Internet of Things. The Cortex-A9 processor is available with a range of supporting ARM technology.
    [Show full text]
  • SHA-3 Conference, March 2012, BLAKE and 256-Bit Advanced
    BLAKE and 256-bit advanced vector extensions Samuel Neves1 and Jean-Philippe Aumasson2 1 University of Coimbra, Portugal 2 NAGRA, Switzerland Abstract. Intel recently documented its AVX2 instruction set extension that introduces support for 256-bit wide single-instruction multiple-data (SIMD) integer arithmetic over double (32-bit) and quad (64-bit) words. This will enable Intel’s future processors—starting with the Haswell architecture, to be released in 2013—to fully support 4-way SIMD com­ putation of 64-bit ARX algorithms (32-bit is already supported since SSE2). AVX2 also introduces instructions with potential to speed-up cryptographic functions, like any-to-any permute and vectorized table lookup. In this paper we show how the AVX2 instructions will benefit the SHA-3 finalist hash function BLAKE, an algorithm that naturally lends itself to 4-way 32- or 64-bit SIMD implementations thanks to its inherent parallelism. We also wrote BLAKE-256 assembly code for AVX and AVX2, and measured for the former a speed of 7.62 cycles per byte, setting a new speed record. Keywords: hash functions, SHA-3, implementation, SIMD 1 Introduction NIST will announce the winner of the SHA-3 competition3 in the second quarter of 2012. At the time of writing (February 26, 2012), no significant security weakness has been discovered on any of the five finalists—BLAKE, Grøstl, JH, Keccak, and Skein—and all seem to provide a comfortable margin of security against future attacks. It is thus expected that secondary evaluation criteria such as performance and ease of implementation will be decisive in the choice of SHA-3.
    [Show full text]
  • Six-Core AMD Opteron Processor Istanbul Paul G. Howard, Ph.D
    Six-Core AMD Opteron Processor Istanbul Paul G. Howard, Ph.D. Chief Scientist, Microway, Inc. Copyright 2009 by Microway, Inc. In April 2009 AMD™ announced the first 6-core Opteron processor, codenamed Istanbul1; delivery began in June 2009, four months ahead of schedule. Istanbul is based on the AMD 64-bit K10 architecture, and is available for 2-, 4-, and 8-socket systems, with clock speeds ranging from 2.0 to 2.8 GHz. Istanbul is the successor to the quad-core Shanghai processor, launched in 2008. It provides up to 30 percent more performance in the same socket and with the same power draw. Here is a summary of the features of the 6-core Istanbul processor: • Six cores mean more performance, and also more performance per watt because the power and thermal envelopes are the same as for 4-core processors • Consistent architecture, scalable to 2, 4, or 8 processor systems • Energy efficient, incorporates AMD-P power management • Virtualization support using AMD-V • Improved bandwidth, through the use of HyperTransport 3.0 and HT Assist. The current state of AMD Opteron processors Istanbul is based on the AMD K10 64-bit architecture and manufactured on a 45 nm SOI process2. Processors in the K10 line to date are shown in the following table. Code Frequency L3 cache Model number Introduction name Cores Sockets (GHz) (MB) Process range3 date Barcelona 4 4 or 8 1.9-2.5 2 65 nm 8346-8360 September 2007 Barcelona 4 2 1.9-2.5 2 65 nm 2344-2360 September 2007 Budapest 4 1 2.1-2.5 2 65 nm 1352-1356 June 2008 Shanghai 4 4 or 8 2.2-3.1 6 45 nm 8374-8393 November 2008 Shanghai 4 2 2.1-3.1 6 45 nm 2372-2393 November 2008 Suzuka 4 1 2.5-2.9 6 45 nm 1381-1389 June 2009 Istanbul 6 4 or 8 2.1-2.8 6 45 nm 8425-8439 June 2009 Istanbul 6 2 2.0-2.8 6 45 nm 2423-2439 June 2009 Budapest uses the same core as Barcelona, and Suzuka uses the same core as Shanghai.
    [Show full text]