Comments (2)
Interesting, thanks for this insight!
What we can do and what's a low hanging fruit is reducing the calls of get_throughput_sum()
in lines 62-64 to call it only once, do the check in line 62 on the result and use the result for line 64 as well.
Furthermore, we could enhance the get_throughput_sum()
function by adding a hint to only calculate (and therefore, do the comprehension) for a specific column/port if we know only this has changed.
The assembly kernel I tested on was the 'triad' benchmark repeated about a hundred times over.
I would assume the relative runtime should move more towards the graph computation when using a more complex kernel with 100 instructions and dependency chains inside of it, as the STREAM triad is only a handful of instructions long, e.g.:
..B399.92: # Preds ..B399.98 ..B399.91
movq %r13, %r14
shlq $5, %r14
movq %r8, 40(%rsp)
movq 336(%rsp), %r8
vmovupd %ymm13, 64(%rsp)
vmovups (%r8,%r14), %ymm7
movq 328(%rsp), %r8
vmovups (%r8,%r14), %ymm8
vaddpd (%rdi,%r14), %ymm7, %ymm7
vmulpd %ymm7, %ymm0, %ymm13
movq 352(%rsp), %r8
vaddpd (%r8,%r14), %ymm8, %ymm11
movq 320(%rsp), %r8
vmulpd %ymm11, %ymm14, %ymm15
vmovups (%r8,%r14), %ymm12
vaddpd %ymm13, %ymm15, %ymm4
vfmadd132pd %ymm0, %ymm15, %ymm7
movq 344(%rsp), %r8
vaddpd (%r8,%r14), %ymm12, %ymm9
movq 312(%rsp), %r8
vmulpd %ymm9, %ymm6, %ymm10
vmovups (%r8,%r14), %ymm5
vaddpd %ymm7, %ymm10, %ymm2
vaddpd (%r12,%r14), %ymm5, %ymm1
movq 384(%rsp), %r8
vaddpd %ymm2, %ymm1, %ymm15
vaddpd %ymm10, %ymm1, %ymm3
vmovups (%r8,%r14), %ymm8
vaddpd %ymm4, %ymm3, %ymm4
movq 416(%rsp), %r8
vaddpd (%r8,%r14), %ymm8, %ymm11
movq 376(%rsp), %r8
vmulpd %ymm11, %ymm0, %ymm5
vmovups (%r8,%r14), %ymm13
vaddpd %ymm15, %ymm5, %ymm9
movq 408(%rsp), %r8
vaddpd (%r8,%r14), %ymm13, %ymm12
movq 368(%rsp), %r8
vmulpd %ymm12, %ymm14, %ymm7
vmovups (%r8,%r14), %ymm2
vaddpd %ymm9, %ymm7, %ymm10
vaddpd %ymm5, %ymm7, %ymm12
vmovdqu 32(%rsi,%r14), %ymm5
vaddpd -32(%rsi,%r14), %ymm5, %ymm9
vmulpd %ymm9, %ymm0, %ymm9
movq 400(%rsp), %r8
vaddpd (%r8,%r14), %ymm2, %ymm1
movq 360(%rsp), %r8
vmulpd %ymm1, %ymm6, %ymm15
vmovups (%r8,%r14), %ymm3
vaddpd %ymm10, %ymm15, %ymm8
movq 392(%rsp), %r8
vaddpd (%r8,%r14), %ymm3, %ymm11
vmovdqu (%rsi,%r14), %ymm3
vaddpd %ymm8, %ymm11, %ymm7
vaddpd %ymm15, %ymm11, %ymm13
vperm2i128 $33, %ymm5, %ymm3, %ymm1
vperm2i128 $3, -32(%rsi,%r14), %ymm3, %ymm2
vpalignr $8, -32(%rsi,%r14), %ymm2, %ymm8
vpalignr $8, %ymm1, %ymm5, %ymm11
vaddpd %ymm12, %ymm13, %ymm10
vaddpd %ymm11, %ymm8, %ymm15
vaddpd %ymm7, %ymm9, %ymm13
vmulpd %ymm15, %ymm14, %ymm12
vaddpd %ymm9, %ymm12, %ymm5
vaddpd %ymm1, %ymm2, %ymm9
vaddpd %ymm13, %ymm12, %ymm8
vmulpd %ymm9, %ymm6, %ymm7
vpalignr $8, %ymm3, %ymm1, %ymm1
vpalignr $8, %ymm2, %ymm3, %ymm2
vaddpd %ymm8, %ymm7, %ymm11
vaddpd %ymm1, %ymm2, %ymm8
vmovupd .L_2il0floatpacket.319(%rip), %ymm2
vmovupd .L_2il0floatpacket.318(%rip), %ymm1
vaddpd %ymm11, %ymm8, %ymm9
vfmadd231pd %ymm2, %ymm3, %ymm8
vaddpd %ymm7, %ymm8, %ymm7
movq 200(%rsp), %r8
vaddpd %ymm5, %ymm7, %ymm5
vmovups (%r8,%r14), %ymm7
vaddpd %ymm10, %ymm5, %ymm10
vfmsub213pd %ymm7, %ymm1, %ymm3
vaddpd %ymm4, %ymm10, %ymm4
vaddpd %ymm9, %ymm3, %ymm10
movq 32(%rsp), %r8
vmovups (%r8,%r14), %ymm9
movq 456(%rsp), %r8
vmulpd %ymm4, %ymm9, %ymm5
vmovups (%r8,%r14), %ymm3
vaddpd %ymm10, %ymm5, %ymm5
movq 488(%rsp), %r8
vmovupd %ymm5, 96(%rsp)
vaddpd (%r8,%r14), %ymm3, %ymm13
movq 448(%rsp), %r8
vmulpd %ymm13, %ymm0, %ymm15
vmovups (%r8,%r14), %ymm4
movq 480(%rsp), %r8
vaddpd (%r8,%r14), %ymm4, %ymm8
movq 440(%rsp), %r8
vmulpd %ymm8, %ymm14, %ymm11
vmovups (%r8,%r14), %ymm12
vaddpd %ymm15, %ymm11, %ymm15
vfmadd132pd %ymm0, %ymm11, %ymm13
movq 472(%rsp), %r8
vaddpd (%r8,%r14), %ymm12, %ymm10
vmulpd %ymm10, %ymm6, %ymm4
vaddpd %ymm13, %ymm4, %ymm10
movq 432(%rsp), %r8
vmovups (%r8,%r14), %ymm13
movq 464(%rsp), %r8
vaddpd (%r8,%r14), %ymm13, %ymm3
movq 520(%rsp), %r8
vaddpd %ymm4, %ymm3, %ymm8
vaddpd %ymm10, %ymm3, %ymm11
vmovups (%r8,%r14), %ymm13
vaddpd %ymm15, %ymm8, %ymm15
movq 552(%rsp), %r8
vaddpd (%r8,%r14), %ymm13, %ymm12
vmulpd %ymm12, %ymm0, %ymm8
vaddpd %ymm11, %ymm8, %ymm3
movq 512(%rsp), %r8
vmovups (%r8,%r14), %ymm11
movq 544(%rsp), %r8
vaddpd (%r8,%r14), %ymm11, %ymm10
movq 504(%rsp), %r8
vmulpd %ymm10, %ymm14, %ymm4
vmovups (%r8,%r14), %ymm10
vaddpd %ymm3, %ymm4, %ymm11
vaddpd %ymm8, %ymm4, %ymm4
movq 536(%rsp), %r8
vaddpd (%r8,%r14), %ymm10, %ymm3
movq 496(%rsp), %r8
vmulpd %ymm3, %ymm6, %ymm3
vmovups (%r8,%r14), %ymm13
vaddpd %ymm11, %ymm3, %ymm12
movq 528(%rsp), %r8
vaddpd (%r8,%r14), %ymm13, %ymm10
vaddpd %ymm3, %ymm10, %ymm11
vaddpd %ymm12, %ymm10, %ymm8
vmovdqu 32(%rbx,%r14), %ymm3
vmovdqu (%rbx,%r14), %ymm12
vaddpd %ymm4, %ymm11, %ymm13
vaddpd -32(%rbx,%r14), %ymm3, %ymm4
vmulpd %ymm4, %ymm0, %ymm4
vaddpd %ymm8, %ymm4, %ymm10
vperm2i128 $33, %ymm3, %ymm12, %ymm8
vpalignr $8, %ymm8, %ymm3, %ymm11
vperm2i128 $3, -32(%rbx,%r14), %ymm12, %ymm3
vpalignr $8, -32(%rbx,%r14), %ymm3, %ymm0
vaddpd %ymm11, %ymm0, %ymm0
vmulpd %ymm0, %ymm14, %ymm11
vaddpd %ymm10, %ymm11, %ymm0
vaddpd %ymm4, %ymm11, %ymm10
vaddpd %ymm8, %ymm3, %ymm4
vmulpd %ymm4, %ymm6, %ymm11
vpalignr $8, %ymm12, %ymm8, %ymm4
vpalignr $8, %ymm3, %ymm12, %ymm8
vaddpd %ymm0, %ymm11, %ymm0
vaddpd %ymm4, %ymm8, %ymm3
vaddpd %ymm0, %ymm3, %ymm0
vfmadd231pd %ymm2, %ymm12, %ymm3
vfmsub213pd %ymm7, %ymm1, %ymm12
vaddpd %ymm11, %ymm3, %ymm8
vaddpd %ymm0, %ymm12, %ymm0
vaddpd %ymm10, %ymm8, %ymm10
vaddpd %ymm13, %ymm10, %ymm13
vaddpd %ymm15, %ymm13, %ymm15
vmulpd %ymm15, %ymm9, %ymm12
vaddpd %ymm0, %ymm12, %ymm10
vmulpd .L_2il0floatpacket.321(%rip), %ymm10, %ymm0
vmovupd %ymm10, 128(%rsp)
vfmadd231pd .L_2il0floatpacket.320(%rip), %ymm5, %ymm0
movq 592(%rsp), %r8
vmovups (%r8,%r14), %ymm5
movq 624(%rsp), %r8
vaddpd (%r8,%r14), %ymm5, %ymm12
vmovupd .L_2il0floatpacket.314(%rip), %ymm5
vmulpd %ymm12, %ymm5, %ymm11
movq 584(%rsp), %r8
vmovups (%r8,%r14), %ymm3
movq 616(%rsp), %r8
vaddpd (%r8,%r14), %ymm3, %ymm4
movq 576(%rsp), %r8
vmulpd %ymm4, %ymm14, %ymm8
vmovups (%r8,%r14), %ymm15
vaddpd %ymm11, %ymm8, %ymm11
vfmadd132pd %ymm5, %ymm8, %ymm12
movq 608(%rsp), %r8
vaddpd (%r8,%r14), %ymm15, %ymm13
movq 568(%rsp), %r8
vmulpd %ymm13, %ymm6, %ymm8
vmovups (%r8,%r14), %ymm10
vaddpd %ymm12, %ymm8, %ymm3
movq 600(%rsp), %r8
vaddpd (%r8,%r14), %ymm10, %ymm4
movq 640(%rsp), %r8
vaddpd %ymm8, %ymm4, %ymm15
vaddpd %ymm3, %ymm4, %ymm13
vmovups (%r8,%r14), %ymm12
vaddpd %ymm11, %ymm15, %ymm11
vmovups (%rdx,%r14), %ymm15
movq 672(%rsp), %r8
vaddpd (%r8,%r14), %ymm12, %ymm10
vmulpd %ymm10, %ymm5, %ymm8
vaddpd %ymm13, %ymm8, %ymm3
movq 632(%rsp), %r8
vmovups (%r8,%r14), %ymm13
movq 664(%rsp), %r8
vaddpd (%r8,%r14), %ymm13, %ymm10
movq 656(%rsp), %r8
vmulpd %ymm10, %ymm14, %ymm4
vaddpd (%r8,%r14), %ymm15, %ymm13
vaddpd %ymm3, %ymm4, %ymm12
vaddpd %ymm8, %ymm4, %ymm10
vmulpd %ymm13, %ymm6, %ymm15
vmovups (%rcx,%r14), %ymm3
vaddpd %ymm12, %ymm15, %ymm4
movq 648(%rsp), %r8
vaddpd (%r8,%r14), %ymm3, %ymm8
vaddpd %ymm4, %ymm8, %ymm3
vaddpd %ymm15, %ymm8, %ymm13
vmovdqu 32(%r15,%r14), %ymm4
vaddpd -32(%r15,%r14), %ymm4, %ymm12
vaddpd %ymm10, %ymm13, %ymm10
vmulpd %ymm12, %ymm5, %ymm5
vaddpd %ymm3, %ymm5, %ymm15
vmovdqu (%r15,%r14), %ymm3
vperm2i128 $33, %ymm4, %ymm3, %ymm12
vpalignr $8, %ymm12, %ymm4, %ymm13
vperm2i128 $3, -32(%r15,%r14), %ymm3, %ymm4
vpalignr $8, -32(%r15,%r14), %ymm4, %ymm8
vaddpd %ymm13, %ymm8, %ymm8
vmulpd %ymm8, %ymm14, %ymm13
vaddpd %ymm15, %ymm13, %ymm8
vaddpd %ymm5, %ymm13, %ymm15
vaddpd %ymm12, %ymm4, %ymm5
vmulpd %ymm5, %ymm6, %ymm13
vpalignr $8, %ymm3, %ymm12, %ymm12
vpalignr $8, %ymm4, %ymm3, %ymm4
vaddpd %ymm8, %ymm13, %ymm5
vaddpd %ymm12, %ymm4, %ymm4
vaddpd %ymm5, %ymm4, %ymm8
vfmadd231pd %ymm2, %ymm3, %ymm4
vfmsub213pd %ymm7, %ymm1, %ymm3
vaddpd %ymm13, %ymm4, %ymm2
vaddpd %ymm15, %ymm2, %ymm5
vaddpd %ymm10, %ymm5, %ymm2
vaddpd %ymm8, %ymm3, %ymm5
vaddpd %ymm11, %ymm2, %ymm10
vmulpd %ymm10, %ymm9, %ymm1
vaddpd %ymm5, %ymm1, %ymm5
movq 288(%rsp), %r8
vmulpd .L_2il0floatpacket.322(%rip), %ymm5, %ymm2
vmovups (%r8,%r14), %ymm1
vaddpd %ymm0, %ymm2, %ymm0
movq 256(%rsp), %r8
vmovupd %ymm0, (%rsp)
vmovupd .L_2il0floatpacket.314(%rip), %ymm0
vaddpd (%r8,%r14), %ymm1, %ymm4
movq 296(%rsp), %r8
vmulpd %ymm4, %ymm0, %ymm2
vmovups (%r8,%r14), %ymm10
movq 264(%rsp), %r8
vaddpd (%r8,%r14), %ymm10, %ymm3
movq 688(%rsp), %r8
vmulpd %ymm3, %ymm14, %ymm1
vmovups (%r8,%r14), %ymm11
vfmadd132pd %ymm0, %ymm1, %ymm4
movq 272(%rsp), %r8
vaddpd (%r8,%r14), %ymm11, %ymm10
vmovups (%r10,%r14), %ymm11
vmulpd %ymm10, %ymm6, %ymm10
vaddpd %ymm4, %ymm10, %ymm8
movq 680(%rsp), %r8
vmovups (%r8,%r14), %ymm3
movq 280(%rsp), %r8
vaddpd (%r8,%r14), %ymm3, %ymm3
movq 232(%rsp), %r8
vaddpd %ymm8, %ymm3, %ymm13
vaddpd (%r8,%r14), %ymm11, %ymm12
vmovupd 96(%rsp), %ymm11
vmulpd %ymm12, %ymm0, %ymm4
vmovupd 128(%rsp), %ymm12
vaddpd %ymm13, %ymm4, %ymm8
vmovupd 64(%rsp), %ymm13
movq 248(%rsp), %r8
vmovups (%r8,%r14), %ymm15
movq 40(%rsp), %r8
..B399.98: # Preds ..B399.92
movq %r10, 48(%rsp)
incq %r13
movq 704(%rsp), %r10
vmovupd %ymm5, 160(%rsp)
vaddpd (%r10,%r14), %ymm15, %ymm15
vaddpd %ymm10, %ymm3, %ymm3
vmulpd %ymm15, %ymm14, %ymm15
movq 240(%rsp), %r10
vaddpd %ymm4, %ymm15, %ymm10
vaddpd %ymm8, %ymm15, %ymm8
vmovups (%r10,%r14), %ymm4
vaddpd (%rax,%r14), %ymm4, %ymm5
vmulpd %ymm5, %ymm6, %ymm5
vaddpd %ymm8, %ymm5, %ymm15
vmovups (%r11,%r14), %ymm8
vaddpd (%r9,%r14), %ymm8, %ymm8
vaddpd %ymm5, %ymm8, %ymm5
vaddpd %ymm15, %ymm8, %ymm4
vaddpd %ymm10, %ymm5, %ymm8
vmovdqu 32(%r8,%r14), %ymm10
vaddpd -32(%r8,%r14), %ymm10, %ymm5
vmulpd %ymm5, %ymm0, %ymm5
vmovdqu (%r8,%r14), %ymm0
vaddpd %ymm4, %ymm5, %ymm4
vfmadd231pd .L_2il0floatpacket.319(%rip), %ymm0, %ymm2
vfmsub231pd .L_2il0floatpacket.318(%rip), %ymm0, %ymm7
vaddpd %ymm2, %ymm1, %ymm2
vaddpd %ymm2, %ymm3, %ymm1
vperm2i128 $33, %ymm10, %ymm0, %ymm2
vpalignr $8, %ymm2, %ymm10, %ymm15
vperm2i128 $3, -32(%r8,%r14), %ymm0, %ymm10
vpalignr $8, -32(%r8,%r14), %ymm10, %ymm3
vaddpd %ymm15, %ymm3, %ymm3
vmulpd %ymm3, %ymm14, %ymm3
vaddpd %ymm4, %ymm3, %ymm15
vaddpd %ymm2, %ymm10, %ymm4
vaddpd %ymm5, %ymm3, %ymm5
vmulpd %ymm4, %ymm6, %ymm3
vpalignr $8, %ymm0, %ymm2, %ymm2
vpalignr $8, %ymm10, %ymm0, %ymm0
vmulpd .L_2il0floatpacket.325(%rip), %ymm12, %ymm10
vaddpd %ymm2, %ymm0, %ymm0
vaddpd %ymm15, %ymm3, %ymm15
vfmadd231pd .L_2il0floatpacket.324(%rip), %ymm11, %ymm10
vaddpd %ymm3, %ymm0, %ymm3
vaddpd %ymm15, %ymm0, %ymm15
vaddpd %ymm5, %ymm3, %ymm4
vaddpd %ymm15, %ymm7, %ymm7
vaddpd %ymm8, %ymm4, %ymm5
vmulpd .L_2il0floatpacket.329(%rip), %ymm12, %ymm4
vaddpd %ymm1, %ymm5, %ymm1
vmulpd %ymm9, %ymm1, %ymm9
vfmadd231pd .L_2il0floatpacket.328(%rip), %ymm11, %ymm4
vaddpd %ymm7, %ymm9, %ymm1
vmovupd 160(%rsp), %ymm9
vmulpd .L_2il0floatpacket.323(%rip), %ymm1, %ymm7
vmulpd .L_2il0floatpacket.326(%rip), %ymm9, %ymm8
vaddpd (%rsp), %ymm7, %ymm0
vaddpd %ymm10, %ymm8, %ymm2
movq 208(%rsp), %r10
vmovups (%r10,%r14), %ymm7
vfmadd213pd %ymm7, %ymm13, %ymm0
movq 304(%rsp), %r10
vmovupd %ymm0, (%r10,%r14)
vmulpd .L_2il0floatpacket.327(%rip), %ymm1, %ymm0
vaddpd %ymm2, %ymm0, %ymm3
vmulpd .L_2il0floatpacket.333(%rip), %ymm12, %ymm0
vmulpd .L_2il0floatpacket.334(%rip), %ymm9, %ymm12
vfmadd213pd %ymm7, %ymm13, %ymm3
vfmadd231pd .L_2il0floatpacket.332(%rip), %ymm11, %ymm0
vmulpd .L_2il0floatpacket.330(%rip), %ymm9, %ymm11
vmulpd .L_2il0floatpacket.331(%rip), %ymm1, %ymm9
vmulpd .L_2il0floatpacket.335(%rip), %ymm1, %ymm1
vaddpd %ymm4, %ymm11, %ymm2
vaddpd %ymm0, %ymm12, %ymm4
vmovupd .L_2il0floatpacket.314(%rip), %ymm0
vaddpd %ymm4, %ymm1, %ymm5
movq 424(%rsp), %r10
vmovupd %ymm3, (%r10,%r14)
vaddpd %ymm2, %ymm9, %ymm3
vfmadd213pd %ymm7, %ymm13, %ymm3
vfmadd231pd %ymm13, %ymm5, %ymm7
movq 560(%rsp), %r10
vmovupd %ymm3, (%r10,%r14)
movq 216(%rsp), %r10
vmovupd %ymm7, (%r10,%r14)
movq 48(%rsp), %r10
cmpq 696(%rsp), %r13
jl ..B399.92 # Prob 82%
from osaca.
A quick look in cProfile surprisingly seems to show that the majority of time is actually spent in the list comprehensions in get_throughput_sum
This includes the first comprehension creating a list of post pressures where the instruction throughput is non zero
port_pressures = [instr.port_pressure for instr in kernel if instr.throughput != 0.0]
and the final list sum over the columns
tp_sum = [round(sum(col), 2) for col in zip(*port_pressures)]
At first I thought using numpy could avoid the 'zip' transposition, but it seems to have no improvement
tp_sum = np.round(np.sum(port_pressures_np, axis=0), 2).tolist()
I think they're already optimized enough and since they already take 80% of runtime, it's probably not worth looking into the other functions. The assembly kernel I tested on was the 'triad' benchmark repeated about a hundred times over.
from osaca.
Related Issues (20)
- Hex immediate are not recognized. HOT 7
- Bug with working with A64FX assembly code. HOT 5
- Import asmbench result HOT 1
- Instruction is missing HOT 1
- x86 OR instruction breaks LCD analysis HOT 1
- x86 NOT instruction doesn't add a cycle to the critical path HOT 1
- [ARM] Multiply-accumulate output register dependency not recognized HOT 1
- iacaMarks.h compatibility HOT 1
- [ARM] Incorrect parsing of NEON instructions targeting individual lanes HOT 4
- [ARM] Incorrect dependency graph for store dependent on post-indexed load HOT 1
- Parallel LCD computation fails on macOS HOT 1
- OSACA fails on godbolt.org with KeyError: "Port 'D' not in port list." HOT 1
- Add new u-arch: AMD Zen4 (Genoa)
- Add new u-arch: Intel Sapphire Rapids (SPR)
- Add new u-arch: Neoverse N1 (Ampere Altra Max) HOT 1
- No throughput/latency information for vbroadcast HOT 2
- [REQUEST] Add support for handling comments generated by fcc compiler at the beginning of instruction code.
- [BUG] Register dependency not detected with x86 conditional move instructions
- [REQUEST] Support Intel assembly syntax HOT 2
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from osaca.