|
|
|
|
main:
vmovdqa ymm0, YMMWORD PTR .LC0[rip]
xor eax, eax
vmovdqa ymm1, YMMWORD PTR .LC1[rip]
.L2:
mov rdx, rax
add rax, 1
sal rdx, 5
vmovdqa YMMWORD PTR main::a3[rdx], ymm0
vpaddd ymm0, ymm0, ymm1
cmp rax, 124999
jbe .L2
vzeroupper
xor eax, eax
ret
... |
|
|
main:
lea r10, [rsp+8]
and rsp, -32
xor eax, eax
push QWORD PTR [r10-8]
push rbp
mov rbp, rsp
push r10
sub rsp, 8
vmovdqa ymm0, YMMWORD PTR .LC0[rip]
vmovdqa ymm1, YMMWORD PTR .LC1[rip]
.L2:
mov rdx, rax
add rax, 1
sal rdx, 5
vmovdqa YMMWORD PTR a3[rdx], ymm0
vpaddd ymm0, ymm0, ymm1
cmp rax, 124999
jbe .L2
mov edx, 40000
mov esi, OFFSET FLAT:a3
mov edi, OFFSET FLAT:a2
vzeroupper
call memcpy
mov edx, 400
mov esi, OFFSET FLAT:a3
mov edi, OFFSET FLAT:a1
call memcpy
add rsp, 8
xor eax, eax
pop r10
pop rbp
lea rsp, [r10-8]
ret
... |
const std::size_t N1 = 100, N2 = 10'000, N3 = 1'000'000 ; , it really does not matter how the code is written: the time taken for filling (and shuffling) the large array will thoroughly dominate everything else.