Hi, this question is actually good for practicing, not only math but coding as well.
I was using
boost::timer::auto_cpu_timer
to measure execution time, and also cut generated assembly code to count number of instructions.
Initially I wrote the code dealing with individual bits in a part of the code,
(bitwise addition / multiplication), in hope it will be "fast", but that didn't work since compiler optimization did much better job than than my "bits" lol.
Here is statistics (as well as the code) for the code you posted:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
|
#include <iostream>
using std::cin;
using std::cout;
using std::endl;
#include <boost/timer/timer.hpp>
using boost::timer::auto_cpu_timer;
/*
1*2 + 1*3 + 1*4 + 1*5 ..... 1*n +
2*3 + 2*4 + 2*5 + 2*6 + ....2*n +
3*4 + 3*5 + 3*6 + 3*7 +.....3*n +
*/
int main()
{
typedef unsigned long long var;
const var n = 100'000;
var s = 0;
{ // begin measuring cpu time
auto_cpu_timer timer;
for (var i = 1; i < n; i++)
{
for (var j = i + 1; j <= n; j++)
{
s = s + i * j;
}
}
}
cout << "result is: " << s << endl;
cin.get();
return 0;
}
|
and here is program output of the above code together with execution time:
20.072684s wall, 20.000000s user + 0.000000s system = 20.000000s CPU (99.6%)
result is: 12500083332083325000 |
Here is my version of the code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
|
#include <iostream>
using std::cin;
using std::cout;
using std::endl;
#include <boost/timer/timer.hpp>
using boost::timer::auto_cpu_timer;
/*
1*2 + 1*3 + 1*4 + 1*5 ..... 1*n +
2*3 + 2*4 + 2*5 + 2*6 + ....2*n +
3*4 + 3*5 + 3*6 + 3*7 +.....3*n +
*/
int main()
{
typedef unsigned long long var;
const var n = 100'000;
var check = n;
var factor = 1;
var result = 0;
{ // begin measuring cpu time
auto_cpu_timer timer;
for (var s; (s = 0, check = n) > factor; result += s * factor++)
while (check > factor) s += check--;
}
cout << "result is: " << result << endl;
cin.get();
return 0;
}
|
and here is program output of the above code together with execution time:
17.963140s wall, 17.796875s user + 0.015625s system = 17.812500s CPU (99.2%)
result is: 12500083332083325000 |
As you can see the second version of the code runs 3 seconds faster (huh, not bad),
however time length is platform dependent of course.
and finally here is generated assembly code, in both cases it begins on timer construction and ends at timer destruction.
original version:
call ??0auto_cpu_timer@timer@boost@@QEAA@F@Z ; boost::timer::auto_cpu_timer::auto_cpu_timer
; Line 26
jmp SHORT $LN4@main
$LN2@main:
mov rax, QWORD PTR s$5[rsp]
imul rax, QWORD PTR factor$[rsp]
mov rcx, QWORD PTR result$[rsp]
add rcx, rax
mov rax, rcx
mov QWORD PTR result$[rsp], rax
mov rax, QWORD PTR factor$[rsp]
inc rax
mov QWORD PTR factor$[rsp], rax
$LN4@main:
mov QWORD PTR s$5[rsp], 0
mov QWORD PTR check$[rsp], 100000 ; 000186a0H
mov rax, QWORD PTR factor$[rsp]
cmp QWORD PTR check$[rsp], rax
jbe SHORT $LN3@main
$LN5@main:
; Line 27
mov rax, QWORD PTR factor$[rsp]
cmp QWORD PTR check$[rsp], rax
jbe SHORT $LN6@main
mov rax, QWORD PTR check$[rsp]
mov rcx, QWORD PTR s$5[rsp]
add rcx, rax
mov rax, rcx
mov QWORD PTR s$5[rsp], rax
mov rax, QWORD PTR check$[rsp]
dec rax
mov QWORD PTR check$[rsp], rax
jmp SHORT $LN5@main
$LN6@main:
jmp $LN2@main
$LN3@main:
; Line 28
lea rcx, QWORD PTR timer$4[rsp]
call ??1auto_cpu_timer@timer@boost@@QEAA@XZ ; boost::timer::auto_cpu_timer::~auto_cpu_timer |
modified version:
call ??0auto_cpu_timer@timer@boost@@QEAA@F@Z ; boost::timer::auto_cpu_timer::auto_cpu_timer
; Line 25
mov QWORD PTR i$5[rsp], 1
jmp SHORT $LN4@main
$LN2@main:
mov rax, QWORD PTR i$5[rsp]
inc rax
mov QWORD PTR i$5[rsp], rax
$LN4@main:
cmp QWORD PTR i$5[rsp], 100000 ; 000186a0H
jae SHORT $LN3@main
; Line 26
mov rax, QWORD PTR i$5[rsp]
inc rax
mov QWORD PTR j$6[rsp], rax
jmp SHORT $LN7@main
$LN5@main:
mov rax, QWORD PTR j$6[rsp]
inc rax
mov QWORD PTR j$6[rsp], rax
$LN7@main:
cmp QWORD PTR j$6[rsp], 100000 ; 000186a0H
ja SHORT $LN6@main
; Line 27
mov rax, QWORD PTR i$5[rsp]
imul rax, QWORD PTR j$6[rsp]
mov rcx, QWORD PTR s$[rsp]
add rcx, rax
mov rax, rcx
mov QWORD PTR s$[rsp], rax
jmp SHORT $LN5@main
$LN6@main:
jmp SHORT $LN2@main
$LN3@main:
; Line 28
lea rcx, QWORD PTR timer$4[rsp]
call ??1auto_cpu_timer@timer@boost@@QEAA@XZ ; boost::timer::auto_cpu_timer::~auto_cpu_timer |
as you can see small optimization has been achieved in both execution time and generated code.
I'm not saying it's perfect of course, there could a lot of room to generate better code but at this moment I'm not sure how.
I hope there are math gurus in this forums to jump here into discussion and give some genius solutions.