Java...

Pages: 1... 789

7. JRE programmers will grab onto any excuses they can to justify why their code runs slower than native implementations.


8. My algorithm is poor or the library I was using is poor. I just checked that ParallelPrimeSwing Java implementation done by Peter Luschny does 500000! in 0.6 sec, 1000000! in 1.1 sec, and 2000000! in 2.4 sec on my computer. But he uses apfloat big num library. I will have to look closer at it.
Last edited on
Well I had several attempts here. I first partitioned the number into equal segments of 500000/4. Then I realised that this might not scale very well as the higher partitions would be processing larger numbers. So I then wrote a version that separates the calculation into every Nth item. I would imagine that scales better.

I only have the one core so I can't test scalability here. All my attempts take roughly the same time (40 sec). I compare them against the GMP built in function that take a staggering 1.11 seconds!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#include <fstream>
#include <iostream>
#include <ctime>
#include <thread>
#include <future>
#include <gmp.h>

std::ostream& operator<<(std::ostream& os, mpz_t i)
{
	char* s = new char[mpz_sizeinbase(i, 10) + 2];
	mpz_get_str(s, 10, i);
	os << s;
	delete[] s;
	return os;
}

/**
 * Factorial interval between lo and hi
 */
inline
void factorial_i(mpz_t res, mpz_t lo, mpz_t hi)
{
	for(mpz_set(res, hi); mpz_cmp(lo, hi) < 0; mpz_add_ui(lo, lo, 1))
		mpz_mul(res, res, lo);
}

/**
 * Factorial increments 1 x 3 x 5 x 7 | 2 x 4 x 6 x 8 | etc..
 */
inline
void factorial_s(mpz_t res, mpz_t lo, mpz_t hi, size_t inc)
{
	for(; mpz_cmp(lo, hi) < 0; mpz_add_ui(lo, lo, inc))
		mpz_mul(res, res, lo);
}

// Number of sub-processing units
const static size_t SZ = 4;

/**
 * Single threaded factorial (using intervals)
 */
void factorial_st(mpz_t ret, size_t n)
{
	size_t p = n / SZ;

	mpz_t lo[SZ];
	mpz_t hi[SZ];

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_init(lo[i]);
		mpz_init(hi[i]);
	}

	for(size_t i = 0; i < SZ; ++i)
		mpz_set_ui(lo[i], (i * p) + 1);

	for(size_t i = 0; i < SZ - 1; ++i)
		mpz_set_ui(hi[i], (i * p) + p);

	mpz_set_ui(hi[SZ - 1], n);

	mpz_t tmp;
	mpz_init(tmp);
	mpz_set_ui(ret, 1);

	for(size_t i = 0; i < SZ; ++i)
	{
		factorial_i(tmp, lo[i], hi[i]);
		mpz_mul(ret, ret, tmp);
	}

	mpz_clear(tmp);
	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_clear(lo[i]);
		mpz_clear(hi[i]);
	}
}

/**
 * Single threaded Factorial (using increments)
 */
void factorial_st_inc(mpz_t ret, size_t n)
{
	mpz_t lo;
	mpz_t hi;
	mpz_t tmp;

	mpz_init(lo);
	mpz_init(hi);
	mpz_init(tmp);

	mpz_set_ui(hi, n);
	mpz_set(ret, hi);

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_set_ui(tmp, 1);
		mpz_set_ui(lo, i + 1);
		factorial_s(tmp, lo, hi, SZ);
		mpz_mul(ret, ret, tmp);
	}

	mpz_clear(lo);
	mpz_clear(hi);
	mpz_clear(tmp);
}

/**
 * Multi threaded factorial (using intervals)
 */
void factorial_mt(mpz_t ret, size_t n)
{
	size_t p = n / SZ;

	mpz_t lo[SZ];
	mpz_t hi[SZ];
	mpz_t tmp[SZ];

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_init(lo[i]);
		mpz_init(hi[i]);
		mpz_init(tmp[i]);
	}

	for(size_t i = 0; i < SZ; ++i)
		mpz_set_ui(lo[i], (i * p) + 1);

	for(size_t i = 0; i < SZ - 1; ++i)
		mpz_set_ui(hi[i], (i * p) + p);

	mpz_set_ui(hi[SZ - 1], n);

	std::future<void> res[SZ];

	for(size_t i = 0; i < SZ; ++i)
		res[i] = std::async(std::launch::async, factorial_i, tmp[i], lo[i], hi[i]);

	mpz_set_ui(ret, 1);

	for(size_t i = 0; i < SZ; ++i)
	{
		res[i].get();
		mpz_mul(ret, ret, tmp[i]);
	}

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_clear(lo[i]);
		mpz_clear(hi[i]);
		mpz_clear(tmp[i]);
	}
}

/**
 * Multi threaded factorial (using increments)
 */
void factorial_mt_inc(mpz_t ret, size_t n)
{
	mpz_t lo[SZ];
	mpz_t hi;
	mpz_t tmp[SZ];

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_init(lo[i]);
		mpz_init(tmp[i]);
	}
	mpz_init(hi);

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_set_ui(lo[i], i + 1);
		mpz_set_ui(tmp[i], 1);
	}

	mpz_set_ui(hi, n);
	mpz_set_ui(ret, 1);
	mpz_set(tmp[0], hi);

	std::future<void> res[SZ];

	for(size_t i = 0; i < SZ; ++i)
		res[i] = std::async(std::launch::async, factorial_s, tmp[i], lo[i], hi, SZ);

	for(size_t i = 0; i < SZ; ++i)
	{
		res[i].get();
		mpz_mul(ret, ret, tmp[i]);
	}

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_clear(lo[i]);
		mpz_clear(tmp[i]);
	}
	mpz_clear(hi);
}

int main()
{
	mpz_t ret;
	mpz_init(ret);

	std::ofstream ofs;

	std::clock_t b, e;

	const size_t n = 500000;

	// Single threaded

	mpz_set_ui(ret, 0);
	b = std::clock();
	factorial_st(ret, n);
	e = std::clock();

	std::cout << "ST  time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-st.txt");
	ofs << ret;
	ofs.close();

	// Single threaded inc

	mpz_set_ui(ret, 0);
	b = std::clock();
	factorial_st_inc(ret, n);
	e = std::clock();

	std::cout << "STI time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-sti.txt");
	ofs << ret;
	ofs.close();

	// Multi threaded

	mpz_set_ui(ret, 0);
	b = std::clock();
	factorial_mt(ret, n);
	e = std::clock();

	std::cout << "MT  time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-mt.txt");
	ofs << ret;
	ofs.close();

	// Multi threaded inc

	mpz_set_ui(ret, 0);
	b = std::clock();
	factorial_mt_inc(ret, n);
	e = std::clock();

	std::cout << "MTI time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-mti.txt");
	ofs << ret;
	ofs.close();

	// Reference library implementation

	mpz_set_ui(ret, 0);
	b = std::clock();
	mpz_fac_ui(ret, n);
	e = std::clock();

	std::cout << "Ref time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-ref.txt");
	ofs << ret;
	ofs.close();

	mpz_clear(ret);

	return 0;
}
ST  time: 40.83

STI time: 39.34

MT  time: 41.84

MTI time: 41.05

GMP time: 1.11 // HUH??!! I feel humbled...

// File MD5 sums to determine accuracy:

81e98fd0b6fa6cafb396b024c330bdc0  fact-500000-mti.txt
81e98fd0b6fa6cafb396b024c330bdc0  fact-500000-mt.txt
81e98fd0b6fa6cafb396b024c330bdc0  fact-500000-ref.txt
81e98fd0b6fa6cafb396b024c330bdc0  fact-500000-sti.txt
81e98fd0b6fa6cafb396b024c330bdc0  fact-500000-st.txt


EDIT: By means of comparison I ran Helios' final version on my hardware:

Done in 35.51 s.
Not including output time: 29.86 s.

Which makes him about 25% faster on a single core :)
Last edited on
Well I discovered that GMP has a C++ interface which cleans up the code a great deal. It is a great example of how C++ is so much more usable than C. So here is the code using GMP C++ as well as some other general tweaking:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <fstream>
#include <iostream>
#include <ctime>
#include <thread>
#include <future>
#include <gmp.h>
#include <gmpxx.h>

typedef mpz_class big;

// Number of sub-processing units
const static size_t SZ = 4;

/**
 * Factorial increments 1 x 3 x 5 x 7 | 2 x 4 x 6 x 8 | etc..
 */
inline
big& fact_s(big& res, size_t lo, size_t hi, size_t inc)
{
	for(; lo < hi; lo += inc)
		res *= lo;
	return res;
}

/**
 * Multi threaded factorial (using increments)
 */
void factorial_mt_inc(big& ret, size_t n)
{
	big b[SZ];

	b[0] = n;
	for(size_t i = 1; i < SZ; ++i)
		b[i] = 1;

	std::future<big&> res[SZ];
	for(size_t i = 0; i < SZ; ++i)
		res[i] = std::async(std::launch::async, [=,&b]()->big&{ return fact_s(b[i], i + 1, n, SZ); });

	ret = 1;
	for(size_t i = 0; i < SZ; ++i)
		ret *= res[i].get();
}

int main()
{
	const size_t n = 500000;
	
	std::ofstream ofs;
	std::clock_t b, e;

	big ret = 0;
	b = std::clock();
	factorial_mt_inc(ret, n);
	e = std::clock();

	std::cout << "Time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-galik.txt");
	ofs << ret;
	ofs.close();

	// Reference library implementation

	ret = 0;
	b = std::clock();
	mpz_fac_ui(ret.get_mpz_t(), n);
	e = std::clock();

	std::cout << "GMP time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-ref.txt");
	ofs << ret;
	ofs.close();

	return 0;
}
Time: 40.39

GMP time: 1.12

You're still taking forever. Even with a single core, it shouldn't take you longer than 10 seconds to get 500000!. Try partitioning the input more. I had my threads partition their workload into ten pieces each.
Galik, I think you should benchmark print-out-to-string times as well. If you print the gmp number to decimal, its performance will drop quite a bit (this was my main motivation for changing my personal crappy LargeInt implementation to work over base 1000 000 000 rather than over 2^31; I figured more time is spent in printing out results than actually computing them. This is no longer true for my own crappy LargeInt library, but is definitely true for gmp).
Last edited on
helios wrote:
You're still taking forever. Even with a single core, it shouldn't take you longer than 10 seconds to get 500000!. Try partitioning the input more. I had my threads partition their workload into ten pieces each.

Yes, indeed you're right. I increased the number of processing units to 16 and got under 10 seconde for 500000! :)
1
2
3
Time: 9.94

GMP time: 1.13
Last edited on
Interesting thing about this thread is that the differences between performance of various programs tried here by different programmers are much higher than the usual differences between performance of various compiled programming language implementations. All of us have lost compared to professional factorial implementations (you lost compared to GMP, I lost significantly compared to PrimeSwing). Looking at the timings of GMP (1.13) and Java PrimeSwing (0.63), even though on different hardware, we can conclude they are probably very close to each other.
Last edited on
Primeswing was implemented in C++ as well. Look at Peter Luschny's own benchmarks, http://www.luschny.de/math/factorial/Benchmark.html . His times with the MPIR (C/C++) library are much (around 7 times for the entry on the top left corner of the table) faster than his Java implementation.

Let me copy Peter Luschny's benchmarks for everyone (material from http://www.luschny.de/math/factorial/Benchmark.html follows) :

MPIR Factorial Benchmark (2011)

CPU: AMD Athlon II X4 640, 3.0 GHz, RAM: 4.0 GB, OS: Win 7 (64 bit)

MPIR-2.4.0 Factorial Benchmark (Jun 14 2011)

(N x 10^6) !            1	 2	4	8	16	32	64	100
ParallelPrimeSwing	0.25	0.58	1.70	3.81	7.85	16.69	35.86	56.92
ParallelPrimeSchoenhage	  0.25	0.62	1.83	4.10	8.50	18.21	38.86	62.03
PrimeSwing	        0.28	0.66	1.92	4.21	8.81	18.86	41.62	65.01
PrimeSchoenhage		 0.27	0.66	1.92	4.24	8.89	18.95	40.67	65.33
MPIR-Library	        0.27	0.66	1.90	4.26	8.89	19.06	41.03	66.16




Java Factorial Benchmark (2011)

Java 1.6.0_25 Sun VM server (64 bit)
Arithmetic: Apfloat Library 1.6.2 by Mikko Tommila
CPU: AMD Athlon II X4 640, 3.0 GHz, RAM: 4.0 GB, OS: Win 7 (64 bit)

You can download the source code here.

Timings (seconds)

(N x 10^6)!	 	  1	  2	  4	  8	  16	  32
ParallelPrimeSwing	 2,3	 3,4	 6,8	 14,4	 30,4	 61,7
ParallelPrimeSplit	 1,8	 3,4	 7,0	 14,5	 29,8	 63,4
PrimeSchoenhage		  1,9	 4,2	 8,7	 18,1	 37,6	 79,6
PrimeSwingList		  1,9	 4,4	 8,7	 19,1	 38,3	 80,7
PrimeSwing	 	 2,1	 4,6	 8,8	 19,3	 40,2	 84,5
PrimeVardi	 	 2,0	 4,5	 9,0	 19,9	 40,1	 82,8
ParallelSplit	 	 3,8	 8,1	 17,2	 38,0	 72,8	149,1
ParallelSwing	 	 4,0	 8,9	 18,3	 39,5	 77,8	158,8
Split	 	 	  6,0	 13,4	 27,5	 90,5	121,9	234,3
Swing	 	 	 6,6	 14,4	 31,3	 69,4	134,4	255,9




****************
So, the world champion on factorials computes 1 million factorial 7 times faster in C/C++ than in Java.

That said, I looked at his code, and it was barely readable - he clearly is a professional mathematician, not a programmer, so he might have failed to do the optimal Java implementation. At any rate, the benchmarks are not in favor of Java.
Last edited on

he clearly is a professional mathematician, not a programmer, so he might have failed to do the optimal Java implementation


I'd rather expect the main culprit is the big num library using suboptimal algorithms. For such big numbers, implementation of bignums and its big O complexity really does matter the most. And Java was never the main interest of professional mathematicians doing big number crunching, so there are 10x as many C++ bignum libraries as there are for Java.

BTW: Luschny's Java factorial implementation, although slow, is still faster than the GMP's factorial implementation in C++. The most important thing is the algorithm, not the language.

Last edited on
I think every language is dope... Better learn 'em
Topic archived. No new replies allowed.
Pages: 1... 789