Java...

Pages: 1... 7 89

7. JRE programmers will grab onto any excuses they can to justify why their code runs slower than native implementations.

8. My algorithm is poor or the library I was using is poor. I just checked that ParallelPrimeSwing Java implementation done by Peter Luschny does 500000! in 0.6 sec, 1000000! in 1.1 sec, and 2000000! in 2.4 sec on my computer. But he uses apfloat big num library. I will have to look closer at it.

Last edited on

Galik (2254)

Well I had several attempts here. I first partitioned the number into equal segments of 500000/4. Then I realised that this might not scale very well as the higher partitions would be processing larger numbers. So I then wrote a version that separates the calculation into every Nth item. I would imagine that scales better.

I only have the one core so I can't test scalability here. All my attempts take roughly the same time (40 sec). I compare them against the GMP built in function that take a staggering 1.11 seconds!

#include <fstream>
#include <iostream>
#include <ctime>
#include <thread>
#include <future>
#include <gmp.h>

std::ostream& operator<<(std::ostream& os, mpz_t i)
{
	char* s = new char[mpz_sizeinbase(i, 10) + 2];
	mpz_get_str(s, 10, i);
	os << s;
	delete[] s;
	return os;
}

/**
 * Factorial interval between lo and hi
 */
inline
void factorial_i(mpz_t res, mpz_t lo, mpz_t hi)
{
	for(mpz_set(res, hi); mpz_cmp(lo, hi) < 0; mpz_add_ui(lo, lo, 1))
		mpz_mul(res, res, lo);
}

/**
 * Factorial increments 1 x 3 x 5 x 7 | 2 x 4 x 6 x 8 | etc..
 */
inline
void factorial_s(mpz_t res, mpz_t lo, mpz_t hi, size_t inc)
{
	for(; mpz_cmp(lo, hi) < 0; mpz_add_ui(lo, lo, inc))
		mpz_mul(res, res, lo);
}

// Number of sub-processing units
const static size_t SZ = 4;

/**
 * Single threaded factorial (using intervals)
 */
void factorial_st(mpz_t ret, size_t n)
{
	size_t p = n / SZ;

	mpz_t lo[SZ];
	mpz_t hi[SZ];

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_init(lo[i]);
		mpz_init(hi[i]);
	}

	for(size_t i = 0; i < SZ; ++i)
		mpz_set_ui(lo[i], (i * p) + 1);

	for(size_t i = 0; i < SZ - 1; ++i)
		mpz_set_ui(hi[i], (i * p) + p);

	mpz_set_ui(hi[SZ - 1], n);

	mpz_t tmp;
	mpz_init(tmp);
	mpz_set_ui(ret, 1);

	for(size_t i = 0; i < SZ; ++i)
	{
		factorial_i(tmp, lo[i], hi[i]);
		mpz_mul(ret, ret, tmp);
	}

	mpz_clear(tmp);
	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_clear(lo[i]);
		mpz_clear(hi[i]);
	}
}

/**
 * Single threaded Factorial (using increments)
 */
void factorial_st_inc(mpz_t ret, size_t n)
{
	mpz_t lo;
	mpz_t hi;
	mpz_t tmp;

	mpz_init(lo);
	mpz_init(hi);
	mpz_init(tmp);

	mpz_set_ui(hi, n);
	mpz_set(ret, hi);

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_set_ui(tmp, 1);
		mpz_set_ui(lo, i + 1);
		factorial_s(tmp, lo, hi, SZ);
		mpz_mul(ret, ret, tmp);
	}

	mpz_clear(lo);
	mpz_clear(hi);
	mpz_clear(tmp);
}

/**
 * Multi threaded factorial (using intervals)
 */
void factorial_mt(mpz_t ret, size_t n)
{
	size_t p = n / SZ;

	mpz_t lo[SZ];
	mpz_t hi[SZ];
	mpz_t tmp[SZ];

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_init(lo[i]);
		mpz_init(hi[i]);
		mpz_init(tmp[i]);
	}

	for(size_t i = 0; i < SZ; ++i)
		mpz_set_ui(lo[i], (i * p) + 1);

	for(size_t i = 0; i < SZ - 1; ++i)
		mpz_set_ui(hi[i], (i * p) + p);

	mpz_set_ui(hi[SZ - 1], n);

	std::future<void> res[SZ];

	for(size_t i = 0; i < SZ; ++i)
		res[i] = std::async(std::launch::async, factorial_i, tmp[i], lo[i], hi[i]);

	mpz_set_ui(ret, 1);

	for(size_t i = 0; i < SZ; ++i)
	{
		res[i].get();
		mpz_mul(ret, ret, tmp[i]);
	}

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_clear(lo[i]);
		mpz_clear(hi[i]);
		mpz_clear(tmp[i]);
	}
}

/**
 * Multi threaded factorial (using increments)
 */
void factorial_mt_inc(mpz_t ret, size_t n)
{
	mpz_t lo[SZ];
	mpz_t hi;
	mpz_t tmp[SZ];

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_init(lo[i]);
		mpz_init(tmp[i]);
	}
	mpz_init(hi);

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_set_ui(lo[i], i + 1);
		mpz_set_ui(tmp[i], 1);
	}

	mpz_set_ui(hi, n);
	mpz_set_ui(ret, 1);
	mpz_set(tmp[0], hi);

	std::future<void> res[SZ];

	for(size_t i = 0; i < SZ; ++i)
		res[i] = std::async(std::launch::async, factorial_s, tmp[i], lo[i], hi, SZ);

	for(size_t i = 0; i < SZ; ++i)
	{
		res[i].get();
		mpz_mul(ret, ret, tmp[i]);
	}

	for(size_t i = 0; i < SZ; ++i)
	{
		mpz_clear(lo[i]);
		mpz_clear(tmp[i]);
	}
	mpz_clear(hi);
}

int main()
{
	mpz_t ret;
	mpz_init(ret);

	std::ofstream ofs;

	std::clock_t b, e;

	const size_t n = 500000;

	// Single threaded

	mpz_set_ui(ret, 0);
	b = std::clock();
	factorial_st(ret, n);
	e = std::clock();

	std::cout << "ST  time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-st.txt");
	ofs << ret;
	ofs.close();

	// Single threaded inc

	mpz_set_ui(ret, 0);
	b = std::clock();
	factorial_st_inc(ret, n);
	e = std::clock();

	std::cout << "STI time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-sti.txt");
	ofs << ret;
	ofs.close();

	// Multi threaded

	mpz_set_ui(ret, 0);
	b = std::clock();
	factorial_mt(ret, n);
	e = std::clock();

	std::cout << "MT  time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-mt.txt");
	ofs << ret;
	ofs.close();

	// Multi threaded inc

	mpz_set_ui(ret, 0);
	b = std::clock();
	factorial_mt_inc(ret, n);
	e = std::clock();

	std::cout << "MTI time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-mti.txt");
	ofs << ret;
	ofs.close();

	// Reference library implementation

	mpz_set_ui(ret, 0);
	b = std::clock();
	mpz_fac_ui(ret, n);
	e = std::clock();

	std::cout << "Ref time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-ref.txt");
	ofs << ret;
	ofs.close();

	mpz_clear(ret);

	return 0;
}

ST  time: 40.83

STI time: 39.34

MT  time: 41.84

MTI time: 41.05

GMP time: 1.11 // HUH??!! I feel humbled...

// File MD5 sums to determine accuracy:

81e98fd0b6fa6cafb396b024c330bdc0  fact-500000-mti.txt
81e98fd0b6fa6cafb396b024c330bdc0  fact-500000-mt.txt
81e98fd0b6fa6cafb396b024c330bdc0  fact-500000-ref.txt
81e98fd0b6fa6cafb396b024c330bdc0  fact-500000-sti.txt
81e98fd0b6fa6cafb396b024c330bdc0  fact-500000-st.txt

EDIT: By means of comparison I ran Helios' final version on my hardware:

Done in 35.51 s.
Not including output time: 29.86 s.

Which makes him about 25% faster on a single core :)

Last edited on

Galik (2254)

Well I discovered that GMP has a C++ interface which cleans up the code a great deal. It is a great example of how C++ is so much more usable than C. So here is the code using GMP C++ as well as some other general tweaking:

#include <fstream>
#include <iostream>
#include <ctime>
#include <thread>
#include <future>
#include <gmp.h>
#include <gmpxx.h>

typedef mpz_class big;

// Number of sub-processing units
const static size_t SZ = 4;

/**
 * Factorial increments 1 x 3 x 5 x 7 | 2 x 4 x 6 x 8 | etc..
 */
inline
big& fact_s(big& res, size_t lo, size_t hi, size_t inc)
{
	for(; lo < hi; lo += inc)
		res *= lo;
	return res;
}

/**
 * Multi threaded factorial (using increments)
 */
void factorial_mt_inc(big& ret, size_t n)
{
	big b[SZ];

	b[0] = n;
	for(size_t i = 1; i < SZ; ++i)
		b[i] = 1;

	std::future<big&> res[SZ];
	for(size_t i = 0; i < SZ; ++i)
		res[i] = std::async(std::launch::async, [=,&b]()->big&{ return fact_s(b[i], i + 1, n, SZ); });

	ret = 1;
	for(size_t i = 0; i < SZ; ++i)
		ret *= res[i].get();
}

int main()
{
	const size_t n = 500000;
	
	std::ofstream ofs;
	std::clock_t b, e;

	big ret = 0;
	b = std::clock();
	factorial_mt_inc(ret, n);
	e = std::clock();

	std::cout << "Time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-galik.txt");
	ofs << ret;
	ofs.close();

	// Reference library implementation

	ret = 0;
	b = std::clock();
	mpz_fac_ui(ret.get_mpz_t(), n);
	e = std::clock();

	std::cout << "GMP time: " << (double(e - b) / CLOCKS_PER_SEC) << '\n';
	std::cout << '\n';

	ofs.open("fact-500000-ref.txt");
	ofs << ret;
	ofs.close();

	return 0;
}

Time: 40.39

GMP time: 1.12

helios (17607)

You're still taking forever. Even with a single core, it shouldn't take you longer than 10 seconds to get 500000!. Try partitioning the input more. I had my threads partition their workload into ten pieces each.

tition (887)

Galik, I think you should benchmark print-out-to-string times as well. If you print the gmp number to decimal, its performance will drop quite a bit (this was my main motivation for changing my personal crappy LargeInt implementation to work over base 1000 000 000 rather than over 2^31; I figured more time is spent in printing out results than actually computing them. This is no longer true for my own crappy LargeInt library, but is definitely true for gmp).

Last edited on

Galik (2254)

helios wrote:
You're still taking forever. Even with a single core, it shouldn't take you longer than 10 seconds to get 500000!. Try partitioning the input more. I had my threads partition their workload into ten pieces each.

Yes, indeed you're right. I increased the number of processing units to 16 and got under 10 seconde for 500000! :)

1
2
3

Time: 9.94

GMP time: 1.13

Last edited on

rapidcoder (1010)

Interesting thing about this thread is that the differences between performance of various programs tried here by different programmers are much higher than the usual differences between performance of various compiled programming language implementations. All of us have lost compared to professional factorial implementations (you lost compared to GMP, I lost significantly compared to PrimeSwing). Looking at the timings of GMP (1.13) and Java PrimeSwing (0.63), even though on different hardware, we can conclude they are probably very close to each other.

Last edited on

tition (887)

Primeswing was implemented in C++ as well. Look at Peter Luschny's own benchmarks, http://www.luschny.de/math/factorial/Benchmark.html . His times with the MPIR (C/C++) library are much (around 7 times for the entry on the top left corner of the table) faster than his Java implementation.

Let me copy Peter Luschny's benchmarks for everyone (material from http://www.luschny.de/math/factorial/Benchmark.html follows) :

MPIR Factorial Benchmark (2011)

CPU: AMD Athlon II X4 640, 3.0 GHz, RAM: 4.0 GB, OS: Win 7 (64 bit)

MPIR-2.4.0 Factorial Benchmark (Jun 14 2011)

(N x 10^6) !            1	 2	4	8	16	32	64	100
ParallelPrimeSwing	0.25	0.58	1.70	3.81	7.85	16.69	35.86	56.92
ParallelPrimeSchoenhage	  0.25	0.62	1.83	4.10	8.50	18.21	38.86	62.03
PrimeSwing	        0.28	0.66	1.92	4.21	8.81	18.86	41.62	65.01
PrimeSchoenhage		 0.27	0.66	1.92	4.24	8.89	18.95	40.67	65.33
MPIR-Library	        0.27	0.66	1.90	4.26	8.89	19.06	41.03	66.16

Java Factorial Benchmark (2011)

Java 1.6.0_25 Sun VM server (64 bit)
Arithmetic: Apfloat Library 1.6.2 by Mikko Tommila
CPU: AMD Athlon II X4 640, 3.0 GHz, RAM: 4.0 GB, OS: Win 7 (64 bit)

You can download the source code here.

Timings (seconds)

(N x 10^6)!	 	  1	  2	  4	  8	  16	  32
ParallelPrimeSwing	 2,3	 3,4	 6,8	 14,4	 30,4	 61,7
ParallelPrimeSplit	 1,8	 3,4	 7,0	 14,5	 29,8	 63,4
PrimeSchoenhage		  1,9	 4,2	 8,7	 18,1	 37,6	 79,6
PrimeSwingList		  1,9	 4,4	 8,7	 19,1	 38,3	 80,7
PrimeSwing	 	 2,1	 4,6	 8,8	 19,3	 40,2	 84,5
PrimeVardi	 	 2,0	 4,5	 9,0	 19,9	 40,1	 82,8
ParallelSplit	 	 3,8	 8,1	 17,2	 38,0	 72,8	149,1
ParallelSwing	 	 4,0	 8,9	 18,3	 39,5	 77,8	158,8
Split	 	 	  6,0	 13,4	 27,5	 90,5	121,9	234,3
Swing	 	 	 6,6	 14,4	 31,3	 69,4	134,4	255,9

****************
So, the world champion on factorials computes 1 million factorial 7 times faster in C/C++ than in Java.

That said, I looked at his code, and it was barely readable - he clearly is a professional mathematician, not a programmer, so he might have failed to do the optimal Java implementation. At any rate, the benchmarks are not in favor of Java.

Last edited on

rapidcoder (1010)

he clearly is a professional mathematician, not a programmer, so he might have failed to do the optimal Java implementation

I'd rather expect the main culprit is the big num library using suboptimal algorithms. For such big numbers, implementation of bignums and its big O complexity really does matter the most. And Java was never the main interest of professional mathematicians doing big number crunching, so there are 10x as many C++ bignum libraries as there are for Java.

BTW: Luschny's Java factorial implementation, although slow, is still faster than the GMP's factorial implementation in C++. The most important thing is the algorithm, not the language.

Last edited on

brokenbot (271)

I think every language is dope... Better learn 'em

Topic archived. No new replies allowed.

Pages: 1... 7 89