Prime Numbers by Eratosthenes Quicker Sequential Than Concurrently

Prime numbers by Eratosthenes quicker sequential than concurrently?

I am no JAVA coder so I stick with C++. Also this is not an direct answer to your question (sorry for that but I can not debug JAVA) take this as some pointers which way to go or check...

  1. Sieves of Eratosthenes

    Parallelization is possible but not with big enough speed gain. Instead I use more sieve-tabs where each one have its own sub-divisions and each table size is an common multiply of all its sub-divisors. This way you need initiate tables just once and then just check them in O(1)

  2. Parallelization

    After checking all of the sieves then I would use threads to do the obvious division testing for all of the unused divisors

  3. Memoize

    If you have active table of all found primes then divide just by primes and add all new primes found

I am using non parallel prime search which is fast enough for me ...

  • You can adapt this to your parallel code ...

[Edit1] updated code

//---------------------------------------------------------------------------
int bits(DWORD p)
{
DWORD m=0x80000000; int b=32;
for (;m;m>>=1,b--)
if (p>=m) break;
return b;
}
//---------------------------------------------------------------------------
DWORD sqrt(const DWORD &x)
{
DWORD m,a;
m=(bits(x)>>1);
if (m) m=1<<m; else m=1;
for (a=0;m;m>>=1) { a|=m; if (a*a>x) a^=m; }
return a;
}
//---------------------------------------------------------------------------
List<int> primes_i32; // list of precomputed primes
const int primes_map_sz=4106301; // max size of map for speedup search for primes max(LCM(used primes per bit)) (not >>1 because SOE is periodic at double LCM size and only odd values are stored 2/2=1)
const int primes_map_N[8]={ 4106301,3905765,3585337,4026077,3386981,3460469,3340219,3974653 };
const int primes_map_i0=33; // first index of prime not used in mask
const int primes_map_p0=137; // biggest prime used in mask
BYTE primes_map[primes_map_sz]; // factors map for first i0-1 primes
bool primes_i32_alloc=false;
int isprime(int p)
{
int i,j,a,b,an,im[8]; BYTE u;
an=0;
if (!primes_i32.num) // init primes vars
{
primes_i32.allocate(1024*1024);
primes_i32.add( 2); for (i=1;i<primes_map_sz;i++) primes_map[i]=255; primes_map[0]=254;
primes_i32.add( 3); for (u=255- 1,j= 3,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 5); for (u=255- 2,j= 5,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 7); for (u=255- 4,j= 7,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 11); for (u=255- 8,j= 11,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 13); for (u=255- 16,j= 13,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 17); for (u=255- 32,j= 17,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 19); for (u=255- 64,j= 19,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 23); for (u=255-128,j= 23,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 29); for (u=255- 1,j=137,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 31); for (u=255- 2,j=131,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 37); for (u=255- 4,j=127,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 41); for (u=255- 8,j=113,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 43); for (u=255- 16,j= 83,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 47); for (u=255- 32,j= 61,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 53); for (u=255- 64,j=107,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 59); for (u=255-128,j=101,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 61); for (u=255- 1,j=103,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 67); for (u=255- 2,j= 67,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 71); for (u=255- 4,j= 37,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 73); for (u=255- 8,j= 41,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 79); for (u=255- 16,j= 43,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 83); for (u=255- 32,j= 47,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 89); for (u=255- 64,j= 53,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add( 97); for (u=255-128,j= 59,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add(101); for (u=255- 1,j= 97,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add(103); for (u=255- 2,j= 89,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add(107); for (u=255- 4,j=109,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add(109); for (u=255- 8,j= 79,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add(113); for (u=255- 16,j= 73,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add(127); for (u=255- 32,j= 71,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add(131); for (u=255- 64,j= 31,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
primes_i32.add(137); for (u=255-128,j= 29,i=j>>1;i<primes_map_sz;i+=j) primes_map[i]&=u;
}

if (!primes_i32_alloc)
{
if (p <=1) return 0; // ignore too small walues
if (p&1==0) return 0; // not prime if even
if (p>primes_map_p0)
{
j=p>>1;
i=j; if (i>=primes_map_N[0]) i%=primes_map_N[0]; if (!(primes_map[i]& 1)) return 0;
i=j; if (i>=primes_map_N[1]) i%=primes_map_N[1]; if (!(primes_map[i]& 2)) return 0;
i=j; if (i>=primes_map_N[2]) i%=primes_map_N[2]; if (!(primes_map[i]& 4)) return 0;
i=j; if (i>=primes_map_N[3]) i%=primes_map_N[3]; if (!(primes_map[i]& 8)) return 0;
i=j; if (i>=primes_map_N[4]) i%=primes_map_N[4]; if (!(primes_map[i]& 16)) return 0;
i=j; if (i>=primes_map_N[5]) i%=primes_map_N[5]; if (!(primes_map[i]& 32)) return 0;
i=j; if (i>=primes_map_N[6]) i%=primes_map_N[6]; if (!(primes_map[i]& 64)) return 0;
i=j; if (i>=primes_map_N[7]) i%=primes_map_N[7]; if (!(primes_map[i]&128)) return 0;
}
}

an=primes_i32[primes_i32.num-1];
if (an>=p)
{
// linear table search
if (p<127) // 31st prime
{
if (an>=p) for (i=0;i<primes_i32.num;i++)
{
a=primes_i32[i];
if (a==p) return 1;
if (a> p) return 0;
}
}
// approximation table search
else{
for (j=1,i=primes_i32.num-1;j<i;j<<=1); j>>=1; if (!j) j=1;
for (i=0;j;j>>=1)
{
i|=j;
if (i>=primes_i32.num) { i-=j; continue; }
a=primes_i32[i];
if (a==p) return 1;
if (a> p) i-=j;
}
return 0;
}
}
a=an; a+=2;
for (j=a>>1,i=0;i<8;i++) im[i]=j%primes_map_N[i];
an=(1<<((bits(p)>>1)+1))-1; if (an<=0) an=1;
an=an+an;
for (;a<=p;a+=2)
{
for (j=1,i=0;i<8;i++,j<<=1) // check if map is set
if (!(primes_map[im[i]]&j)) { j=-1; break; } // if not dont bother with division
for (i=0;i<8;i++) { im[i]++; if (im[i]>=primes_map_N[i]) im[i]-=primes_map_N[i]; }
if (j<0) continue;
for (i=primes_map_i0;i<primes_i32.num;i++)
{
b=primes_i32[i];
if (b>an) break;
if ((a%b)==0) { i=-1; break; }
}
if (i<0) continue;
primes_i32.add(a);
if (a==p) return 1;
if (a> p) return 0;
}
return 0;
}
//---------------------------------------------------------------------------
void getprimes(int p) // compute and allocate primes up to p
{
if (!primes_i32.num) isprime(3);
int p0=primes_i32[primes_i32.num-1]; // biggest prime computed yet
if (p>p0+10000) // if too big difference use sieves to fast precompute
{
// T((0.3516+0.5756*log10(n))*n) -> O(n.log(n))
// sieves N/16 bytes p=100 000 000 t=1903.031 ms
// ------------------------------
// 0 1 2 3 4 5 6 7 bit
// ------------------------------
// 1 3 5 7 9 11 13 15 +-> +2
// 17 19 21 23 25 27 29 31 |
// 33 35 37 39 41 43 45 47 V +16
// ------------------------------
int N=(p|15),M=(N>>4); // store only odd values 1,3,5,7,... each bit ...
char *m=new char[M+1]; // m[i] -> is 1+i+i prime? (factors map)
int i,j,k,n;
// init sieves
m[0]=254; for (i=1;i<=M;i++) m[i]=255;
for(n=sqrt(p),i=1;i<=n;)
{
int a=m[i>>4];
if (int(a& 1)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2;
if (int(a& 2)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2;
if (int(a& 4)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2;
if (int(a& 8)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2;
if (int(a& 16)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2;
if (int(a& 32)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2;
if (int(a& 64)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2;
if (int(a&128)!=0) for(k=i+i,j=i+k;j<=N;j+=k) m[j>>4]&=255-(1<<((j>>1)&7)); i+=2;
}
// compute primes
i=p0&0xFFFFFFF1; k=m[i>>4]; // start after last found prime in list
if ((int(k& 1)!=0)&&(i>p0)) primes_i32.add(i); i+=2;
if ((int(k& 2)!=0)&&(i>p0)) primes_i32.add(i); i+=2;
if ((int(k& 4)!=0)&&(i>p0)) primes_i32.add(i); i+=2;
if ((int(k& 8)!=0)&&(i>p0)) primes_i32.add(i); i+=2;
if ((int(k& 16)!=0)&&(i>p0)) primes_i32.add(i); i+=2;
if ((int(k& 32)!=0)&&(i>p0)) primes_i32.add(i); i+=2;
if ((int(k& 64)!=0)&&(i>p0)) primes_i32.add(i); i+=2;
if ((int(k&128)!=0)&&(i>p0)) primes_i32.add(i); i+=2;
for(j=i>>4;j<M;i+=16,j++) // continue with 16-blocks
{
k=m[j];
if (!k) continue;
if (int(k& 1)!=0) primes_i32.add(i );
if (int(k& 2)!=0) primes_i32.add(i+ 2);
if (int(k& 4)!=0) primes_i32.add(i+ 4);
if (int(k& 8)!=0) primes_i32.add(i+ 6);
if (int(k& 16)!=0) primes_i32.add(i+ 8);
if (int(k& 32)!=0) primes_i32.add(i+10);
if (int(k& 64)!=0) primes_i32.add(i+12);
if (int(k&128)!=0) primes_i32.add(i+14);
}
k=m[j]; // do the last primes
if ((int(k& 1)!=0)&&(i<=p)) primes_i32.add(i); i+=2;
if ((int(k& 2)!=0)&&(i<=p)) primes_i32.add(i); i+=2;
if ((int(k& 4)!=0)&&(i<=p)) primes_i32.add(i); i+=2;
if ((int(k& 8)!=0)&&(i<=p)) primes_i32.add(i); i+=2;
if ((int(k& 16)!=0)&&(i<=p)) primes_i32.add(i); i+=2;
if ((int(k& 32)!=0)&&(i<=p)) primes_i32.add(i); i+=2;
if ((int(k& 64)!=0)&&(i<=p)) primes_i32.add(i); i+=2;
if ((int(k&128)!=0)&&(i<=p)) primes_i32.add(i); i+=2;
delete[] m;
}
else{
bool b0=primes_i32_alloc;
primes_i32_alloc=true;
isprime(p);
primes_i32_alloc=false;
primes_i32_alloc=b0;
}
}
//---------------------------------------------------------------------------
  • solved some overflow bugs in mine code (periodicity of sieves ...)

  • also some further optimizations

  • added getprimes(p) function which add all primes<=p to the list fast as it can if they are not there yet

  • tested correctness on first 1 000 000 primes (up to 15 485 863)

  • getprimes(15 485 863) solves it on 175.563 ms on mine setup

  • isprime is way slower for this of coarse

  • primes_i32 is a dynamic list of ints

  • primes_i32.num is the number of ints in the list

  • primes_i32[i] is the i-th int i = <0,primes_i32.num-1>

  • primes_i32.add(x) add x to the end of list

  • primes_i32.allocate(N) preallocates space for N items in list to avoid reallocation slowdowns

[notes]

I have used this non parallel version for Euler problem 10 (sum of all primes below 2000000)

    ----------------------------------------------------------------------------------
Time ID Reference | My solution | Note
----------------------------------------------------------------------------------
[ 35.639 ms] Problem010. 142913828922 | 142913828922 | 64_bit
  • The sieve tabs are each one as a bit slice in the primes_map[] array and only the odd values are used (no need to remember even sieves).
  • if you want maximize speed for all primes found then just call isprime(max value) and read the contents of primes_i32[]
  • I use half of the bit-size instead of sqrt for speed

Hope I did not forget to copy something here

How to Implement a Faster Algorithm for Searching for Primes in C

OP's code limits factor testing with for (i = 2; i <= n / 2; ++i) { - a modest first step.

Yet past the square root of n , there are no factors. Since the root is smaller (usually) , than n/2, this is much quicker.

We could use test_factor * test_factor <= n, but test_factor * test_factor may overflow. Use test_factor <= n/test_factor. Smart compilers will see the nearby n/test_factor and n%test_factor and emit code that executes about as fast as just one of those.

Code could calculate int limit = sqrt(n), but that 1) has edge case problems as sqrt() may no return exactly the sought after root even when it is a whole number 2) may lose needed precession when the integer type has more precision that double. 3) Incurs a floating point cost vs. the nearly no additional cost of test_factor <= n/test_factor.

OP's code sets a flag when a factor is found and then continues to look for more. Yet there is no reason to continue looking. This speeds things up a lot to quit the loop once a factor is found.

We could also check every other value as numbers divided by 2 are not prime - with one exception: 2. This is a modest improvement.

bool is_prime(int n) {
if (n % 2 == 0) {
return n == 2;
}
for (int test_factor = 3; test_factor <= n / test_factor; test_factor += 2) {
if (n % test_factor == 0) {
return false;
}
}
return n > 1; // No factors and n is more than 1.
}

Use

for (n = 2; n < b; n++) {
if (is_prime(n)) {
printf("%d is a prime number.\n", n);
}
}

Note: less than 2 seconds to print all the primes less than 1000000.


For a sieve approach. See Sieve of Eratosthenes pseudo code.

What is the most efficient algorithm to give out prime numbers, up to very high values (all a 32bit machine can handle)

If you need to test an handful, possibly only one, of primes, the AKS primality test is polynomial in the length of n.

If you want to find a very big prime, of cryptographic size, then select a random range of odd numbers and sieve out all the numbers whose factors are small primes (e.g. less equal than 64K-240K) then test the remaining numbers for primality.

If you want to find the primes in a range then use a sieve, the sieve of Erathostenes is very easy to implement but run slower and require more memory.

The sieve of Atkin is faster, the wheels sieve requires far less memory.

The size of the problem is exponential if approached naively so before micro-optimising is mandatory to first macro-optimise.

More or less all prime numbers algorithms require confidence with Number theory, so take particular attention to the group/ring/field the algorithm is working on because mathematicians write operations like the inverse or the multiplication with the same symbol for all the algebraic structures.

Once you have a fast algorithm, you can start micro-optimising.

At this level it's really impossible to answer how to proceed with such optimisations.

Summation of primes' takes too long

Apply some straightforward optimizations:

  • list numbers should not be used because each number can be calculated based on an index
  • simplified initialization of Isprime.

For 1'000'000 got:

the sum of all prime numbers below 1000000 is 37548466742 tap to continue
long sum = 0;
int howmanyChecked = 1;
int target = 1000000;
int index = -1;
var Isprime = Enumerable.Repeat(true, target).ToArray();

while (1 > 0)
{
index = Array.IndexOf(Isprime, true, index + 1);

int Selected = index + 2;
howmanyChecked++;

sum += Selected;
//Console.WriteLine($"selected prime number is {Selected}");
//int startfrom =numbers.IndexOf(Selected * Selected);
if (Selected >= target / 2)
{
Console.WriteLine("ss");
for (int i = index + 1; i < target - 1; i++)

{
if (Isprime[i] == true)
{
Console.WriteLine(i + 2);
sum += i + 2;
}
}
Console.WriteLine($"the sum of all prime nubers below {target} is {sum} tap to continue");
Console.ReadLine();
break;
}
else
{
for (int i = Selected; i * Selected <= target; i++)
{
int k = i * Selected - 2;
if (k < 0)
break;
if (Isprime[k] == true)
{
Isprime[k] = false;
howmanyChecked++;
//Console.WriteLine($"Checked number is {Selected * i} and we have counted {howmanyChecked} numbers");
}
}
}
if (howmanyChecked == target || index == target)
break;

}

Console.ReadLine();

Segmented Prime sieve

Here is a more concise formulation of the same algorithm, which should make the principle much clearer (part of full, runnable .cpp for segment size timings @ pastebin). This initialises a packed (odds-only) sieve instead of counting primes but the principles involved are the same. Download and run the .cpp to see the influence of segment sizes. Basically, the optimum should be around the L1 cache size for your CPU. Too small, and the overhead due to increased number of rounds starts dominating; too big, and you'll get penalised by the slower timings of the L2 and L3 caches. See also How does segmentation improve the running time of Sieve of Eratosthenes?.

void initialise_packed_sieve_4G (void *data, unsigned segment_bytes = 1 << 15, unsigned end_bit = 1u << 31)
{
typedef std::vector<prime_and_offset_t>::iterator prime_iter_t;
std::vector<prime_and_offset_t> small_factors;

initialise_odd_primes_and_offsets_64K(small_factors);

unsigned segment_bits = segment_bytes * CHAR_BIT;
unsigned partial_bits = end_bit % segment_bits;
unsigned segments = end_bit / segment_bits + (partial_bits != 0);

unsigned char *segment = static_cast<unsigned char *>(data);
unsigned bytes = segment_bytes;

for ( ; segments--; segment += segment_bytes)
{
if (segments == 0 && partial_bits)
{
segment_bits = partial_bits;
bytes = (partial_bits + CHAR_BIT - 1) / CHAR_BIT;
}

std::memset(segment, 0, bytes);

for (prime_iter_t p = small_factors.begin(); p != small_factors.end(); ++p)
{
unsigned n = p->prime;
unsigned i = p->next_offset;

for ( ; i < segment_bits; i += n)
{
set_bit(segment, i);
}

p->next_offset = i - segment_bits;
}
}
}

If the offsets were not remembered from segment to segment then they would have to be recomputed every time using at least one division and one multiplication per re-computed index, plus conditionals or serious bit trickery. When sieving the full 2^32 number range (8192 segments of 32 KBytes each) that's at least 53,583,872 slow divisions and the same number of somewhat faster multiplications; that's roughly one second added to the time needed for initialising a full 2^32 sieve (2^31 bits for the odds-only Eratosthenes).

Here's some actual code from one of my older sieves that uses this 'reconstitutive' math:

for (index_t k = 1; k <= max_factor_bit; ++k)
{
if (bitmap_t::traits::bt(bm.bm, k)) continue;

index_t n = (k << 1) + 1; // == index_for_value(value_for_index(k) * 2) == n
index_t i = square(n) >> 1; // == index_for_value(square(n))

if (i < offset)
{
i += ((offset - i) / n) * n;
}

for ( ; i <= new_max_bit; i += n)
{
bitmap_t::traits::bts(bm.bm, i);
}
}

That takes about 5.5 seconds for the full sieve (VC++); the code shown first takes only 4.5 seconds with the same compiler, or 3.5 seconds using gcc 4.8.1 (MinGW64).

Here are the gcc timings:

sieve bits = 2147483648 (equiv. number = 4294967295)

segment size 4096 (2^12) bytes ... 4.091 s 1001.2 M/s
segment size 8192 (2^13) bytes ... 3.723 s 1100.2 M/s
segment size 16384 (2^14) bytes ... 3.534 s 1159.0 M/s
segment size 32768 (2^15) bytes ... 3.418 s 1198.4 M/s
segment size 65536 (2^16) bytes ... 3.894 s 1051.9 M/s
segment size 131072 (2^17) bytes ... 4.265 s 960.4 M/s
segment size 262144 (2^18) bytes ... 4.453 s 919.8 M/s
segment size 524288 (2^19) bytes ... 5.002 s 818.9 M/s
segment size 1048576 (2^20) bytes ... 5.176 s 791.3 M/s
segment size 2097152 (2^21) bytes ... 5.135 s 797.7 M/s
segment size 4194304 (2^22) bytes ... 5.251 s 780.0 M/s
segment size 8388608 (2^23) bytes ... 7.412 s 552.6 M/s

digest { 203280221, 0C903F86, 5B253F12, 774A3204 }

Note: an additional second can be shaved from that time by a trick called 'presieving', i.e. blasting a pre-computed pattern into the bitmap instead of zeroing it at the beginning. That brings the gcc timing down to 2.1 s for the full sieve, with this hacked copy of the earlier .cpp. This trick works extremely well together with segmented sieving in cache-sized chunks.

Python Prime number finder

You have to check all the numbers before you can say it's prime. Currently, your loop exits at the first check (which is d == 2) and return False if p % 2 == 0, else True.

You should put the else statement at the end of the loop, like this:

while True:
p = int(input('Enter a number '))
for d in range(2, p):
if p % d == 0:
print(p, "is not a prime number!", d,"*", p//d,"=",p)
break
else:
print(p, "is a prime number!")

The else is executed only if the loop did not end with break. This means that if not any divider was found, your number is prime.

Moreover, note that you do not have to check numbers until p, you can stop at sqrt(p) and iterate two by two: for d in range(3, int(p**0.5) + 1, 2).



Related Topics



Leave a reply



Submit