Hamming Weight/Population Count in T-Sql

Hamming weight/population count in T-SQL

You could use a helper table with precalculated Hamming weights for small numbers, like bytes, then split the value accordingly, join to the helper table and get the sum of partial Hamming weights as the value's Hamming weight:

-- define Hamming weight helper table
DECLARE @hwtally TABLE (byte tinyint, hw int);
INSERT INTO @hwtally (byte, hw) VALUES (0, 0);
INSERT INTO @hwtally (byte, hw) SELECT 1 - byte, 1 - hw FROM @hwtally;
INSERT INTO @hwtally (byte, hw) SELECT 3 - byte, 2 - hw FROM @hwtally;
INSERT INTO @hwtally (byte, hw) SELECT 7 - byte, 3 - hw FROM @hwtally;
INSERT INTO @hwtally (byte, hw) SELECT 15 - byte, 4 - hw FROM @hwtally;
INSERT INTO @hwtally (byte, hw) SELECT 31 - byte, 5 - hw FROM @hwtally;
INSERT INTO @hwtally (byte, hw) SELECT 63 - byte, 6 - hw FROM @hwtally;
INSERT INTO @hwtally (byte, hw) SELECT 127 - byte, 7 - hw FROM @hwtally;
INSERT INTO @hwtally (byte, hw) SELECT 255 - byte, 8 - hw FROM @hwtally;

-- calculate
WITH split AS (
SELECT SUBSTRING(@value, number, 1) AS byte
FROM master.dbo.spt_values
WHERE type = 'P' AND number BETWEEN 1 AND LEN(@value)
)
SELECT
Value = @value,
HammingWeight = SUM(t.hw)
FROM split s
INNER JOIN @hwtally t ON s.byte = t.byte

Fastest 64-bit population count (Hamming weight)

OK came to the conclusion that it was no idea of trying to be 'smart', I benched:

the built in intrinsic popcount: _mm_popcnt_u64

bmi2:__tzcnt_u64(~_pext_u64(data[i],data[i]));
against three assembler functions

popcnt, bmi2 and avx2.

They all run at the speed you can move memory in and out of my:

cat /proc/cpuinfo

-Intel(R) Xeon(R) CPU E3-1275 v3 @ 3.50GHz

FYI:

main.c:

// Hamming weight bench

#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <smmintrin.h>
#include <immintrin.h>
#include <x86intrin.h>
#include <math.h>

#define DISPLAY_HEIGHT 4
#define DISPLAY_WIDTH 32
#define NUM_DATA_OBJECTS 40000000
#define ITTERATIONS 20

// The source data (+32 to avoid the quantization out of memory problem)
__attribute__ ((aligned(32))) static long long unsigned data[NUM_DATA_OBJECTS+32]={};
__attribute__ ((aligned(32))) static long long unsigned data_out[NUM_DATA_OBJECTS+32]={};
__attribute__ ((aligned(32))) static unsigned char k1[32*3]={
0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,
0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03,0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03,0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
};

extern "C" {
void popcnt_popcnt(long long unsigned[],unsigned int,long long unsigned[]);
void popcnt_bmi2(long long unsigned[],unsigned int,long long unsigned[]);
void popcnt_avx2(long long unsigned[],unsigned int,long long unsigned[],unsigned char[]);
}

void populate_data()
{
for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
{
data[i] = rand();
}
}

void display_source_data()
{
printf ("\r\nData in(start):\r\n");
for (unsigned int j = 0; j < DISPLAY_HEIGHT; j++)
{
for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
{
printf ("0x%02llux,",data[i+(j*DISPLAY_WIDTH)]);
}
printf ("\r\n");
}
}

void bench_popcnt()
{
for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
{
data_out[i] = _mm_popcnt_u64(data[i]);
}
}

void bench_move_data_memcpy()
{
memcpy(data_out,data,NUM_DATA_OBJECTS*8);
}

// __tzcnt64 ??
void bench_bmi2()
{
for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
{
data_out[i]=__tzcnt_u64(~_pext_u64(data[i],data[i]));
}
}

void display_dest_data()
{
printf ("\r\nData out:\r\n");
for (unsigned int j = 0; j < DISPLAY_HEIGHT; j++)
{
for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
{
printf ("0x%02llux,",data_out[i+(j*DISPLAY_WIDTH)]);
}
printf ("\r\n");
}
}

int main() {
struct timeval t0;
struct timeval t1;
long elapsed[ITTERATIONS]={0};
long avrg=0;

for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
bench_move_data_memcpy();
gettimeofday(&t1, 0);
elapsed[i]= (((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000);
printf ("Time_to_move_data_without_processing: %ld\n",elapsed[i]);
}

avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average time_to_move_data: %ld\n",avrg/(ITTERATIONS-1));

//display_dest_data();

for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
bench_popcnt();
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("popcnt: %ld\n",elapsed[i]);
}

avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average popcnt: %ld\n",avrg/(ITTERATIONS-1));

//display_dest_data();

for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
bench_bmi2();
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("bmi2: %ld\n",elapsed[i]);
}

avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average bmi2: %ld\n",avrg/(ITTERATIONS-1));

//display_dest_data();

printf ("Now test the assembler functions\n");

for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
popcnt_popcnt(data,NUM_DATA_OBJECTS,data_out);
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("popcnt_asm: %ld\n",elapsed[i]);
}

avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average popcnt_asm: %ld\n",avrg/(ITTERATIONS-1));

//display_dest_data();

for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
popcnt_bmi2(data,NUM_DATA_OBJECTS,data_out);
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("bmi2_asm: %ld\n",elapsed[i]);
}

avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average bmi2_asm: %ld\n",avrg/(ITTERATIONS-1));

//display_dest_data();

for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
popcnt_avx2(data,(unsigned int)ceil((NUM_DATA_OBJECTS*8)/32.0),data_out,k1);
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("avx2_asm: %ld\n",elapsed[i]);
}

avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average avx2_asm: %ld\n",avrg/(ITTERATIONS-1));

//display_dest_data();

return 0;
}

The engine.s

//
// avx2_bmi2_popcnt bench
//

.global popcnt_bmi2 , popcnt_avx2, popcnt_popcnt
.align 2

//64-bit popcnt using the built-in popcnt instruction
popcnt_popcnt:
popcntq (%rdi), %r11
mov %r11,(%rdx)
add $8,%rdi
add $8,%rdx
dec %rsi
jnz popcnt_popcnt
ret

//64-bit popcnt using BMI2
popcnt_bmi2:
mov (%rdi),%r11
pextq %r11,%r11,%r11
not %r11
tzcnt %r11,%r11
mov %r11,(%rdx)
add $8,%rdi
add $8,%rdx
dec %rsi
jnz popcnt_bmi2
ret

//64-bit popcnt using AVX2
popcnt_avx2:
vmovdqa (%rcx),%ymm2
add $0x20,%rcx
vmovdqa (%rcx),%ymm3
add $0x20,%rcx
vmovdqa (%rcx),%ymm4
popcnt_avx2_loop:
vmovdqa (%rdi),%ymm0
vpand %ymm0, %ymm2, %ymm1
vpandn %ymm0, %ymm2, %ymm0
vpsrld $4,%ymm0, %ymm0
vpshufb %ymm1, %ymm3, %ymm1
vpshufb %ymm0, %ymm3, %ymm0
vpaddb %ymm1,%ymm0,%ymm0
vpsadbw %ymm0,%ymm4,%ymm0
vmovdqa %ymm0,(%rdx)
add $0x20,%rdi
add $0x20,%rdx
dec %rsi
jnz popcnt_avx2_loop
ret

Compile the sources:

g++ -march=native -mavx -mpopcnt -O3 main.c engine.s

set the CPU to performance:

cpufreq-set -g performance

Run the bench:

sudo chrt -r 10 ./a.out

Result:

Average time_to_move_data: 61

Average popcnt: 61

Average bmi2: 61

Now test the assembler functions

Average popcnt_asm: 61

Average bmi2_asm: 61

Average avx2_asm: 61

Bit-Count or Hamming-weight of a BitString in Elixir?

Here is a better performing solution, which (for me) also shows the intention more clearly:

for(<<bit::1 <- :binary.encode_unsigned(n)>>, do: bit) |> Enum.sum

Benchmark using benchfella with 100.000 binary digits:

Benchfella.start

defmodule HammingBench do
use Benchfella

@n Stream.repeatedly(fn -> Enum.random [0, 1] end)
|> Enum.take(100_000)
|> Enum.join
|> String.to_integer(2)

bench "CharlesO" do
Enum.count(Integer.to_char_list(@n,2),&(&1===49))
end

bench "Patrick Oscity" do
for(<<bit::1 <- :binary.encode_unsigned(@n)>>, do: bit) |> Enum.sum
end
end

Benchmark results:

$ mix bench
Compiled lib/hamming_bench.ex
Generated hamming_bench app
Settings:
duration: 1.0 s

## HammingBench
[20:12:03] 1/2: Patrick Oscity
[20:12:06] 2/2: CharlesO

Finished in 8.4 seconds

## HammingBench
Patrick Oscity 500 4325.79 µs/op
CharlesO 1 5754094.00 µs/op

Number of set elements in column of type SET (population count)

BIT_COUNT(bars) returns the number of one bits in the SET.

I did not know/expect that you can simply use the SET datatype like an integer.

  • SQLFiddle demo

Mysql convert varchar binary representation field to binary to do hamming distance calculation with bit_count

Working code:

SELECT BIT_COUNT( CONV( hash, 2, 10 ) ^ 
0b0000000101100111111100011110000011100000111100011011111110011011 )


Related Topics



Leave a reply



Submit