Python Decompression Relative Performance

Python decompression relative performance?

You can use Python-blosc

It is very fast and for small arrays (<2GB) also quite easy to use. On easily compressable data like your example, it is often faster to compress the data for IO operations. (SATA-SSD: about 500 MB/s, PCIe- SSD: up to 3500MB/s) In the decompression step the array allocation is the most costly part. If your images are of similar shape you can avoid repeated memory allocation.

Example

A contigous array is assumed for the following example.

import blosc
import pickle

def compress(arr,Path):
    #c = blosc.compress_ptr(arr.__array_interface__['data'][0], arr.size, arr.dtype.itemsize, clevel=3,cname='lz4',shuffle=blosc.SHUFFLE)
    c = blosc.compress_ptr(arr.__array_interface__['data'][0], arr.size, arr.dtype.itemsize, clevel=3,cname='zstd',shuffle=blosc.SHUFFLE)
    f=open(Path,"wb")
    pickle.dump((arr.shape, arr.dtype),f)
    f.write(c)
    f.close()
    return c,arr.shape, arr.dtype

def decompress(Path):
    f=open(Path,"rb")
    shape,dtype=pickle.load(f)
    c=f.read()
    #array allocation takes most of the time
    arr=np.empty(shape,dtype)
    blosc.decompress_ptr(c, arr.__array_interface__['data'][0])
    return arr

#Pass a preallocated array if you have many similar images
def decompress_pre(Path,arr):
    f=open(Path,"rb")
    shape,dtype=pickle.load(f)
    c=f.read()
    #array allocation takes most of the time
    blosc.decompress_ptr(c, arr.__array_interface__['data'][0])
    return arr

Benchmarks

#blosc.SHUFFLE, cname='zstd' -> 4728KB,  
%timeit compress(arr,"Test.dat")
1.03 s ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#611 MB/s
%timeit decompress("Test.dat")
146 ms ± 481 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#4310 MB/s
%timeit decompress_pre("Test.dat",arr)
50.9 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#12362 MB/s

#blosc.SHUFFLE, cname='lz4' -> 9118KB, 
%timeit compress(arr,"Test.dat")
32.1 ms ± 437 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#19602 MB/s
%timeit decompress("Test.dat")
146 ms ± 332 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#4310 MB/s
%timeit decompress_pre("Test.dat",arr)
53.6 ms ± 82.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#11740 MB/s

Edit

This version is more for general use. It does handle f-contiguous, c-contiguous and non-contiguous arrays and arrays >2GB. Also have a look at bloscpack.

import blosc
import pickle

def compress(file, arr,clevel=3,cname='lz4',shuffle=1):
    """
    file           path to file
    arr            numpy nd-array
    clevel         0..9
    cname          blosclz,lz4,lz4hc,snappy,zlib
    shuffle        0-> no shuffle, 1->shuffle,2->bitshuffle
    """
    max_blk_size=100_000_000 #100 MB 

    shape=arr.shape
    #dtype np.object is not implemented
    if arr.dtype==np.object:
        raise(TypeError("dtype np.object is not implemented"))

    #Handling of fortran ordered arrays (avoid copy)
    is_f_contiguous=False
    if arr.flags['F_CONTIGUOUS']==True:
        is_f_contiguous=True
        arr=arr.T.reshape(-1)
    else:
        arr=np.ascontiguousarray(arr.reshape(-1))

    #Writing
    max_num=max_blk_size//arr.dtype.itemsize
    num_chunks=arr.size//max_num

    if arr.size%max_num!=0:
        num_chunks+=1

    f=open(file,"wb")
    pickle.dump((shape,arr.size,arr.dtype,is_f_contiguous,num_chunks,max_num),f)
    size=np.empty(1,np.uint32)
    num_write=max_num
    for i in range(num_chunks):
        if max_num*(i+1)>arr.size:
            num_write=arr.size-max_num*i
        c = blosc.compress_ptr(arr[max_num*i:].__array_interface__['data'][0], num_write, 
                               arr.dtype.itemsize, clevel=clevel,cname=cname,shuffle=shuffle)
        size[0]=len(c)
        size.tofile(f)
        f.write(c)
    f.close()

def decompress(file,prealloc_arr=None):
    f=open(file,"rb")
    shape,arr_size,dtype,is_f_contiguous,num_chunks,max_num=pickle.load(f)

    if prealloc_arr is None:
        if prealloc_arr.flags['F_CONTIGUOUS']==True
            prealloc_arr=prealloc_arr.T
        if prealloc_arr.flags['C_CONTIGUOUS']!=True
            raise(TypeError("Contiguous array is needed"))
        arr=np.empty(arr_size,dtype)
    else:
        arr=np.frombuffer(prealloc_arr.data, dtype=dtype, count=arr_size)

    for i in range(num_chunks):
        size=np.fromfile(f,np.uint32,count=1)
        c=f.read(size[0])
        blosc.decompress_ptr(c, arr[max_num*i:].__array_interface__['data'][0])
    f.close()

    #reshape
    if is_f_contiguous:
        arr=arr.reshape(shape[::-1]).T
    else:
        arr=arr.reshape(shape)
    return arr

What's the most space-efficient way to compress serialized Python data?

I've done some test using a Pickled object, lzma gave the best compression.

But your results can vary based on your data, I'd recommend testing them with some sample data of your own.

Mode                LastWriteTime         Length Name
----                -------------         ------ ----
-a----        9/17/2019  10:05 PM       23869925 no_compression.pickle
-a----        9/17/2019  10:06 PM        6050027 gzip_test.gz
-a----        9/17/2019  10:06 PM        3083128 bz2_test.pbz2
-a----        9/17/2019  10:07 PM        1295013 brotli_test.bt
-a----        9/17/2019  10:06 PM        1077136 lzma_test.xz

Test file used (you'll need to pip install brotli or remove that algorithm):

import bz2
import gzip
import lzma
import pickle

import brotli

class SomeObject():

    a = 'some data'
    b = 123
    c = 'more data'

    def __init__(self, i):
        self.i = i

data = [SomeObject(i) for i in range(1, 1000000)]

with open('no_compression.pickle', 'wb') as f:
    pickle.dump(data, f)

with gzip.open("gzip_test.gz", "wb") as f:
    pickle.dump(data, f)

with bz2.BZ2File('bz2_test.pbz2', 'wb') as f:
    pickle.dump(data, f)

with lzma.open("lzma_test.xz", "wb") as f:
    pickle.dump(data, f)

with open('no_compression.pickle', 'rb') as f:
    pdata = f.read()
    with open('brotli_test.bt', 'wb') as b:
        b.write(brotli.compress(pdata))

Python decompressing gzip chunk-by-chunk

gzip and zlib use slightly different headers.

See How can I decompress a gzip stream with zlib?

Try d = zlib.decompressobj(16+zlib.MAX_WBITS).

And you might try changing your chunk size to a power of 2 (say CHUNKSIZE=1024) for possible performance reasons.

Memory usage increases when building a large NumPy array

Using the h5py package, I can create an hdf5 file that contains a dataset that represents the a array. The dset variable is similar to the a variable discussed in the question. This allows the array to reside on disk, not in memory. The generated hdf5 file is 8 GB on disk which is the size of the array containing np.float32 values. The elapsed time for this approach is similar to the examples discussed in the question; therefore, writing to the hdf5 file seems to have a negligible performance impact.

import numpy as np
import h5py
import time

def main():

    rng = np.random.default_rng()

    tic = time.perf_counter()

    z = 500   # depth
    x = 2000  # rows
    y = 2000  # columns

    f = h5py.File('file.hdf5', 'w')
    dset = f.create_dataset('data', shape=(z, x, y), dtype=np.float32)

    for i in range(z):
        r = rng.standard_normal((x, y), dtype=np.float32)
        dset[i, :, :] = r

    toc = time.perf_counter()
    print('elapsed time =', round(toc - tic, 2), 'sec')

    s = np.float32().nbytes * (z * x * y) / 1e9  # where 1 GB = 1000 MB
    print('calculated storage =', s, 'GB')

if __name__ == '__main__':
    main()

Output from running this example on a MacBook Pro with 2.6 GHz 6-Core Intel Core i7 and 32 GB of RAM:

elapsed time = 22.97 sec
calculated storage = 8.0 GB

Running the memory profiler for this example gives the plot shown below. The peak memory usage is about 100 MiB which is drastically lower than the examples demonstrated in the question.

hdf5 plot

Using Python to compress data into e.g. 12-bit chunks?

Could you package/bit-shift two 12-bit integers into an array of three bytes (24 bits), and then use bit shifting to get the upper and lower 12 bits?

I imagine such an encoding would also compress well, on top of the space savings from encoding, given the redundancy, or if your data are particularly sparse or integers are distributed in a certain way.

I don't know a ton about numpy but from a cursory look, I believe it can store arrays of bytes, and there are bit shifting operands available in Python. If performance is a requirement, one could look into Cython for C-based bit operations on an unsigned char * within Python.

You'd need to figure out offsets, so that you always get on the correct starting byte, or you'd get something like a "frameshift mutation", to use a biological metaphor. That could be bad news for TB-sized data containers.

What is the most compact way of storing numpy data?

An array with 46800 x 4 x 18 8-byte floats takes up 26956800 bytes. That's 25.7MiB or 27.0MB. A compressed size of 22MB is an 18% (or 14% if you really meant MiB) compression, which is pretty good by most standards, especially for random binary data. You are unlikely to improve on that much. Using a smaller datatype like float32, or perhaps trying to represent your data as rationals may be useful.

Since you mention that you want to store metadata, you can record a byte for the number of dimensions (numpy allows at most 32 dimensions), and N integers for the size in each dimension (either 32 or 64 bit). Let's say you use 64 bit integers. That makes for 193 bytes of metadata in your particular case, or 7*10^-4% of the total array size.

Which is the best way to compress json to store in a memory based store like redis or memcache?

We just use gzip as a compressor.

import gzip
import cStringIO

def decompressStringToFile(value, outputFile):
  """
  decompress the given string value (which must be valid compressed gzip
  data) and write the result in the given open file.
  """
  stream = cStringIO.StringIO(value)
  decompressor = gzip.GzipFile(fileobj=stream, mode='r')
  while True:  # until EOF
    chunk = decompressor.read(8192)
    if not chunk:
      decompressor.close()
      outputFile.close()
      return 
    outputFile.write(chunk)

def compressFileToString(inputFile):
  """
  read the given open file, compress the data and return it as string.
  """
  stream = cStringIO.StringIO()
  compressor = gzip.GzipFile(fileobj=stream, mode='w')
  while True:  # until EOF
    chunk = inputFile.read(8192)
    if not chunk:  # EOF?
      compressor.close()
      return stream.getvalue()
    compressor.write(chunk)

In our usecase we store the result as files, as you can imagine. To use just in-memory strings, you can use a cStringIO.StringIO() object as a replacement for the file as well.

Python Decompression Relative Performance