Convert CSV to Parquet File Using Python

Converting csv file to parquet using python

To solve the memory problem, you can first import the data with the chunck method of pandas and save each chunck as a parquet file. So for example for your case, create a folder "train_data", and in this folder you save the different parquet files that correspond to the chuncks.

import pandas as pd
import numpy as np
import os
import csv2parquet
from subprocess import run
import fastparquet
import sys
import pyarrow.parquet as pq

path ="C:/.../amex-default-prediction/"
parquet="parquet/"
#create new folder train_data
path_train_data="train_data/"


def get_path_parquet(file):
if file.split('.')[0]=="sample_submission":
return path_sample_submission
elif file.split('.')[0]=="test_data":
return path_test_data
elif file.split('.')[0]=="train_data":
return path_train_data
elif file.split('.')[0]=="train_labels":
return path_train_label
def csv_to_parquet(df,title, path,i):
"""
Convert Csv files to parquet
df : csv data
title : name data
path : folder into the save parquet data
"""
try:
title_prefix=title.split(".")[0]+str(i)
out_title = path + f'\\{title_prefix}.parquet'
df.to_parquet(out_title, engine='fastparquet')
except:
sys.exit(-1)
def loding_csv_with_chunk(path,file):
try:
chunk_csv= pd.read_csv(path + f'\\{file}', low_memory=False, chunksize = 5000)
#df = pd.concat(chunk for chunk in chunk_csv)
return chunk_csv
except:
sys.exit(-1)


def read_partition_parquet():
dataset = pq.ParquetDataset(path_train_, use_legacy_dataset=False)
data=dataset.read().to_pandas()
return data


#csv_df
for file in os.listdir(path):
if file[-4:]==".csv":

print("begin process for : "+str(file)+ "....")
#csv_df = pd.read_csv(path + f'\\{file}')
##load data with chunck method
chunk_csv = loding_csv_with_chunk(path,file)
##for each chunck save the data on parquet format
for i, df_chunk in enumerate(chunk_csv):
print(df_chunk.shape)
title_prefix=file.split(".")[0]+str(i)
out_title = path+parquet+get_path_parquet(file) + f'{title_prefix}.parquet'
df_chunk.to_parquet(out_title, engine='fastparquet')
#csv_to_parquet(csv_df,file, path)
print("end process for : "+str(file)+ "....")
else:
continue

Comparing and Generating Parquet Files in Python

You need to inspect the schema and the metadata of the parquet files.

The schema will be particularly useful in terms of information about data types.

EDIT: Considering the pyarrow module:

import pyarrow.parquet as pq
import json
import pandas as pd

# load legacy parquet file
old_tbl = pq.read_table('old_file.parquet')

# get the metadata key
print(old_tbl.schema.metadata.keys())

# let's say the result was b'pandas'...
# create a dictionary with metadata information
old_info = json.loads(old_tbl.schema.metadata[b'pandas'].decode('utf-8'))

# get the metadata field names
print(old_info.keys())

# finally, inspect each metadata field
# p.e. column types
print(pd.DataFrame(old_info['columns']))

# p.e. pandas version used by him
print(old_dict['pandas_version'])

# p.e. pyarrow version used by him (assuming he used it too)
print(old_dict['creator'])
# and so on

With all this information, you can create new parquet files that carries the process expected data types.

How to provide parquet schema while writing parquet file using PyArrow

Could you give an example of records? If I try tu use a list of lists as suggested fails:

>>> pa.Table.from_pylist([["1", "2"], ["first", "second"]])
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "pyarrow/table.pxi", line 3682, in pyarrow.lib.Table.from_pylist
return _from_pylist(cls=Table,
File "pyarrow/table.pxi", line 5199, in pyarrow.lib._from_pylist
names = list(mapping[0].keys())
AttributeError: 'list' object has no attribute 'keys'

I would expect records to be a list of dicts from the documentation.

    data = [{'strs': '', 'floats': 4.5},
{'strs': 'foo', 'floats': 5},
{'strs': 'bar', 'floats': None}]
table = pa.Table.from_pylist(data)

You can use the schema when building the table from py_list, on this case:

schema = pa.schema([('a', pa.int64()),
('c', pa.int32()),
('d', pa.int16())
])
table = pa.Table.from_pylist(
[{'a': 1, 'b': 3}, {'a': 2, 'b': 4}, {'a': 3, 'b': 5}],
schema=schema
)
data = [{'a': 1, 'c': None, 'd': None},
{'a': 2, 'c': None, 'd': None},
{'a': 3, 'c': None, 'd': None}]
assert table.schema == schema
assert table.to_pylist() == data


Related Topics



Leave a reply



Submit