Converting csv file to parquet using python
To solve the memory problem, you can first import the data with the chunck method of pandas and save each chunck as a parquet file. So for example for your case, create a folder "train_data", and in this folder you save the different parquet files that correspond to the chuncks.
import pandas as pd
import numpy as np
import os
import csv2parquet
from subprocess import run
import fastparquet
import sys
import pyarrow.parquet as pq
path ="C:/.../amex-default-prediction/"
parquet="parquet/"
#create new folder train_data
path_train_data="train_data/"
def get_path_parquet(file):
if file.split('.')[0]=="sample_submission":
return path_sample_submission
elif file.split('.')[0]=="test_data":
return path_test_data
elif file.split('.')[0]=="train_data":
return path_train_data
elif file.split('.')[0]=="train_labels":
return path_train_label
def csv_to_parquet(df,title, path,i):
"""
Convert Csv files to parquet
df : csv data
title : name data
path : folder into the save parquet data
"""
try:
title_prefix=title.split(".")[0]+str(i)
out_title = path + f'\\{title_prefix}.parquet'
df.to_parquet(out_title, engine='fastparquet')
except:
sys.exit(-1)
def loding_csv_with_chunk(path,file):
try:
chunk_csv= pd.read_csv(path + f'\\{file}', low_memory=False, chunksize = 5000)
#df = pd.concat(chunk for chunk in chunk_csv)
return chunk_csv
except:
sys.exit(-1)
def read_partition_parquet():
dataset = pq.ParquetDataset(path_train_, use_legacy_dataset=False)
data=dataset.read().to_pandas()
return data
#csv_df
for file in os.listdir(path):
if file[-4:]==".csv":
print("begin process for : "+str(file)+ "....")
#csv_df = pd.read_csv(path + f'\\{file}')
##load data with chunck method
chunk_csv = loding_csv_with_chunk(path,file)
##for each chunck save the data on parquet format
for i, df_chunk in enumerate(chunk_csv):
print(df_chunk.shape)
title_prefix=file.split(".")[0]+str(i)
out_title = path+parquet+get_path_parquet(file) + f'{title_prefix}.parquet'
df_chunk.to_parquet(out_title, engine='fastparquet')
#csv_to_parquet(csv_df,file, path)
print("end process for : "+str(file)+ "....")
else:
continue
Comparing and Generating Parquet Files in Python
You need to inspect the schema and the metadata of the parquet files.
The schema will be particularly useful in terms of information about data types.
EDIT: Considering the pyarrow
module:
import pyarrow.parquet as pq
import json
import pandas as pd
# load legacy parquet file
old_tbl = pq.read_table('old_file.parquet')
# get the metadata key
print(old_tbl.schema.metadata.keys())
# let's say the result was b'pandas'...
# create a dictionary with metadata information
old_info = json.loads(old_tbl.schema.metadata[b'pandas'].decode('utf-8'))
# get the metadata field names
print(old_info.keys())
# finally, inspect each metadata field
# p.e. column types
print(pd.DataFrame(old_info['columns']))
# p.e. pandas version used by him
print(old_dict['pandas_version'])
# p.e. pyarrow version used by him (assuming he used it too)
print(old_dict['creator'])
# and so on
With all this information, you can create new parquet files that carries the process expected data types.
How to provide parquet schema while writing parquet file using PyArrow
Could you give an example of records? If I try tu use a list of lists as suggested fails:
>>> pa.Table.from_pylist([["1", "2"], ["first", "second"]])
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "pyarrow/table.pxi", line 3682, in pyarrow.lib.Table.from_pylist
return _from_pylist(cls=Table,
File "pyarrow/table.pxi", line 5199, in pyarrow.lib._from_pylist
names = list(mapping[0].keys())
AttributeError: 'list' object has no attribute 'keys'
I would expect records to be a list of dicts from the documentation.
data = [{'strs': '', 'floats': 4.5},
{'strs': 'foo', 'floats': 5},
{'strs': 'bar', 'floats': None}]
table = pa.Table.from_pylist(data)
You can use the schema when building the table from py_list, on this case:
schema = pa.schema([('a', pa.int64()),
('c', pa.int32()),
('d', pa.int16())
])
table = pa.Table.from_pylist(
[{'a': 1, 'b': 3}, {'a': 2, 'b': 4}, {'a': 3, 'b': 5}],
schema=schema
)
data = [{'a': 1, 'c': None, 'd': None},
{'a': 2, 'c': None, 'd': None},
{'a': 3, 'c': None, 'd': None}]
assert table.schema == schema
assert table.to_pylist() == data
Related Topics
Discord Bot Messaging a User With a Specific User Id
Finding the Most Frequent Character in a String
Regex to Remove Commas Before a Number in Python
Adding Different Sized/Shaped Displaced Numpy Matrices
Python - Regex Match Multiple Patterns in Multiple Lines
Python: How to Turn CSV Data in to Array
Reduce Multi-Index/Multi-Level Dataframe to Single Index, Single Level
Check Json Data Is None in Python
How to Get Maximum Length of Each Column in the Data Frame Using Pandas Python
How to Map True/False to 1/0 in a Pandas Dataframe
How to Determine Whether a Pandas Column Contains a Particular Value
Python, Delete Json Element Having Specific Key from a Loop
How to Clear/Delete the Contents of a Tkinter Text Widget
Check If Value from One Dataframe Exists in Another Dataframe
How to Limit a Number to Be Within a Specified Range (Python)