Merge multiple large csv files quickly
on the second thought , you can use hdf5
structure that handles big data really well:
import pandas as pd
hdf_path = '_combined.h5'
with pd.HDFStore(hdf_path, mode='w', complevel=5, complib='blosc') as store:
for csv_file_name in sorted_fnames:
store.append('data', pd.read_csv(csv_file_name), index=False)
you eventually can save it back to csv, if you wanted, but working with hdf5 would be more effeient
How to combine two csv files together
I ended up finding the answer to my own question.
I did some digging and what worked for me was using:
merged=df1.append(df2)
merged=merged.sort_values('Dept')
So my final code output:
import pandas as pd
import os, csv, sys
csvPath1 = 'data1.csv'
csvPath2 = 'data2.csv'
csvDest = 'newdata.csv'
df1 = pd.read_csv(csvPath1)
df2 = pd.read_csv(csvPath2)
df1=df1.drop('Company', 1)
df2=df2.drop('Company', 1)
merged=df1.append(df2)
merged=merged.sort_values('Dept')
merged.to_csv(csvDest, index=False)
Opening at least two csv files and merging them using tkinter button
You should use list to keep all selected files. This way you may have any number of files and you can use for
-loop to work with all filenames
all_files = []
and when you select file
def file_dialog():
global all_files
try:
file_name = fd.askopenfilename(initialdir='/home/furas/test',
title='Select a file',
filetypes=(('csv files', '*.csv'), ('All files', '*.*')))
except FileNotFoundError:
tkinter.messagebox.showerror('Information', "File not found")
return
# check if not pressed `Cancel`
if file_name:
all_files.append( file_name )
And later you cand merge them
df = pd.DataFrame() # create new empty dataframe
for filename in all_files:
print('filename:', filename)
try:
new_df = pd.read_csv(filename, delimiter=',')
df = df.append(new_df) # <--- inside `try/except`
except ValueError:
tkinter.messagebox.showerror('Information', 'File is invalid')
except FileNotFoundError:
tkinter.messagebox.showerror('Information', "File not found")
print(df.to_string())
Minimal working example:
import tkinter as tk
from tkinter import filedialog as fd
import pandas as pd
# --- functions ---
def file_dialog():
global all_files
print('[file_dialog]')
try:
file_name = fd.askopenfilename(initialdir='/home/furas/test',
title='Select a file',
filetypes=(('csv files', '*.csv'), ('All files', '*.*')))
except FileNotFoundError:
tkinter.messagebox.showerror('Information', "File not found")
return
# check if not pressed `Cancel`
if file_name:
all_files.append( file_name )
def merge_files():
global df
global all_files
print('[merge_files]')
df = pd.DataFrame() # create new empty dataframe
for filename in all_files:
print('filename:', filename)
try:
new_df = pd.read_csv(filename, delimiter=',')
df = df.append(new_df)
except ValueError:
tkinter.messagebox.showerror('Information', 'File is invalid')
except FileNotFoundError:
tkinter.messagebox.showerror('Information', "File not found")
print(df.to_string())
# remove all filenames
all_files = []
# --- main ---
df = pd.DataFrame() # create empty dataframe at start (as default value)
all_files = [] # create empty list at start (as default value)
root = tk.Tk()
browse_button = tk.Button(root, text='Browse File', command=file_dialog)
browse_button.pack(fill='x')
load_button = tk.Button(root, text='Load Data', command=merge_files)
load_button.pack(fill='x')
root.mainloop()
EDIT:
There is also askopenfilenames
with char s
at the end to select many filenames at once.
It gives list/tuple with all selected filenames which you could assign to global variable (and replace all previous filenames) or use extend()
or +=
to add filenames to existing list
file_names = fd.askopenfilenames(...)
# check if not pressed `Cancel`
if file_names:
#all_files.extend( file_names ) # add to existing list
#all_files += file_names # add to existing list
all_files = file_names # replace previous list
Merge Two CSV Files With Cases Using Python
You need to do a left join of actual onto forecast (this based on your requirement that missing actual data should be filled, if you expect missing forecast data you would want to do an outer join).
import pandas as pd
act_df = pd.read_csv(actual_data.csv)
fore_df = pd.read_csv(forecast_data.csv)
res = fore_df.merge(act_df, on=['type', 'region_1', 'region_2'], how='left', suffixes=('_fore', '_act'))
At this point res will have an index of the merge columns ['type', region_1', region_2']
, 2 date columns (date_fore
, date_act
), and 2 data columns (data_fore
, date_act
). From there you can fill the null values of the actual date with the forecast date and then collapse it to a single column if you like:
res['date'] = res['date_act'].fillna(res['date_fore'])
res.drop(['date_fore', 'date_act'], axis=1, inplace=True)
For the data it's a similar operation:
res['data'] = res['data_act'].fillna(res['data_fore'])
res.drop(['data_fore', 'data_act'], axis=1, inplace=True)
python script to merge more than 200 very large csv very in just one
You probably just need to keep a merged.csv
file open whilst reading in each of the certificates.csv
files. glob.glob()
can be used to recursively find all suitable files:
import glob
import csv
import os
path = r'C:\path\to\folder\where\all\files\are-allowated-in-subfolders'
os.chdir(path)
with open('merged.csv', 'w', newline='') as f_merged:
csv_merged = csv.writer(f_merged)
for filename in glob.glob(os.path.join(path, '*/certificates.csv'), recursive=True):
print(filename)
try:
with open(filename) as f_csv:
csv_merged.writerows(csv.reader(f_csv))
except:
print('problem with file: ', filename)
An r
prefix can be added to your path to avoid needing to escape each backslash. Also newline=''
should be added to the open()
when using a csv.writer()
to stop extra blank lines being written to your output file.
Merge Two CSV files in Python
The problem is that you could iterate over a csv reader only once, so that csv_file2 does not work after the first iteration. To solve that you should save the output of csv_file2 and iterate over the saved list.
It could look like that:
import time, csv
def links():
first = open('closed.csv')
csv_file = csv.reader(first, delimiter="|")
second = open('links.csv')
csv_file2 = csv.reader(second, delimiter="|")
list=[]
for row in csv_file2:
list.append(row)
for row in csv_file:
match=False
for secrow in list:
if row[0].replace(" ","") == secrow[0].replace(" ",""):
print row[0] + "," + row[1] + "," + secrow[1]
match=True
if not match:
print row[0] + "," + row[1] + ", blank no match"
time.sleep(1)
Output:
Num , status, code
1213 , closed, 1891
4223 , open, 0011
2311 , open, blank no match
Python merging two CSV by common column
Other have proposed ways using pandas. You should considere it if your files are big, or if you need to do this operation quite often. But the csv module is enough here.
You cannot use plain dicts here because the keys are not unique: subproduct.csv
has 2 different rows with the same id 1. So I would use dicts of lists instead.
I will admit here that all keys have to be present in product.csv, but some product may have no associated subproducts (meaning a left outer join in database wordings).
So I will use:
- a dict for product.csv because I assume that product_id are unique per product
- a defaultdict of lists for subproduct.csv because a single product may have many subproducts
- the list of ids from product.csv to build the final file
- a default empty list for subproduct.csv if a product had no subproducts
- and process headers separately
Code could be:
with open('product.csv') as f:
r = csv.reader(f)
header1 = next(r)
dict1 = {row[0]: row[1:] for row in r}
dict2 = collections.defaultdict(list)
with open('subproduct.csv', 'r') as f:
r = csv.reader(f)
header2 = next(r)
for row in r:
dict2[row[0]].append(row[1:])
with open('merged.csv', 'w', newline='') as f:
w = csv.writer(f)
_ = w.writerow(header1 + header2[1:])
empty2 = [[] * (len(header2) - 1)]
for k in sorted(dict1.keys()):
for row2 in dict2.get(k, empty2): # accept no subproducts
_ = w.writerow([k] + dict1[k] + row2)
Assuming that your csv files are truely Comma Separated Values files, this gives:
product_id,name,subproduct_name,volume
1,Handwash,Dettol,20
1,Handwash,Lifebuoy,50
2,Soap,Lux,100
Related Topics
Replacing Text in a File with Python
How to Get Last Items of a List in Python
Rename Specific Column(S) in Pandas
How to Pass an Argument to a Function Pointer Parameter
Get Raw Post Body in Python Flask Regardless of Content-Type Header
Random State (Pseudo-Random Number) in Scikit Learn
Why Don't Methods Have Reference Equality
List() Uses Slightly More Memory Than List Comprehension
How to Execute a Python Script in Notepad++
Importerror: No Module Named Crypto.Cipher
File Not Found Error When Launching a Subprocess Containing Piped Commands
What Is the Advantage of a List Comprehension Over a for Loop
Chain-Calling Parent Initialisers in Python