Splitting One CSV into Multiple Files

Splitting one csv into multiple files

I suggest you not inventing a wheel. There is existing solution. Source here

import os


def split(filehandler, delimiter=',', row_limit=1000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
import csv
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = reader.next()
current_out_writer.writerow(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)

Use it like:

split(open('/your/pat/input.csv', 'r'));

How to split one csv into multiple files in python

If you can't use pandas you can use the built-in csv module and itertools.groupby() function. You can use this to group by country.

from itertools import groupby
import csv

with open('world.csv') as csv_file:
reader = csv.reader(csv_file)
next(reader) #skip header

#Group by column (country)
lst = sorted(reader, key=lambda x : x[4])
groups = groupby(lst, key=lambda x : x[4])

#Write file for each country
for k,g in groups:
filename = k + '.csv'
with open(filename, 'w', newline='') as fout:
csv_output = csv.writer(fout)
csv_output.writerow(["city","city_alt","lat","lng","country"]) #header
for line in g:
csv_output.writerow(line)

Split csv file into two separate files python 3.7

Read the whole csv file and store the contents in a list. Then store similar csv data in a temp list. After storing, extract 70% of the data from the list and write it into the file and write the remaining data into another file.

csv_data = []
with open ('file.csv') as file:
csv_data.append(file.read())
csv_data = (''.join(csv_data)).split("\n")
header = csv_data[0]
csv_data = csv_data[1:]
temp_list = []
add_header = True
for i in csv_data:
if len(temp_list) == 0:
temp_list.append(i)
elif i.split(',')[0] == temp_list[0].split(',')[0]:
temp_list.append(i)
else:
file_length = len(temp_list)
line_count = int((0.7*file_length)+1)
if line_count == 1:
with open("file1.csv","a+") as file1:
if add_header:
add_header = False
file1.write(header+'\n')
file1.write(temp_list[0]+'\n')
else:
seventy_perc_lines = temp_list[:line_count]
thirty_perc_lines = temp_list[line_count:]
if add_header:
seventy_perc_lines.insert(0,header)
thirty_perc_lines.insert(0,header)
add_header = False
with open("file1.csv","a+") as file1:
for j in range(len(seventy_perc_lines)):
file1.write(seventy_perc_lines[j]+'\n')
if len(thirty_perc_lines) != 0:
with open("file2.csv","a+") as file2:
for j in range(len(thirty_perc_lines)):
file2.write(thirty_perc_lines[j]+'\n')
temp_list = []
temp_list.append(i)

file1.csv

first file

file2.csv

second file

Note: If there are only 3 lines, this code will add all the 3 lines in file1 and adds nothing to file2. You need to edit this code if you wish to change this behaviour.

Splitting a csv files into multiple file with overlapping rows using python

I don't have the data to test this thoroughly but should work:

CHUNK = 4_000
OVERLAP = 1_000

def write_csv(lines, filename, header):
with open(filename, 'w') as csv:
csv.write(header)
csv.writelines(lines)

def get_csv_gen():
part = 1
while True:
yield f'data_part_{part}.csv'
part += 1

get_csv_name = get_csv_gen()

with open('8-0new2.csv') as csv:
header = csv.readline()
lines = csv.readlines()
for offset in range(0, len(lines), CHUNK-OVERLAP):
write_csv(lines[offset:offset+CHUNK], next(get_csv_name), header)


Related Topics



Leave a reply



Submit