Ruby how to merge two CSV files with slightly different headers
Challenge accepted :)
#!/usr/bin/env ruby
require "csv"
module MergeCsv
class << self
def run(csv_paths)
csv_files = csv_paths.map { |p| CSV.read(p, headers: true) }
merge(csv_files)
end
private
def merge(csv_files)
headers = csv_files.flat_map(&:headers).uniq.sort
hash_array = csv_files.flat_map(&method(:csv_to_hash_array))
CSV.generate do |merged_csv|
merged_csv << headers
hash_array.each do |row|
merged_csv << row.values_at(*headers)
end
end
end
# Probably not the most performant way, but easy
def csv_to_hash_array(csv)
csv.to_a[1..-1].map { |row| csv.headers.zip(row).to_h }
end
end
end
if(ARGV.length == 0)
puts "Use: ruby merge_csv.rb <file_path_csv_1> <file_path_csv_2>"
exit 1
end
puts MergeCsv.run(ARGV)
Generate a new CSV combining two csv files
First you need to parse both files - you could save each row in a hash or you create a new class and save instances of that class.
Second you need to pair the entries with the same email (if you create instances of your own class, you can assign the notes to the right instance when you parse the second csv)
Finally you want to write a csv file again.
Have a look at this gem - it might be helpful
https://github.com/ruby/csv
How does that sound?
EDIT: here is the code if you use a class to solve the problem
class Person
attr_reader :name, :email, :phone, :job, :created_at, :note
attr_writer :note
#state
# name,email,phone,job,created_at
def initialize(name, email, phone, job, created_at, note)
@name = name
@email = email
@phone = phone
@job = job
@created_at = created_at
@note = note
end
#behaviour
end
#little test:
person_1 = Person.new("john", "john@john.us", "112", "police", "21.02.", nil)
p person_1
require 'csv'
csv_options = { headers: :first_row }
filepath = 'persons.csv'
persons = []
CSV.foreach(filepath, csv_options) do |row|
persons << Person.new(row["name"], row["email"], row["phone"], row["job"], row["created_at"], nil)
end
filepath_2 = "notes.csv"
CSV.foreach(filepath_2, csv_options) do |row|
persons.each do |person|
if person.email == row["email"]
person.note = row["note"]
end
end
end
p persons
csv_options = { col_sep: ',', force_quotes: true, quote_char: '"' }
filepath = 'combined.csv'
CSV.open(filepath, 'wb', csv_options) do |csv|
csv << ['name', 'email', 'phone', 'job', 'created_at', "note"]
persons.each do |person|
csv << [person.name, person.email, person.phone, person.job, person.created_at, person.note]
end
end
Join two CSV files in Ruby without using tables
Givens
We are given the following.
The paths for the two input files:
fname1 = 't1.csv'
fname2 = 't2.csv'
The path for the output file:
fname3 = 't3.csv'
The names of the headers to match in each of the two input files:
target1 = 'B'
target2 = 'E'
I do assume that (as is the case with the example) the two files necessarily contain the same number of lines.
Create test files
Let's first create the two files:
str = [%w|A B C|, %w|1 1 1|, %w|2 2 2|, %w|3 4 5|, %w|6 9 9|].
map { |a| a.join(",") }.join("\n")
#=> "A,B,C\n1,1,1\n2,2,2\n3,4,5\n6,9,9"
File.write(fname1, str)
#=> 29
str = [%w|D E F|, %w|21 1 41|, %w|22 5 42|, %w|23 8 45|, %w|26 9 239|].
map { |a| a.join(",") }.join("\n")
#=> "D,E,F\n21,1,41\n22,5,42\n23,8,45\n26,9,239"
File.write(fname2, str)
#=> 38
Read the input files into CSV::Table
objects
When reading fname1
I will use the :header_converters
option to convert the header "B"
to "B/E"
. Note that this does not require knowledge of the location of the column with header "B"
(or whatever it may be).
require 'csv'
new_target1 = target1 + "/" + target2
#=> "B/E"
csv1 = CSV.read(fname1, headers: true,
header_converters: lambda { |header| header==target1 ? new_target1 : header})
csv2 = CSV.read(fname2, headers: true)
Construct arrays of headers to be written from each input file
headers1 = csv1.headers
#=> ["A", "B/E", "C"]
headers2 = csv2.headers - [target2]
#=> ["D", "F"]
Create the output file
We will first write the new headers headers1 + headers2
to the output file.
Next, for each row index i
(i = 0
corresponding to the first row after the header row in each file), for which a condition is satisfied, we write as a single row the elements of csv1[i]
and csv2[i]
that are in the columns having headers in headers1
and headers2
. The condition to be satisfied to write the rows at index i
is that i
satisfies:
csv1[i][new_target1] == csv2[i][target2] #=> true
Now open fname3
for writing, write the headers and then the body.
CSV.open(fname3, 'w') do |csv|
csv << headers1 + headers2
[csv1.size, csv2.size].min.times do |i|
csv << (headers1.map { |h| csv1[i][h] } +
headers2.map { |h| csv2[i][h] }) if
csv1[i][new_target1] == csv2[i][target2]
end
end
#=> 4
Let's confirm that what was written is correct.
puts File.read(fname3)
A,B/E,C,D,F
1,1,1,21,41
6,9,9,26,239
How to merge multiple csv files into one with unique columns in Rails
Code
require 'csv'
def doit(*csv_input_files, csv_output_file)
CSV.open(csv_output_file, "wb", headers: true) do |csv_out|
csv_out << CSV.open(csv_input_files.first, &:readline)
csv_input_files.each_with_object({}) do |f,h|
CSV.read(f, headers: true).each do |csv|
h[[csv['Date'], csv['ID']]] = csv
end
end.values.each { |row| csv_out << row }
end
end
Example
F1 = 'f1.csv'
F2 = 'f2.csv'
F3 = 'f3.csv'
File.write(F1, <<~END)
Date,ID,Name,Count,Price
2019-08-25,110146,Amazon In-App,1,23
2019-08-25,121615,Google US Rally,0,0
2019-08-25,208442,Google Rewarded US,47,12
2019-08-26,110146,Amazon In-App,10,40
2019-08-26,121615,Google US Rally,0,0
2019-08-26,208442,Google Rewarded US,0,0
END
#=> 260
File.write(F2, <<~END)
Date,ID,Name,Count,Price
2019-08-26,110146,Amazon In-App,30,90
2019-08-26,121615,Google US Rally,5,25
2019-08-26,208442,Google Rewarded US,15,45
2019-08-27,110146,Amazon In-App,5,15
2019-08-27,121615,Google US Rally,10,40
2019-08-27,208442,Google Rewarded US,0,0
END
#=> 263
File.write(F3, <<~END)
Date,ID,Name,Count,Price
2019-08-27,110146,Amazon In-App,30,70
2019-08-27,121615,Google US Rally,12,50
2019-08-27,208442,Google Rewarded US,15,45
2019-08-28,110146,Amazon In-App,15,55
2019-08-28,121615,Google US Rally,20,60
2019-08-28,208442,Google Rewarded US,0,0
END
#=> 265
Fout = 'fout.csv'
doit(F1, F2, F3, Fout)
puts File.read(Fout)
Date,ID,Name,Count,Price
2019-08-25,110146,Amazon In-App,1,23
2019-08-25,121615,Google US Rally,0,0
2019-08-25,208442,Google Rewarded US,47,12
2019-08-26,110146,Amazon In-App,30,90
2019-08-26,121615,Google US Rally,5,25
2019-08-26,208442,Google Rewarded US,15,45
2019-08-27,110146,Amazon In-App,30,70
2019-08-27,121615,Google US Rally,12,50
2019-08-27,208442,Google Rewarded US,15,45
2019-08-28,110146,Amazon In-App,15,55
2019-08-28,121615,Google US Rally,20,60
2019-08-28,208442,Google Rewarded US,0,0
See CSV::open, CSV::read, File::new, IO#gets and Hash#values. This article on working with CSV files may be of interest.
How do I use Ruby to combine several CSV files into one big CSV file?
hs = %w{ transcription_factor cell_type chromosome start_site end_site project_name }
CSV.open('result.csv','w') do |csv|
csv << hs
CSV.foreach('test1.csv', headers: true) {|row| csv << row.values_at(*hs) }
CSV.foreach('test2.csv', headers: true) {|row| csv << row.values_at(*hs) }
CSV.foreach('test3.csv', headers: true) do |row|
csv << ['Cmyc', 'PWM', row['chromosome'].match(/\d+/).to_s] + row.values_at('start_site', 'end_site') + ['JASPAR']
end
end
How to combine the data from two CSV files in BASH?
And a nice, clean awk
solution:
awk -F" *@ *" 'NR==FNR{lines[$2]=$0} NR!=FNR{if(lines[$1])lines[$1]=lines[$1] " @ " $2} END{for(line in lines)print lines[line]}' file2.csv file1.csv
A nice one-liner. Not a short one, but not the longest I've seen. Note that file2 and file1 are switched. Again, as a script with explanation:
#!/usr/bin/awk -f
# Split fields on @ and the whitespace on either side.
BEGIN { FS = " *@ *" }
# First file
NR == FNR {
#Store the line
lines[$2] = $0
}
# Second file
NR != FNR {
# If the appropriate animal was in the first file, append its eating habits.
# If not, it's discarded; if you want something else, let me know.
if(lines[$1]) lines[$1] = lines[$1] " @ " $2
}
# After both files have been processed
END {
# Loop over all lines in the first file and print them, possibly updated with eating habits.
# No guarantees on order.
for(line in lines) print lines[line]
}
Call as awk -f join.awk file2.csv file1.csv
, or make executable and ./join.awk file2.csv file1.csv
.
Merging csv files with slightly different headers
A batch file can easily parse most CSV lines using FOR /F as long as none of the column values contain a comma. But the FOR /F solution can be tripped up by missing values. Your CSV might have consecutive commas, which indicates a missing value. But FOR /F treats consecutive delimiters as a single delimiter. That problem could be solved by batch, but I don't think it is worth the effort.
PowerShell likely has a good solution for parsing CSV. I know .NET has a class for parsing CSV, and PowerShell can access .NET. But I don't really know PowerShell.
There are free text processing tools like sed that are available for Windows. But that requires a download.
I've written an easy to use hybrid batch/JScript utility called REPL.BAT that can perform regex search and replace on text files.
Assuming that your 1st column in the 2nd file never has a quoted comma in the value, then a solution can be as simple as:
@echo off
>new.csv (
echo header1,header2,header3,header4,header5
findstr /v /c:"header1,header2,header3,header4" file1.csv | repl "^(.*)$" "$1,"
findstr /v /c:"header1,header3,header4,header5" file2.csv | repl "^([^,]*)," "$1, ,"
)
Here is the REPL.BAT utility that enables the above solution. Full documentation is build into the script.
@if (@X)==(@Y) @end /* Harmless hybrid line that begins a JScript comment
::************ Documentation ***********
:::
:::REPL Search Replace [Options [SourceVar]]
:::REPL /?
:::
::: Performs a global search and replace operation on each line of input from
::: stdin and prints the result to stdout.
:::
::: Each parameter may be optionally enclosed by double quotes. The double
::: quotes are not considered part of the argument. The quotes are required
::: if the parameter contains a batch token delimiter like space, tab, comma,
::: semicolon. The quotes should also be used if the argument contains a
::: batch special character like &, |, etc. so that the special character
::: does not need to be escaped with ^.
:::
::: If called with a single argument of /? then prints help documentation
::: to stdout.
:::
::: Search - By default this is a case sensitive JScript (ECMA) regular
::: expression expressed as a string.
:::
::: JScript syntax documentation is available at
::: http://msdn.microsoft.com/en-us/library/ae5bf541(v=vs.80).aspx
:::
::: Replace - By default this is the string to be used as a replacement for
::: each found search expression. Full support is provided for
::: substituion patterns available to the JScript replace method.
::: A $ literal can be escaped as $$. An empty replacement string
::: must be represented as "".
:::
::: Replace substitution pattern syntax is documented at
::: http://msdn.microsoft.com/en-US/library/efy6s3e6(v=vs.80).aspx
:::
::: Options - An optional string of characters used to alter the behavior
::: of REPL. The option characters are case insensitive, and may
::: appear in any order.
:::
::: I - Makes the search case-insensitive.
:::
::: L - The Search is treated as a string literal instead of a
::: regular expression. Also, all $ found in Replace are
::: treated as $ literals.
:::
::: E - Search and Replace represent the name of environment
::: variables that contain the respective values. An undefined
::: variable is treated as an empty string.
:::
::: M - Multi-line mode. The entire contents of stdin is read and
::: processed in one pass instead of line by line. ^ anchors
::: the beginning of a line and $ anchors the end of a line.
:::
::: X - Enables extended substitution pattern syntax with support
::: for the following escape sequences:
:::
::: \\ - Backslash
::: \b - Backspace
::: \f - Formfeed
::: \n - Newline
::: \r - Carriage Return
::: \t - Horizontal Tab
::: \v - Vertical Tab
::: \xnn - Ascii (Latin 1) character expressed as 2 hex digits
::: \unnnn - Unicode character expressed as 4 hex digits
:::
::: Escape sequences are supported even when the L option is used.
:::
::: S - The source is read from an environment variable instead of
::: from stdin. The name of the source environment variable is
::: specified in the next argument after the option string.
:::
::************ Batch portion ***********
@echo off
if .%2 equ . (
if "%~1" equ "/?" (
findstr "^:::" "%~f0" | cscript //E:JScript //nologo "%~f0" "^:::" ""
exit /b 0
) else (
call :err "Insufficient arguments"
exit /b 1
)
)
echo(%~3|findstr /i "[^SMILEX]" >nul && (
call :err "Invalid option(s)"
exit /b 1
)
cscript //E:JScript //nologo "%~f0" %*
exit /b 0
:err
>&2 echo ERROR: %~1. Use REPL /? to get help.
exit /b
************* JScript portion **********/
var env=WScript.CreateObject("WScript.Shell").Environment("Process");
var args=WScript.Arguments;
var search=args.Item(0);
var replace=args.Item(1);
var options="g";
if (args.length>2) {
options+=args.Item(2).toLowerCase();
}
var multi=(options.indexOf("m")>=0);
var srcVar=(options.indexOf("s")>=0);
if (srcVar) {
options=options.replace(/s/g,"");
}
if (options.indexOf("e")>=0) {
options=options.replace(/e/g,"");
search=env(search);
replace=env(replace);
}
if (options.indexOf("l")>=0) {
options=options.replace(/l/g,"");
search=search.replace(/([.^$*+?()[{\\|])/g,"\\$1");
replace=replace.replace(/\$/g,"$$$$");
}
if (options.indexOf("x")>=0) {
options=options.replace(/x/g,"");
replace=replace.replace(/\\\\/g,"\\B");
replace=replace.replace(/\\b/g,"\b");
replace=replace.replace(/\\f/g,"\f");
replace=replace.replace(/\\n/g,"\n");
replace=replace.replace(/\\r/g,"\r");
replace=replace.replace(/\\t/g,"\t");
replace=replace.replace(/\\v/g,"\v");
replace=replace.replace(/\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}/g,
function($0,$1,$2){
return String.fromCharCode(parseInt("0x"+$0.substring(2)));
}
);
replace=replace.replace(/\\B/g,"\\");
}
var search=new RegExp(search,options);
if (srcVar) {
WScript.Stdout.Write(env(args.Item(3)).replace(search,replace));
} else {
while (!WScript.StdIn.AtEndOfStream) {
if (multi) {
WScript.Stdout.Write(WScript.StdIn.ReadAll().replace(search,replace));
} else {
WScript.Stdout.WriteLine(WScript.StdIn.ReadLine().replace(search,replace));
}
}
}
Merging CSV tables with Ruby
I've been playing with Ruby and CSV most of this day, I might be able to help (even though I am beginner myself) but I don't understand what do you want as output (little example would help).
This example would load only columns "Date", "High" and "Volume" into "my_array".
my_array = []
CSV.foreach("data.csv") do |row|
my_array.push([row[0], row[2], row[5]])
end
If you want every column try:
my_array = []
CSV.foreach("data.csv") do |row|
my_array.push(row)
end
If you want to access element of array inside array:
puts my_array[0][0].inspect #=> "Date"
puts my_array[1][0].inspect #=> "2014-07-14"
When you finally get what you want as output, if you are on Windows you can do this from command prompt to save it:
my_file.rb > output_in_text_form.txt
Related Topics
Is Ruby Any Good for Gui Development
Ruby: "If !Object.Nil" or "If Object"
Ruby Outputting to the Same Line as the Previous Output
How to Set a Proxy in Rubys Net/Http
Capybara Trouble Filling in Js Modal
Require Ruby File Without .Rb Extension
Why Does 'Puts(Nil or 4)' Fail in Ruby
How to Upload a JSON File with Secret Keys to Heroku
Ruby - Naming Convention - Letter Case for Acronyms in Class/Module Names
No Such File to Load -- Bundler/Setup (Ruby on Rails)
Do I Need to Indent My Code in Ruby