Ruby How to Merge Two CSV Files with Slightly Different Headers

Ruby how to merge two CSV files with slightly different headers

Challenge accepted :)

#!/usr/bin/env ruby
require "csv"

module MergeCsv
class << self
def run(csv_paths)
csv_files = csv_paths.map { |p| CSV.read(p, headers: true) }
merge(csv_files)
end

private

def merge(csv_files)
headers = csv_files.flat_map(&:headers).uniq.sort
hash_array = csv_files.flat_map(&method(:csv_to_hash_array))

CSV.generate do |merged_csv|
merged_csv << headers

hash_array.each do |row|
merged_csv << row.values_at(*headers)
end
end
end

# Probably not the most performant way, but easy
def csv_to_hash_array(csv)
csv.to_a[1..-1].map { |row| csv.headers.zip(row).to_h }
end
end
end

if(ARGV.length == 0)
puts "Use: ruby merge_csv.rb <file_path_csv_1> <file_path_csv_2>"
exit 1
end

puts MergeCsv.run(ARGV)

Generate a new CSV combining two csv files

First you need to parse both files - you could save each row in a hash or you create a new class and save instances of that class.
Second you need to pair the entries with the same email (if you create instances of your own class, you can assign the notes to the right instance when you parse the second csv)
Finally you want to write a csv file again.

Have a look at this gem - it might be helpful
https://github.com/ruby/csv

How does that sound?

EDIT: here is the code if you use a class to solve the problem

class Person
attr_reader :name, :email, :phone, :job, :created_at, :note
attr_writer :note
#state
# name,email,phone,job,created_at
def initialize(name, email, phone, job, created_at, note)
@name = name
@email = email
@phone = phone
@job = job
@created_at = created_at
@note = note
end
#behaviour
end

#little test:
person_1 = Person.new("john", "john@john.us", "112", "police", "21.02.", nil)
p person_1

require 'csv'
csv_options = { headers: :first_row }
filepath = 'persons.csv'
persons = []

CSV.foreach(filepath, csv_options) do |row|
persons << Person.new(row["name"], row["email"], row["phone"], row["job"], row["created_at"], nil)
end

filepath_2 = "notes.csv"
CSV.foreach(filepath_2, csv_options) do |row|
persons.each do |person|
if person.email == row["email"]
person.note = row["note"]
end
end
end

p persons

csv_options = { col_sep: ',', force_quotes: true, quote_char: '"' }
filepath = 'combined.csv'

CSV.open(filepath, 'wb', csv_options) do |csv|
csv << ['name', 'email', 'phone', 'job', 'created_at', "note"]
persons.each do |person|
csv << [person.name, person.email, person.phone, person.job, person.created_at, person.note]
end
end

Join two CSV files in Ruby without using tables

Givens

We are given the following.

The paths for the two input files:

fname1 = 't1.csv'
fname2 = 't2.csv'

The path for the output file:

fname3 = 't3.csv'

The names of the headers to match in each of the two input files:

target1 = 'B'
target2 = 'E'

I do assume that (as is the case with the example) the two files necessarily contain the same number of lines.

Create test files

Let's first create the two files:

str = [%w|A B C|, %w|1 1 1|, %w|2 2 2|, %w|3 4 5|, %w|6 9 9|].
map { |a| a.join(",") }.join("\n")
#=> "A,B,C\n1,1,1\n2,2,2\n3,4,5\n6,9,9"
File.write(fname1, str)
#=> 29

str = [%w|D E F|, %w|21 1 41|, %w|22 5 42|, %w|23 8 45|, %w|26 9 239|].
map { |a| a.join(",") }.join("\n")
#=> "D,E,F\n21,1,41\n22,5,42\n23,8,45\n26,9,239"
File.write(fname2, str)
#=> 38

Read the input files into CSV::Table objects

When reading fname1 I will use the :header_converters option to convert the header "B" to "B/E". Note that this does not require knowledge of the location of the column with header "B" (or whatever it may be).

require 'csv'

new_target1 = target1 + "/" + target2
#=> "B/E"

csv1 = CSV.read(fname1, headers: true,
header_converters: lambda { |header| header==target1 ? new_target1 : header})
csv2 = CSV.read(fname2, headers: true)

Construct arrays of headers to be written from each input file

headers1 = csv1.headers
#=> ["A", "B/E", "C"]
headers2 = csv2.headers - [target2]
#=> ["D", "F"]

Create the output file

We will first write the new headers headers1 + headers2 to the output file.

Next, for each row index i (i = 0 corresponding to the first row after the header row in each file), for which a condition is satisfied, we write as a single row the elements of csv1[i] and csv2[i] that are in the columns having headers in headers1 and headers2. The condition to be satisfied to write the rows at index i is that i satisfies:

csv1[i][new_target1] == csv2[i][target2] #=> true

Now open fname3 for writing, write the headers and then the body.

CSV.open(fname3, 'w') do |csv|
csv << headers1 + headers2
[csv1.size, csv2.size].min.times do |i|
csv << (headers1.map { |h| csv1[i][h] } +
headers2.map { |h| csv2[i][h] }) if
csv1[i][new_target1] == csv2[i][target2]
end
end
#=> 4

Let's confirm that what was written is correct.

puts File.read(fname3)
A,B/E,C,D,F
1,1,1,21,41
6,9,9,26,239

How to merge multiple csv files into one with unique columns in Rails

Code

require 'csv'

def doit(*csv_input_files, csv_output_file)
CSV.open(csv_output_file, "wb", headers: true) do |csv_out|
csv_out << CSV.open(csv_input_files.first, &:readline)
csv_input_files.each_with_object({}) do |f,h|
CSV.read(f, headers: true).each do |csv|
h[[csv['Date'], csv['ID']]] = csv
end
end.values.each { |row| csv_out << row }
end
end

Example

F1   = 'f1.csv'
F2 = 'f2.csv'
F3 = 'f3.csv'

File.write(F1, <<~END)
Date,ID,Name,Count,Price
2019-08-25,110146,Amazon In-App,1,23
2019-08-25,121615,Google US Rally,0,0
2019-08-25,208442,Google Rewarded US,47,12
2019-08-26,110146,Amazon In-App,10,40
2019-08-26,121615,Google US Rally,0,0
2019-08-26,208442,Google Rewarded US,0,0
END
#=> 260

File.write(F2, <<~END)
Date,ID,Name,Count,Price
2019-08-26,110146,Amazon In-App,30,90
2019-08-26,121615,Google US Rally,5,25
2019-08-26,208442,Google Rewarded US,15,45
2019-08-27,110146,Amazon In-App,5,15
2019-08-27,121615,Google US Rally,10,40
2019-08-27,208442,Google Rewarded US,0,0
END
#=> 263

File.write(F3, <<~END)
Date,ID,Name,Count,Price
2019-08-27,110146,Amazon In-App,30,70
2019-08-27,121615,Google US Rally,12,50
2019-08-27,208442,Google Rewarded US,15,45
2019-08-28,110146,Amazon In-App,15,55
2019-08-28,121615,Google US Rally,20,60
2019-08-28,208442,Google Rewarded US,0,0
END
#=> 265

Fout = 'fout.csv'

doit(F1, F2, F3, Fout)

puts File.read(Fout)
Date,ID,Name,Count,Price
2019-08-25,110146,Amazon In-App,1,23
2019-08-25,121615,Google US Rally,0,0
2019-08-25,208442,Google Rewarded US,47,12
2019-08-26,110146,Amazon In-App,30,90
2019-08-26,121615,Google US Rally,5,25
2019-08-26,208442,Google Rewarded US,15,45
2019-08-27,110146,Amazon In-App,30,70
2019-08-27,121615,Google US Rally,12,50
2019-08-27,208442,Google Rewarded US,15,45
2019-08-28,110146,Amazon In-App,15,55
2019-08-28,121615,Google US Rally,20,60
2019-08-28,208442,Google Rewarded US,0,0

See CSV::open, CSV::read, File::new, IO#gets and Hash#values. This article on working with CSV files may be of interest.

How do I use Ruby to combine several CSV files into one big CSV file?

hs = %w{ transcription_factor cell_type chromosome start_site end_site project_name }

CSV.open('result.csv','w') do |csv|
csv << hs
CSV.foreach('test1.csv', headers: true) {|row| csv << row.values_at(*hs) }
CSV.foreach('test2.csv', headers: true) {|row| csv << row.values_at(*hs) }
CSV.foreach('test3.csv', headers: true) do |row|
csv << ['Cmyc', 'PWM', row['chromosome'].match(/\d+/).to_s] + row.values_at('start_site', 'end_site') + ['JASPAR']
end
end

How to combine the data from two CSV files in BASH?

And a nice, clean awk solution:

awk -F" *@ *" 'NR==FNR{lines[$2]=$0} NR!=FNR{if(lines[$1])lines[$1]=lines[$1] " @ " $2} END{for(line in lines)print lines[line]}' file2.csv file1.csv

A nice one-liner. Not a short one, but not the longest I've seen. Note that file2 and file1 are switched. Again, as a script with explanation:

#!/usr/bin/awk -f

# Split fields on @ and the whitespace on either side.
BEGIN { FS = " *@ *" }

# First file
NR == FNR {
#Store the line
lines[$2] = $0
}

# Second file
NR != FNR {
# If the appropriate animal was in the first file, append its eating habits.
# If not, it's discarded; if you want something else, let me know.
if(lines[$1]) lines[$1] = lines[$1] " @ " $2
}

# After both files have been processed
END {
# Loop over all lines in the first file and print them, possibly updated with eating habits.
# No guarantees on order.
for(line in lines) print lines[line]
}

Call as awk -f join.awk file2.csv file1.csv, or make executable and ./join.awk file2.csv file1.csv.

Merging csv files with slightly different headers

A batch file can easily parse most CSV lines using FOR /F as long as none of the column values contain a comma. But the FOR /F solution can be tripped up by missing values. Your CSV might have consecutive commas, which indicates a missing value. But FOR /F treats consecutive delimiters as a single delimiter. That problem could be solved by batch, but I don't think it is worth the effort.

PowerShell likely has a good solution for parsing CSV. I know .NET has a class for parsing CSV, and PowerShell can access .NET. But I don't really know PowerShell.

There are free text processing tools like sed that are available for Windows. But that requires a download.

I've written an easy to use hybrid batch/JScript utility called REPL.BAT that can perform regex search and replace on text files.

Assuming that your 1st column in the 2nd file never has a quoted comma in the value, then a solution can be as simple as:

@echo off
>new.csv (
echo header1,header2,header3,header4,header5
findstr /v /c:"header1,header2,header3,header4" file1.csv | repl "^(.*)$" "$1,"
findstr /v /c:"header1,header3,header4,header5" file2.csv | repl "^([^,]*)," "$1, ,"
)

Here is the REPL.BAT utility that enables the above solution. Full documentation is build into the script.

@if (@X)==(@Y) @end /* Harmless hybrid line that begins a JScript comment

::************ Documentation ***********
:::
:::REPL Search Replace [Options [SourceVar]]
:::REPL /?
:::
::: Performs a global search and replace operation on each line of input from
::: stdin and prints the result to stdout.
:::
::: Each parameter may be optionally enclosed by double quotes. The double
::: quotes are not considered part of the argument. The quotes are required
::: if the parameter contains a batch token delimiter like space, tab, comma,
::: semicolon. The quotes should also be used if the argument contains a
::: batch special character like &, |, etc. so that the special character
::: does not need to be escaped with ^.
:::
::: If called with a single argument of /? then prints help documentation
::: to stdout.
:::
::: Search - By default this is a case sensitive JScript (ECMA) regular
::: expression expressed as a string.
:::
::: JScript syntax documentation is available at
::: http://msdn.microsoft.com/en-us/library/ae5bf541(v=vs.80).aspx
:::
::: Replace - By default this is the string to be used as a replacement for
::: each found search expression. Full support is provided for
::: substituion patterns available to the JScript replace method.
::: A $ literal can be escaped as $$. An empty replacement string
::: must be represented as "".
:::
::: Replace substitution pattern syntax is documented at
::: http://msdn.microsoft.com/en-US/library/efy6s3e6(v=vs.80).aspx
:::
::: Options - An optional string of characters used to alter the behavior
::: of REPL. The option characters are case insensitive, and may
::: appear in any order.
:::
::: I - Makes the search case-insensitive.
:::
::: L - The Search is treated as a string literal instead of a
::: regular expression. Also, all $ found in Replace are
::: treated as $ literals.
:::
::: E - Search and Replace represent the name of environment
::: variables that contain the respective values. An undefined
::: variable is treated as an empty string.
:::
::: M - Multi-line mode. The entire contents of stdin is read and
::: processed in one pass instead of line by line. ^ anchors
::: the beginning of a line and $ anchors the end of a line.
:::
::: X - Enables extended substitution pattern syntax with support
::: for the following escape sequences:
:::
::: \\ - Backslash
::: \b - Backspace
::: \f - Formfeed
::: \n - Newline
::: \r - Carriage Return
::: \t - Horizontal Tab
::: \v - Vertical Tab
::: \xnn - Ascii (Latin 1) character expressed as 2 hex digits
::: \unnnn - Unicode character expressed as 4 hex digits
:::
::: Escape sequences are supported even when the L option is used.
:::
::: S - The source is read from an environment variable instead of
::: from stdin. The name of the source environment variable is
::: specified in the next argument after the option string.
:::

::************ Batch portion ***********
@echo off
if .%2 equ . (
if "%~1" equ "/?" (
findstr "^:::" "%~f0" | cscript //E:JScript //nologo "%~f0" "^:::" ""
exit /b 0
) else (
call :err "Insufficient arguments"
exit /b 1
)
)
echo(%~3|findstr /i "[^SMILEX]" >nul && (
call :err "Invalid option(s)"
exit /b 1
)
cscript //E:JScript //nologo "%~f0" %*
exit /b 0

:err
>&2 echo ERROR: %~1. Use REPL /? to get help.
exit /b

************* JScript portion **********/
var env=WScript.CreateObject("WScript.Shell").Environment("Process");
var args=WScript.Arguments;
var search=args.Item(0);
var replace=args.Item(1);
var options="g";
if (args.length>2) {
options+=args.Item(2).toLowerCase();
}
var multi=(options.indexOf("m")>=0);
var srcVar=(options.indexOf("s")>=0);
if (srcVar) {
options=options.replace(/s/g,"");
}
if (options.indexOf("e")>=0) {
options=options.replace(/e/g,"");
search=env(search);
replace=env(replace);
}
if (options.indexOf("l")>=0) {
options=options.replace(/l/g,"");
search=search.replace(/([.^$*+?()[{\\|])/g,"\\$1");
replace=replace.replace(/\$/g,"$$$$");
}
if (options.indexOf("x")>=0) {
options=options.replace(/x/g,"");
replace=replace.replace(/\\\\/g,"\\B");
replace=replace.replace(/\\b/g,"\b");
replace=replace.replace(/\\f/g,"\f");
replace=replace.replace(/\\n/g,"\n");
replace=replace.replace(/\\r/g,"\r");
replace=replace.replace(/\\t/g,"\t");
replace=replace.replace(/\\v/g,"\v");
replace=replace.replace(/\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}/g,
function($0,$1,$2){
return String.fromCharCode(parseInt("0x"+$0.substring(2)));
}
);
replace=replace.replace(/\\B/g,"\\");
}
var search=new RegExp(search,options);

if (srcVar) {
WScript.Stdout.Write(env(args.Item(3)).replace(search,replace));
} else {
while (!WScript.StdIn.AtEndOfStream) {
if (multi) {
WScript.Stdout.Write(WScript.StdIn.ReadAll().replace(search,replace));
} else {
WScript.Stdout.WriteLine(WScript.StdIn.ReadLine().replace(search,replace));
}
}
}

Merging CSV tables with Ruby

I've been playing with Ruby and CSV most of this day, I might be able to help (even though I am beginner myself) but I don't understand what do you want as output (little example would help).

This example would load only columns "Date", "High" and "Volume" into "my_array".

my_array = []
CSV.foreach("data.csv") do |row|
my_array.push([row[0], row[2], row[5]])
end

If you want every column try:

my_array = []
CSV.foreach("data.csv") do |row|
my_array.push(row)
end

If you want to access element of array inside array:

puts my_array[0][0].inspect #=> "Date"
puts my_array[1][0].inspect #=> "2014-07-14"

When you finally get what you want as output, if you are on Windows you can do this from command prompt to save it:

my_file.rb > output_in_text_form.txt


Related Topics



Leave a reply



Submit