Manipulating multiple files in R
Alright - I think I hit on all your questions here, but let me know if I missed something. The general process that we will go through here is:
- Identify all of the files that we want to read in and process in our working directory
- Use
lapply
to iterate over each of those file names to create a single list object that contains all of the data - Select your columns of interest
- Merge them together by the common column
For the purposes of the example, consider I have four files named file1.txt
through file4.txt
that all look like this:
x y y2
1 1 2.44281173 -2.32777987
2 2 -0.32999022 -0.60991623
3 3 0.74954561 0.03761497
4 4 -0.44374491 -1.65062852
5 5 0.79140012 0.40717932
6 6 -0.38517329 -0.64859906
7 7 0.92959219 -1.27056731
8 8 0.47004041 2.52418636
9 9 -0.73437337 0.47071120
10 10 0.48385902 1.37193941
##1. identify files to read in
filesToProcess <- dir(pattern = "file.*\\.txt$")
> filesToProcess
[1] "file1.txt" "file2.txt" "file3.txt" "file4.txt"
##2. Iterate over each of those file names with lapply
listOfFiles <- lapply(filesToProcess, function(x) read.table(x, header = TRUE))
##3. Select columns x and y2 from each of the objects in our list
listOfFiles <- lapply(listOfFiles, function(z) z[c("x", "y2")])
##NOTE: you can combine steps 2 and 3 by passing in the colClasses parameter to read.table.
#That code would be:
listOfFiles <- lapply(filesToProcess, function(x) read.table(x, header = TRUE
, colClasses = c("integer","NULL","numeric")))
##4. Merge all of the objects in the list together with Reduce.
# x is the common columns to join on
out <- Reduce(function(x,y) {merge(x,y, by = "x")}, listOfFiles)
#clean up the column names
colnames(out) <- c("x", sub("\\.txt", "", filesToProcess))
Results in the following:
> out
x file1 file2 file3 file4
1 1 -2.32777987 -0.671934857 -2.32777987 -0.671934857
2 2 -0.60991623 -0.822505224 -0.60991623 -0.822505224
3 3 0.03761497 0.049694686 0.03761497 0.049694686
4 4 -1.65062852 -1.173863215 -1.65062852 -1.173863215
5 5 0.40717932 1.189763270 0.40717932 1.189763270
6 6 -0.64859906 0.610462808 -0.64859906 0.610462808
7 7 -1.27056731 0.928107752 -1.27056731 0.928107752
8 8 2.52418636 -0.856625895 2.52418636 -0.856625895
9 9 0.47071120 -1.290480033 0.47071120 -1.290480033
10 10 1.37193941 -0.235659079 1.37193941 -0.235659079
Read and process multiple files in R
You are passing only directory name to write data. Change the function to
files = list.files(path = "Path/to/my/Directory/",
pattern = "*.txt",
full.names = TRUE)
FUN = function(files) {
CSA_input_data <- data.table::fread(files)
#1
CSA_input_data[,'x21_CT'] = ((CSA_input_data[,'CASE_ALLELE_CT']/2) - CSA_input_data[,'A1_CASE_CT'])
#2
CSA_input_data[,'x21'] = CSA_input_data[,'x21_CT']/CSA_input_data[,'CASE_ALLELE_CT']
#x22
#1
CSA_input_data[,'x22_CT'] = ((CSA_input_data[,'CTRL_ALLELE_CT']/2) - CSA_input_data[,'A1_CTRL_CT'])
#2
CSA_input_data[,'x22'] = CSA_input_data[,'x22_CT']/CSA_input_data[,'CTRL_ALLELE_CT']
write.table(CSA_input_data, paste0("Path/to/my/Directory/result_", basename(files)), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
}
and then use lapply
or for
loop.
lapply(files, FUN)
Iteratively read, manipulate multiple excel files and append them into one dataframe using R
You could try:
library(fs)
library(readxl)
file_paths = list.files("./test/", pattern = "*.xlsx")
df = data.frame()
for(i in file_paths){
df_temp = read_xlsx(path=paste0("./test/", i))
df_temp$`1` = names(df_temp)[2]
names(df_temp) = c("date", "value", "name")
df = rbind(df, df_temp)
}
rm(df_temp)
Output:
> df
# A tibble: 21 x 3
date value name
<dttm> <dbl> <chr>
1 2021-01-07 00:00:00 76.5 J01-J05
2 2021-01-08 00:00:00 93.5 J01-J05
3 2021-01-15 00:00:00 305 J01-J05
4 2021-01-22 00:00:00 -289 J01-J05
5 2021-01-29 00:00:00 -242. J01-J05
6 2021-02-05 00:00:00 -266 J01-J05
7 2021-02-10 00:00:00 -240. J01-J05
8 2021-01-07 00:00:00 323 J01-J09
9 2021-01-08 00:00:00 318. J01-J09
10 2021-01-15 00:00:00 528. J01-J09
# ... with 11 more rows
Update, with function:
read_excel = function(name) {
df_temp = read_xlsx(path=paste0("./test/", name))
df_temp$`1` = names(df_temp)[2]
names(df_temp) = c("date", "value", "name")
return(df_temp)
}
df = do.call(rbind, lapply(file_paths, read_excel))
R foreach: Read and manipulate multiple files in parallel
As suggested in comments, the code below creates one directory per process / tar file, untars, merges the CSVs in a .rds file and deletes them.
Note that it seems that vroom
needs the altrep = FALSE
argument to avoid a permission denied error at deletion.
# Generate sample tars for test
write.csv(mtcars,'file1.csv')
write.csv(mtcars,'file2.csv')
write.csv(iris,'file3.csv')
write.csv(iris,'file4.csv')
tar('tar1.tar',files=c('file1.csv','file2.csv'),tar="tar")
tar('tar2.tar',files=c('file3.csv','file4.csv'),tar="tar")
require(dplyr)
require(tidyr)
require(foreach)
require(doParallel)
require(magrittr)
#List all tar files in directory
list_of_files<-list.files(pattern = "\\.tar")
packsINeed<-c("vroom","magrittr","dplyr","tidyr","doParallel")
#Start for loop
myCluster<-makeCluster(2,type="PSOCK")
registerDoParallel(myCluster)
foreach(i= 1:NROW(list_of_files),.packages = packsINeed)%dopar%{
print(paste(list_of_files[i], "which is", i, "of", NROW(list_of_files) ))
print("2. Untar .csv files inside")
fileout <- tools::file_path_sans_ext(list_of_files[i], compression = TRUE)
exdir <- paste0("temp",fileout)
untar(tarfile = list_of_files[i], exdir = exdir)
print("#3. Read in files and add up two columns")
df<-vroom::vroom(file.path(exdir,dir(exdir,"*.csv")),altrep = FALSE)
# df$A<-df$B+df$C # These columns don't exist in mtcars used as example
print("#4. save RDS")
saveRDS(object = df, file = file.path(exdir,paste0(fileout,".rds")))
print("#5. Clean up files")
.files<-list.files(exdir,pattern="\\.csv")
file.remove(file.path(exdir,.files))
}
Not sure where the .rds should go, so left for the time being in the temporary folder.
Load multiple .csv files in R, edit them and save as new .csv files named by a list of chracter strings
You are actually quite close and using lapply()
is really good idea.
As you state, the issue is, it only takes one list
as an argument,
but you want to work with two. mapply()
is a function in base R
that you can feed multiple lists into and cycle through synchronically. lapply()
and mapply()
are both designed to create/ manipulate objects inR
but you want to write the files and are not interested in the out withinR
. Thepurrr
package has thewalk*()\
functions which are useful,
when you want to cycle through lists and are only interested in creating
side effects (in your case saving files).
purrr::walk2()
takes two lists, so you can provide the data and the
file names at the same time.
library(purrr)
First I create some example data (I’m basically already using the same concept here as I will below):
test_data <- map(1:5, ~ data.frame(
a = sample(1:5, 3),
b = sample(1:5, 3),
c = sample(1:5, 3)
))
walk2(test_data,
paste0("species_data/", 1:5, "test.csv"),
~ write.csv(.x, .y))
Instead of getting the file paths and then stripping away the path
to get the file names, I just call list.files()
, once with full.names = TRUE
and once with full.names = FALSE
.
NDOP_filepaths <-
list.files(
path = "species_data",
pattern = "*.csv$",
full.names = TRUE,
recursive = FALSE
)
NDOP_filenames <-
list.files(
path = "species_data",
pattern = "*.csv$",
full.names = FALSE,
recursive = FALSE
)
Now I feed the two lists into purrr::walk2()
. Using the ~
before
the curly brackets I can define the anonymous function a bit more elegant
and then use .x
, and .y
to refer to the entries of the first and the
second list.
walk2(NDOP_filepaths,
NDOP_filenames,
~ {
species <- read.csv(.x)
species <- species[, 1:2]
species$species <- gsub(".csv", "", .y)
write.csv(species, .x)
})
Learn more about purrr
at purrr.tidyverse.org.
Alternatively, you could just extract the file name in the loop and stick to lapply()
or use purrr::map()
/purrr::walk()
, like this:
lapply(NDOP_filepaths,
function(x) {
species <- read.csv(x)
species <- species[, 1:2]
species$species <- gsub("species///|.csv", "", x)
write.csv(species, gsub("species///", "", x))
})
Applying a custom function to multiple files and creating unique csv output in R
You can read, manipulate data and write the csv in the same function :
library(dplyr)
summarize_by_month = function(file) {
df <- readr::read_csv(file)
# assign mean values to the missing data and modify the dataframe
df = df %>% mutate(ET = replace(ET,is.na(ET),mean(ET, na.rm = TRUE)))
#separate data into year, month and day
df$date = as.Date(df$date,format="%Y/%m/%d")
#summarize by year and month
new_df <- df %>%
mutate(year = format(date, "%Y"), month = format(date, "%m")) %>%
group_by(year, month) %>%
summarise(mean_monthly = mean(ET))
write.csv(new_df, sprintf('output_folder/%s_processed.csv',
tools::file_path_sans_ext(basename(file))), row.names = FALSE)
}
monthly_AET = lapply(file_list, summarize_by_month)
Related Topics
Change Level of Multiple Factor Variables
R: How to Get the Last Element from Each Group
R Formatting a Date from a Character Mmm Dd, Yyyy to Class Date
Lapply Function /Loops on List of Lists R
Trying to Find Row Associated with Max Value in Dataframe R
Aggregate by Specific Year in R
How to Learn How to Write C Code to Speed Up Slow R Functions
Scatterplot with Alpha Transparent Histograms in R
How to Manually Set Colors in a Bar Chart
R: Arranging Multiple Plots Together Using Gridextra
Remove All Variables Except Functions
Circular Heatmap That Looks Like a Donut
Ggplot2: Issues with Dual Y-Axes and Loess Smoothing
Referring to Variables by Name in a Dplyr Function Returns Object Not Found Error
What Are Some Good Books, Web Resources, and Projects for Learning R
Writings Functions (Procedures) for Data.Table Objects