Manipulating Multiple Files in R

Manipulating multiple files in R

Alright - I think I hit on all your questions here, but let me know if I missed something. The general process that we will go through here is:

Identify all of the files that we want to read in and process in our working directory
Use lapply to iterate over each of those file names to create a single list object that contains all of the data
Select your columns of interest
Merge them together by the common column

For the purposes of the example, consider I have four files named file1.txt through file4.txt that all look like this:

    x           y          y2
1   1  2.44281173 -2.32777987
2   2 -0.32999022 -0.60991623
3   3  0.74954561  0.03761497
4   4 -0.44374491 -1.65062852
5   5  0.79140012  0.40717932
6   6 -0.38517329 -0.64859906
7   7  0.92959219 -1.27056731
8   8  0.47004041  2.52418636
9   9 -0.73437337  0.47071120
10 10  0.48385902  1.37193941

##1. identify files to read in
filesToProcess <- dir(pattern = "file.*\\.txt$")
> filesToProcess
[1] "file1.txt" "file2.txt" "file3.txt" "file4.txt"

##2. Iterate over each of those file names with lapply
listOfFiles <- lapply(filesToProcess, function(x) read.table(x, header = TRUE))

##3. Select columns x and y2 from each of the objects in our list
listOfFiles <- lapply(listOfFiles, function(z) z[c("x", "y2")])

##NOTE: you can combine steps 2 and 3 by passing in the colClasses parameter to read.table.
#That code would be:
listOfFiles <- lapply(filesToProcess, function(x) read.table(x, header = TRUE
  , colClasses = c("integer","NULL","numeric")))

##4. Merge all of the objects in the list together with Reduce. 
# x is the common columns to join on
out <- Reduce(function(x,y) {merge(x,y, by = "x")}, listOfFiles)
#clean up the column names
colnames(out) <- c("x", sub("\\.txt", "", filesToProcess))

Results in the following:

> out
    x       file1        file2       file3        file4
1   1 -2.32777987 -0.671934857 -2.32777987 -0.671934857
2   2 -0.60991623 -0.822505224 -0.60991623 -0.822505224
3   3  0.03761497  0.049694686  0.03761497  0.049694686
4   4 -1.65062852 -1.173863215 -1.65062852 -1.173863215
5   5  0.40717932  1.189763270  0.40717932  1.189763270
6   6 -0.64859906  0.610462808 -0.64859906  0.610462808
7   7 -1.27056731  0.928107752 -1.27056731  0.928107752
8   8  2.52418636 -0.856625895  2.52418636 -0.856625895
9   9  0.47071120 -1.290480033  0.47071120 -1.290480033
10 10  1.37193941 -0.235659079  1.37193941 -0.235659079

Read and process multiple files in R

You are passing only directory name to write data. Change the function to

files = list.files(path = "Path/to/my/Directory/", 
                   pattern = "*.txt", 
                   full.names = TRUE)

FUN = function(files) {
  CSA_input_data <- data.table::fread(files)
  
  #1
  CSA_input_data[,'x21_CT'] = ((CSA_input_data[,'CASE_ALLELE_CT']/2) - CSA_input_data[,'A1_CASE_CT'])
  #2
  CSA_input_data[,'x21'] = CSA_input_data[,'x21_CT']/CSA_input_data[,'CASE_ALLELE_CT']
  
  #x22
  #1
  CSA_input_data[,'x22_CT'] = ((CSA_input_data[,'CTRL_ALLELE_CT']/2) - CSA_input_data[,'A1_CTRL_CT'])
  #2
  CSA_input_data[,'x22'] = CSA_input_data[,'x22_CT']/CSA_input_data[,'CTRL_ALLELE_CT']
  
 write.table(CSA_input_data, paste0("Path/to/my/Directory/result_", basename(files)), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
  
}

and then use lapply or for loop.

lapply(files, FUN)

Iteratively read, manipulate multiple excel files and append them into one dataframe using R

You could try:

library(fs)
library(readxl)

file_paths = list.files("./test/", pattern = "*.xlsx")

df = data.frame()

for(i in file_paths){
  df_temp = read_xlsx(path=paste0("./test/", i))
  df_temp$`1` = names(df_temp)[2]
  names(df_temp) = c("date", "value", "name")
  df = rbind(df, df_temp)
}

rm(df_temp)

Output:

> df
# A tibble: 21 x 3
   date                 value name   
   <dttm>               <dbl> <chr>  
 1 2021-01-07 00:00:00   76.5 J01-J05
 2 2021-01-08 00:00:00   93.5 J01-J05
 3 2021-01-15 00:00:00  305   J01-J05
 4 2021-01-22 00:00:00 -289   J01-J05
 5 2021-01-29 00:00:00 -242.  J01-J05
 6 2021-02-05 00:00:00 -266   J01-J05
 7 2021-02-10 00:00:00 -240.  J01-J05
 8 2021-01-07 00:00:00  323   J01-J09
 9 2021-01-08 00:00:00  318.  J01-J09
10 2021-01-15 00:00:00  528.  J01-J09
# ... with 11 more rows

Update, with function:

read_excel = function(name) {
  df_temp = read_xlsx(path=paste0("./test/", name))
  df_temp$`1` = names(df_temp)[2]
  names(df_temp) = c("date", "value", "name")
  return(df_temp)
}

df = do.call(rbind, lapply(file_paths, read_excel))

R foreach: Read and manipulate multiple files in parallel

As suggested in comments, the code below creates one directory per process / tar file, untars, merges the CSVs in a .rds file and deletes them.

Note that it seems that vroom needs the altrep = FALSE argument to avoid a permission denied error at deletion.

# Generate sample tars for test
write.csv(mtcars,'file1.csv')
write.csv(mtcars,'file2.csv')
write.csv(iris,'file3.csv')
write.csv(iris,'file4.csv')
tar('tar1.tar',files=c('file1.csv','file2.csv'),tar="tar")
tar('tar2.tar',files=c('file3.csv','file4.csv'),tar="tar")

require(dplyr)
require(tidyr)
require(foreach)
require(doParallel)
require(magrittr)

#List all tar files in directory
list_of_files<-list.files(pattern = "\\.tar")

packsINeed<-c("vroom","magrittr","dplyr","tidyr","doParallel")

#Start for loop

myCluster<-makeCluster(2,type="PSOCK")
registerDoParallel(myCluster) 

foreach(i= 1:NROW(list_of_files),.packages = packsINeed)%dopar%{
  print(paste(list_of_files[i], "which is", i, "of", NROW(list_of_files) ))
  
  print("2. Untar .csv files inside")
  fileout <- tools::file_path_sans_ext(list_of_files[i], compression = TRUE)
  exdir <- paste0("temp",fileout)
  untar(tarfile = list_of_files[i], exdir = exdir)
  
  print("#3. Read in files and add up two columns")
  df<-vroom::vroom(file.path(exdir,dir(exdir,"*.csv")),altrep = FALSE)
  
  # df$A<-df$B+df$C   # These columns don't exist in mtcars used as example
  
  print("#4. save RDS")
  
  saveRDS(object = df, file = file.path(exdir,paste0(fileout,".rds")))
  
  print("#5. Clean up files")
  
  .files<-list.files(exdir,pattern="\\.csv")
  
  file.remove(file.path(exdir,.files))
}

Not sure where the .rds should go, so left for the time being in the temporary folder.

Load multiple .csv files in R, edit them and save as new .csv files named by a list of chracter strings

You are actually quite close and using lapply() is really good idea.
As you state, the issue is, it only takes one list as an argument,
but you want to work with two. mapply() is a function in base R that you can feed multiple lists into and cycle through synchronically. lapply() and mapply()are both designed to create/ manipulate objects inRbut you want to write the files and are not interested in the out withinR. Thepurrrpackage has thewalk*()\ functions which are useful,
when you want to cycle through lists and are only interested in creating
side effects (in your case saving files).

purrr::walk2() takes two lists, so you can provide the data and the
file names at the same time.

library(purrr)

First I create some example data (I’m basically already using the same concept here as I will below):

test_data <- map(1:5, ~ data.frame(
  a = sample(1:5, 3),
  b = sample(1:5, 3),
  c = sample(1:5, 3)
))
walk2(test_data,
      paste0("species_data/", 1:5, "test.csv"),
      ~ write.csv(.x, .y))

Instead of getting the file paths and then stripping away the path
to get the file names, I just call list.files(), once with full.names = TRUE and once with full.names = FALSE.

NDOP_filepaths <-
  list.files(
    path = "species_data",
    pattern = "*.csv$",
    full.names = TRUE,
    recursive = FALSE
  )

NDOP_filenames <-
  list.files(
    path = "species_data",
    pattern = "*.csv$",
    full.names = FALSE,
    recursive = FALSE
  )

Now I feed the two lists into purrr::walk2(). Using the ~ before
the curly brackets I can define the anonymous function a bit more elegant
and then use .x, and .y to refer to the entries of the first and the
second list.

walk2(NDOP_filepaths, 
      NDOP_filenames,
      ~ {
        species <- read.csv(.x)
        species <- species[, 1:2]
        species$species <- gsub(".csv", "", .y)
        write.csv(species, .x)
      })

Learn more about purrr at purrr.tidyverse.org.

Alternatively, you could just extract the file name in the loop and stick to lapply() or use purrr::map()/purrr::walk(), like this:

lapply(NDOP_filepaths, 
      function(x) {
        species <- read.csv(x)
        species <- species[, 1:2]
        species$species <- gsub("species///|.csv", "", x)
        write.csv(species, gsub("species///", "", x))
      })

Applying a custom function to multiple files and creating unique csv output in R

You can read, manipulate data and write the csv in the same function :

library(dplyr)

summarize_by_month = function(file) {
  df <- readr::read_csv(file)

  # assign mean values to the missing data and modify the dataframe
  df = df %>% mutate(ET = replace(ET,is.na(ET),mean(ET, na.rm = TRUE)))

  #separate data into year, month and day  
  df$date = as.Date(df$date,format="%Y/%m/%d")

  #summarize by year and month 
  new_df <- df %>%
    mutate(year = format(date, "%Y"), month = format(date, "%m")) %>%
    group_by(year, month) %>%
    summarise(mean_monthly = mean(ET))
  
     write.csv(new_df, sprintf('output_folder/%s_processed.csv', 
           tools::file_path_sans_ext(basename(file))), row.names = FALSE)
}

monthly_AET = lapply(file_list, summarize_by_month)

Manipulating Multiple Files in R