How can I write dplyr groups to separate files?
You can wrap the csv write process in a custom function as follows. Note that the function has to return
a data.frame
else it returns an error Error: Results are not data frames at positions
This will return 3 csv files named "mtcars_cyl_4.csv","mtcars_cyl_6.csv" and "mtcars_cyl_8.csv"
customFun = function(DF) {
write.csv(DF,paste0("mtcars_cyl_",unique(DF$cyl),".csv"))
return(DF)
}
mtcars %>%
group_by(cyl) %>%
do(customFun(.))
Writing out group_by with length 1 to individual text files in R
dplyrlibrary(dplyr)
newDF <- DF %>%
group_by(Locus) %>%
filter(n() > 1) %>%
nest_by()
newDF
# # A tibble: 2 x 2
# # Rowwise: Locus
# Locus data
# <chr> <list<tbl_df[,4]>>
# 1 2 [2 x 4]
# 2 3 [2 x 4]
mapply(function(x, nm) write.csv(x, nm),
newDF$data, paste0("loc", newDF$Locus, ".csv"))
# [[1]]
# NULL
# [[2]]
# NULL
library(dplyr)
newDF <- DF %>%
group_by(Locus) %>%
filter(n() > 1) %>%
nest_by()
newDF
# # A tibble: 2 x 2
# # Rowwise: Locus
# Locus data
# <chr> <list<tbl_df[,4]>>
# 1 2 [2 x 4]
# 2 3 [2 x 4]
mapply(function(x, nm) write.csv(x, nm),
newDF$data, paste0("loc", newDF$Locus, ".csv"))
# [[1]]
# NULL
# [[2]]
# NULL
The files are created in the current directory. You can safely ignore the NULL
output from mapply
.
data.tablelibrary(data.table)
DT <- as.data.table(DF)
newDT <- DT[, .SD[.N > 1, .(data = list(.SD))], by = Locus]
newDT
# Locus data
# <char> <list>
# 1: 2 <data.table[2x4]>
# 2: 3 <data.table[2x4]>
mapply(function(x, nm) write.csv(x, nm),
newDF$data, paste0("loc", newDF$Locus, ".csv"))
output of group_split needs to be saved as separate dataframe
library(data.table)
DT <- as.data.table(DF)
newDT <- DT[, .SD[.N > 1, .(data = list(.SD))], by = Locus]
newDT
# Locus data
# <char> <list>
# 1: 2 <data.table[2x4]>
# 2: 3 <data.table[2x4]>
mapply(function(x, nm) write.csv(x, nm),
newDF$data, paste0("loc", newDF$Locus, ".csv"))
group_split
will not return you name of the year as list name. Use base::split
instead.
y <- split(year_x, year_x$Year)
for(i in seq_along(y)) {
write.csv2(y[[i]], paste0("D:/newfolder/", names(y)[i], ".csv"),row.names = FALSE)
}
You could also do this with purrr::imap
purrr::imap(y, ~write.csv2(.x, paste0("D:/newfolder/", .y, ".csv"),row.names = FALSE))
R - Group by, then iterate through group and extract original column values
Generating your "data":
df <- data.frame("ID1" = c("A","B","C","E","G","H"), "ID2" = c("B","B","D","Fe","Fe","I"), "V1" = c("var1","var1","var2","var3","var3","var4"),"V2" = c("foo","foo","bar","foo","foo","zed"), "V3" = c(1,1,2,3,3,2))
Clustering the data and getting the unique clusters:
library(dplyr)
df_clust <- df %>% group_by(V1,V2,V3)
df_tally <- df_clust %>% tally()
Looping through, assuming only two ID columns and 3 feature columns, and printing each result to a new file:
for (i in c(1:nrow(df_tally))){
pull1 <- df %>% filter(V1 == unlist(df_tally[i,1]), V2 == unlist(df_tally[i,2]), V3 == unlist(df_tally[i,3])) %>% pull(ID1)
pull2 <- df %>% filter(V1 == unlist(df_tally[i,1]), V2 == unlist(df_tally[i,2]), V3 == unlist(df_tally[i,3])) %>% pull(ID2)
mergeID <- c(type.convert(pull1,as.is = TRUE),type.convert(pull2,as.is = TRUE))
mergeID <- unique(mergeID)
filename <- paste("TEST_",i, ".txt", sep="")
fileConn<-file(filename)
writeLines(mergeID, fileConn)
close(fileConn)
}
R Generating multiple Excel files based on aggregated data
If you use purrr
from the tidyverse
you can avoid the for loop.
If you take your code above and wrap it into a basic function you can just iterate over the function for each site name using purrr::map
.
Your setup:
#Load libraries
library(dplyr)
library(xlsx)
library(purrr)
#Reproducible Data Frame
df=data.frame(Site=c("Tokyo Harbor","Tokyo Harbor","Tokyo Harbor","Arlington","Arlington","Cairo Skyline","Cairo Skyline"),
Seating=c("comfy never a problem to find","difficult","ease and quick","nobody to help","nice n comfy","old seats","nt bad"),
Decor=c("very beautiful","i loved it!!!","nice","great","nice thanks","no response","yea nice"),
Reception=c("always neat","I wasn't happy with the decor on this site","great!","immaculate","happy very helpful","","I wont bother again"),
Toilets=c("well maintained","nicely managed","long queues could do better","","cleaner toilets needed!","no toilet roll in the mens loo","flush for god's sake!!!"),
Comfort=c("very comfortable and heated","I felt like I was home","","couldn't be better","very nice and kush","not comment","fresh eyes needed"),
Speed=c("rapid service","no delays ever got everything I needed on time","","","I have grown accustomed to the speed of service","machines","super duper quick"),
Efficiency=c("very efficient, the servers were great","spot on","","I was quite disappointed in the efficiency","clockwork","parfait",""),
Courtesy=c("Staff were very polite","smiling faces everywhere, loved it","very welcoming and kind","the hostess was a bit rude","trés impoli","noo",""),
Responsiveness=c("On the ball all the time","super quick whenever help was needed","","","","want more service like this",""))
#Transform all columns with empty cells to NAs
df[df==""] <- NA
Your Steps in a function:
- Take your dataframe and filter by argument site name
- Perform all the steps you did above
- Write site df to spreadsheet
The Function:
export_site_data <- function(site.name){
###########################
#STEP 0: filter by block site
df <- df %>% filter(Site %in% site.name)
###########################
#STEP 1: Define the blocks
#Block 1: Overall = Seating + Decor + Reception + Toilets
BlockOverall=c(names(df)[2],names(df)[3],names(df)[4],names(df)[5])
#Block 2: Comfort & Speed = Comfort + Speed
BlockComfortSpeed=c(names(df)[6],names(df)[7])
#Block 3: Operations = Efficiency + Courtesy + Responsiveness
BlockOps=c(names(df)[8],names(df)[9],names(df)[10])
###############################################
#STEP 2: Group comments based on defined blocks
#Group Overall
Data_Overall= df %>%
select(BlockOverall)
Data_Overall = Data_Overall %>%
do(.,data.frame(Comments_Overall=unlist(Data_Overall,use.names = F))) %>%
filter(complete.cases(.))
#Group Comfort & Speed
Data_ComfortSpeed= df %>%
select(BlockComfortSpeed)
Data_ComfortSpeed = Data_ComfortSpeed %>%
do(.,data.frame(Comments_ComfortSpeed=unlist(Data_ComfortSpeed,use.names = F))) %>%
filter(complete.cases(.))
#Group Operations
Data_Operations= df %>%
select(BlockOps)
Data_Operations = Data_Operations %>%
do(.,data.frame(Comments_Operations=unlist(Data_Operations,use.names = F))) %>% filter(complete.cases(.))
library(xlsx)
write.xlsx(Data_Overall, paste0("Comments_",site.name,"_2017.xlsx"), sheetName =
'Overall',row.names = F) #Tab 1
write.xlsx(Data_ComfortSpeed, paste0("Comments_",site.name,"_2017.xlsx"), sheetName =
'Comfort_&_Speed',row.names = F,append = T) #Tab 2
write.xlsx(Data_Operations, paste0("Comments_",site.name,"_2017.xlsx"), sheetName =
'Operations',row.names = F,append = T) #Tab 3
}
Use Map to iterate through site names
site.name <- unique(df$Site)
site.name %>% map(export_site_data )
The results:
list.files(pattern = "Comments_")
[1] "Comments_Arlington_2017.xlsx" "Comments_Cairo Skyline_2017.xlsx"
[3] "Comments_Tokyo Harbor_2017.xlsx"
custom grouped dplyr function (sample_n)
We could check the number of rows in the group and pass the value to sample_n
accordingly.
library(dplyr)
n <- 8
temp <- mtcars %>% group_by(cyl) %>% sample_n(if(n() < n) n() else n)
temp
# mpg cyl disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
# 2 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
# 3 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
# 4 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
# 5 26 4 120. 91 4.43 2.14 16.7 0 1 5 2
# 6 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
# 7 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2
# 8 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
# 9 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#10 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4
# … with 13 more rows
We can check number of rows in each group after that.
table(temp$cyl)
#4 6 8
#8 7 8
table(mtcars$cyl)
# 4 6 8
#11 7 14
write.csv() in dplyr chain
This appeared to work for me in version 0.2:
mtcars %>% filter(cyl == 4) %>% write.csv(.,file = "~/Desktop/piping.csv")
dplyr - Group by and select TOP x %
Or another option with dplyr:
mtcars %>% select(gear, wt) %>%
group_by(gear) %>%
arrange(gear, desc(wt)) %>%
filter(wt > quantile(wt, .8))
Source: local data frame [7 x 2]
Groups: gear [3]
gear wt
(dbl) (dbl)
1 3 5.424
2 3 5.345
3 3 5.250
4 4 3.440
5 4 3.440
6 4 3.190
7 5 3.570
R dplyr group summarize and filter when ID exists in multiple groups
You can also use a combination of length
and unique
to filter
as well:
library(dplyr)
df1 %>%
group_by(ID, Category) %>%
summarize(CNT = n(), amount = sum(Amount)) %>%
filter(length(unique(Category)) > 1)
Output
ID Category CNT amount
<chr> <chr> <int> <dbl>
1 V3 a 1 1
2 V3 b 2 2
3 V5 b 1 1
4 V5 c 2 2
Or here is a base R option using aggregate
to do the summary, then using ave
to do the filtering. Here, Amount
is the variable that we want to apply 2 functions to (i.e., length
and sum
), but we want to do that for each group (ID
and Category
). aggregate
will return a matrix with the results in 2 columns. So, to integrate those with the rest of the dataframe, we can use do.call
to bind each of those columns to the dataframe. Then, we can rename the columns with the desired column names with setNames
.
df1_output <-
setNames(do.call(
data.frame,
aggregate(
Amount ~ ID + Category,
data = df1,
FUN = function(x)
c(CNT = length(x), amount = sum(x))
)
), c(names(df1[1:2]), "CNT", "amount"))
df1_output[with(df1_output, ave(Category, ID, FUN = function(x) length(unique(x))) > 1),]
Related Topics
Is R Superstitious Regarding Posixct Data Type
How to Remove Rows with All Zeros Without Using Rowsums in R
Aggregate by Specific Year in R
Multiple Graphs of Each Time Series
R: Multiple Linear Regression Model and Prediction Model
Create Top-To-Bottom Fade/Gradient Geom_Density in Ggplot2
Convert Data Frame into Vector
Shiny Doesn't Show Me the Entire Selectinput When I Have Choices > 1000
Bookmarking and Saving the Bookmarks in R Shiny
R: Replace All Values in a Dataframe Lower Than a Threshold with Na
Aggregation Using Ffdfdply Function in R
Loop Over Rows of Dataframe Applying Function with If-Statement
Get Selected Row from Datatable in Shiny App
Convert Ggplot Object to Plotly in Shiny Application
Stl Decomposition of Time Series with Missing Values for Anomaly Detection