Existing Function to Combine Standard Deviations in R

existing function to combine standard deviations in R?

Are the populations non overlapping?

library(fishmethods)
combinevar

For instance the example in wikipedia would work like this:

xbar <- c(70,65)
s<-c(3,2)
n <- c(1,1)
combinevar(xbar,s,n)

and standard deviation would be sqrt(combinevar(xbar,s,n)[2])

if you don't want to download the library the function goes like this:

combinevar <- 
function (xbar = NULL, s_squared = NULL, n = NULL)
{
if (length(xbar) != length(s_squared) | length(xbar) != length(n) |
length(s_squared) != length(n))
stop("Vector lengths are different.")
sum_of_squares <- sum((n - 1) * s_squared + n * xbar^2)
grand_mean <- sum(n * xbar)/sum(n)
combined_var <- (sum_of_squares - sum(n) * grand_mean^2)/(sum(n) -
1)
return(c(grand_mean, combined_var))
}

Standard deviation of combined data

Rudmin (2010) states that exact variance of pooled data set is the mean of the variances plus the variance of the means. flodel has already provided an answer and function that gives similar values to Rudmin's statement. Using Rudmin's data set and flodel's function based on Wikipedia:

df <- data.frame(mean = c(30.66667, 31.14286, 40.33333), variance = c(8.555555, 13.26531, 1.555555), n = c(6,7,3))

grand.sd <- function(S, M, N) {sqrt(weighted.mean(S^2 + M^2, N) -
weighted.mean(M, N)^2)}

grand.sd(sqrt(df$variance), df$mean, df$n)^2

#[1] 22.83983 = Dp variance in Rudmin (2010).

However this solution gives slightly different values compared to the function 5.38 from Headrick (2010) (unless there is a mistake somewhere):

dat <- data.frame(variable = c(rep("x", 2), rep("y", 3)), replicate = c(1,2,1,2,3),
mean = c(3.4, 2.5, 6.5, 5.7, 5.1), sd = c(1.2, 0.7, 2.4, 4.0, 3.5),
n = c(3,3,5,4,6))

x <- subset(dat, variable == "x")

((x$n[1]^2)*(x$sd[1]^2)+
(x$n[2]^2)*(x$sd[2]^2)-
(x$n[2])*(x$sd[1]^2) -
(x$n[2])*(x$sd[2]^2) -
(x$n[1])*(x$sd[1]^2) -
(x$n[1])*(x$sd[2]^2) +
(x$n[1])*(x$n[2])*(x$sd[1]^2) +
(x$n[1])*(x$n[2])*(x$sd[2]^2) +
(x$n[1])*(x$n[2])*(x$mean[1] - x$mean[2])^2)/
((x$n[1] + x$n[2] - 1)*(x$n[1] + x$n[2]))

#[1] 1.015

grand.sd(x$sd, x$mean, x$n)^2

#[1] 1.1675

To answer my own question, the desired data.frame would be acquired followingly:

library(plyr)
ddply(dat, c("variable"), function(dat) c(mean=with(dat,weighted.mean(mean, n)), sd = with(dat, grand.sd(sd, mean, n))))

variable mean sd
1 x 2.950000 1.080509
2 y 5.726667 3.382793

R: Calculate Standard Deviations by Group with a Moving Window

You can use the rolling functions from zoo package :

library(dplyr)

stock %>%
group_by(company) %>%
mutate(std_3obs = zoo::rollapplyr(return, 3, sd, fill = NA))

# company return std_3obs
# <chr> <dbl> <dbl>
# 1 1 0.01 NA
# 2 1 0.015 NA
# 3 1 -0.01 0.0132
# 4 1 0.02 0.0161
# 5 1 0.023 0.0182
# 6 2 -0.04 NA
# 7 2 -0.02 NA
# 8 2 -0.01 0.0153
# 9 2 0.05 0.0379
#10 2 0.06 0.0379
#11 2 0.03 0.0153
#12 2 -0.09 0.0794
#13 3 0.2 NA
#14 3 0.3 NA
#15 3 -0.04 0.175
#16 3 -0.02 0.191
#17 4 -0.01 NA
#18 4 0.023 NA
#19 4 -0.04 0.0315

Create a table from dataframe column values mean and standard deviation in R

Another option with old function tidyr::gather to have a column df:

library(tidyverse)

df2_spread <- df1 %>%
tidyr::gather(ID, val) %>%
left_join(df, by = 'ID')

result_1 <- df2_spread %>%
group_by(Disease, gene = ID) %>%
summarise(n = n(),
mean = mean(val),
sd = sd(val),
min = min(val),
max = max(val), .groups = "drop")
 A tibble: 9 × 7
Disease gene n mean sd min max
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
1 AML GSM239328 16 4.91 2.15 2.13 9.30
2 AML GSM239329 16 4.95 2.13 2.27 9.53
3 AML GSM239331 16 4.88 1.96 2.26 8.29
4 Control GSM239170 16 4.56 1.91 2.20 7.98
5 Control GSM239323 16 5.04 1.98 2.33 8.82
6 Control GSM239324 16 4.93 2.45 2.13 11.4
7 Control GSM239326 16 4.97 2.34 2.18 10.7
8 Control GSM239332 16 4.97 2.16 2.25 9.77
9 Control GSM239333 16 5.01 2.14 2.29 9.77

In any case I'm not able to find a way to calculate Fold_change for each gene since there seems to be only one disease by gene here.

Here are the datas


df <- tibble::tribble(
~ID, ~Disease,
"GSM239170", "Control",
"GSM239323", "Control",
"GSM239324", "Control",
"GSM239326", "Control",
"GSM239328", "AML",
"GSM239329", "AML",
"GSM239331", "AML",
"GSM239332", "Control",
"GSM239333", "Control"
)

df1 <- tibble::tribble(
~GSM239170, ~GSM239323, ~GSM239324, ~GSM239326, ~GSM239328, ~GSM239329, ~GSM239331, ~GSM239332, ~GSM239333,
3.016704177, 3.285669072, 2.929482692, 2.922820483, 3.15950317, 3.163327169, 2.985901308, 3.122708843, 3.070948463,
7.977735461, 6.532514237, 6.388007183, 6.466679556, 6.432795021, 6.407321524, 6.426470803, 6.376394357, 6.469070308,
4.207280707, 4.994965767, 4.40159671, 4.747114589, 4.830045513, 4.213762092, 4.884418365, 4.4318876, 4.849665444,
7.25609471, 7.420807337, 6.999340125, 7.094488581, 7.024332721, 7.17928981, 7.159898654, 7.009977785, 6.830979234,
2.204955099, 2.331625217, 2.133305231, 2.18332885, 2.12778313, 2.269697813, 2.264705552, 2.253940441, 2.287924323,
7.28437278, 6.983593721, 6.86337111, 6.865970678, 7.219840938, 7.181113053, 7.392230178, 7.484052914, 7.52498281,
4.265792764, 4.970684112, 4.595545125, 4.575545289, 4.547957809, 4.68215122, 4.674495889, 4.675841709, 4.643311767,
2.6943516, 2.916324936, 2.578130269, 2.659717988, 2.567436676, 2.8095128, 2.790110381, 2.795882913, 2.884588792,
3.646303109, 8.817891552, 11.4248793, 10.74738082, 9.296043108, 9.53150669, 8.285160496, 9.769919327, 9.774610531,
3.040292001, 3.38486713, 2.958851115, 3.047880699, 2.878562717, 3.209319974, 3.20260379, 3.195993624, 3.3004227,
2.357625231, 2.444753172, 2.340767158, 2.32143889, 2.282608342, 2.401218719, 2.385568421, 2.375334953, 2.432634747,
5.378494673, 6.065038394, 5.134842087, 5.367342376, 5.682051149, 5.712072512, 5.57179966, 5.72082395, 5.656674512,
2.833814735, 3.038434511, 2.837711812, 2.859800224, 2.866040813, 2.969167906, 2.929449968, 2.963530689, 2.931065261,
6.192932281, 6.478439634, 6.180169144, 6.151689376, 6.238949956, 6.708196123, 6.441437631, 6.448280595, 6.413562269,
4.543042482, 4.786227217, 4.445131477, 4.51471011, 4.491645167, 4.460114204, 4.602482637, 4.587221948, 4.623125028,
6.069437462, 6.232738284, 6.74644117, 7.04995802, 6.938928532, 6.348253102, 6.080950712, 6.324619355, 6.472893789
)

How to calculate standard deviation every 3 columns in a dataframe?

Here is an idea via base R. We split the data frame every 3 columns and create a list. We then loop over that list and calculate the rowwise standard deviation, i.e.

sapply(split.default(df, rep(seq((ncol(df) / 3)), each = 3)), function(i)
apply(i, 1, sd, na.rm = TRUE))

Standard deviation showing up as NA in dplyr chain

The call to sd() is referencing the in-place mutation (summarization) of pd_sent_amount. Give the summarized column a new name.

data_control %>%
group_by(politics, partner_politics) %>%
summarize(pd_sent_amount_mean = mean(as.numeric(pd_sent_amount)),
n = n(),
pd_sent_amount_sd = sd(as.numeric(pd_sent_amount), na.rm = T)
)

The fourth example given here on the dplyr website mentions that "newly created summaries immediately overwrite existing variables", and the example is actually the same case as yours with sequential calls to mean() and sd().



Related Topics



Leave a reply



Submit