R - Scaling Numeric Values Only in a Dataframe with Mixed Types

selective scaling function in r using a different data frame to scale

One way with base R. Comments in the code. Thanks, Nelson, for the data +1

df <- read.table(text="color weight height length estimate
    1    red     10     66     40        5
    2    red     12     60     41        7
    3 yellow     12     67     48        9
    4   blue     15     55     36       10
    5 yellow     21     54     48        7
    6    red     12     54     43        5
    7    red     11     38     36        6", head=T)

scale_df <- read.table(text=" color weight height length estimate
    1    red     11     55     41        7
    2    red     13     67     39        9
    3 yellow     12     67     46       11
    4   blue     16      8     37        5
    5 yellow     23     10     47        9
    6    red     17     11     41       10
    7    red     16     13     37       13", head=T)

## add reference and scaling df as arguments
scale2sd <- function(ref, scale_by, variable) {
  ((ref[[variable]]) - mean(scale_by[[variable]], na.rm = TRUE)) / (2 * sd(scale_by[[variable]], na.rm = TRUE))
}
predictors <- c("color", "weight", "height", "length")
## this is to get all numeric columns that are part of your predictor variables
df_to_scale <- Filter(is.numeric, df[predictors])
## create a named vector. This is a bit awkward but it makes it easier to select
## the corresponding items in the two data frames, 
## and then replace the original columns 
num_vars <- setNames(names(df_to_scale), names(df_to_scale))                      

## this is the actual scaling job - 
## use the named vector for looping over the selected columns 
## then assign it back to the selected columns
df[num_vars] <- lapply(num_vars, function(x) scale2sd(df, scale_df, x))

df
#>    color      weight     height      length estimate
#> 1    red -0.67259271 0.58130793 -0.14222363        5
#> 2    red -0.42479540 0.47561558 -0.01777795        7
#> 3 yellow -0.42479540 0.59892332  0.85334176        9
#> 4   blue -0.05309942 0.38753862 -0.64000632       10
#> 5 yellow  0.69029252 0.36992323  0.85334176        7
#> 6    red -0.42479540 0.36992323  0.23111339        5
#> 7    red -0.54869405 0.08807696 -0.64000632        6

scaling only numeric values in data frame that contain string

Convert non numeric values to missing values and then use alternative solution for scaling, last replace missing values back to original:

print (df)
   subject_id  hour_measure urinecolor blood pressure
0           3          1.00        red             40
1           3          1.15        red           high
2           4          2.00     yellow            low
3           5          5.00     yellow            100

df = df.set_index('subject_id')

df1 = df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
df2 = (df1 - df1.min()) / (df1.max() - df1.min())

df = df2.combine_first(df)
print (df)
            hour_measure urinecolor blood pressure
subject_id                                        
3                 0.0000        red              0
3                 0.0375        red           high
4                 0.2500     yellow            low
5                 1.0000     yellow              1

First solution:

I suggest replace text columns to numeric by dictionary like:

dbp = {'high': 150, 'low': 60}

df['blood pressure'] = df['blood pressure'].replace(dbp)

All together:

#if subject_id are numeric convert them to index
df = df.set_index('subject_id')

dbp = {'high': 150, 'low': 60}
#replace to numbers and convert to integers
df['blood pressure'] = df['blood pressure'].replace(dbp).astype(int)

print (df)
            hour_measure urinecolor  blood pressure
subject_id                                         
3                   1.00        red              40
3                   1.15        red             150
4                   2.00     yellow              60

print (df.dtypes)
hour_measure      float64
urinecolor         object
blood pressure      int32
dtype: object

from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1))
#select only numeric columns
X = scaler.fit_transform(df.select_dtypes(np.number))
print (X)
[[0.         0.        ]
 [0.15       1.        ]
 [1.         0.18181818]]

Detail:

print (df.select_dtypes(np.number))
            hour_measure  blood pressure
subject_id                              
3                   1.00              40
3                   1.15             150
4                   2.00              60

R Standardizing numeric variables in dataframe while retaining factor variables

You need to use rapply instead of sapply

set.seed(1)
> df=data.frame(A=rnorm(10),b=1:10,C=as.factor(rep(1:2,5)))
> str(df)
'data.frame':   10 obs. of  3 variables:
 $ A: num  -0.626 0.184 -0.836 1.595 0.33 ...
 $ b: int  1 2 3 4 5 6 7 8 9 10
 $ C: Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 2

The code you need to use:

> D=rapply(df,scale,c("numeric","integer"),how="replace")
> D
             A          b C
1  -0.97190653 -1.4863011 1
2   0.06589991 -1.1560120 2
3  -1.23987805 -0.8257228 1
4   1.87433300 -0.4954337 2
5   0.25276523 -0.1651446 1
6  -1.22045645  0.1651446 2
7   0.45507643  0.4954337 1
8   0.77649606  0.8257228 2
9   0.56826358  1.1560120 1
10 -0.56059319  1.4863011 2
> str(D)
'data.frame':   10 obs. of  3 variables:
 $ A: num [1:10, 1] -0.9719 0.0659 -1.2399 1.8743 0.2528 ...
  ..- attr(*, "scaled:center")= num 0.132
  ..- attr(*, "scaled:scale")= num 0.781
 $ b: num [1:10, 1] -1.486 -1.156 -0.826 -0.495 -0.165 ...
  ..- attr(*, "scaled:center")= num 5.5
  ..- attr(*, "scaled:scale")= num 3.03
 $ C: Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 2
>

How to scale segments of a column in an R data frame?

Apply the same function (scale) by group.

In base R

df$z <- with(df, ave(x, y, FUN = scale))
df

#    x y        z
#1   1 A -1.26491
#2   2 A -0.63246
#3   3 A  0.00000
#4   4 A  0.63246
#5   5 A  1.26491
#6  20 B -1.33242
#7  22 B -0.59219
#8  24 B  0.14805
#9  25 B  0.51816
#10 27 B  1.25840
#11 12 C -0.83028
#12 13 C -0.36901
#13 12 C -0.83028
#14 15 C  0.55352
#15 17 C  1.47605

Using dplyr

library(dplyr)
df %>%  group_by(y) %>%  mutate(z =  scale(x))

Or data.table

library(data.table)
setDT(df)[, z:= scale(x), y]

Standardize data columns in R

I have to assume you meant to say that you wanted a mean of 0 and a standard deviation of 1. If your data is in a dataframe and all the columns are numeric you can simply call the scale function on the data to do what you want.

dat <- data.frame(x = rnorm(10, 30, .2), y = runif(10, 3, 5))
scaled.dat <- scale(dat)

# check that we get mean of 0 and sd of 1
colMeans(scaled.dat)  # faster version of apply(scaled.dat, 2, mean)
apply(scaled.dat, 2, sd)

Using built in functions is classy. Like this cat:

Sample Image

Scaling a numeric matrix in R with values 0 to 1

Try the following, which seems simple enough:

## Data to make a minimal reproducible example
m <- matrix(rnorm(9), ncol=3)

## Rescale each column to range between 0 and 1
apply(m, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))
#           [,1]      [,2]      [,3]
# [1,] 0.0000000 0.0000000 0.5220198
# [2,] 0.6239273 1.0000000 0.0000000
# [3,] 1.0000000 0.9253893 1.0000000