How to Read Data in Utf-8 Format in R

how to read data in utf-8 format in R?

OS: Windows-7 (64-bit)

R Version:

package_version(R.version)

[1] ‘3.0.2’

Change your locale from "chinese" to "English_United States.1252"

  Sys.setlocale(category="LC_ALL", locale = "English_United States.1252")

Sys.getlocale(category="LC_ALL")

[1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"

Read in data with chinese encoding

 df_ch <- read.table("test.utf8",
sep=",",
header=FALSE,
encoding="chinese",
stringsAsFactors=FALSE
)

Read in data with UTF-8 encoding

 df_utf8 <- read.table("test.utf8",
sep=",",
header=FALSE,
encoding="UTF-8",
stringsAsFactors=FALSE
)

In RStudio Version 0.98.501

 df_ch$V1[1]

[1] "乘客姓å"

df_utf8$V1[1]

[2] "乘客姓名"

df_utf8$V1

[1] "乘客姓名" "HuangTianhui" "姜翠云" "李红晶" "LuiChing" "宋飞飞"
[7] "唐旭东" "YangJiabao" "买买提江·阿布拉" "安文兰" "鲍媛华" "边亮京"
[13] "边茂勤" "曹蕊" "车俊章" "陈长军" "陈建设" "陈昀"
[19] "戴淑玲" "丁立军" "丁莹" "丁颖" "董国伟" "杜文忠"
[25] "冯栋" "冯纪新" "付宝峰" "甘福祥" "甘涛" "高歌"
[31] "管文杰" "韩静" "侯爱琴" "侯波" "胡偲婠(婴儿)" "胡效宁"

Display unicode data for a row from a data frame

  df_utf8[1,]
V1 V2 V3
1 <U+FEFF><U+4E58><U+5BA2><U+59D3><U+540D> <U+6027><U+522B> <U+51FA><U+751F><U+65E5><U+671F>

Display chinese data for a row from a data frame

as.character(df_utf8[1,])

[1] "乘客姓名" "性别" "出生日期"

as.character(df_utf8[2,])

[1] "HuangTianhui" "男" "1948/05/28"

Displaying multiple columns of data with international characters can be performed by converting the data frame into list and by forcing the data into character format.

  df_utf8_ch <- lapply(df_utf8, as.character)

df_utf8_ch

$V1
1 "乘客姓名" "HuangTianhui" "姜翠云" "李红晶" "LuiChing" "宋飞飞"

7 "唐旭东" "YangJiabao" "买买提江·阿布拉" "安文兰" "鲍媛华" "边亮京"

[13] "边茂勤" "曹蕊" "车俊章" "陈长军" "陈建设" "陈昀"

[19] "戴淑玲" "丁立军" "丁莹" "丁颖" "董国伟" "杜文忠"

[25] "冯栋" "冯纪新" "付宝峰" "甘福祥" "甘涛" "高歌"

[31] "管文杰" "韩静" "侯爱琴" "侯波" "胡偲婠(婴儿)" "胡效宁"

[37] "黄毅" "姜学仁" "姜颖" "焦微微" "焦文学" "鞠坤"

[43] "康旭" "黎明中" "李国辉" "李洁" "李乐" "李文博"

[49] "李燕" "李宇辰" "李志锦" "李志欣" "李智" "栗延林"

[55] "梁路阳" "梁旭阳" "林安南" "林明峰" "刘凤英" "刘金鹏"

[61] "刘强" "刘如生" "刘顺超" "柳忠福" "楼宝棠" "卢先初"

[67] "鹿建华" "罗伟" "马骏" "马文芝" "毛土贵" "么立飞"

[73] "蒙高生" "孟兵" "孟凡余" "欧阳欣" "石贤文" "宋春玲"

[79] "宋坤" "苏强国" "汤雪竹" "田军伟" "田清君" "汪厚彬"

[85] "王春勇" "王纯华" "王丹" "王海涛" "王利军" "王林诗"

[91] "王墨恒(婴儿)" "王守宪" "王淑敏" "王献军" "王永刚"

$V2
1 "性别" "男" "女" "女" "女" "男" "男" "女" "男" "女" "女" "男" "女" "女" "女" "男"

[17] "男" "女" "女" "男" "女" "女" "男" "男" "男" "男" "男" "男" "男" "女" "男" "女"

[33] "女" "男" "女" "男" "女" "男" "女" "女" "男" "男" "男" "男" "男" "女" "男" "女"

[49] "女" "男" "男" "男" "男" "男" "男" "男" "男" "男" "女" "男" "男" "男" "男" "男"

[65] "男" "男" "男" "男" "男" "女" "男" "男" "男" "男" "男" "女" "男"

$V3
1 "出生日期" "1948/05/28" "1952/03/27" "1994/12/09" "1969/08/02" "1982/03/01" "1983/08/03" "1988/08/25"
[9] "1979/07/10" "1949/10/20" "1951/10/21" "1987/06/06" "1947/07/19" "1982/02/19" "1946/03/20" "1979/06/06"
[17] "1956/03/07" "1957/08/11" "1956/12/07" "1971/04/06" "1952/04/25" "1986/10/24" "1966/10/26" "1964/06/07"
[25] "1993/03/09" "1944/01/06" "1986/12/06" "1965/11/21" "1970/01/29" "1987/11/16" "1979/10/03" "1961/05/28"
[33] "1969/06/24" "1979/05/15" "2011/02/25" "1980/01/01" "1984/06/18" "有待确认" "1987/04/13" "1983/05/09"
[41] "1956/12/17" "1982/11/07" "1980/08/09" "1945/12/19" "1958/05/18" "1987/02/06" "1982/12/03" "1985/07/16"
[49] "1983/07/19" "1987/11/06" "1984/04/14" "1979/05/22" "1973/05/05" "1985/10/26" "1954/03/26" "1984/11/12"
[57] "1987/03/27" "1980/05/25" "1949/05/10" "1981/12/26" "1974/08/13" "1938/01/22" "1968/02/29" "1942/05/22"
[65] "1935/04/21" "1981/10/14" "1957/03/28" "1985/08/20" "1981/12/25" "1957/08/01" "1942/08/02" "1983/06/15"
[73] "1950/01/01" "1974/04/26" "1944/08/23" "1976/10/12" "1988/01/18" "1954/04/06"

 View(df_ch)

chinese encoding

 View(df_utf8)

Sample Image

In RGui (64-bit)

Sample Image

View(df_ch)

Sample Image

View(df_utf8)

Sample Image

The good thing is you have all data in utf8 format to be used for further data analysis.

Once your analysis is done, you may change the locale back to "chinese"

  Sys.setlocale(category="LC_ALL", locale = "chinese")

Sys.getlocale(category="LC_ALL")

[1] "LC_COLLATE=Chinese (Simplified)_People's Republic of China.936;LC_CTYPE=Chinese (Simplified)_People's Republic of China.936;LC_MONETARY=Chinese (Simplified)_People's Republic of China.936;LC_NUMERIC=C;LC_TIME=Chinese (Simplified)_People's Republic of China.936"

Some functions you may need to explore for converting between character string encodings.

Encoding()

iconv()

HTH

Import csv file into R data frame with UTF-8 encoding

You can use read.csv() function with the same attributes you used with read.table. Except fileEncoding - in read.csv() you should write just encoding = "UTF-8".

Also Duck's answer is suitable, too.

How to read utf-8 encoded text in R

I don't have access to your data so I cannot check:

data <- xlsx::read.xlsx("file.xlsx", sheetIndex = 1, encoding="UTF-8")

Importing a .csv into R with UTF-8 encoding error?

You need to specify the encoding

servutf <- read.csv("servutf.csv", sep=";", encoding = "UTF-8")

unable to read csv file saved with encoding UTF-8-SIG

As the input is already encoded as UTF-8, you should use the encoding argument to read the file as-is. Using fileEncoding will attempt to re-encode the file.

mydata <- read.csv(<path_to_file>, encoding="UTF-8", header=FALSE)


Related Topics



Leave a reply



Submit