Get Continent Name from Country Name in R

Get continent name from country name in R

You can use the countrycode package for this task.

library(countrycode)
df <- data.frame(country = c("Afghanistan",
                             "Algeria",
                             "USA",
                             "France",
                             "New Zealand",
                             "Fantasyland"))

df$continent <- countrycode(sourcevar = df[, "country"],
                            origin = "country.name",
                            destination = "continent")
#warning
#In countrycode(sourcevar = df[, "country"], origin = "country.name",  :
#  Some values were not matched unambiguously: Fantasyland

Result

df
#      country continent
#1 Afghanistan      Asia
#2     Algeria    Africa
#3         USA  Americas
#4      France    Europe
#5 New Zealand   Oceania
#6 Fantasyland      <NA>

Is there a package/function in R that will identify country/continent?

Ok, both are possible:

library(dplyr)
library(countrycode)

df<- c("random,thing,thing, United States", "site, level, state, information, Sweden")
df2 <- data_frame(Country =  sub('.*\\, ', '', df))
df2$Continent <- countrycode(sourcevar = df2$Country,
                             origin = "country.name",
                             destination = "continent")
df2

countrycode in R - NA's returned when detecting country name in df

Kindly let me know if this is what you were anticipating.

country %>%
  mutate(continent = countrycode(sourcevar = country, origin = 'country.name', destination = 'region'))

Fast way to parse vector of continent / country / city in R

Consider using read.table from base R

read.table(text = x, sep = "/", header = FALSE,
      fill = TRUE, strip.white = TRUE, na.strings = "")
      V1    V2      V3
1 Africa Kenya Nairobi
2 Africa Kenya Nairobi
3 Africa Kenya    <NA>

Or using fread from data.table

library(data.table)
fread(text = paste(x, collapse="\n"), sep="/", fill = TRUE, na.strings = "")
   Africa Kenya Nairobi
1: Africa Kenya Nairobi
2: Africa Kenya    <NA>

Benchmarks

x <- rep("Africa / Kenya / Nairobi", 1000000)
> 
> system.time(fread(text = paste(x, collapse="\n"), sep="/", fill = TRUE, na.strings = ""))
   user  system elapsed 
  0.473   0.024   0.496 

> system.time(read.table(text = x, sep = "/", header = FALSE,
+       fill = TRUE, strip.white = TRUE, na.strings = ""))
   user  system elapsed 
  0.519   0.026   0.543 

> system.time({  #Using data.table
+   y <- do.call(cbind, data.table::tstrsplit(x, "/", TRUE))
+   y <- trimws(y, whitespace = " ")
+ })
   user  system elapsed 
  2.035   0.051   2.067

data

x <- c("Africa / Kenya / Nairobi", "Africa/Kenya/Nairobi", "Africa / Kenya")

Is there a function in R that lets me create a new column with (1) the full country names and (2) the respective continent?

Are you looking for

library(countrycode)

df <- data.frame(country_code = mycodes,
                country_name = countrycode(sourcevar = mycodes, origin = "iso3c", 
                                    destination = "country.name"),
                continent = countrycode(sourcevar = mycodes, origin = "iso3c", 
                                        destination = "continent"))

head(df)

Note, countrycode() does not manage to identify all strings "iso3c" (country codes) unambiguously. See the warnings to identify which

#> Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: TMP
#> Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: ATA, ATF, TMP, UMI

Btw, I see no need to use {dplyr} here.

Data

mycodes <- c("PRT", "FRA", "JPN", "IRL", "ESP", "BEL", "AUT", "DEU", "ITA", "CHN", "RUS", "POL", "USA", "CRI", "CHE", "ROU", "GBR",
             "BRA", "FIN", "NLD", "CAN", "ZAF", "AUS", "AGO", "BGR", "SWE", "CYP", "ARG", "ARM", "CHL", "MOZ", "KOR", "TUN", "DNK", "GRC",
             "NOR", "ISR", "MYS", "EGy", "JOR", "LUX", "TUR", "IRN", "LBY", "PAN", "COL", "VEN", "DZA", "GNB", "MAR", "CZE", "SVN", "IND",
             "HUN", "NZL", "PER", "LTU", "TWN", "SRB", "EST", "KAZ", "KWT", "IDN", "UKR", "MEX", "SVK", "SAU", "ARE", "BGD", "THA", "TZA", 
             "LVA", "PHL", "BIH", "BHR", "NAM", "BOL", "HRV", "SGP", "CMR", "MLT", "URY", "PAK", "JAM", "ECU", "SYC", "QAT", "PRY", "BRB",
             "OMN", "TMP", "ABW", "LBN", "SLV", "DMA", "CUB", "VNM", "GEO", "IRQ", "PYF", "UGA", "LIE", "SEN", "BLR", "ISL", "DOM",
             "GUY", "LCA", "CPV", "ATA", "GAB", "NGA", "RWA", "CIV", "ALB", "MKD", "MNE", "GTM", "GHA", "MDV", "MCO", "MUS", "TGO", "LKA",
             "AZE", "SUR", "KEN", "MRT", "HKG", "SYR", "CAF", "NCL", "UZB", "KIR", "SDN", "PRI", "ATF", "KNA", "TJK", "SLE", "LAO", "COM",
             "ETH", "FRO", "AND", "BEN", "ZWE", "ASM", "MLI", "BWA", "AIA", "COD", "SPM", "JEY", "MDG", "NIC", "SWZ", "CYM", "SOM", "ATG",
             "KGZ", "FLK", "GIB", "SMR", "TKM", "HTI", "UMI", "MMR", "WSM", "VIR", "ERI", "WLF", "GUF", "MWI", "PCN", "TCD")

How can I change some of the names of variables within columns?

With case_when you could extend: (more conditions):

library(dplyr)
df %>% 
  mutate(Continent = case_when(Country == "Afghanistan" ~ "Asia",
                               Country == "Australia" ~ "Oceana",
                               TRUE ~ Continent))

      Country Year Continent Life_Expectancy
1 Afghanistan 2010      Asia        61.17996
2 Afghanistan 2011      Asia        61.72234
3 Afghanistan 2012      Asia        62.20652
4   Australia 2012    Oceana        43.22200

data:

df <- structure(list(Country = c("Afghanistan", "Afghanistan", "Afghanistan", 
"Australia"), Year = c(2010L, 2011L, 2012L, 2012L), Continent = c("Eastern Mediterranean", 
"Eastern Mediterranean", "Eastern Mediterranean", "Western pacific"
), Life_Expectancy = c(61.17996, 61.72234, 62.20652, 43.222)), class = "data.frame", row.names = c("1", 
"2", "3", "4"))