Get a List of the Data Sets in a Particular Package

How do I get a list of built-in data sets in R?

There are several ways to find the included datasets in R:

1: Using data() will give you a list of the datasets of all loaded packages (and not only the ones from the datasets package); the datasets are ordered by package

2: Using data(package = .packages(all.available = TRUE)) will give you a list of all datasets in the available packages on your computer (i.e. also the not-loaded ones)

3: Using data(package = "packagename") will give you the datasets of that specific package, so data(package = "plyr") will give the datasets in the plyr package

If you want to know in which package a dataset is located (e.g. the acme dataset), you can do:

dat <- as.data.frame(data(package = .packages(all.available = TRUE))$results)
dat[dat$Item=="acme", c(1,3,4)]

which gives:

    Package Item                  Title
107    boot acme Monthly Excess Returns

How to find out how many datasets each package contains in R?

Short answer:

nrow(data(package = "MASS")$result)
#[1] 87

Long answer: data(package = "MASS") returns an object of class packageIQR which has a nice print method that let you see all the available datasets. However, that object is actually a list and the result component is a matrix with a row for each dataset containing location, name and description. Using nrow on it you get the number of datasets.

a <- data(package = "MASS")
str(a)
#List of 4
# $ title  : chr "Data sets"
# $ header : NULL
# $ results: chr [1:87, 1:4] "MASS" "MASS" "MASS" "MASS" ...
#  ..- attr(*, "dimnames")=List of 2
#  .. ..$ : NULL
#  .. ..$ : chr [1:4] "Package" "LibPath" "Item" "Title"
# $ footer : NULL
# - attr(*, "class")= chr "packageIQR"tr(a)
head(a$results[,3:4])
#     Item       Title                                                 
#[1,] "Aids2"    "Australian AIDS Survival Data"                       
#[2,] "Animals"  "Brain and Body Weights for 28 Species"               
#[3,] "Boston"   "Housing Values in Suburbs of Boston"                 
#[4,] "Cars93"   "Data from 93 Cars on Sale in the USA in 1993"        
#[5,] "Cushings" "Diagnostic Tests on Patients with Cushing's Syndrome"
#[6,] "DDT"      "DDT in Kale"

nrow(a$result)
#[1] 87

Get a list of the all the names of the objects in the datasets R package?

There is a note on the ?data help page that states

Where the datasets have a different name from the argument that should be used to retrieve them the index will have an entry like beaver1 (beavers) which tells us that dataset beaver1 can be retrieved by the call data(beavers).

So the actual object name is the thing before the parentheses at the end. Since that value is returned as just a string, that's something you'll need to remove yourself unfortunately. But you can do that with a gsub

datanames <- data(package="datasets")$results[,"Item"]
objnames <- gsub("\\s+\\(.*\\)","", datanames)

for(ds in objnames) {
  print(get(ds))
  cat("\n\n")
}

Find which package a data set is included in

This is what the find() function is for.

> find("iris")
[1] "package:datasets"

> find("UScereal")
[1] "package:MASS"

If an object is in the search path, find() will tell you where it came from. See ?find for more information.

To get more information about a specific dataset, you can also use ?UScereal, which will work if UScereal is in the search path, or ??UScereal if it isn't but its parent package is installed.

To locate a dataset that isn't within an installed package, you can search for it on RDocumentation.org.

How can I find a dataset that has some specific attributes?

I've packaged a solution in a one function github package.

I'm copying the whole code at the bottom but the simplest is :

remotes::install_github("moodymudskipper/datasearch")
library(datasearch)

All data sets from package "dplyr"

dplyr_all <-
  datasearch("dplyr")

View(dplyr_all)

dplyr

Datasets from package "datasets" restricted by condition

datasets_ncol5 <-
  datasearch("datasets", filter =  ~is.data.frame(.) && ncol(.) == 5)

View(datasets_ncol5)

datasets

All datasets from all installed packages, no restriction


# might take more or less time, depends what you have installed
all_datasets <- datasearch()

View(all_datasets)

# subsetting the output
my_subset <- subset(
  all_datasets, 
  class1 == "data.frame" &
    grepl("treatment", names_collapsed) &
    nrow < 100
)

View(my_subset)

all

datasearch <- function(pkgs = NULL, filter = NULL){
  # make function silent
  w <- options()$warn
  options(warn = -1)
  search_ <- search()
  file_ <- tempfile()
  file_ <- file(file_, "w")
  on.exit({
    options(warn = w)
    to_detach <- setdiff(search(), search_)
    for(pkg in to_detach) eval(bquote(detach(.(pkg))))
    # note : we still have loaded namespaces, we could unload those that we ddn't
    # have in the beginning but i'm worried about surprising effects, I think
    # the S3 method tables should be cleaned too, and maybe other things

    # note2 : tracing library and require didn't work
    })

  # convert formula to function
  if(inherits(filter, "formula")) {
    filter <- as.function(c(alist(.=), filter[[length(filter)]]))
  }

  ## by default fetch all available packages in .libPaths()
  if(is.null(pkgs)) pkgs <- .packages(all.available = TRUE)
  ## fetch all data sets description
  df <- as.data.frame(data(package = pkgs, verbose = FALSE)$results)
  names(df) <- tolower(names(df))
  item <- NULL # for cmd check note
  df <- transform(
    df,
    data_name = sub('.*\\((.*)\\)', '\\1', item),
    dataset   = sub(' \\(.*', '', item),
    libpath = NULL,
    item = NULL
    )
  df <- df[order(df$package, df$data_name),]
  pkg_data_names <- aggregate(dataset ~ package + data_name, df, c)
  pkg_data_names <- pkg_data_names[order(pkg_data_names$package, pkg_data_names$data_name),]

  env <- new.env()
  n <-  nrow(pkg_data_names)
  pb <- progress::progress_bar$new(
    format = "[:bar] :percent :pkg",
    total = n)
  row_dfs <- vector("list", n)
  for(i in seq(nrow(pkg_data_names))) {
    pkg    <- pkg_data_names$package[i]
    data_name <- pkg_data_names$data_name[i]
    datasets  <- pkg_data_names$dataset[[i]]
    pb$tick(tokens = list(pkg = format(pkg, width = 12)))

    sink(file_, type = "message")
    data(list=data_name, package = pkg, envir = env)
    row_dfs_i <- lapply(datasets, function(dataset) {
      dat <- get(dataset, envir = env)
      if(!is.null(filter) && !filter(dat)) return(NULL)
      cl <- class(dat)
      nms <- names(dat)
      nc <- ncol(dat)
      if (is.null(nc)) nc <- NA
      nr <- nrow(dat)
      if (is.null(nr)) nr <- NA

      out <- data.frame(
        package = pkg,
        data_name = data_name,
        dataset = dataset,
        class = I(list(cl)),
        class1 = cl[1],
        type = typeof(dat),
        names = I(list(nms)),
        names_collapsed = paste(nms, collapse = "/"),
        nrow       = nr,
        ncol       = nc,
        length     = length(dat))

      if("data.frame" %in% cl) {
        classes <- lapply(dat, class)
        cl_flat <- unlist(classes)
        out <- transform(
          out,
          classes    = I(list(classes)),
          types      = I(list(vapply(dat, typeof, character(1)))),
          logical    = sum(cl_flat == 'logical'),
          integer    = sum(cl_flat == 'integer'),
          numeric    = sum(cl_flat == 'numeric'),
          complex    = sum(cl_flat == 'complex'),
          character  = sum(cl_flat == 'character'),
          raw        = sum(cl_flat == 'raw'),
          list       = sum(cl_flat == 'list'),
          data.frame = sum(cl_flat == 'data.frame'),
          factor     = sum(cl_flat == 'factor'),
          ordered    = sum(cl_flat == 'ordered'),
          Date       = sum(cl_flat == 'Date'),
          POSIXt     = sum(cl_flat == 'POSIXt'),
          POSIXct    = sum(cl_flat == 'POSIXct'),
          POSIXlt    = sum(cl_flat == 'POSIXlt'))
      } else {
        out <- transform(
          out,
          nrow       = NA,
          ncol       = NA,
          classes    = NA,
          types      = NA,
          logical    = NA,
          integer    = NA,
          numeric    = NA,
          complex    = NA,
          character  = NA,
          raw        = NA,
          list       = NA,
          data.frame = NA,
          factor     = NA,
          ordered    = NA,
          Date       = NA,
          POSIXt     = NA,
          POSIXct    = NA,
          POSIXlt    = NA)
      }
      if(is.matrix(dat)) {
        out$names <- list(colnames(dat))
        out$names_collapsed = paste(out$names, collapse = "/")
      }
      out
    })
    row_dfs_i <- do.call(rbind, row_dfs_i)
    if(!is.null(row_dfs_i)) row_dfs[[i]] <- row_dfs_i
    sink(type = "message")
  }
  df2 <- do.call(rbind, row_dfs)
  df <- merge(df, df2)
  df
}