How to Make a Sunburst Plot in R or Python

How to make a sunburst plot in R or Python?

Python version of sunburst diagram using matplotlib bars in polar projection:

import numpy as np
import matplotlib.pyplot as plt

def sunburst(nodes, total=np.pi * 2, offset=0, level=0, ax=None):
ax = ax or plt.subplot(111, projection='polar')

if level == 0 and len(nodes) == 1:
label, value, subnodes = nodes[0]
ax.bar([0], [0.5], [np.pi * 2])
ax.text(0, 0, label, ha='center', va='center')
sunburst(subnodes, total=value, level=level + 1, ax=ax)
elif nodes:
d = np.pi * 2 / total
labels = []
widths = []
local_offset = offset
for label, value, subnodes in nodes:
labels.append(label)
widths.append(value * d)
sunburst(subnodes, total=total, offset=local_offset,
level=level + 1, ax=ax)
local_offset += value
values = np.cumsum([offset * d] + widths[:-1])
heights = [1] * len(nodes)
bottoms = np.zeros(len(nodes)) + level - 0.5
rects = ax.bar(values, heights, widths, bottoms, linewidth=1,
edgecolor='white', align='edge')
for rect, label in zip(rects, labels):
x = rect.get_x() + rect.get_width() / 2
y = rect.get_y() + rect.get_height() / 2
rotation = (90 + (360 - np.degrees(x) % 180)) % 360
ax.text(x, y, label, rotation=rotation, ha='center', va='center')

if level == 0:
ax.set_theta_direction(-1)
ax.set_theta_zero_location('N')
ax.set_axis_off()

Example, how this function can be used:

data = [
('/', 100, [
('home', 70, [
('Images', 40, []),
('Videos', 20, []),
('Documents', 5, []),
]),
('usr', 15, [
('src', 6, [
('linux-headers', 4, []),
('virtualbox', 1, []),

]),
('lib', 4, []),
('share', 2, []),
('bin', 1, []),
('local', 1, []),
('include', 1, []),
]),
]),
]

sunburst(data)

python matplotlib sunburst diagram

Follow up of : How to make a sunburst plot in R?

You can try adding a row ID column to your data frame & use it explicitly as a grouping variable. This prevents ggplot() from grouping the bars by the fill aesthetic:

library(dplyr)

ggplot(df %>% mutate(id = seq(1, n())),
aes(y = value, group = id)) +
geom_col(aes(fill = level1, x = 0), width = .5) +
geom_col(aes(fill = level2, x = .25), width = .25) +
coord_polar(theta = 'y')

plot

How to format data for plotly sunburst diagram

You are absolutely right, compared to the rest of the intuitiv usage of plotly's R API preparing data for a sunburst (or treemap) chart is rather annoying.

I had the same problem and wrote a function based on library(data.table) to prepare the data, accepting two different data.frame input formats.

The format required to generate a sunburst plot using data similarly structured as yours can be seen here under the section Sunburst with Repeated Labels.

For your example it should look like this:

         labels values         parents                           ids
1: total 1658 <NA> total
2: private 353 total total - private
3: public 1120 total total - public
4: mixed 185 total total - mixed
5: residential 108 total - private total - private - residential
6: recreation 143 total - private total - private - recreation
7: commercial 102 total - private total - private - commercial
8: residential 300 total - public total - public - residential
9: recreation 320 total - public total - public - recreation
10: commercial 500 total - public total - public - commercial
11: residential 37 total - mixed total - mixed - residential
12: recreation 58 total - mixed total - mixed - recreation
13: commercial 90 total - mixed total - mixed - commercial

Here is the code to get there:

library(data.table)
library(plotly)

DF <- data.table(ownership=c(rep("private", 3), rep("public",3),rep("mixed", 3)),
landuse=c(rep(c("residential", "recreation", "commercial"),3)),
acres=c(108, 143, 102, 300, 320, 500, 37, 58, 90))

as.sunburstDF <- function(DF, value_column = NULL, add_root = FALSE){
require(data.table)

colNamesDF <- names(DF)

if(is.data.table(DF)){
DT <- copy(DF)
} else {
DT <- data.table(DF, stringsAsFactors = FALSE)
}

if(add_root){
DT[, root := "Total"]
}

colNamesDT <- names(DT)
hierarchy_columns <- setdiff(colNamesDT, value_column)
DT[, (hierarchy_columns) := lapply(.SD, as.factor), .SDcols = hierarchy_columns]

if(is.null(value_column) && add_root){
setcolorder(DT, c("root", colNamesDF))
} else if(!is.null(value_column) && !add_root) {
setnames(DT, value_column, "values", skip_absent=TRUE)
setcolorder(DT, c(setdiff(colNamesDF, value_column), "values"))
} else if(!is.null(value_column) && add_root) {
setnames(DT, value_column, "values", skip_absent=TRUE)
setcolorder(DT, c("root", setdiff(colNamesDF, value_column), "values"))
}

hierarchyList <- list()

for(i in seq_along(hierarchy_columns)){
current_columns <- colNamesDT[1:i]
if(is.null(value_column)){
currentDT <- unique(DT[, ..current_columns][, values := .N, by = current_columns], by = current_columns)
} else {
currentDT <- DT[, lapply(.SD, sum, na.rm = TRUE), by=current_columns, .SDcols = "values"]
}
setnames(currentDT, length(current_columns), "labels")
hierarchyList[[i]] <- currentDT
}

hierarchyDT <- rbindlist(hierarchyList, use.names = TRUE, fill = TRUE)

parent_columns <- setdiff(names(hierarchyDT), c("labels", "values", value_column))
hierarchyDT[, parents := apply(.SD, 1, function(x){fifelse(all(is.na(x)), yes = NA_character_, no = paste(x[!is.na(x)], sep = ":", collapse = " - "))}), .SDcols = parent_columns]
hierarchyDT[, ids := apply(.SD, 1, function(x){paste(x[!is.na(x)], collapse = " - ")}), .SDcols = c("parents", "labels")]
hierarchyDT[, c(parent_columns) := NULL]
return(hierarchyDT)
}

sunburstDF <- as.sunburstDF(DF, value_column = "acres", add_root = TRUE)

plot_ly(data = sunburstDF, ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='sunburst', branchvalues = 'total')

result

Here is an example for the second data.frame format accepted by the function (value_column = NULL, because it is calculated from the data):

DF2 <- data.frame(sample(LETTERS[1:3], 100, replace = TRUE),
sample(LETTERS[4:6], 100, replace = TRUE),
sample(LETTERS[7:9], 100, replace = TRUE),
sample(LETTERS[10:12], 100, replace = TRUE),
sample(LETTERS[13:15], 100, replace = TRUE),
stringsAsFactors = FALSE)

plot_ly(data = as.sunburstDF(DF2, add_root = TRUE), ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='sunburst', branchvalues = 'total')

Please also see library(sunburstR) as an alternative.


Edit: Added a benchmark regarding the dplyr based count_to_sunburst() function from library(plotme) (see below), which on my system is around 5 times slower than the data.table version.

Unit: milliseconds
expr min lq mean median uq max neval
plotme 50.4618 53.09425 60.92404 55.37815 63.62315 122.3842 100
ismirsehregal 8.6553 10.28870 12.63881 11.53760 12.26620 108.2025 100

Code to reproduce the benchmark:

# devtools::install_github("yogevherz/plotme")

library(microbenchmark)
library(plotme)
library(dplyr)
library(data.table)
library(plotly)

DF <- data.frame(ownership=c(rep("private", 3), rep("public",3),rep("mixed", 3)),
landuse=c(rep(c("residential", "recreation", "commercial"),3)),
acres=c(108, 143, 102, 300, 320, 500, 37, 58, 90))

as.sunburstDF <- function(DF, value_column = NULL, add_root = FALSE){
require(data.table)

colNamesDF <- names(DF)

if(is.data.table(DF)){
DT <- copy(DF)
} else {
DT <- data.table(DF, stringsAsFactors = FALSE)
}

if(add_root){
DT[, root := "Total"]
}

colNamesDT <- names(DT)
hierarchy_columns <- setdiff(colNamesDT, value_column)
DT[, (hierarchy_columns) := lapply(.SD, as.factor), .SDcols = hierarchy_columns]

if(is.null(value_column) && add_root){
setcolorder(DT, c("root", colNamesDF))
} else if(!is.null(value_column) && !add_root) {
setnames(DT, value_column, "values", skip_absent=TRUE)
setcolorder(DT, c(setdiff(colNamesDF, value_column), "values"))
} else if(!is.null(value_column) && add_root) {
setnames(DT, value_column, "values", skip_absent=TRUE)
setcolorder(DT, c("root", setdiff(colNamesDF, value_column), "values"))
}

hierarchyList <- list()

for(i in seq_along(hierarchy_columns)){
current_columns <- colNamesDT[1:i]
if(is.null(value_column)){
currentDT <- unique(DT[, ..current_columns][, values := .N, by = current_columns], by = current_columns)
} else {
currentDT <- DT[, lapply(.SD, sum, na.rm = TRUE), by=current_columns, .SDcols = "values"]
}
setnames(currentDT, length(current_columns), "labels")
hierarchyList[[i]] <- currentDT
}

hierarchyDT <- rbindlist(hierarchyList, use.names = TRUE, fill = TRUE)

parent_columns <- setdiff(names(hierarchyDT), c("labels", "values", value_column))
hierarchyDT[, parents := apply(.SD, 1, function(x){fifelse(all(is.na(x)), yes = NA_character_, no = paste(x[!is.na(x)], sep = ":", collapse = " - "))}), .SDcols = parent_columns]
hierarchyDT[, ids := apply(.SD, 1, function(x){paste(x[!is.na(x)], collapse = " - ")}), .SDcols = c("parents", "labels")]
hierarchyDT[, c(parent_columns) := NULL]
return(hierarchyDT)
}

microbenchmark(plotme = {
DF %>%
rename(n = acres) %>%
count_to_sunburst()
}, ismirsehregal = {
plot_ly(data = as.sunburstDF(DF, value_column = "acres", add_root = TRUE), ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='sunburst', branchvalues = 'total')
})


Related Topics



Leave a reply



Submit