r - do.call(rbind, list) for uneven number of column

ID : 274341

viewed : 38

Tags : rlistdata-bindingr





Top 5 Answer for r - do.call(rbind, list) for uneven number of column

vote vote

96

rbind.fill is an awesome function that does really well on list of data.frames. But IMHO, for this case, it could be done much faster when the list contains only (named) vectors.

The rbind.fill way

require(plyr) rbind.fill(lapply(x,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)})) 

A more straightforward way (and efficient for this scenario at least):

rbind.named.fill <- function(x) {     nam <- sapply(x, names)     unam <- unique(unlist(nam))     len <- sapply(x, length)     out <- vector("list", length(len))     for (i in seq_along(len)) {         out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]     }     setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam) } 

Basically, we get total unique names to form the columns of the final data.frame. Then, we create a list with length = input and just fill the rest of the values with NA. This is probably the "trickiest" part as we've to match the names while filling NA. And then, we set names once finally to the columns (which can be set by reference using setnames from data.table package as well if need be).


Now to some benchmarking:

Data:

# generate some huge random data: set.seed(45) sample.fun <- function() {     nam <- sample(LETTERS, sample(5:15))     val <- sample(letters, length(nam))     setNames(val, nam)   } ll <- replicate(1e4, sample.fun()) 

Functions:

# plyr's rbind.fill version: rbind.fill.plyr <- function(x) {     rbind.fill(lapply(x,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)})) }  rbind.named.fill <- function(x) {     nam <- sapply(x, names)     unam <- unique(unlist(nam))     len <- sapply(x, length)     out <- vector("list", length(len))     for (i in seq_along(len)) {         out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]     }     setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam) } 

Update (added GSee's function as well):

foo <- function (...)  {   dargs <- list(...)   all.names <- unique(names(unlist(dargs)))   out <- do.call(rbind, lapply(dargs, `[`, all.names))   colnames(out) <- all.names   as.data.frame(out, stringsAsFactors=FALSE) } 

Benchmarking:

require(microbenchmark) microbenchmark(t1 <- rbind.named.fill(ll),                 t2 <- rbind.fill.plyr(ll),                 t3 <- do.call(foo, ll), times=10) identical(t1, t2) # TRUE identical(t1, t3) # TRUE  Unit: milliseconds                        expr        min         lq     median         uq        max neval  t1 <- rbind.named.fill(ll)   243.0754   258.4653   307.2575   359.4332   385.6287    10   t2 <- rbind.fill.plyr(ll) 16808.3334 17139.3068 17648.1882 17890.9384 18220.2534    10      t3 <- do.call(foo, ll)   188.5139   204.2514   229.0074   339.6309   359.4995    10 
vote vote

81

If you want the result to be a matrix...

I recently wrote this function for a co-worker that wanted to rbind vectors into a matrix.

foo <- function (...)  {   dargs <- list(...)   if (!all(vapply(dargs, is.vector, TRUE)))        stop("all inputs must be vectors")   if (!all(vapply(dargs, function(x) !is.null(names(x)), TRUE)))        stop("all input vectors must be named.")   all.names <- unique(names(unlist(dargs)))   out <- do.call(rbind, lapply(dargs, `[`, all.names))   colnames(out) <- all.names   out }  R > do.call(foo, x)      A   B   C   D   E   F   G   H   I   J   L   O   R   P   T   [1,] "b" "d" "f" "h" "j" "l" "n" "p" "r" "t" NA  NA  NA  NA  NA  [2,] NA  NA  "c" NA  NA  "f" NA  NA  "i" NA  "l" "o" "r" NA  NA  [3,] NA  NA  NA  "d" NA  NA  NA  "h" NA  NA  "l" NA  NA  "p" "t" 
vote vote

73

Here's a version using the package data.table, a bit faster for very big data. It uses the function rbindlist and its argument fill=TRUE passed to the function do.call.

library(data.table) x <- list() x[[1]] <- letters[seq(2,20,by=2)] names(x[[1]]) <- LETTERS[c(1:length(x[[1]]))] x[[2]] <- letters[seq(3,20, by=3)] names(x[[2]]) <- LETTERS[seq(3,20, by=3)] x[[3]] <- letters[seq(4,20, by=4)] names(x[[3]]) <- LETTERS[seq(4,20, by=4)]   x2 <- lapply(x, as.list) rbindlist(x2, fill=TRUE) #>       A    B    C    D    E    F    G    H    I    J    L    O    R    P    T #> 1:    b    d    f    h    j    l    n    p    r    t <NA> <NA> <NA> <NA> <NA> #> 2: <NA> <NA>    c <NA> <NA>    f <NA> <NA>    i <NA>    l    o    r <NA> <NA> #> 3: <NA> <NA> <NA>    d <NA> <NA> <NA>    h <NA> <NA>    l <NA> <NA>    p    t 

It adds a small overhead because it needs the character vectors to be converted with as.list. This passage can also add time to the process, depending on how the data is generated.
On the other hand, it seems to perform faster on large datasets.
It returns a data.table.

I rewrote @Arun and @GSee's examples to generate a bigger sample.

Data

# generate some huge random data: set.seed(45) sample.fun <- function() {   nam <- sample(LETTERS, sample(5:15))   val <- sample(letters, length(nam))   setNames(val, nam)   } l1 <- replicate(1e6, sample.fun()) # Arun's data, just bigger l2 <- lapply(l1, as.list) # same data converted with as.list 

Functions

library(microbenchmark) library(data.table) # Arun's function rbind.named.fill <- function(x) {   nam <- sapply(x, names)   unam <- unique(unlist(nam))   len <- sapply(x, length)   out <- vector("list", length(len))   for (i in seq_along(len)) {     out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]   }   setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam) }  # GSee's function foo <- function (...)  {   dargs <- list(...)   all.names <- unique(names(unlist(dargs)))   out <- do.call(rbind, lapply(dargs, `[`, all.names))   colnames(out) <- all.names   as.data.frame(out, stringsAsFactors=FALSE) } 

Benchmarking

microbenchmark(t1 <- rbind.named.fill(l1),                 t2 <- rbindlist(l2, fill=TRUE),                t3 <- do.call(foo, l1),                times=10) #> Unit: seconds #>                                 expr      min        lq        mean    median        uq      max neval #> t1 <- rbind.named.fill(l1)      6.536782  7.545538   9.118771  9.304844 10.505814 11.28260    10 #> t2 <- rbindlist(l2, fill=TRUE)  5.250387  5.787712   6.910340  6.226065  7.579503 10.40524    10 #> t3 <- do.call(foo, l1)          9.590615 11.043557  13.504694 12.550535 15.364464 19.95877    10   identical(t1, data.frame(t2)) #> [1] TRUE identical(t3, data.frame(t2)) #> [1] TRUE 

Created on 2019-08-01 by the reprex package (v0.3.0)

vote vote

62

After you convert your names vector to individual dataframes, you can use dplyr::bind_rows

dplyr::bind_rows(lapply(x,function(y) as.data.frame(t(y),stringsAsFactors=FALSE)))  #     A    B    C    D    E    F    G    H    I    J    L    O    R    P    T #1    b    d    f    h    j    l    n    p    r    t <NA> <NA> <NA> <NA> <NA> #2 <NA> <NA>    c <NA> <NA>    f <NA> <NA>    i <NA>    l    o    r <NA> <NA> #3 <NA> <NA> <NA>    d <NA> <NA> <NA>    h <NA> <NA>    l <NA> <NA>    p    t 

We can also use purrr::map_df/purrr::map_dfr in this case

purrr::map_df(x, ~as.data.frame(t(.x),stringsAsFactors = FALSE)) 

This will give the same output as above.

vote vote

57

Top 3 video Explaining r - do.call(rbind, list) for uneven number of column







Related QUESTION?