Answer to: Counting instances of genotype strings where order is irrelevant within locus
Score: 1
A quick one-liner using Tmisc::strSort (same basic approach of sorting the strings before using table):
library(Tmisc)
table(strSort(unlist(offspring, 0, 0)))
#>
#> aabb aabB aaBB aAbb aAbB aABB AAbb AAbB AABB
#> 1 2 1 2 4 2 1 2 1
A faster option if performance is a concern. This uses the non-sorting unordered match approach found in this answer.
strtable <- function(x) {
x <- unlist(x, 0, 0)
y <- mapply(utf8ToInt, x, USE.NAMES = FALSE)
u <- unique(`dim<-`(y, NULL))
y[] <- match(y, u)
y <- colSums(y*sin(y*pi/(length(u) + 1)))
i <- which(!duplicated(y))
`names<-`(tabulate(match(y, y[i])), x[i])
}
strtable(offspring)
#> AABB AAbB aABB aAbB AAbb aAbb aaBB aabB aabb
#> 1 2 2 4 1 2 1 2 1
Benchmarking against the functions from @RuiBarradas and @ThomasIsCoding (the function provided by @jay.sf does not return the correct counts for the general problem of counting occurrences of unordered strings--it assumes the OP's specific problem of "diploid, biallelic loci" as noted in jay.sf's answer).
microbenchmark::microbenchmark(
strtable(str),
table(strSort(unlist(str, 0, 0))),
Rui_Barradas(str),
tic(str)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> strtable(str) 8.3294 10.06245 12.71034 12.6583 14.5268 22.7197 100
#> table(strSort(unlist(str, 0, 0))) 169.1872 215.74645 236.81358 242.0942 260.7001 325.7841 100
#> Rui_Barradas(str) 172.1685 215.09295 252.07995 252.9611 283.8306 333.5353 100
#> tic(str) 149.1786 189.83450 210.64926 213.6807 228.9248 318.0333 100
Benchmark data:
str <- stringi::stri_rand_strings(1e4, 4, "[A-Ea-e]")
# partial check for correctness
res <- vector("list", 4)
res[[1]] <- sort(as.integer(table(strSort(unlist(str, 0, 0)))))
res[[2]] <- sort(as.integer(strtable(str)))
res[[3]] <- sort(as.integer(Rui_Barradas(str)))
res[[4]] <- sort(as.integer(tic(str)))
identical(res[-1], res[-4])
#> [1] TRUE
Functions:
Rui_Barradas <- function(x) {
lapply(x, strsplit, "") |>
unlist(recursive = FALSE) |>
lapply(sort) |>
sapply(paste, collapse = "") |>
table(useNA = "ifany")
}
tic <- function(x) { # ThomisIsCoding
table(sapply(unlist(x), \(x) intToUtf8(utf8ToInt(x)[order(utf8ToInt(x))])))
}
View Question ↗
Question
Parent Entity
Score: 4 • Views: 101
Site: stackoverflow
SaaS Metrics