vignettes/comparison_with_base_R.Rmd
comparison_with_base_R.Rmd
In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers.
transcripts
Tidy transcriptomics
rowData(tt)$gene_name = rownames(tt)
tt.aggr = tt %>% aggregate_duplicates(.transcript = gene_name)
Base R
counts
Tidy transcriptomics
tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance()
Base R
library(edgeR)
<- DGEList(count_m=x,group=group)
dgList <- filterByExpr(dgList)
keep <- dgList[keep,,keep.lib.sizes=FALSE]
dgList
[...]<- calcNormFactors(dgList, method="TMM")
dgList <- cpm(dgList) norm_counts.table
variable transcripts
We may want to identify and filter variable transcripts.
Tidy transcriptomics
tt.norm.variable = tt.norm %>% keep_variable()
Base R
dimensions
Tidy transcriptomics
tt.norm.MDS =
tt.norm %>%
reduce_dimensions(method="MDS", .dims = 2)
Base R
Tidy transcriptomics
tt.norm.PCA =
tt.norm %>%
reduce_dimensions(method="PCA", .dims = 2)
Base R
Tidy transcriptomics
tt.norm.tSNE =
breast_tcga_mini_SE %>%
tidybulk( sample, ens, count_scaled) %>%
identify_abundant() %>%
reduce_dimensions(
method = "tSNE",
perplexity=10,
pca_scale =TRUE
)
Base R
dimensions
Tidy transcriptomics
tt.norm.MDS.rotated =
tt.norm.MDS %>%
rotate_dimensions(`Dim1`, `Dim2`, rotation_degrees = 45, action="get")
Base R
differential abundance
Tidy transcriptomics
tt.de =
tt %>%
test_differential_abundance( ~ condition, action="get")
tt.de
Base R
library(edgeR)
dgList <- DGEList(counts=counts_m,group=group)
keep <- filterByExpr(dgList)
dgList <- dgList[keep,,keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
design <- model.matrix(~group)
dgList <- estimateDisp(dgList,design)
fit <- glmQLFit(dgList,design)
qlf <- glmQLFTest(fit,coef=2)
topTags(qlf, n=Inf)
counts
Tidy transcriptomics
tt.norm.adj =
tt.norm %>% adjust_abundance( ~ condition + time)
Base R
library(sva)
count_m_log = log(count_m + 1)
design =
model.matrix(
object = ~ condition + time,
data = annotation
)
count_m_log.sva =
ComBat(
batch = design[,2],
mod = design,
...
)
count_m_log.sva = ceiling(exp(count_m_log.sva) -1)
count_m_log.sva$cell_type = counts[
match(counts$sample, rownames(count_m_log.sva)),
"Cell type"
]
Cell type composition
Tidy transcriptomics
tt.cibersort =
tt %>%
deconvolve_cellularity(action="get", cores=1)
Base R
source(‘CIBERSORT.R’)
%>% write.table("mixture_file.txt")
count_m <- CIBERSORT(
results "sig_matrix_file.txt",
"mixture_file.txt",
perm=100, QN=TRUE
)$cell_type = tibble_counts[
resultsmatch(tibble_counts$sample, rownames(results)),
"Cell type"
]
samples
Tidy transcriptomics
tt.norm.cluster = tt.norm.MDS %>%
cluster_elements(method="kmeans", centers = 2, action="get" )
Base R
Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable.
Tidy transcriptomics
tt.norm.SNN =
tt.norm.tSNE %>%
cluster_elements(method = "SNN")
Base R
library(Seurat)
snn = CreateSeuratObject(count_m)
snn = ScaleData(
snn, display.progress = TRUE,
num.cores=4, do.par = TRUE
)
snn = FindVariableFeatures(snn, selection.method = "vst")
snn = FindVariableFeatures(snn, selection.method = "vst")
snn = RunPCA(snn, npcs = 30)
snn = FindNeighbors(snn)
snn = FindClusters(snn, method = "igraph", ...)
snn = snn[["seurat_clusters"]]
snn$cell_type = tibble_counts[
match(tibble_counts$sample, rownames(snn)),
c("Cell type", "Dim1", "Dim2")
]
redundant
transcripts
Tidy transcriptomics
tt.norm.non_redundant =
tt.norm.MDS %>%
remove_redundancy( method = "correlation" )
Base R
library(widyr)
.data.correlated =
pairwise_cor(
counts,
sample,
transcript,
rc,
sort = TRUE,
diag = FALSE,
upper = FALSE
) %>%
filter(correlation > correlation_threshold) %>%
distinct(item1) %>%
rename(!!.element := item1)
# Return non redundant data frame
counts %>% anti_join(.data.correlated) %>%
spread(sample, rc, - transcript) %>%
left_join(annotation)
heatmap
tidytranscriptomics
tt.norm.MDS %>%
# filter lowly abundant
keep_abundant() %>%
# extract 500 most variable genes
keep_variable( .abundance = count_scaled, top = 500) %>%
# create heatmap
heatmap(sample, transcript, count_scaled, transform = log1p) %>%
add_tile(`Cell type`)
Base R
# Example taken from airway dataset from BioC2020 workshop.
dgList <- SE2DGEList(airway)
group <- factor(dgList$samples$`Cell type`)
keep.exprs <- filterByExpr(dgList, group=group)
dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
logcounts <- cpm(dgList, log=TRUE)
var_genes <- apply(logcounts, 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
highly_variable_lcpm <- logcounts[select_var,]
colours <- c("#440154FF", "#21908CFF", "#fefada" )
col.group <- c("red","grey")[group]
gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")
density plot
tidytranscriptomics
# Example taken from airway dataset from BioC2020 workshop.
airway %>%
tidybulk() %>%
identify_abundant() %>%
scale_abundance() %>%
pivot_longer(cols = starts_with("counts"), names_to = "source", values_to = "abundance") %>%
filter(!lowly_abundant) %>%
ggplot(aes(x=abundance + 1, color=sample)) +
geom_density() +
facet_wrap(~source) +
scale_x_log10()
Base R
# Example taken from airway dataset from BioC2020 workshop.
dgList <- SE2DGEList(airway)
group <- factor(dgList$samples$dex)
keep.exprs <- filterByExpr(dgList, group=group)
dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
logcounts <- cpm(dgList, log=TRUE)
var_genes <- apply(logcounts, 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
highly_variable_lcpm <- logcounts[select_var,]
colours <- c("#440154FF", "#21908CFF", "#fefada" )
col.group <- c("red","grey")[group]
gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] tidySummarizedExperiment_1.8.0 SummarizedExperiment_1.28.0
## [3] Biobase_2.58.0 GenomicRanges_1.50.2
## [5] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [7] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [9] MatrixGenerics_1.10.0 matrixStats_0.63.0
## [11] tidybulk_1.11.3 ggrepel_0.9.3
## [13] ggplot2_3.4.1 magrittr_2.0.3
## [15] tibble_3.1.8 tidyr_1.3.0
## [17] dplyr_1.1.0 knitr_1.42
## [19] BiocStyle_2.26.0
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.16 colorspace_2.1-0 ellipsis_0.3.2
## [4] rprojroot_2.0.3 XVector_0.38.0 fs_1.6.1
## [7] tidytext_0.4.1 SnowballC_0.7.0 bit64_4.0.5
## [10] AnnotationDbi_1.60.0 fansi_1.0.4 codetools_0.2-19
## [13] splines_4.2.2 cachem_1.0.7 jsonlite_1.8.4
## [16] broom_1.0.3 annotate_1.76.0 png_0.1-8
## [19] BiocManager_1.30.20 readr_2.1.4 compiler_4.2.2
## [22] httr_1.4.5 backports_1.4.1 Matrix_1.5-3
## [25] fastmap_1.1.1 lazyeval_0.2.2 limma_3.54.2
## [28] cli_3.6.0 htmltools_0.5.4 tools_4.2.2
## [31] gtable_0.3.1 glue_1.6.2 GenomeInfoDbData_1.2.9
## [34] reshape2_1.4.4 Rcpp_1.0.10 jquerylib_0.1.4
## [37] pkgdown_2.0.7 vctrs_0.5.2 Biostrings_2.66.0
## [40] preprocessCore_1.60.2 nlme_3.1-162 xfun_0.37
## [43] stringr_1.5.0 lifecycle_1.0.3 XML_3.99-0.13
## [46] edgeR_3.40.2 zlibbioc_1.44.0 scales_1.2.1
## [49] ragg_1.2.5 hms_1.1.2 parallel_4.2.2
## [52] yaml_2.3.7 memoise_2.0.1 sass_0.4.5
## [55] stringi_1.7.12 RSQLite_2.3.0 genefilter_1.80.3
## [58] tokenizers_0.3.0 desc_1.4.2 BiocParallel_1.32.5
## [61] rlang_1.0.6 pkgconfig_2.0.3 systemfonts_1.0.4
## [64] bitops_1.0-7 evaluate_0.20 lattice_0.20-45
## [67] purrr_1.0.1 htmlwidgets_1.6.1 bit_4.0.5
## [70] tidyselect_1.2.0 plyr_1.8.8 bookdown_0.32
## [73] R6_2.5.1 generics_0.1.3 DelayedArray_0.24.0
## [76] DBI_1.1.3 pillar_1.8.1 withr_2.5.0
## [79] mgcv_1.8-42 survival_3.5-3 KEGGREST_1.38.0
## [82] RCurl_1.98-1.10 janeaustenr_1.0.0 widyr_0.1.5
## [85] crayon_1.5.2 utf8_1.2.3 plotly_4.10.1
## [88] tzdb_0.3.0 rmarkdown_2.20 locfit_1.5-9.7
## [91] grid_4.2.2 sva_3.46.0 data.table_1.14.8
## [94] blob_1.2.3 digest_0.6.31 xtable_1.8-4
## [97] textshaping_0.3.6 munsell_0.5.0 viridisLite_0.4.1
## [100] bslib_0.4.2