clustifyr

clustifyr classifies cells and clusters in single-cell RNA sequencing experiments using reference bulk RNA-seq data sets, sorted microarray expression data, single-cell gene signatures, or lists of marker genes.

Installation

Install the Bioconductor version with:

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("clustifyr")

Install the development version with:

BiocManager::install("rnabioco/clustifyr")

Example usage

In this example we use the following built-in input data:

an expression matrix of single cell RNA-seq data (pbmc_matrix_small)
a metadata data.frame (pbmc_meta), with cluster information stored ("classified")
a vector of variable genes (pbmc_vargenes)
a matrix of mean normalized scRNA-seq UMI counts by cell type (cbmc_ref)

We then calculate correlation coefficients and plot them on a pre-calculated projection (stored in pbmc_meta).

library(clustifyr)

# calculate correlation
res <- clustify(
  input = pbmc_matrix_small,
  metadata = pbmc_meta$classified,
  ref_mat = cbmc_ref,
  query_genes = pbmc_vargenes
)

# print assignments
cor_to_call(res)
#> # A tibble: 9 × 3
#> # Groups:   cluster [9]
#>   cluster      type           r
#>   <chr>        <chr>      <dbl>
#> 1 B            B          0.909
#> 2 CD14+ Mono   CD14+ Mono 0.915
#> 3 FCGR3A+ Mono CD16+ Mono 0.929
#> 4 Memory CD4 T CD4 T      0.861
#> 5 Naive CD4 T  CD4 T      0.889
#> 6 DC           DC         0.849
#> 7 Platelet     Mk         0.732
#> 8 CD8 T        NK         0.826
#> 9 NK           NK         0.894

# plot assignments on a projection
plot_best_call(
  cor_mat = res,
  metadata = pbmc_meta,
  cluster_col = "classified"
)

clustify() can take a clustered SingleCellExperiment or seurat object (both v2 and v3) and assign identities.

# for SingleCellExperiment
clustify(
  input = sce_small,          # an SCE object
  ref_mat = cbmc_ref,         # matrix of RNA-seq expression data for each cell type
  cluster_col = "cell_type1", # name of column in meta.data containing cell clusters
  obj_out = TRUE              # output SCE object with cell type inserted as "type" column
) 
#> class: SingleCellExperiment 
#> dim: 200 200 
#> metadata(0):
#> assays(2): counts logcounts
#> rownames(200): SGIP1 AZIN2 ... TAF12 SNHG3
#> rowData names(10): feature_symbol is_feature_control ... total_counts
#>   log10_total_counts
#> colnames(200): AZ_A1 AZ_A10 ... HP1502401_E18 HP1502401_E19
#> colData names(35): cell_quality cell_type1 ... type r
#> reducedDimNames(0):
#> mainExpName: NULL
#> altExpNames(0):

library(Seurat)
# for Seurat3/4
clustify(
  input = s_small3,
  cluster_col = "RNA_snn_res.1",
  ref_mat = cbmc_ref,
  seurat_out = TRUE
)
#> An object of class Seurat 
#> 230 features across 80 samples within 1 assay 
#> Active assay: RNA (230 features, 20 variable features)
#>  2 dimensional reductions calculated: pca, tsne

# New output option, directly as a vector (in the order of the metadata), which can then be inserted into metadata dataframes and other workflows
clustify(
  input = s_small3,
  cluster_col = "RNA_snn_res.1",
  ref_mat = cbmc_ref,
  vec_out = TRUE
)
#>  [1] "Mk"         "Mk"         "Mk"         "Mk"         "Mk"        
#>  [6] "Mk"         "Mk"         "Mk"         "Mk"         "Mk"        
#> [11] "B"          "B"          "B"          "B"          "B"         
#> [16] "B"          "B"          "B"          "B"          "B"         
#> [21] "CD16+ Mono" "CD16+ Mono" "CD16+ Mono" "CD16+ Mono" "CD16+ Mono"
#> [26] "CD16+ Mono" "CD16+ Mono" "CD16+ Mono" "CD16+ Mono" "CD16+ Mono"
#> [31] "Mk"         "B"          "Mk"         "Mk"         "Mk"        
#> [36] "Mk"         "Mk"         "Mk"         "Mk"         "Mk"        
#> [41] "Mk"         "B"          "Mk"         "Mk"         "B"         
#> [46] "B"          "Mk"         "Mk"         "Mk"         "Mk"        
#> [51] "CD16+ Mono" "CD16+ Mono" "B"          "CD16+ Mono" "CD16+ Mono"
#> [56] "CD16+ Mono" "CD16+ Mono" "CD16+ Mono" "CD16+ Mono" "Mk"        
#> [61] "B"          "CD16+ Mono" "B"          "CD16+ Mono" "B"         
#> [66] "CD16+ Mono" "CD16+ Mono" "CD16+ Mono" "CD16+ Mono" "B"         
#> [71] "Mk"         "Mk"         "Mk"         "Mk"         "Mk"        
#> [76] "Mk"         "Mk"         "Mk"         "Mk"         "CD16+ Mono"

New reference matrix can be made directly from SingleCellExperiment and Seurat objects as well. Other scRNAseq experiment object types are supported as well.

# make reference from SingleCellExperiment objects
sce_ref <- object_ref(
  input = sce_small,               # SCE object
  cluster_col = "cell_type1"       # name of column in colData containing cell identities
)
#> The following clusters have less than 10 cells for this analysis: co-expression, ductal, endothelial, epsilon, MHC class II, PSC. Classification is likely inaccurate.

# make reference from seurat objects
s_ref <- seurat_ref(
  seurat_object = s_small3,
  cluster_col = "RNA_snn_res.1"
)

head(s_ref)
#>                 0        1        2
#> MS4A1    0.000000 1.126047 5.151065
#> CD79B    2.469341 2.920407 5.031316
#> CD79A    0.000000 2.535151 5.375681
#> HLA-DRA  3.640368 6.008446 7.055386
#> TCL1A    0.000000 1.495867 4.963367
#> HLA-DQB1 1.603068 3.836290 5.137422

clustify_lists() handles identity assignment of matrix or SingleCellExperiment and seurat objects based on marker gene lists.

clustify_lists(
  input = pbmc_matrix_small,
  metadata = pbmc_meta,
  cluster_col = "classified",
  marker = pbmc_markers,
  marker_inmatrix = FALSE
)
#>                      0        1        2         3         4        5        6
#> Naive CD4 T  1.5639055 20.19469 31.77095  8.664074 23.844992 19.06931 19.06931
#> Memory CD4 T 1.5639055 20.19469 31.77095 10.568007 23.844992 17.97875 19.06931
#> CD14+ Mono   0.9575077 14.70716 76.21353 17.899569 11.687739 49.86699 16.83210
#> B            0.6564777 12.70976 31.77095 26.422929 13.536295 20.19469 13.53630
#> CD8 T        1.0785353 17.97875 31.82210 12.584823 31.822099 22.71234 40.45383
#> FCGR3A+ Mono 0.6564777 13.63321 72.43684 17.899569  9.726346 56.48245 14.61025
#> NK           0.6564777 14.61025 31.82210  7.757206 31.822099 22.71234 45.05072
#> DC           0.6564777 15.80598 63.34978 19.069308 13.758144 40.56298 17.97875
#> Platelet     0.5428889 13.34769 59.94938 14.215244 15.158755 46.92861 19.49246
#>                      7          8
#> Naive CD4 T   6.165348  0.6055118
#> Memory CD4 T  6.165348  0.9575077
#> CD14+ Mono   25.181595  1.0785353
#> B            17.899569  0.1401901
#> CD8 T         7.882145  0.3309153
#> FCGR3A+ Mono 21.409177  0.3309153
#> NK            5.358651  0.3309153
#> DC           45.101877  0.1401901
#> Platelet     19.492465 59.9493793

clustify_lists(
  input = s_small3,
  marker = pbmc_markers,
  marker_inmatrix = FALSE,
  cluster_col = "RNA_snn_res.1",
  seurat_out = TRUE
)
#> An object of class Seurat 
#> 230 features across 80 samples within 1 assay 
#> Active assay: RNA (230 features, 20 variable features)
#>  2 dimensional reductions calculated: pca, tsne

Additional resources

Script for benchmarking, compatible with scRNAseq_Benchmark
Additional reference data (including tabula muris, immgen, etc) are available in a supplemental package clustifyrdatahub. Also see list for individual downloads.
See the FAQ for more details.

Simple example analysis with plotting code

library(RClusterCT)
library(ggplot2)
data("pbmc4k_avg");
data("pbmc_bulk_matrix");
gene_constraints <- list(pbmc4k_vargenes, rownames(pbmc_bulk_matrix));
pbmc4k_avg <- select_gene_subset(pbmc4k_avg, gene_constraints);
pbmc_bulk_matrix <- select_gene_subset(pbmc_bulk_matrix, gene_constraints);

out <- lapply(colnames(pbmc4k_avg),
              function(x){
                per_col <- lapply(colnames(pbmc_bulk_matrix),
                                  function(y){
                                    compute_similarity(pbmc4k_avg[,x],
                                                       pbmc_bulk_matrix[,y], corr_coef,
                                                       method = "spearman")})
                do.call(cbind, per_col)
              })

res <- do.call(rbind, out)
rownames(res) <- colnames(pbmc4k_avg)
colnames(res) <- colnames(pbmc_bulk_matrix)
res[1:10, 1:10]
#>   primary human B cells rep 1 primary human myeloid DC rep 1
#> 0                  0.50820425                      0.4579628
#> 1                  0.32281079                      0.7202308
#> 2                  0.70432854                      0.4584629
#> 3                  0.49785471                      0.4361370
#> 4                  0.44130928                      0.4059912
#> 5                  0.40268039                      0.4198868
#> 6                  0.39736997                      0.6842751
#> 7                  0.36540944                      0.7451606
#> 8                  0.44289258                      0.4494797
#> 9                  0.06624097                      0.3350129
#>   primary human monocytes rep 1 primary human neutrophils rep 1
#> 0                     0.4022359                       0.3689856
#> 1                     0.7752308                       0.6415800
#> 2                     0.3883629                       0.3570139
#> 3                     0.3782507                       0.3569020
#> 4                     0.3422209                       0.3380242
#> 5                     0.3568106                       0.3518037
#> 6                     0.6643071                       0.5200495
#> 7                     0.6849474                       0.5670303
#> 8                     0.4079214                       0.3208810
#> 9                     0.3462206                       0.2792161
#>   primary human NK cells rep 1 primary human PBMC rep 1
#> 0                    0.5551566                0.5746751
#> 1                    0.3957439                0.6468407
#> 2                    0.4228903                0.4963271
#> 3                    0.5955744                0.5769416
#> 4                    0.6570067                0.5806158
#> 5                    0.7198484                0.5841992
#> 6                    0.4178842                0.6037755
#> 7                    0.4332017                0.6213894
#> 8                    0.3562691                0.4491803
#> 9                    0.1525381                0.3215894
#>   primary human T cells rep 1 primary human B cells rep 2
#> 0                  0.67825132                  0.50702268
#> 1                  0.29332822                  0.34436167
#> 2                  0.44460967                  0.71343253
#> 3                  0.70072958                  0.49931182
#> 4                  0.63764459                  0.44837165
#> 5                  0.57486120                  0.41192369
#> 6                  0.32515553                  0.41995243
#> 7                  0.34017640                  0.39020492
#> 8                  0.33956974                  0.45445820
#> 9                  0.05847776                  0.08712384
#>   primary human myeloid DC rep 2 primary human monocytes rep 2
#> 0                      0.4560228                     0.3908564
#> 1                      0.7079754                     0.7876344
#> 2                      0.4580736                     0.3794703
#> 3                      0.4332183                     0.3689152
#> 4                      0.4044759                     0.3372726
#> 5                      0.4171209                     0.3511799
#> 6                      0.6806905                     0.6662284
#> 7                      0.7437065                     0.6927999
#> 8                      0.4451127                     0.3973351
#> 9                      0.3345931                     0.3615359


plot_tsne(pbmc4k_meta, feature = "classified")

plot_cor(res,
         pbmc4k_meta,
         colnames(res)[1:7])
#> Warning: package 'bindrcpp' was built under R version 3.4.4
#> [[1]]

#> 
#> [[2]]

#> 
#> [[3]]

#> 
#> [[4]]

#> 
#> [[5]]

#> 
#> [[6]]

#> 
#> [[7]]

Created on 2018-06-22 by the reprex package (v0.2.0).

rnabioco / clustifyr Goto Github PK

clustifyr's Introduction

clustifyr

Installation

Example usage

Additional resources

clustifyr's People

Contributors

Stargazers

Watchers

Forkers

clustifyr's Issues

install recount

Load recount R package

Recommend Projects

Recommend Topics

Recommend Org