We present here some usage case of Cell-ID, a clustering-free multivariate statistical method for the robust extraction of per-cell gene signatures from single-cell RNA-seq. Cell-ID allows unbiased cell identity recognition across different donors, tissues-of-origin, model organisms and single-cell omics protocols.
In order to install CellID, the user will have to set the repositories option to enable downloading Bioconductor dependencies.
To illustrate CellID usage we will use two different pancreatic data sets from Baron and Segerstolpe.
#download data
download.file(url = "https://storage.googleapis.com/cellid-cbl/BaronMatrix.rds",destfile = "BaronMatrix.rds")
download.file(url = "https://storage.googleapis.com/cellid-cbl/BaronMetaData.rds",destfile = "BaronMetaData.rds")
download.file(url = "https://storage.googleapis.com/cellid-cbl/SegerstolpeMatrix.rds",destfile = "SegerstolpeMatrix.rds")
download.file(url = "https://storage.googleapis.com/cellid-cbl/SegerstolpeMetaData2.rds",destfile = "SegerstolpeMetaData.rds")
#read data
BaronMatrix <- readRDS("BaronMatrix.rds")
BaronMetaData <- readRDS("BaronMetaData.rds")
SegerMatrix <- readRDS("SegerstolpeMatrix.rds")
SegerMetaData <- readRDS("SegerstolpeMetaData.rds")
CellID can be either used on Seurat or SingleCellExperiment object, here before creating the Seurat object we filter the matrix genes with protein coding genes.
# Remove non protein coding genes
BaronMatrixProt <- BaronMatrix[rownames(BaronMatrix) %in% HgProteinCodingGenes,]
SegerMatrixProt <- SegerMatrix[rownames(SegerMatrix) %in% HgProteinCodingGenes,]
# Create Seurat object and remove remove low detection genes
BaronSeurat <- CreateSeuratObject(counts = BaronMatrixProt, project = "Baron", min.cells = 5, meta.data = BaronMetaData)
SegerSeurat <- CreateSeuratObject(counts = SegerMatrixProt, project = "Segerstolpe", min.cells = 5, meta.data = SegerMetaData)
Before running CellID we perform standard seurat workflow as desribed here.
BaronSeurat <- NormalizeData(BaronSeurat)
BaronSeurat <- ScaleData(BaronSeurat, features = rownames(BaronSeurat))
#> Centering and scaling data matrix
BaronSeurat <- RunPCA(BaronSeurat, features = rownames(BaronSeurat))
#> PC_ 1
#> Positive: IFITM3, ZFP36L1, CDC42EP1, TMSB10, PMEPA1, YBX3, SOX4, IL32, TMSB4X, RHOC
#> S100A11, TACSTD2, TM4SF1, KRT7, IFITM2, CD44, FLNA, MSN, S100A16, MYH9
#> SDC4, LGALS3, SERPING1, PRSS8, SERPINA3, ITGA2, COL18A1, GSTP1, SAT1, DDIT4
#> Negative: PCSK1N, CPE, SCG5, PTPRN, CHGA, CHGB, GNAS, SCGN, SCG2, CD99
#> PAM, PEMT, VGF, PPP1R1A, BEX1, PCSK2, KIAA1324, TTR, PAX6, IDS
#> NLRP1, SLC30A8, EEF1A2, UCHL1, GAD2, APLP1, SYT7, NEUROD1, ERO1B, TMOD1
#> PC_ 2
#> Positive: CELA3A, PRSS1, CELA3B, CTRC, CTRB1, PRSS2, CLPS, CPA1, PNLIPRP1, PLA2G1B
#> CPA2, CELA2A, CTRB2, CPB1, KLK1, SYCN, REG1A, PNLIP, REG1B, GP2
#> CUZD1, SPINK1, PRSS3, CTRL, CELA2B, FOXD2, REG3G, MGST1, PDIA2, AMY2A
#> Negative: PSAP, IGFBP7, COL18A1, APP, CALR, ITGB1, PKM, LGALS3BP, CD81, ITM2B
#> GNAI2, GRN, APLP2, CTSD, CDK2AP1, LAMP1, PXDN, SPARC, ATP1A1, MMP14
#> ITGA5, PLXNB2, TIMP1, FSTL1, COL4A1, CALD1, CTSB, NBL1, HSPG2, FLNA
#> PC_ 3
#> Positive: SPARC, COL4A1, ENG, IGFBP4, COL6A2, PDGFRB, BGN, PXDN, COL15A1, COL3A1
#> COL1A2, CYGB, AEBP1, ADAMTS4, COL6A3, HTRA3, EMILIN1, LRRC32, CRISPLD2, COL1A1
#> COL5A3, C11orf96, LAMA4, MMP2, COL6A1, COL5A1, ITGA1, TIMP3, HIC1, COL5A2
#> Negative: KRT19, TACSTD2, KRT7, SPINT2, ELF3, KRT8, ANXA4, PRSS8, SERINC2, SPINT1
#> CDH1, LCN2, CLDN7, CLDN4, CFB, CD24, SERPINA5, CFTR, KRT18, CLDN1
#> MMP7, LGALS4, ST14, C3, SDC4, LAD1, S100A14, SLC4A4, PTPRF, ABCC3
#> PC_ 4
#> Positive: INS, HADH, IAPP, PCSK1, ADCYAP1, TIMP2, IGF2, DLK1, GSN, NPTX2
#> TSPAN13, PDX1, VAT1L, RPS25, WLS, RPL7, RPL24, MAFA, SLC6A6, SYT13
#> ELMO1, DNAJB9, UCHL1, RPL3, RGS16, RPS6, RBP4, PKIB, TGFBR3, RPL17
#> Negative: GCG, GC, TM4SF4, CLU, TMEM176B, IRX2, TTR, TMEM176A, FEV, F10
#> FXYD5, CRYBA2, SMIM24, SERPINA1, FXYD3, PCSK2, ARX, ALDH1A1, LOXL4, B2M
#> PAPPA2, CD82, GPX3, FXYD6, GLS, MUC13, VSTM2L, EGFL7, RGS4, COTL1
#> PC_ 5
#> Positive: COL6A3, COL1A2, PDGFRB, COL1A1, BGN, COL3A1, CRLF1, COL5A1, SFRP2, CYGB
#> THBS2, EMILIN1, COL6A1, FMOD, VCAN, COL6A2, PRRX1, COL5A2, COL5A3, ADAMTS12
#> CRISPLD2, LAMC3, AEBP1, NOTCH3, MXRA8, LUM, VSTM4, TPM2, INHBA, FN1
#> Negative: PLVAP, PECAM1, CD93, FLT1, KDR, SOX18, PODXL, ACVRL1, RGCC, VWF
#> ROBO4, TIE1, ESM1, ADGRL4, ESAM, ECSCR, S1PR1, SEMA3F, CXCR4, CLDN5
#> PASK, NOTCH4, ERG, DYSF, CDH5, CYP1A1, ANGPT2, BCL6B, IFI27, PTPRB
BaronSeurat <- RunUMAP(BaronSeurat, dims = 1:30)
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 10:56:40 UMAP embedding parameters a = 0.9922 b = 1.112
#> 10:56:40 Read 8569 rows and found 30 numeric columns
#> 10:56:40 Using Annoy for neighbor search, n_neighbors = 30
#> 10:56:40 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 10:56:41 Writing NN index file to temp file /tmp/Rtmp9xQMzV/file27ad36ef2b93
#> 10:56:41 Searching Annoy index using 1 thread, search_k = 3000
#> 10:56:44 Annoy recall = 100%
#> 10:56:45 Commencing smooth kNN distance calibration using 1 thread
#> 10:56:47 Initializing from normalized Laplacian + noise
#> 10:56:47 Commencing optimization for 500 epochs, with 352400 positive edges
#> 10:57:09 Optimization finished
BaronSeurat <- RunTSNE(BaronSeurat, dims = 1:30)
PCA <- DimPlot(BaronSeurat, reduction = "pca", group.by = "cell.type") + ggtitle("PCA") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
tSNE <- DimPlot(BaronSeurat, reduction = "tsne", group.by = "cell.type")+ ggtitle("tSNE") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
UMAP <- DimPlot(BaronSeurat, reduction = "umap", group.by = "cell.type") + ggtitle("UMAP") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(tSNE, UMAP, common.legend = T, legend = "top")
CellID is based on Multiple Correspondence Analysis (MCA), a dimensionality reduction methods that allows the representation of both cells and genes in a single euclidean space. Running MCA enables to compute afterwards the distance betwwen genes and cells to obtain a ranking of gene specificity for each cells. MCA can be Run like other dimensionality reduction methods in Seurat by using the command RunMCA
. Using the DimPlotMC
it is also possible to plot both cells and genes in the same space.
BaronSeurat <- RunMCA(BaronSeurat)
#> Computing Fuzzy Matrix
#> 11.692 sec elapsed
#> Computing SVD
#> 81.23 sec elapsed
#> Computing Coordinates
#> 3.082 sec elapsed
MCA <- DimPlot(BaronSeurat, reduction = "mca", group.by = "cell.type") + ggtitle("MCA") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(PCA, MCA, common.legend = T, legend = "top")
After running MCA, the user can perform enrichment analysis on each cells in the data by performing Hypergeometric test. This enables using pre-established cell types gene sets in the litterature to predict cell types in the data. Here we will use the panglao database curated signature database to predict the cell types in the Baron data. We will use two different collection signatures: one with only known pancreatic cell types and one with all cell types.
#download signatures from panglaoDB
panglao <- read_tsv("https://panglaodb.se/markers/PanglaoDB_markers_27_Mar_2020.tsv.gz")
#> Parsed with column specification:
#> cols(
#> species = col_character(),
#> `official gene symbol` = col_character(),
#> `cell type` = col_character(),
#> nicknames = col_character(),
#> `ubiquitousness index` = col_double(),
#> `product description` = col_character(),
#> `gene type` = col_character(),
#> `canonical marker` = col_double(),
#> `germ layer` = col_character(),
#> organ = col_character(),
#> sensitivity_human = col_double(),
#> sensitivity_mouse = col_double(),
#> specificity_human = col_double(),
#> specificity_mouse = col_double()
#> )
#filter to get pancreas specific genes
panglao_pancreas <- panglao %>% filter(organ == "Pancreas")
#filter to get human specific genes
panglao_pancreas <- panglao_pancreas %>% filter(str_detect(species,"Hs"))
# convert dataframes to a list of vectors which is the format for CellID input
panglao_pancreas <- panglao_pancreas %>% group_by(`cell type`) %>% summarise(geneset = list(`official gene symbol`))
pancreas_gs <- setNames(panglao_pancreas$geneset, panglao_pancreas$`cell type`)
#filter to get human specific genes
panglao_all <- panglao %>% filter(str_detect(species,"Hs"))
# convert dataframes to a list of named vectors which is the format for CellID input
panglao_all <- panglao_all %>% group_by(`cell type`) %>% summarise(geneset = list(`official gene symbol`))
all_gs <- setNames(panglao_all$geneset, panglao_all$`cell type`)
#remove very short signatures
all_gs <- all_gs[sapply(all_gs, length) >= 10]
After downloading and formatting the signatures, we perform the Hypergeometric test to test each cells against each signatures. By assigning the most highly enriched signature to each cells, it is possible to determine the cell type composition of the dataset. The results quality is highly dependant on the quality of the signatures.
#perform hypergeometric test
HGT_pancreas_gs <- RunCellHGT(BaronSeurat, pathways = pancreas_gs, dims = 1:50)
#>
#> calculating distance
#> ranking genes
#> calculating number of success
#> performing hypergeometric test
#for each cells detect the maximum enrichment
pancreas_gs_prediction <- rownames(HGT_pancreas_gs)[apply(HGT_pancreas_gs, 2, which.max)]
#test if maximum enrichment is significant
pancreas_gs_prediction_signif <- ifelse(apply(HGT_pancreas_gs, 2, max)>2, yes = pancreas_gs_prediction, "unassigned")
#put predictions in Seurat metadata
BaronSeurat$pancreas_gs_prediction <- pancreas_gs_prediction_signif
#compare original labels with predictions
color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey")
ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "unassigned"))
OriginalPlot <- DimPlot(BaronSeurat, reduction = "tsne", group.by = "cell.type") +
scale_color_manual(values = ggcolor) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
Predplot1 <- DimPlot(BaronSeurat, reduction = "tsne", group.by = "pancreas_gs_prediction") +
scale_color_manual(values = ggcolor) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(OriginalPlot, Predplot1, legend = "top",common.legend = T)
HGT_all_gs <- RunCellHGT(BaronSeurat, pathways = all_gs, dims = 1:50)
#>
#> calculating distance
#> ranking genes
#> calculating number of success
#> performing hypergeometric test
all_gs_prediction <- rownames(HGT_all_gs)[apply(HGT_all_gs, 2, which.max)]
all_gs_prediction_signif <- ifelse(apply(HGT_all_gs, 2, max)>2, yes = all_gs_prediction, "unassigned")
BaronSeurat$all_gs_prediction <- ifelse(all_gs_prediction_signif %in% c(names(pancreas_gs), "Schwann cells", "Endothelial cells", "Macrophages", "Mast cells", "T cells","Fibroblasts", "unassigned"), all_gs_prediction_signif,"other")
color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "#D575FE", "#F962DD", "grey", "black")
ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "Fibroblasts", "Schwann cells", "unassigned", "other"))
BaronSeurat$pancreas_gs_prediction <- factor(BaronSeurat$pancreas_gs_prediction,c(sort(unique(BaronSeurat$cell.type)), "Fibroblasts", "Schwann cells", "unassigned", "other"))
PredPlot2 <- DimPlot(BaronSeurat, group.by = "all_gs_prediction", reduction = "tsne") +
scale_color_manual(values = ggcolor, drop =FALSE) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(OriginalPlot, PredPlot2, legend = "top",common.legend = T)
CellID allows signatures extraction from data set both at group level and cell level. These signatures can later be used as reference gene sets. Here we use Baron as a Reference dataset to extract both cell signatures and cell type signatures, and we use the Segerstolpe dataset as a query dataset to prdict the cell type composition.
# extract gene signatures of baron cells
Baron_cell_gs <- GetCellGeneSet(BaronSeurat, dims = 1:50, n.features = 200)
#>
#> calculating distance
#>
#> creating ranking
#>
#> creating geneset
# extract gene signatures of baron cell types
Baron_group_gs <- GetGroupGeneSet(BaronSeurat, dims = 1:50, n.features = 200, group.by = "cell.type")
#>
#> creating ranking
Again, here we run the basic seurat protocole and MCA.
SegerSeurat <- NormalizeData(SegerSeurat)
SegerSeurat <- FindVariableFeatures(SegerSeurat)
SegerSeurat <- ScaleData(SegerSeurat)
SegerSeurat <- RunPCA(SegerSeurat)
SegerSeurat <- RunMCA(SegerSeurat, nmcs = 50)
#> 1.812 sec elapsed
#> 12.386 sec elapsed
#> 0.526 sec elapsed
SegerSeurat <- RunUMAP(SegerSeurat, dims = 1:30)
SegerSeurat <- RunTSNE(SegerSeurat, dims = 1:30)
tSNE <- DimPlot(SegerSeurat, reduction = "tsne", group.by = "cell.type", pt.size = 0.1) + ggtitle("tSNE") + theme(aspect.ratio = 1)
UMAP <- DimPlot(SegerSeurat, reduction = "umap", group.by = "cell.type", pt.size = 0.1) + ggtitle("UMAP") + theme(aspect.ratio = 1)
ggarrange(tSNE, UMAP, common.legend = T, legend = "top")
Here we perform Hypergeometric test on Segerstolpe dataset using Baron cell type signatures. As we did for the pre-established signatures, we choose the signature with the highest enrichment for each cells to assign cell types to the query dataset.
HGT_baron_group_gs <- RunCellHGT(SegerSeurat, pathways = Baron_group_gs, dims = 1:50)
#>
#> calculating distance
#> ranking genes
#> calculating number of success
#> performing hypergeometric test
baron_group_gs_prediction <- rownames(HGT_baron_group_gs)[apply(HGT_baron_group_gs, 2, which.max)]
baron_group_gs_prediction_signif <- ifelse(apply(HGT_baron_group_gs, 2, max)>2, yes = baron_group_gs_prediction, "unassigned")
SegerSeurat$baron_group_gs_prediction <- baron_group_gs_prediction_signif
color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey")
ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "unassigned"))
ggPredictions <- DimPlot(SegerSeurat, group.by = "baron_group_gs_prediction", pt.size = 0.2, reduction = "tsne") + ggtitle("Predicitons") + scale_color_manual(values = ggcolor, drop =FALSE) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggOriginal <- DimPlot(SegerSeurat, group.by = "cell.type", pt.size = 0.2, reduction = "tsne") + ggtitle("Original") + scale_color_manual(values = ggcolor) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(ggOriginal, ggPredictions, legend = "top", common.legend = T)
Alternatively we can also use Baron individual cell signatures to match one Baron data cells to a Segerstolpe data cells and the transfer the label to predict cell types in Segerstolpe.
HGT_baron_cell_gs <- RunCellHGT(SegerSeurat, pathways = Baron_cell_gs, dims = 1:50)
#>
#> calculating distance
#> ranking genes
#> calculating number of success
#> performing hypergeometric test
baron_cell_gs_match <- rownames(HGT_baron_cell_gs)[apply(HGT_baron_cell_gs, 2, which.max)]
baron_cell_gs_prediction <- BaronSeurat$cell.type[baron_cell_gs_match]
baron_cell_gs_prediction_signif <- ifelse(apply(HGT_baron_cell_gs, 2, max)>2, yes = baron_cell_gs_prediction, "unassigned")
SegerSeurat$baron_cell_gs_prediction <- baron_cell_gs_prediction_signif
color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey")
ggcolor <- setNames(color,c(sort(unique(BaronSeurat$cell.type)), "unassigned"))
ggPredictionsCellMatch <- DimPlot(SegerSeurat, group.by = "baron_cell_gs_prediction", pt.size = 0.2, reduction = "tsne") + ggtitle("Predicitons") + scale_color_manual(values = ggcolor, drop =FALSE) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggOriginal <- DimPlot(SegerSeurat, group.by = "cell.type", pt.size = 0.2, reduction = "tsne") + ggtitle("Original") + scale_color_manual(values = ggcolor) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(ggOriginal, ggPredictionsCellMatch, legend = "top", common.legend = T)
Once MCA is performed, any type of signatures can be tested and that includes functionnal pathways. Here we use the Hallmark and KEGG pathways in HyperGeometric test and integrate the results into the Seurat object to visualise the -log10 pvalue of the enrichment into an UMAP.
KEGG <- fgsea::gmtPathways("https://amp.pharm.mssm.edu/Enrichr/geneSetLibrary?mode=text&libraryName=KEGG_2019_Human")
HGT_Hallmark <- RunCellHGT(SegerSeurat, pathways = Hallmark, dims = 1:50)
#>
#> calculating distance
#> ranking genes
#> calculating number of success
#> performing hypergeometric test
HGT_KEGG <- RunCellHGT(SegerSeurat, pathways = KEGG, dims = 1:50)
#>
#> calculating distance
#> ranking genes
#> calculating number of success
#> performing hypergeometric test
SegerSeurat@assays[["Hallmark"]] <- CreateAssayObject(HGT_Hallmark)
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
SegerSeurat@assays[["KEGG"]] <- CreateAssayObject(HGT_KEGG)
ggG2Mcell <- FeaturePlot(SegerSeurat, "G2M-CHECKPOINT", order = T, reduction = "tsne", min.cutoff = 2) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
#> Warning: Could not find G2M-CHECKPOINT in the default search locations,
#> found in Hallmark assay instead
ggPancSecr <- FeaturePlot(SegerSeurat, "Pancreatic secretion", order = T, reduction = "tsne", min.cutoff = 2) +
theme(legend.text = element_text(size =10), aspect.ratio = 1)
#> Warning: Could not find Pancreatic secretion in the default search
#> locations, found in KEGG assay instead
ggarrange(ggG2Mcell, ggPancSecr)