scRNA-Seq | pySCENIC

组织内细胞异质性的基础是细胞转录状态的差异,转录状态的特异性又是由转录因子主导的基因调控网络(GRNs)决定并维持稳定的。因此分析单细胞的GRNs有助于深入挖掘细胞异质性背后的生物学意义,并为疾病的诊断、治疗以及发育分化的研究提供有价值的线索。

然而单细胞转录组数据具有背景噪音高、基因检出率低和表达矩阵稀疏性的特点,给传统统计学和生物信息学方法推断高质量的GRNs带来了挑战。Single-cell regulatory network inference and clustering (SCENIC)是一种专为单细胞数据开发的GRNs算法,它的创新之处在于引入了转录因子motif序列验证统计学方法推断的基因共表达网络,从而识别高可靠性的由转录因子主导的GRNs。SCENIC相关的文章2017年首先发表于nature methods,2020年又将流程整理后发表于nature protocls。

SCENIC分析流程
官方介绍的主要分析有四步:

  • GENIE3/GRNBoost:基于共表达情况鉴定每个TF的潜在靶点;
  • RcisTarget:基于DNA-motif 分析选择潜在的直接结合靶点;
  • AUCell:分析每个细胞的regulons活性;
  • 细胞聚类:基于regulons的活性鉴定稳定的细胞状态。
  • 以pbmc3k为例,降维聚类,输出csv矩阵文件。

    library(SeuratData) #加载seurat数据集  
    #InstallData("pbmc3k") #安装pbmc3k数据
    data("pbmc3k")  
    sce <- pbmc3k.final   
    library(Seurat)
    table(Idents(sce))
    p1=DimPlot(sce,label = T) 
    write.csv(t(as.matrix(sce@assays$RNA@counts)), file = "pbmc_3k.all.csv")
    

    这一步会生成一个70M的pbmc_3k.all.csv文件

    接下来需要在Linux操作了。写一个 Python脚本 ( csv2loom.py )把 csv格式的表达量矩阵 转为 .loom 文件。
    这一步是在linux下面操作

    import os, sys
    os.getcwd()
    os.listdir(os.getcwd()) 
    import loompy as lp;
    import numpy as np;
    import scanpy as sc;
    x=sc.read_csv("pbmc_3k.all.csv"); ## 曾老师的代码这里是x=sc.read_csv("pbmc_3k.csv");
    row_attrs = {"Gene": np.array(x.var_names),};
    col_attrs = {"CellID": np.array(x.obs_names)};
    lp.create("pbmc_3k.loom",x.X.transpose(),row_attrs,col_attrs);
    

    上面的脚本写了后,就可以 运行 Python脚本 ( csv2loom.py )把 csv格式的表达量矩阵 转为 .loom 文件:

    #conda activate pyscenic
    python csv2loom.py
    

    这一步会生成一个6.7M的pbmc_3k.loom文件。

    2. pyscenic运行

    2.1 三大文件下载

    但是在这之前需要提前下载好3个重要文件。

    文件1:hs_hgnc_tfs.txt,https://github.com/aertslab/pySCENIC/blob/master/resources/hs_hgnc_tfs.txt

    文件2: hg19-tss-centered-10kb-7species.mc9nr.feather,https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-tss-centered-10kb-7species.mc9nr.feather

    文件3: motifs-v9-nr.hgnc-m0.001-o0.0.tbl,https://resources.aertslab.org/cistarget/motif2tf/motifs-v9-nr.hgnc-m0.001-o0.0.tbl

    第1个文件12k,第2个文件1.02G,第三个文件99M,大小一定要正确。

    2.2 run_pyscenic.sh脚本编写
    # 不同物种的数据库不一样,这里是人类是human 
    tfs=$dir/TF/TFs_list/hs_hgnc_tfs.txt
    feather=$dir/hg19-tss-centered-10kb-7species.mc9nr.feather
    tbl=$dir/TF/TFs_annotation_motif/human_TFs/motifs-v9-nr.hgnc-m0.001-o0.0.tbl 
    # 一定要保证上面的数据库文件完整无误哦 
    input_loom=pbmc_3k.loom
    ls $tfs  $feather  $tbl  
    # pyscenic 的3个步骤之 grn
    pyscenic grn \
    --num_workers 20 \
    --output adj.sample.tsv \
    --method grnboost2 \
    $input_loom  \
    #pyscenic 的3个步骤之 cistarget
    pyscenic ctx \
    adj.sample.tsv $feather \
    --annotations_fname $tbl \
    --expression_mtx_fname $input_loom  \
    --mode "dask_multiprocessing" \
    --output reg.csv \
    --num_workers 20  \
    --mask_dropouts
    #pyscenic 的3个步骤之 AUCell
    pyscenic aucell \
    $input_loom \
    reg.csv \
    --output out_SCENIC.loom \
    --num_workers 20 
    

    这一步会得到11M的out_SCENIC.loom文件。
    最重要的三个文件如下

    11M 3月  15 09:21 out_SCENIC.loom
    6.7M 3月  13 20:59 pbmc_3k.loom
    14M 3月  15 09:18 reg.csv
    70M 3月  13 18:18 pbmc_3k.all.csv
    

    二、初级可视化

    1. 依赖包安装

    ## SCENIC需要一些依赖包,先安装好
    BiocManager::install(c("AUCell", "RcisTarget"))
    BiocManager::install(c("GENIE3"))
    BiocManager::install(c("zoo", "mixtools", "rbokeh"))
    BiocManager::install(c("DT", "NMF", "pheatmap", "R2HTML", "Rtsne"))
    BiocManager::install(c("doMC", "doRNG"))
    devtools::install_github("aertslab/SCopeLoomR", build_vignettes = TRUE)
    devtools::install_github("aertslab/SCENIC")
    library(SCENIC)
    packageVersion("SCENIC")
    #[1] ‘1.2.4’
    

    2. 提取 out_SCENIC.loom 信息

    ##可视化
    rm(list=ls())
    library(Seurat)
    library(SCopeLoomR)
    library(AUCell)
    library(SCENIC)
    library(dplyr)
    library(KernSmooth)
    library(RColorBrewer)
    library(plotly)
    library(BiocParallel)
    library(grid)
    library(ComplexHeatmap)
    library(data.table)
    library(scRNAseq)
    ## 提取 out_SCENIC.loom 信息
    #inputDir='./outputs/'
    #scenicLoomPath=file.path(inputDir,'out_SCENIC.loom')
    library(SCENIC)
    loom <- open_loom('out_SCENIC.loom') 
    regulons_incidMat <- get_regulons(loom, column.attr.name="Regulons")
    regulons_incidMat[1:4,1:4] 
    regulons <- regulonsToGeneLists(regulons_incidMat)
    names(regulons)
    regulonAUC <- get_regulons_AUC(loom,column.attr.name='RegulonsAUC')
    rownames(regulonAUC)
    regulonAucThresholds <- get_regulon_thresholds(loom)
    tail(regulonAucThresholds[order(as.numeric(names(regulonAucThresholds)))])
    embeddings <- get_embeddings(loom)  
    close_loom(loom)
    

    3. 加载SeuratData

    然后载入前面的seurat对象,我们这里仅仅是最基础的示例数据,所以直接使用 SeuratData 包即可

    library(SeuratData) #加载seurat数据集  
    data("pbmc3k")  
    sce <- pbmc3k.final   
    table(sce$seurat_clusters)
    table(Idents(sce))
    sce$celltype = Idents(sce)
    library(ggplot2) 
    genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
                       'CD19', 'CD79A', 'MS4A1' ,
                       'IGHG1', 'MZB1', 'SDC1',
                       'CD68', 'CD163', 'CD14', 
                       'TPSAB1' , 'TPSB2',  # mast cells,
                       'RCVRN','FPR1' , 'ITGAM' ,
                       'C1QA',  'C1QB',  # mac
                       'S100A9', 'S100A8', 'MMP19',# monocyte
                       'FCGR3A',
                       'LAMP3', 'IDO1','IDO2',## DC3 
                       'CD1E','CD1C', # DC2
                       'KLRB1','NCR1', # NK 
                       'FGF7','MME', 'ACTA2', ## fibo 
                       'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                       'MKI67' , 'TOP2A', 
                       'PECAM1', 'VWF',  ## endo 
                       'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )
    library(stringr)  
    genes_to_check=str_to_upper(genes_to_check)
    genes_to_check
    p <- DotPlot(sce , features = unique(genes_to_check),
                 assay='RNA'  )  + coord_flip() +   theme(axis.text.x=element_text(angle=45,hjust = 1))
    ggsave('check_last_markers.pdf',height = 11,width = 11)
    DimPlot(sce,reduction = "umap",label=T ) 
    sce$sub_celltype =  sce$celltype
    DimPlot(sce,reduction = "umap",label=T,group.by = "sub_celltype" )
    ggsave('umap-by-sub_celltype.pdf')
    Idents(sce) <- sce$sub_celltype
    sce <- FindNeighbors(sce, dims = 1:15)
    sce <- FindClusters(sce, resolution = 0.8)
    table(sce@meta.data$RNA_snn_res.0.8)  
    DimPlot(sce,reduction = "umap",label=T ) 
    ggsave('umap-by-sub_RNA_snn_res.0.8.pdf')
    

    这里的代码仍然是简单的检验一下自己的降维聚类分群是否合理,方便后续合并分析。

    4. 四种可视化

    现在我们就可以把pyscenic的转录因子分析结果去跟我们的降维聚类分群结合起来进行5种可视化展示。

    合并的代码如下所示:

    sub_regulonAUC <- regulonAUC[,match(colnames(sce),colnames(regulonAUC))]
    dim(sub_regulonAUC)
    #确认是否一致
    identical(colnames(sub_regulonAUC), colnames(sce))
    #[1] TRUE
    cellClusters <- data.frame(row.names = colnames(sce), 
                               seurat_clusters = as.character(sce$seurat_clusters))
    cellTypes <- data.frame(row.names = colnames(sce), 
                            celltype = sce$sub_celltype)
    head(cellTypes)
    head(cellClusters)
    sub_regulonAUC[1:4,1:4] 
    save(sub_regulonAUC,cellTypes,cellClusters,sce,
         file = 'for_rss_and_visual.Rdata')
    

    这个时候,我们的pbmc3k数据集里面的两千多个细胞都有表达量矩阵,也有转录因子活性打分信息。

    B细胞有两个非常出名的转录因子,TCF4(+) 以及NR2C1(+),接下来就可以对这两个进行简单的可视化。

    首先,我们需要把这两个转录因子活性信息 添加到降维聚类分群后的的seurat对象里面。

    #尴尬的是TCF4(+)我这个数据里面没有,换了个PAX5(+)和REL(+)
    regulonsToPlot = c('PAX5(+)','REL(+)')
    regulonsToPlot
    sce@meta.data = cbind(sce@meta.data ,t(assay(sub_regulonAUC[regulonsToPlot,])))
    Idents(sce) <- sce$sub_celltype
    table(Idents(sce))
    
    DotPlot(sce, features = unique(regulonsToPlot)) + RotatedAxis()
    RidgePlot(sce, features = regulonsToPlot , ncol = 1)
    VlnPlot(sce, features = regulonsToPlot,pt.size = 0 ) 
    FeaturePlot(sce, features = regulonsToPlot )
    

    可以看到b细胞有两个非常出名的转录因子,TCF4(+) 以及NR2C1(+),确实是在b细胞比较独特的高。

    看看不同单细胞亚群的转录因子活性平均值

    # Split the cells by cluster:
    selectedResolution <- "celltype" # select resolution
    cellsPerGroup <- split(rownames(cellTypes), 
                           cellTypes[,selectedResolution]) 
    # 去除extened regulons
    sub_regulonAUC <- sub_regulonAUC[onlyNonDuplicatedExtended(rownames(sub_regulonAUC)),] 
    dim(sub_regulonAUC)
    #[1]  220 2638 #似乎没啥区别
    # Calculate average expression:
    regulonActivity_byGroup <- sapply(cellsPerGroup,
                                      function(cells) 
                                        rowMeans(getAUC(sub_regulonAUC)[,cells]))
    # Scale expression. 
    # Scale函数是对列进行归一化,所以要把regulonActivity_byGroup转置成细胞为行,基因为列
    # 参考:https://www.jianshu.com/p/115d07af3029
    regulonActivity_byGroup_Scaled <- t(scale(t(regulonActivity_byGroup),
                                              center = T, scale=T)) 
    # 同一个regulon在不同cluster的scale处理
    dim(regulonActivity_byGroup_Scaled)
    #[1] 220   9
    regulonActivity_byGroup_Scaled=regulonActivity_byGroup_Scaled[]
    regulonActivity_byGroup_Scaled=na.omit(regulonActivity_byGroup_Scaled)
    

    2. 热图查看TF分布

    pheatmap(regulonActivity_byGroup_Scaled)
    

    可以看到,确实每个单细胞亚群都是有 自己的特异性的激活的转录因子。

    3. rss 查看特异TF

    不过,SCENIC包自己提供了一个 calcRSS函数,帮助我们来挑选各个单细胞亚群特异性的转录因子,全称是:Calculates the regulon specificity score

    参考文章:The RSS was first used by Suo et al. in: Revealing the Critical Regulators of Cell Identity in the Mouse Cell Atlas. Cell Reports (2018). doi: 10.1016/j.celrep.2018.10.045
    运行超级简单。

    rss <- calcRSS(AUC=getAUC(sub_regulonAUC), 
                   cellAnnotation=cellTypes[colnames(sub_regulonAUC), 
                                               selectedResolution]) 
    rss=na.omit(rss) 
    rssPlot <- plotRSS(rss)
    plotly::ggplotly(rssPlot$plot)
    df$fc = df$sd.1 - df$sd.2
    top5 <- df %>% group_by(cluster) %>% top_n(5, fc)
    rowcn = data.frame(path = top5$cluster) 
    n = rss[top5$path,] 
    #rownames(rowcn) = rownames(n)
    pheatmap(n,
             annotation_row = rowcn,
             show_rownames = T)
    

    至此, pySCENIC的转录因子分析及数据可视化教程复现结束,

    2023-04-04 00:28:00,057 - distributed.nanny - WARNING - Worker process still alive after 3.9999994277954105 seconds, killing
    2023-04-04 00:28:00,076 - distributed.nanny - WARNING - Worker process still alive after 3.9999994277954105 seconds, killing
    2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.9999988555908206 seconds, killing
    2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.9999990463256836 seconds, killing
    2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
    2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
    2023-04-04 00:28:00,081 - distributed.nanny - WARNING - Worker process still alive after 3.9999988555908206 seconds, killing
    2023-04-04 00:28:00,092 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
    Traceback (most recent call last):
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/bin/pyscenic", line 8, in <module>
        sys.exit(main())
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/pyscenic/cli/pyscenic.py", line 713, in main
        args.func(args)
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/pyscenic/cli/pyscenic.py", line 106, in find_adjacencies_command
        network = method(
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/arboreto/algo.py", line 39, in grnboost2
        return diy(expression_data=expression_data, regressor_type='GBM', regressor_kwargs=SGBM_KWARGS,
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/arboreto/algo.py", line 134, in diy
        return client \
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 3215, in compute
        result = self.gather(futures)
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 2174, in gather
        return self.sync(
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 338, in sync
        return sync(
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 405, in sync
        raise exc.with_traceback(tb)
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 378, in f
        result = yield future
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/tornado/gen.py", line 762, in run
        value = future.result()
      File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 2037, in _gather
        raise exception.with_traceback(traceback)
    distributed.scheduler.KilledWorker: ('ndarray-99aebd9077176e83098d0511c36efe41', <WorkerState 'tcp://127.0.0.1:34970', name: 30, status: closed, memory: 0, processing: 23711>)
    

    -n 设置小点
    https://www.jianshu.com/p/0a29ecfaf21e
    https://www.jianshu.com/p/05d1b2d0d772
    https://www.jianshu.com/p/7ab2d6c8f764
    https://mp.weixin.qq.com/s/SIfyGzx4fwXPtQsVvvwwMQ
    https://mp.weixin.qq.com/s/py4AWdtaNNMPqLzU4loODQ
    https://mp.weixin.qq.com/s/yaYSqqvBnK8OlL0ZQkR94Q

    可视化举例

    做热图主要分为两种,一种是把细胞分组求regulons的活性平均值/中位数,另一种是展示所有细胞regulons的活性平均值。

    1、细胞分组求regulons的活性平均值/中位数

    https://mp.weixin.qq.com/s/zaXpaTQ0IwYGgMO3XIGVaQ

    https://www.jianshu.com/p/7180828033a7
    https://www.jianshu.com/p/1c9937e7996c
    https://blog.csdn.net/qq_42090739/article/details/127745764