scRNA-Seq | pySCENIC

组织内细胞异质性的基础是细胞转录状态的差异，转录状态的特异性又是由转录因子主导的基因调控网络（GRNs）决定并维持稳定的。因此分析单细胞的GRNs有助于深入挖掘细胞异质性背后的生物学意义，并为疾病的诊断、治疗以及发育分化的研究提供有价值的线索。
然而单细胞转录组数据具有背景噪音高、基因检出率低和表达矩阵稀疏性的特点，给传统统计学和生物信息学方法推断高质量的GRNs带来了挑战。Single-cell regulatory network inference and clustering (SCENIC)是一种专为单细胞数据开发的GRNs算法，它的创新之处在于引入了转录因子motif序列验证统计学方法推断的基因共表达网络，从而识别高可靠性的由转录因子主导的GRNs。SCENIC相关的文章2017年首先发表于nature methods，2020年又将流程整理后发表于nature protocls。
SCENIC分析流程
官方介绍的主要分析有四步：
GENIE3/GRNBoost：基于共表达情况鉴定每个TF的潜在靶点；
RcisTarget：基于DNA-motif 分析选择潜在的直接结合靶点；
AUCell：分析每个细胞的regulons活性；
细胞聚类：基于regulons的活性鉴定稳定的细胞状态。
以pbmc3k为例，降维聚类，输出csv矩阵文件。
library(SeuratData) #加载seurat数据集  
#InstallData("pbmc3k") #安装pbmc3k数据
data("pbmc3k")  
sce <- pbmc3k.final   
library(Seurat)
table(Idents(sce))
p1=DimPlot(sce,label = T) 
write.csv(t(as.matrix(sce@assays$RNA@counts)), file = "pbmc_3k.all.csv")
这一步会生成一个70M的pbmc_3k.all.csv文件
接下来需要在Linux操作了。写一个 Python脚本 （ csv2loom.py ）把 csv格式的表达量矩阵 转为 .loom 文件。

这一步是在linux下面操作
import os, sys
os.getcwd()
os.listdir(os.getcwd()) 
import loompy as lp;
import numpy as np;
import scanpy as sc;
x=sc.read_csv("pbmc_3k.all.csv"); ## 曾老师的代码这里是x=sc.read_csv("pbmc_3k.csv");
row_attrs = {"Gene": np.array(x.var_names),};
col_attrs = {"CellID": np.array(x.obs_names)};
lp.create("pbmc_3k.loom",x.X.transpose(),row_attrs,col_attrs);
上面的脚本写了后，就可以 运行 Python脚本 （ csv2loom.py ）把 csv格式的表达量矩阵 转为 .loom 文件：
#conda activate pyscenic
python csv2loom.py
这一步会生成一个6.7M的pbmc_3k.loom文件。
2. pyscenic运行
2.1 三大文件下载
但是在这之前需要提前下载好3个重要文件。
文件1：hs_hgnc_tfs.txt，https://github.com/aertslab/pySCENIC/blob/master/resources/hs_hgnc_tfs.txt
文件2: hg19-tss-centered-10kb-7species.mc9nr.feather，https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-tss-centered-10kb-7species.mc9nr.feather
文件3: motifs-v9-nr.hgnc-m0.001-o0.0.tbl，https://resources.aertslab.org/cistarget/motif2tf/motifs-v9-nr.hgnc-m0.001-o0.0.tbl
第1个文件12k，第2个文件1.02G，第三个文件99M，大小一定要正确。
2.2 run_pyscenic.sh脚本编写
# 不同物种的数据库不一样，这里是人类是human 
tfs=$dir/TF/TFs_list/hs_hgnc_tfs.txt
feather=$dir/hg19-tss-centered-10kb-7species.mc9nr.feather
tbl=$dir/TF/TFs_annotation_motif/human_TFs/motifs-v9-nr.hgnc-m0.001-o0.0.tbl 
# 一定要保证上面的数据库文件完整无误哦 
input_loom=pbmc_3k.loom
ls $tfs  $feather  $tbl  
# pyscenic 的3个步骤之 grn
pyscenic grn \
--num_workers 20 \
--output adj.sample.tsv \
--method grnboost2 \
$input_loom  \
#pyscenic 的3个步骤之 cistarget
pyscenic ctx \
adj.sample.tsv $feather \
--annotations_fname $tbl \
--expression_mtx_fname $input_loom  \
--mode "dask_multiprocessing" \
--output reg.csv \
--num_workers 20  \
--mask_dropouts
#pyscenic 的3个步骤之 AUCell
pyscenic aucell \
$input_loom \
reg.csv \
--output out_SCENIC.loom \
--num_workers 20 
这一步会得到11M的out_SCENIC.loom文件。

最重要的三个文件如下
11M 3月  15 09:21 out_SCENIC.loom
6.7M 3月  13 20:59 pbmc_3k.loom
14M 3月  15 09:18 reg.csv
70M 3月  13 18:18 pbmc_3k.all.csv
二、初级可视化
1. 依赖包安装
## SCENIC需要一些依赖包，先安装好
BiocManager::install(c("AUCell", "RcisTarget"))
BiocManager::install(c("GENIE3"))
BiocManager::install(c("zoo", "mixtools", "rbokeh"))
BiocManager::install(c("DT", "NMF", "pheatmap", "R2HTML", "Rtsne"))
BiocManager::install(c("doMC", "doRNG"))
devtools::install_github("aertslab/SCopeLoomR", build_vignettes = TRUE)
devtools::install_github("aertslab/SCENIC")
library(SCENIC)
packageVersion("SCENIC")
#[1] ‘1.2.4’
2. 提取 out_SCENIC.loom 信息
##可视化
rm(list=ls())
library(Seurat)
library(SCopeLoomR)
library(AUCell)
library(SCENIC)
library(dplyr)
library(KernSmooth)
library(RColorBrewer)
library(plotly)
library(BiocParallel)
library(grid)
library(ComplexHeatmap)
library(data.table)
library(scRNAseq)
## 提取 out_SCENIC.loom 信息
#inputDir='./outputs/'
#scenicLoomPath=file.path(inputDir,'out_SCENIC.loom')
library(SCENIC)
loom <- open_loom('out_SCENIC.loom') 
regulons_incidMat <- get_regulons(loom, column.attr.name="Regulons")
regulons_incidMat[1:4,1:4] 
regulons <- regulonsToGeneLists(regulons_incidMat)
names(regulons)
regulonAUC <- get_regulons_AUC(loom,column.attr.name='RegulonsAUC')
rownames(regulonAUC)
regulonAucThresholds <- get_regulon_thresholds(loom)
tail(regulonAucThresholds[order(as.numeric(names(regulonAucThresholds)))])
embeddings <- get_embeddings(loom)  
close_loom(loom)
3. 加载SeuratData
然后载入前面的seurat对象，我们这里仅仅是最基础的示例数据，所以直接使用 SeuratData 包即可
library(SeuratData) #加载seurat数据集  
data("pbmc3k")  
sce <- pbmc3k.final   
table(sce$seurat_clusters)
table(Idents(sce))
sce$celltype = Idents(sce)
library(ggplot2) 
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
                   'CD19', 'CD79A', 'MS4A1' ,
                   'IGHG1', 'MZB1', 'SDC1',
                   'CD68', 'CD163', 'CD14', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'C1QA',  'C1QB',  # mac
                   'S100A9', 'S100A8', 'MMP19',# monocyte
                   'FCGR3A',
                   'LAMP3', 'IDO1','IDO2',## DC3 
                   'CD1E','CD1C', # DC2
                   'KLRB1','NCR1', # NK 
                   'FGF7','MME', 'ACTA2', ## fibo 
                   'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                   'MKI67' , 'TOP2A', 
                   'PECAM1', 'VWF',  ## endo 
                   'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )
library(stringr)  
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
p <- DotPlot(sce , features = unique(genes_to_check),
             assay='RNA'  )  + coord_flip() +   theme(axis.text.x=element_text(angle=45,hjust = 1))
ggsave('check_last_markers.pdf',height = 11,width = 11)
DimPlot(sce,reduction = "umap",label=T ) 
sce$sub_celltype =  sce$celltype
DimPlot(sce,reduction = "umap",label=T,group.by = "sub_celltype" )
ggsave('umap-by-sub_celltype.pdf')
Idents(sce) <- sce$sub_celltype
sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.8)
table(sce@meta.data$RNA_snn_res.0.8)  
DimPlot(sce,reduction = "umap",label=T ) 
ggsave('umap-by-sub_RNA_snn_res.0.8.pdf')
这里的代码仍然是简单的检验一下自己的降维聚类分群是否合理，方便后续合并分析。
4. 四种可视化
现在我们就可以把pyscenic的转录因子分析结果去跟我们的降维聚类分群结合起来进行5种可视化展示。
合并的代码如下所示：
sub_regulonAUC <- regulonAUC[,match(colnames(sce),colnames(regulonAUC))]
dim(sub_regulonAUC)
#确认是否一致
identical(colnames(sub_regulonAUC), colnames(sce))
#[1] TRUE
cellClusters <- data.frame(row.names = colnames(sce), 
                           seurat_clusters = as.character(sce$seurat_clusters))
cellTypes <- data.frame(row.names = colnames(sce), 
                        celltype = sce$sub_celltype)
head(cellTypes)
head(cellClusters)
sub_regulonAUC[1:4,1:4] 
save(sub_regulonAUC,cellTypes,cellClusters,sce,
     file = 'for_rss_and_visual.Rdata')
这个时候，我们的pbmc3k数据集里面的两千多个细胞都有表达量矩阵，也有转录因子活性打分信息。
B细胞有两个非常出名的转录因子，TCF4(+) 以及NR2C1(+)，接下来就可以对这两个进行简单的可视化。
首先，我们需要把这两个转录因子活性信息 添加到降维聚类分群后的的seurat对象里面。
#尴尬的是TCF4(+)我这个数据里面没有，换了个PAX5(+)和REL(+)
regulonsToPlot = c('PAX5(+)','REL(+)')
regulonsToPlot
sce@meta.data = cbind(sce@meta.data ,t(assay(sub_regulonAUC[regulonsToPlot,])))
Idents(sce) <- sce$sub_celltype
table(Idents(sce))
DotPlot(sce, features = unique(regulonsToPlot)) + RotatedAxis()
RidgePlot(sce, features = regulonsToPlot , ncol = 1)
VlnPlot(sce, features = regulonsToPlot,pt.size = 0 ) 
FeaturePlot(sce, features = regulonsToPlot )
可以看到b细胞有两个非常出名的转录因子，TCF4(+) 以及NR2C1(+)，确实是在b细胞比较独特的高。
看看不同单细胞亚群的转录因子活性平均值
# Split the cells by cluster:
selectedResolution <- "celltype" # select resolution
cellsPerGroup <- split(rownames(cellTypes), 
                       cellTypes[,selectedResolution]) 
# 去除extened regulons
sub_regulonAUC <- sub_regulonAUC[onlyNonDuplicatedExtended(rownames(sub_regulonAUC)),] 
dim(sub_regulonAUC)
#[1]  220 2638 #似乎没啥区别
# Calculate average expression:
regulonActivity_byGroup <- sapply(cellsPerGroup,
                                  function(cells) 
                                    rowMeans(getAUC(sub_regulonAUC)[,cells]))
# Scale expression. 
# Scale函数是对列进行归一化，所以要把regulonActivity_byGroup转置成细胞为行，基因为列
# 参考：https://www.jianshu.com/p/115d07af3029
regulonActivity_byGroup_Scaled <- t(scale(t(regulonActivity_byGroup),
                                          center = T, scale=T)) 
# 同一个regulon在不同cluster的scale处理
dim(regulonActivity_byGroup_Scaled)
#[1] 220   9
regulonActivity_byGroup_Scaled=regulonActivity_byGroup_Scaled[]
regulonActivity_byGroup_Scaled=na.omit(regulonActivity_byGroup_Scaled)
2. 热图查看TF分布
pheatmap(regulonActivity_byGroup_Scaled)
可以看到，确实每个单细胞亚群都是有 自己的特异性的激活的转录因子。
3. rss 查看特异TF
不过，SCENIC包自己提供了一个 calcRSS函数，帮助我们来挑选各个单细胞亚群特异性的转录因子，全称是：Calculates the regulon specificity score
参考文章：The RSS was first used by Suo et al. in: Revealing the Critical Regulators of Cell Identity in the Mouse Cell Atlas. Cell Reports (2018). doi: 10.1016/j.celrep.2018.10.045

运行超级简单。
rss <- calcRSS(AUC=getAUC(sub_regulonAUC), 
               cellAnnotation=cellTypes[colnames(sub_regulonAUC), 
                                           selectedResolution]) 
rss=na.omit(rss) 
rssPlot <- plotRSS(rss)
plotly::ggplotly(rssPlot$plot)
df$fc = df$sd.1 - df$sd.2
top5 <- df %>% group_by(cluster) %>% top_n(5, fc)
rowcn = data.frame(path = top5$cluster) 
n = rss[top5$path,] 
#rownames(rowcn) = rownames(n)
pheatmap(n,
         annotation_row = rowcn,
         show_rownames = T)
至此， pySCENIC的转录因子分析及数据可视化教程复现结束，
2023-04-04 00:28:00,057 - distributed.nanny - WARNING - Worker process still alive after 3.9999994277954105 seconds, killing
2023-04-04 00:28:00,076 - distributed.nanny - WARNING - Worker process still alive after 3.9999994277954105 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.9999988555908206 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.9999990463256836 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
2023-04-04 00:28:00,081 - distributed.nanny - WARNING - Worker process still alive after 3.9999988555908206 seconds, killing
2023-04-04 00:28:00,092 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
Traceback (most recent call last):
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/bin/pyscenic", line 8, in <module>
    sys.exit(main())
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/pyscenic/cli/pyscenic.py", line 713, in main
    args.func(args)
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/pyscenic/cli/pyscenic.py", line 106, in find_adjacencies_command
    network = method(
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/arboreto/algo.py", line 39, in grnboost2
    return diy(expression_data=expression_data, regressor_type='GBM', regressor_kwargs=SGBM_KWARGS,
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/arboreto/algo.py", line 134, in diy
    return client \
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 3215, in compute
    result = self.gather(futures)
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 2174, in gather
    return self.sync(
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 338, in sync
    return sync(
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 405, in sync
    raise exc.with_traceback(tb)
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 378, in f
    result = yield future
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/tornado/gen.py", line 762, in run
    value = future.result()
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 2037, in _gather
    raise exception.with_traceback(traceback)
distributed.scheduler.KilledWorker: ('ndarray-99aebd9077176e83098d0511c36efe41', <WorkerState 'tcp://127.0.0.1:34970', name: 30, status: closed, memory: 0, processing: 23711>)
把-n 设置小点

https://www.jianshu.com/p/0a29ecfaf21e

https://www.jianshu.com/p/05d1b2d0d772

https://www.jianshu.com/p/7ab2d6c8f764

https://mp.weixin.qq.com/s/SIfyGzx4fwXPtQsVvvwwMQ

https://mp.weixin.qq.com/s/py4AWdtaNNMPqLzU4loODQ

https://mp.weixin.qq.com/s/yaYSqqvBnK8OlL0ZQkR94Q
可视化举例
做热图主要分为两种，一种是把细胞分组求regulons的活性平均值/中位数，另一种是展示所有细胞regulons的活性平均值。
1、细胞分组求regulons的活性平均值/中位数
https://mp.weixin.qq.com/s/zaXpaTQ0IwYGgMO3XIGVaQ
https://www.jianshu.com/p/7180828033a7

https://www.jianshu.com/p/1c9937e7996c

https://blog.csdn.net/qq_42090739/article/details/127745764