一个完美的单细胞亚群随机森林分离器是如何炼成的开发者社区

一个完美的单细胞亚群随机森林分离器是如何炼成的

# devtools::install_github('satijalab/seurat-data')
library(SeuratData) #加载seurat数据集  
getOption('timeout')
options(timeout=10000)
# InstallData("pbmc3k")  
data("pbmc3k")  
library(Seurat) 
sce <- pbmc3k.final  
# 需要合并一些单细胞亚群：
levels(Idents(sce))
## Assigning cell type identity to clusters
new.cluster.ids <- c("CD4 T","CD4 T", "Mono",  "B", "CD8 T",
                     "Mono", "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(sce)
sce <- RenameIdents(sce, new.cluster.ids)
DimPlot(sce, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
future::plan("multiprocess", workers = 4) 
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, 
                              min.pct = 0.25, 
                              thresh.use = 0.25)
save(sce.markers,file = 'sce.markers.FindAllMarkers.pbmc.Rdata')
load(file = 'sce.markers.FindAllMarkers.pbmc.Rdata')
library(dplyr) 
top10 <- sce.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)

sce <- NormalizeData(sce, normalization.method =  "LogNormalize",  
                         scale.factor = 1e4)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce, 
                                selection.method = "vst", nfeatures = 2000)  
sce <- ScaleData(sce) 
sce <- ScaleData(sce,features = unique(top10$gene)) 
t_expr <- t(as.matrix( sce@assays$RNA@scale.data ))
dim(t_expr) 
t_expr[1:4,1:4] 
# 1. 划分训练集和验证集
library(randomForest)
library(caret)
library(pROC)
library(caret)
inTrain<-createDataPartition(y= Idents(sce) ,p=0.25,list=F)
test_expr <-t_expr[inTrain,]
train_expr <-t_expr[-inTrain,]
test_y <- Idents(sce)[inTrain]
train_y  <- Idents(sce)[-inTrain] 
save(test_y,train_y,
     test_expr,train_expr,file = 'input.Rdata')

library(randomForest)
library(caret)
library(pROC)
library(caret)
load(file = 'input.Rdata')
train_expr[1:4,1:4]
table(train_y)
predictor_data = train_expr 
target = train_y 
# 直接使用 randomForest 函数即可：
rf_output=randomForest(x=predictor_data, y=target,
                       importance = TRUE, ntree = 10001, proximity=TRUE )
rf_output
save(rf_output,file='rf_output.Rdata')


# 构建好的随机森林模型，首先自我预测，在前面的75%的训练集，这里略
load(file='rf_output.Rdata')
load(file = 'input.Rdata')
# 然后预测我们预留下来的另外的25%的测试集
test_outputs <- predict(rf_output,newdata = test_expr,type="prob")
test_expr[1:4,1:4]
head(test_outputs)
pred_y = colnames(test_outputs)[apply(test_outputs, 1, which.max)]
pred_y = factor(pred_y,levels = levels(test_y))

一个完美的单细胞亚群随机森林分离器是如何炼成的

一个完美的单细胞亚群随机森林分离器是如何炼成的

首先单细胞各个亚群找特异性高表达量基因：

然后把单细胞表达量矩阵划分为训练集和测试集

接着使用 randomForest 函数在训练集构建模型

在测试集上面看模型效果