「R」逻辑回归、决策树、随机森林开发者社区

「R」逻辑回归、决策树、随机森林

pkgs <- c("rpart", "rpart.plot", "party",
          "randomForest", "e1071")
install.packages(pkgs, dependencies = TRUE)

## Data preparation
loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/"
ds  <- "breast-cancer-wisconsin/breast-cancer-wisconsin.data" 
url <- paste(loc, ds, sep="")
breast <- read.table(url, sep = ",", header = FALSE, na.strings = "?")
names(breast) <- c("ID", "clumpThickness", "sizeUniformity", "shapeUniformity",
                   "maginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
                   "blandChromatin", "normalNucleoli", "mitosis", "class")
df <- breast[-1]
df$class <- factor(df$class, levels = c(2,4),
                   labels = c("benign", "malignant"))
set.seed(1234)
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]
df.validate <- df[-train,]
table(df.train$class)
table(df.validate$class)

## logistic regression
fit.logit <- glm(class ~ ., data=df.train, family=binomial()) # 拟合逻辑回归
summary(fit.logit)                            #　检查模型
Call:
glm(formula = class ~ ., family = binomial(), data = df.train)
Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.75813  -0.10602  -0.05679   0.01237   2.64317  
Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -10.42758    1.47602  -7.065 1.61e-12 ***
clumpThickness             0.52434    0.15950   3.287  0.00101 ** 
sizeUniformity            -0.04805    0.25706  -0.187  0.85171    
shapeUniformity            0.42309    0.26775   1.580  0.11407    
maginalAdhesion            0.29245    0.14690   1.991  0.04650 *  
singleEpithelialCellSize   0.11053    0.17980   0.615  0.53871    
bareNuclei                 0.33570    0.10715   3.133  0.00173 ** 
blandChromatin             0.42353    0.20673   2.049  0.04049 *  
normalNucleoli             0.28888    0.13995   2.064  0.03900 *  
mitosis                    0.69057    0.39829   1.734  0.08295 .  
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
prob <- predict(fit.logit, df.validate, type="response")　　#　建立预测模型,对训练集集外样本单元分类
logit.pred <- factor(prob > .5, levels=c(FALSE, TRUE),
                     labels = c("benign", "malignant"))　　
logit.perf <- table(df.validate$class, logit.pred,
                    dnn=c("Actual", "Predicted"))　　　　　　# 评估预测准确性
logit.perf
           Predicted
Actual      benign malignant
  benign       118         2
  malignant      4        76

library(rpart)
set.seed(1234)
dtree <- rpart(class ~ ., data=df.train, method="class",
              parms=list(split="information"))     # 生成树
dtree$cptable
        CP nsplit rel error xerror       xstd
1 0.800000      0   1.00000 1.0000 0.06484605
2 0.046875      1   0.20000 0.2750 0.03954867
3 0.012500      3   0.10625 0.1500 0.02985779
4 0.010000      4   0.09375 0.1625 0.03101007
plotcp(dtree)
dtree.pruned <- prune(dtree, cp=.0125)          # 剪枝
library(rpart.plot)
prp(dtree.pruned, type=2, extra=104,
    fallen.leaves = TRUE, main="Decision Tree")
dtree.pred <- predict(dtree.pruned, df.validate, type="class")   # 对训练集外样本单元分类
dtree.perf <- table(df.validate$class, dtree.pred,
                    dnn=c("Actual", "Predicted"))
dtree.perf
           Predicted
Actual      benign malignant
  benign       122         7
  malignant      2        79

library(party)
fit.ctree <- ctree(class ~ ., data=df.train)
plot(fit.ctree, main="Conditional Inference Tree")
ctree.pred <- predict(fit.ctree, df.validate, type="response")
ctree.perf <- table(df.validate$class, ctree.pred, 
                    dnn=c("Actual", "Predicted"))
ctree.perf

library(randomForest)
set.seed(1234)
fit.forest <- randomForest(class ~ ., data=df.train,
                           na.action = na.roughfix,
                           importance = TRUE)   # 生成森林
fit.forest
importance(fit.forest, type=2)    # 给出变量重要性
forest.pred <- predict(fit.forest, df.validate)    # 对训练集外样本进行分类
forest.perf <- table(df.validate$class, forest.pred,
                     dnn = c("Actual", "Predicted"))
forest.perf

> fit.forest
Call:
 randomForest(formula = class ~ ., data = df.train, importance = TRUE,      na.action = na.roughfix) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3
        OOB estimate of  error rate: 3.68%
Confusion matrix:
          benign malignant class.error
benign       319        10  0.03039514
malignant      8       152  0.05000000
> importance(fit.forest, type=2)
                         MeanDecreaseGini
clumpThickness                  12.504484
sizeUniformity                  54.770143
shapeUniformity                 48.662325
maginalAdhesion                  5.969580
singleEpithelialCellSize        14.297239
bareNuclei                      34.017599
blandChromatin                  16.243253
normalNucleoli                  26.337646
mitosis                          1.814502

「R」逻辑回归、决策树、随机森林

「R」逻辑回归、决策树、随机森林

数据准备

逻辑回归

决策树

经典决策树

条件推断树

随机森林