# df %>% # mutate(new_var = case_when(var1 < 15 ~ 'low', # var2 < 25 ~ 'med', # TRUE ~ 'high')) df <- data.frame(player = c('AJ', 'Bob', 'Chad', 'Dan', 'Eric', 'Frank'), position = c('G', 'F', 'F', 'G', 'C', NA), points = c(12, 15, 19, 22, 32, NA), assists = c(5, 7, 7, 12, 11, NA)) # player position points assists # 1 AJ G 12 5 # 2 Bob F 15 7 # 3 Chad F 19 7 # 4 Dan G 22 12 # 5 Eric C 32 11 # 6 Frank NA NA

注:TRUE相当于else条件语句。首先我们准备示例数据,为后面示例使用。

下面代码展示如何创建新的变量quality,其值有point变量产生:

df %>%
  mutate(quality = case_when(points > 20 ~ 'high',
                             points > 15 ~ 'med',
                             TRUE ~ 'low' ))
#   player position points assists quality
# 1     AJ        G     12       5     low
# 2    Bob        F     15       7     low
# 3   Chad        F     19       7     med
# 4    Dan        G     22      12    high
# 5   Eric        C     32      11    high
# 6  Frank     <NA>     NA      NA     low

上面代码比较好理解:point>20 则 quality 为 high, point>15 则 quality 为 med,其他情况(包括NA)为low。

下面示例展示基于 pointsassists 两个变量产生quality变量:

df %>%
  mutate(quality = case_when(points > 15 & assists > 10 ~ 'great',
                             points > 15 & assists > 5 ~ 'good',
                             TRUE ~ 'average' ))
#   player position points assists quality
# 1     AJ        G     12       5 average
# 2    Bob        F     15       7 average
# 3   Chad        F     19       7    good
# 4    Dan        G     22      12   great
# 5   Eric        C     32      11   great
# 6  Frank     <NA>     NA      NA average
df %>%
  mutate(quality = case_when(is.na(points) ~ 'missing',
                             points > 15 & assists > 10 ~ 'great',
                             points > 15 & assists > 5 ~ 'good',
                             TRUE ~ 'average' ))
#   player position points assists quality
# 1     AJ        G     12       5 average
# 2    Bob        F     15       7 average
# 3   Chad        F     19       7    good
# 4    Dan        G     22      12   great
# 5   Eric        C     32      11   great
# 6  Frank     <NA>     NA      NA missing

后面代码使用is.na对NA记录单独进行判断。另外需要说明的是case_when 中的几个条件语句顺序很重要,TRUE语句一定要放在最后。

每个学生有三个学科成绩,但分值范围不同。我们首先要对每个学科成绩标准化,然后再对每个学生成绩求平均值,最后根据设定范围生成等级。

library(dplyr)
options(digits=2)
Student <- c("John Davis", "Angela Williams", "Bullwinkle Moose",
             "David Jones", "Janice Markhammer", "Cheryl Cushing",
             "Reuven Ytzrhak", "Greg Knox", "Joel England",
             "Mary Rayburn")
Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
roster <- data.frame(Student, Math, Science, English, stringsAsFactors=FALSE)
z <- roster %>% mutate_if(is.numeric, scale) 
# roster %>% select(-Student) %>% mutate_each(funs(scale))
z <- z %>% mutate(score = rowMeans(select(.,-Student)))
y <- quantile(score, c(0.8, 0.6, 0.4, 0.2))
#              Student   Math Science English score
# 1         John Davis  0.013   1.078   0.587  0.56
# 2    Angela Williams  1.143   1.591   0.037  0.92
# 3   Bullwinkle Moose -1.026  -0.847  -0.697 -0.86
# 4        David Jones -1.649  -0.590  -1.247 -1.16
# 5  Janice Markhammer -0.068  -1.489  -0.330 -0.63
# 6     Cheryl Cushing  0.128  -0.205   1.137  0.35
# 7     Reuven Ytzrhak -1.049  -0.847  -1.247 -1.05
# 8          Greg Knox  1.432   1.078   1.504  1.34
# 9       Joel England  0.832   0.308   0.954  0.70
# 10      Mary Rayburn  0.243  -0.077  -0.697 -0.18
z <- z %>% mutate(
  level = case_when(
    score >= y[1] ~ 'A',
    score < y[1] & score >= y[2] ~ 'B',
    score < y[2] & score >= y[3] ~ 'C',
    score < y[3] & score >= y[4] ~ 'D',
    score < y[4] ~ 'F'
#              Student   Math Science English score level
# 1         John Davis  0.013   1.078   0.587  0.56     B
# 2    Angela Williams  1.143   1.591   0.037  0.92     A
# 3   Bullwinkle Moose -1.026  -0.847  -0.697 -0.86     D
# 4        David Jones -1.649  -0.590  -1.247 -1.16     F
# 5  Janice Markhammer -0.068  -1.489  -0.330 -0.63     D
# 6     Cheryl Cushing  0.128  -0.205   1.137  0.35     C
# 7     Reuven Ytzrhak -1.049  -0.847  -1.247 -1.05     F
# 8          Greg Knox  1.432   1.078   1.504  1.34     A
# 9       Joel England  0.832   0.308   0.954  0.70     B
# 10      Mary Rayburn  0.243  -0.077  -0.697 -0.18     C

代码连接起来,一气呵成:

roster %>% mutate_if(is.numeric, scale) %>% 
  mutate(score = rowMeans(select(.,-Student))) %>% 
  mutate(
  level = case_when(
    score >= y[1] ~ 'A',
    score < y[1] & score >= y[2] ~ 'B',
    score < y[2] & score >= y[3] ~ 'C',
    score < y[3] & score >= y[4] ~ 'D',
    score < y[4] ~ 'F'
                    dplyr提供的case_when()函数简化多个if_else语句,如果不匹配返回NA。结合mutate函数可基于已有变量创建新的变量。本文通过几个示例进行对比学习。语法:library(dplyr)# df %&gt;%#   mutate(new_var = case_when(var1 &lt; 15 ~ 'low',#                              var2 &lt; 25 ~ 'med',#                              .
1、case_when函数,有一些SQL基础(casewhen)的都猜得到这个函数的功能
可实现多条件判断并可以添加标签的函数,这在我们对数据进行分类整理中十分的实用,这个函数中的参数可以这样分:一部分是判断条件,另一部分是所要做的标签
iris%>%select(Sepal.Lengt...
				
R包dplyr可用于处理R内部或者外部的结构化数据,相较于plyr包,dplyr专注接受dataframe对象, 大幅提高了速度, 并且提供了更稳健的数据库接口。同时,dplyr包可用于操作Spark的dataframe。 1. 数据集类型转换     tbl_df()可用于将过长过大的数据集转换为显示更友好的 tbl_df 类型。使用dplyr包处理数据前,建议先将数据集转换为tbl对象。
1. CASE WHEN 表达式有两种形式 --简单Case函数 CASE sex WHEN '1' THEN '男' WHEN '2' THEN '女' ELSE '其他' END --Case搜索函数 CASEWHEN sex = '1' THEN '男' WHEN sex = '2' THEN '女' ELSE '其他' END 2. CASE WHEN...