R语言基础入门(4) mutate函数创建新列

今天继续介绍 dplyr 包中的重要函数 mutate ,其基本功能为创建新列; mutate 中的选项几乎是无穷无尽的,可以通过各种函数之间的组合来对数据集做任意的处理,下面通过具体的案例来进行演示

这次我们使用R内置的数据集 msleep ,其中包括哺乳动物的睡眠时间。让我们首先加载包并查看数据:

library(tidyverse)
msleep
   name    genus  vore  order conservation sleep_total sleep_rem sleep_cycle
   <chr>   <chr>  <chr> <chr> <chr>              <dbl>     <dbl>       <dbl>
 1 Cheetah Acino~ carni Carn~ lc                  12.1      NA        NA    
 2 Owl mo~ Aotus  omni  Prim~ NA                  17         1.8      NA    
 3 Mounta~ Aplod~ herbi Rode~ nt                  14.4       2.4      NA    
 4 Greate~ Blari~ omni  Sori~ lc                  14.9       2.3       0.133

mutate 基础操作

最简单的的操作就是根据其他列中的值进行计算。在示例代码中,我们将睡眠数据从以小时为单位更改为分钟为单位

msleep %>%
  select(name,sleep_total) %>%
  mutate(sleep_total_min = sleep_total * 60)
   name                       sleep_total sleep_total_min
   <chr>                            <dbl>           <dbl>
 1 Cheetah                           12.1             726
 2 Owl monkey                        17              1020
 3 Mountain beaver                   14.4             864

下列代码创建了两列新列:一列显示了睡眠时间与平均睡眠时间的差异,另一列显示了与睡眠时间最少的动物之间的差异;round( )对数据进行四舍五入操作

msleep %>%
  select(name, sleep_total) %>%
  mutate(AVG = sleep_total - round(mean(sleep_total), 1),
         MIN = sleep_total - min(sleep_total))
# A tibble: 83 x 4
   name                       sleep_total    AVG   MIN
   <chr>                            <dbl>  <dbl> <dbl>
 1 Cheetah                           12.1  1.7    10.2
 2 Owl monkey                        17    6.6    15.1
 3 Mountain beaver                   14.4  4      12.5

选择特定列按行求均值,rowwise( )说明按行进行操作

msleep %>%
  select(name, contains("sleep")) %>%
  rowwise() %>%
  mutate(avg = mean(c(sleep_rem,sleep_cycle)))
  name                sleep_total sleep_rem sleep_cycle    avg
   <chr>                     <dbl>     <dbl>       <dbl>  <dbl>
 1 Cheetah                    12.1      NA        NA     NA    
 2 Owl monkey                 17         1.8      NA     NA    
 3 Mountain beaver            14.4       2.4      NA     NA    
 4 Greater short-tail~        14.9       2.3       0.133  1.22 

通过ifelse判断语句对数据进行操作,如果brainwt > 4返回NA,不满足此条件返回原值

msleep %>%
  select(name, brainwt) %>%
  mutate(brainwt2 = ifelse(brainwt > 4, NA, brainwt)) %>%
  arrange(desc(brainwt))
   name             brainwt brainwt2
   <chr>              <dbl>    <dbl>
 1 African elephant   5.71    NA    
 2 Asian elephant     4.60    NA    
 3 Human              1.32     1.32 
 4 Horse              0.655    0.655

也可以结合使用stringr的功能或正则表达式来对字符串列进行操作;
示例代码将返回动物名称的最后一个单词,并使其小写

msleep %>%
  select(name) %>%
  mutate(name_last_word = tolower(str_extract(name, pattern = "\\w+$")))
   name                       name_last_word
   <chr>                      <chr>         
 1 Cheetah                    cheetah       
 2 Owl monkey                 monkey        
 3 Mountain beaver            beaver        

对多列同时进行操作

  • mutate_all() 将对所有列进行操作
  • mutate_if()首先需要一个返回布尔值,如果是T,则将在这些变量上执行mutate指令
  • mutate_at()要求在vars() 参数内指定要进行改变的列
  • 将所有数据转换为小写:

    msleep %>% mutate_all(tolower)
    
       name    genus vore  order conservation sleep_total sleep_rem
       <chr>   <chr> <chr> <chr> <chr>        <chr>       <chr>    
     1 cheetah acin~ carni carn~ lc           12.1        NA       
     2 owl mo~ aotus omni  prim~ NA           17          1.8      
     3 mounta~ aplo~ herbi rode~ nt           14.4        2.4      
    

    所有列添加" /n "

    msleep %>% mutate_all(~paste(., "  /n  "))
    

    " /n "全部替换为空

    msleep_ohno <- msleep %>% mutate_all(~paste(., "  /n  ")) 
    msleep_ohno %>%
      mutate_all(~str_replace_all(., "/n", "")) %>%
      mutate_all(str_trim)
    

    mutate_if()对数据进行判断

    如果数据类型是数值,对其进行四舍五入操作

    msleep %>%
      select(name, sleep_total:bodywt) %>%
      mutate_if(is.numeric, round)
    
       name                       sleep_total sleep_rem sleep_cycle awake brainwt bodywt
       <chr>                            <dbl>     <dbl>       <dbl> <dbl>   <dbl>  <dbl>
     1 Cheetah                             12        NA          NA    12      NA     50
     2 Owl monkey                          17         2          NA     7       0      0
     3 Mountain beaver                     14         2          NA    10      NA      1
    

    mutate_at( )对特定列进行操作

    对列名含有sleep的进行操作

    msleep %>%
      select(name, sleep_total:awake) %>%
      mutate_at(vars(contains("sleep")), ~(.*60))
    
       name                       sleep_total sleep_rem sleep_cycle awake
       <chr>                            <dbl>     <dbl>       <dbl> <dbl>
     1 Cheetah                            726        NA       NA     11.9
     2 Owl monkey                        1020       108       NA      7  
     3 Mountain beaver                    864       144       NA      9.6
    
    msleep %>%
      select(name, sleep_total:awake) %>%
      mutate_at(vars(contains("sleep")), ~(.*60)) %>%
      rename_at(vars(contains("sleep")), ~paste0(.,"_min"))
    
       name                       sleep_total_min sleep_rem_min sleep_cycle_min awake
       <chr>                                <dbl>         <dbl>           <dbl> <dbl>
     1 Cheetah                                726            NA           NA     11.9
     2 Owl monkey                            1020           108           NA      7  
     3 Mountain beaver                        864           144           NA      9.6
    

    保留原始数据

    msleep %>%
      select(name, sleep_total:awake) %>%
      mutate_at(vars(contains("sleep")), funs(min = .*60))
    
       name           sleep_total sleep_rem sleep_cycle awake sleep_total_min sleep_rem_min sleep_cycle_min
       <chr>                <dbl>     <dbl>       <dbl> <dbl>           <dbl>         <dbl>           <dbl>
     1 Cheetah               12.1      NA        NA      11.9             726            NA           NA   
     2 Owl monkey            17         1.8      NA       7              1020           108           NA   
    

    ifelse创建2个级别的离散列

    msleep %>%
      select(name, sleep_total) %>%
      mutate(sleep_time = ifelse(sleep_total > 10, "long", "short"))
    
       name                       sleep_total sleep_time
       <chr>                            <dbl> <chr>     
     1 Cheetah                           12.1 long      
     2 Owl monkey                        17   long      
     3 Mountain beaver                   14.4 long      
    

    case_when创建多级离散列

    此函数在后续数据清洗中有大有,需要多多练习

    msleep %>%
      select(name, sleep_total) %>%
      mutate(sleep_total_discr = case_when(
        sleep_total > 13 ~ "very long",
        sleep_total > 10 ~ "long",
        sleep_total > 7 ~ "limited",
        TRUE ~ "short"))
    
       name                       sleep_total sleep_total_discr
       <chr>                            <dbl> <chr>            
     1 Cheetah                           12.1 long             
     2 Owl monkey                        17   very long        
     3 Mountain beaver                   14.4 very long        
     4 Greater short-tailed shrew        14.9 very long        
    

    将数据转化为NA

    msleep %>%
      select(name:order) %>%
      na_if("omni")
    
       name                       genus       vore  order       
       <chr>                      <chr>       <chr> <chr>