2 R数据清洗

2.1 重复数据删除

library(tidyverse)
data <- tibble(index1 = rep(c(1, 2), 2),
               index2 = rep(c(1, 2), each = 2),
               label = letters[1:4]);data

## # A tibble: 4 × 3
##   index1 index2 label
##    <dbl>  <dbl> <chr>
## 1      1      1 a    
## 2      2      1 b    
## 3      1      2 c    
## 4      2      2 d

方法1：便捷易懂，但是筛选后其他列的信息就不在了,通过 dplyr 包的 summrize() 函数实现

data %>%
    group_by(index1) %>%
    summarize(index2 = min(index2))

## # A tibble: 2 × 2
##   index1 index2
##    <dbl>  <dbl>
## 1      1      1
## 2      2      1

方法2：通过 filter() 函数来实现

data %>%
  group_by(index1) %>%
  filter(index2 == min(index2))

## # A tibble: 2 × 3
## # Groups:   index1 [2]
##   index1 index2 label
##    <dbl>  <dbl> <chr>
## 1      1      1 a    
## 2      2      1 b

方法3：简单，但是没接触的第一次可能不大懂，且筛选后其他列的信息就不在了,aggregate() 函数

# aggregate(cbind(value1, value2, ...) ~ group1 + group2 + ..., data, FUN = 函数)
aggregate(index2~index1, data, function(x) x[which.min(abs(x))])

##   index1 index2
## 1      1      1
## 2      2      1

方法4：稍微显得冗余，但是能保留之前所有的列

data1 <- data[order(data$index1, data$index2, decreasing = FALSE), ]
data1[!duplicated(data1$index1), ]

## # A tibble: 2 × 3
##   index1 index2 label
##    <dbl>  <dbl> <chr>
## 1      1      1 a    
## 2      2      1 b

方法5：data.table，在 R 中出了名的高效处理包，但是不易理解，有人封装成了tidyfst包,且能保留原本所有列

library(data.table)
as.data.table(data)[, .SD[which.min(abs(index2))], by = index1]

##    index1 index2 label
## 1:      1      1     a
## 2:      2      1     b

方法6：利用 do.call 函数和 split 函数结合 lapply 以及 rbind

# lapply 故名思议对 list 进行 apply 批量操作
do.call(rbind, lapply(split(data,data$index1),function(x) x[which.min(abs(data$index2)),]))

## # A tibble: 2 × 3
##   index1 index2 label
## *  <dbl>  <dbl> <chr>
## 1      1      1 a    
## 2      2      1 b

2.2 数据整合与重构

整合数据：aggregate用于折叠数据

# 多列值  + 多列变量
aggregate(cbind(price, carat) ~ cut + color, diamonds, FUN = mean) %>% head(5)

##         cut color    price     carat
## 1      Fair     D 4291.061 0.9201227
## 2      Good     D 3405.382 0.7445166
## 3 Very Good     D 3470.467 0.6964243
## 4   Premium     D 3631.293 0.7215471
## 5     Ideal     D 2629.095 0.5657657

整合与重构数据：melt用法

library(reshape)
#融合
melt(as.data.frame(diamonds), id = c("cut","color"),measure=c("x","y")) %>% head(5)

##       cut color variable value
## 1   Ideal     E        x  3.95
## 2 Premium     E        x  3.89
## 3    Good     E        x  4.05
## 4 Premium     I        x  4.20
## 5    Good     J        x  4.34

整合与重构数据：cast用法

#重铸
cast(diamonds, cut~color+clarity,value = 'carat',fun.aggregate = mean) %>% head(5) %>% select(1:5)

##         cut   D_I1     D_SI2     D_SI1     D_VS2
## 1      Fair 1.8775 1.0169643 0.9137931 0.8436000
## 2      Good 1.0400 0.8582511 0.7008017 0.7025000
## 3 Very Good 0.9500 0.9317197 0.7078340 0.6336570
## 4   Premium 1.1550 0.9189074 0.6916547 0.5845723
## 5     Ideal 0.9600 0.7503090 0.5947967 0.4992935

#重铸而不整合
cast(diamonds, cut~color+clarity,value = 'carat') %>% head(5) %>% select(1:5)

##         cut D_I1 D_SI2 D_SI1 D_VS2
## 1      Fair    4    56    58    25
## 2      Good    8   223   237   104
## 3 Very Good    5   314   494   309
## 4   Premium   12   421   556   339
## 5     Ideal   13   356   738   920

重构数据：dcast用法

# 多列值  + 多列变量
library(reshape2)
dcast(diamonds, cut+color~clarity,fun.aggregate = mean,value.var = 'z') %>% head(5) %>% select(1:5)

##    cut color       I1      SI2      SI1
## 1 Fair     D 4.905000 4.009107 3.864828
## 2 Fair     E 4.008889 3.956795 3.739077
## 3 Fair     F 4.000857 4.042921 3.726506
## 4 Fair     G 4.233962 4.291625 3.852319
## 5 Fair     H 4.549231 4.422967 4.110533

dcast(diamonds, cut+color~clarity) %>% head(5) %>% select(1:5)

##    cut color I1 SI2 SI1
## 1 Fair     D  4  56  58
## 2 Fair     E  9  78  65
## 3 Fair     F 35  89  83
## 4 Fair     G 53  80  69
## 5 Fair     H 52  91  75