R语言数据结构：数据框

数据框概括

数据框是由n个相同长度的向量按列合并的一张表，每列的数据类型可以不同。

数据框是一个特殊的list,list的每个元素都是长度相同的向量。

# stringsAsFactors = FALSE,字符串不会被转换成fator类型
> df <- data.frame(x=1:3,y=c('a','b','c'),stringsAsFactors = FALSE)
> df
  x y
1 1 a
2 2 b
3 3 c

# 3行
> nrow(df)
[1] 3

# 2列
> ncol(df)
[1] 2

# 列名称为'x'的列
> df$x
[1] 1 2 3

# 第1列
> df[,1]
[1] 1 2 3

# 第1行
> df[1,]
  x y
1 1 a

# 合并行，需要保证数据类型和长度符合
> rbind(df,c(4,'d'))
  x y
1 1 a
2 2 b
3 3 c
4 4 d

# 合并列，需要保证长度符合
> cbind(df,'z'=4:6)
  x y z
1 1 a 4
2 2 b 5
3 3 c 6

数据框函数

数据集及包

> library(nycflights13)
> library(plyr)
> library(dplyr)
> library(tidyr)
> library(reshape2)
> head(flights)
  year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight
1 2013     1   1      517            515         2      830            819        11      UA   1545
2 2013     1   1      533            529         4      850            830        20      UA   1714
3 2013     1   1      542            540         2      923            850        33      AA   1141
4 2013     1   1      544            545        -1     1004           1022       -18      B6    725
5 2013     1   1      554            600        -6      812            837       -25      DL    461
6 2013     1   1      554            558        -4      740            728        12      UA   1696
  tailnum origin dest air_time distance hour minute           time_hour
1  N14228    EWR  IAH      227     1400    5     15 2013-01-01 05:00:00
2  N24211    LGA  IAH      227     1416    5     29 2013-01-01 05:00:00
3  N619AA    JFK  MIA      160     1089    5     40 2013-01-01 05:00:00
4  N804JB    JFK  BQN      183     1576    5     45 2013-01-01 05:00:00
5  N668DN    LGA  ATL      116      762    6      0 2013-01-01 06:00:00
6  N39463    EWR  ORD      150      719    5     58 2013-01-01 05:00:00

管道函数 %>%

1 2	x %>% f(y) 等价于 f(x,y) y %>% f(x,.,z) 等价于 f(x,y,z)

筛选行

subset(flights,flights$month==1&flights$day==1)
filter(flights, month ==1, day ==1)

# 去重
distinct(select(flights, year,2,3))
distinct(flights[,1:3])
# distinct比unique快
unique(flights[,1:3])

选择列

# 按列的名称进行筛选,可以添加任意多个列名称
# 可以搭配使用starts_with(), ends_with(), contains(), matches()
select(flights, year, month, day)
select(flights,year,month,day,ends_with('time'))

# 按列的位置进行筛选
select(flights,1:3,6,9:10) # 选择第1,2,3,6,9,10共6列
flights[c(1:3,6,9:10)]

# 列名称与位置混合使用
select(flights,year,6,9:10)

增加列

1
2
3

mutate(flights,gain =arr_delay -dep_delay,speed =distance /air_time *60)
#可以在mutate()函数中使用刚刚添加的新列，不需要写两行代码
mutate(flights,gain =arr_delay -dep_delay,gain_per_hour =gain /(air_time /60))

数据框整形

排序

# arrange函数，默认升序
arrange(flights, year, desc(month), desc(day))

# base包的order函数
# 对各列按列名称降序排序
x <- x[,order(colnames(x),decreasing=T)]
# 对各行按行名称降序排序
x <- x[order(rownames(x),decreasing=T),]

重命名行名、列名

# 行名称
rownames(flights) <- c(...)
rownames(flights)[1] <- '1st'

# 列名称
colnames(flights) <- c(...)
colnames(flights)[1] <- 'YEAR'

dplyr::rename(flights,'年(新列名)' = year )

setNames(flights[1:3],c('年','月','日'))

合并、拆分

reshape2:melt,dcast

> head(airquality)
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5
6    28      NA 14.9   66     5   6

# 保留"Ozone", "Month", "Day"三列，将2到4列合并为一列
> melt_result <- melt(airquality, var.ids=c("Ozone", "Month", "Day"),measure.vars=c(2:4), variable.name="V.type", value.name="value")
> head(melt_result)
  Ozone Month Day  V.type value
1    41     5   1 Solar.R   190
2    36     5   2 Solar.R   118
3    12     5   3 Solar.R   149
4    18     5   4 Solar.R   313
5    NA     5   5 Solar.R    NA
6    28     5   6 Solar.R    NA

# melt_result可以用dcast展开
# 左边的每一个变量为一列，再按V.type的水平数分列（V.type有4个水平/4类，就再增加4列）
> dcast_result <- dcast(melt_result, Ozone+Month+Day~V.type)
> head(dcast_result)
  Ozone Month Day Solar.R Wind Temp
1     1     5  21       8  9.7   59
2     4     5  23      25  9.7   61
3     6     5  18      78 18.4   57
4     7     5  11      NA  6.9   74
5     7     7  15      48 14.3   80
6     7     9  24      49 10.3   69

# dcast在拆分数据的同时还可以按照某一个因子进行聚合（其实这才是主要目的）
> dcast(melt_result, Month~V.type, fun.aggregate=mean, na.rm=TRUE)
  Month  Solar.R      Wind     Temp
1     5 181.2963 11.622581 65.54839
2     6 190.1667 10.266667 79.10000
3     7 216.4839  8.941935 83.90323
4     8 171.8571  8.793548 83.96774
5     9 167.4333 10.180000 76.90000

tidyr:gather,spread

# 将Solar.R,Wind,Temp三列合并为一列
> melt_result <- gather(airquality,key=V.type,value=value,Solar.R,Wind,Temp)
> head(melt_result)
  Ozone Month Day  V.type value
1    41     5   1 Solar.R   190
2    36     5   2 Solar.R   118
3    12     5   3 Solar.R   149
4    18     5   4 Solar.R   313
5    NA     5   5 Solar.R    NA
6    28     5   6 Solar.R    NA

# 将V.type的值作为数据展开后的列名，将value值填充到数据框内
> spread(melt_result,V.type,value)
    Ozone Month Day Solar.R Wind Temp
1       1     5  21       8  9.7   59
2       4     5  23      25  9.7   61
3       6     5  18      78 18.4   57
4       7     5  11      NA  6.9   74
5       7     7  15      48 14.3   80

tidyr:unite,separate

> a <- data.frame(col_name=c('a-b','d-e-f','1-2-3-4'))
> tidyr::separate(a,
                 # col:被分割列的列名称
                 col=col_name,
                 # into:分割的新列的名称
                 into=c('new_col_1','new_col_2','new_col_3'),
                 # sep:分割符
                 sep="-",
                 # remove=TRUE：返回结果中移除被分割列
                 # remove=FALSE：返回结果中保留被分割列
                 remove = FALSE,
                 # extra：merge，drop，'1-2-3-4'分割结果有4列，但into里面只有三列，merge表示3、4列合并，drop表示直接遗弃第4列的值
                 extra="merge",
                 # fill:right,left,'a-b'分割结果只有2列，所有第三列填充NA，right表示在右边填充
                 fill="right"
                 )
  col_name new_col_1 new_col_2 new_col_3
1      a-b         a         b      <NA>
2    d-e-f         d         e         f
3  1-2-3-4         1         2       3-4

> b <- data.frame('col_1'=c('a','b','c'),'col_2'=1:3)
> tidyr::unite(b,
              # col：合并后新列的名称
              col=new_col_name,
              # 被合并的列名
              col_1,col_2,
              # remove=FALSE：返回结果保留被合并的列
              remove=FALSE
              )
  new_col_name col_1 col_2
1          a_1     a     1
2          b_2     b     2
3          c_3     c     3

数据框合并

merge(users,allorders,by="uid",all.x=T)

# type:left (default), right, inner or full
# match:first,all
plyr::join(users,users_year_template,by="uid",type="left",match="first")

分组、聚合

group_by,summarise

# 根据飞机编号进行分组
by_tailnum <- group_by(flights, tailnum)
# 求每个飞机编号的飞行次数，每次飞行的平均距离以及平均延误时间
delay <- summarize(by_tailnum,
                  count =n(),
                  dist =mean(distance,na.rm =TRUE),
                  delay =mean(arr_delay,na.rm =TRUE))
# n()：计数
# n_distinct(x)：去重计数
# first(x)：第一个
# last(x)：最后一个
# nth(x, n)：第n个

# 当分组变量有多个的时候，每一个summarise向上递归一个变量
daily <- group_by(flights, year, month, day)
# 第1个summarize计算day
per_day <- summarize(daily,flights =n())
# 第2个summarize计算month
per_month <- summarize(per_day,flights =sum(flights))
# 第3个summarize计算year
per_year <- summarize(per_month,flights =sum(flights))

# summarize_all函数：对所有的聚合变量使用同样的聚合函数
by_tailnum%>%summarize_all(n_distinct)

# summary_each,对多个聚合变量使用多个聚合函数
flights %>%
  group_by(year,month) %>%
  summarise_each(funs(min(.,na.rm =TRUE),
                      mean(.,na.rm =TRUE),
                      max(.,na.rm =TRUE)),
                 distance,dep_time
                 )

tapply

1
2
3

# 多个分组变量(itemName,DATE)，一个聚合变量(order_id)，一个聚合函数(length)
# 对订单数据做聚合，按照下单日期，商品名称分组，计算订单量
tapply(Orders$order_id,list(Orders$itemName,Orders$DATE),length)

aggregate

1
2
3

# 多个分组变量(DATE)，多个聚合变量(16:22),一个聚合函数(sum)
# 对订单数据做聚合，按照下单日期分组，对第16至22列的数据做sum运算
aggregate(Orders[,16:22],by=list(Orders$itemName,Orders$DATE),sum)

That’s all.
Happy writing!