在线时间:8:00-16:00
132-9538-2358
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
R语言进行数据预处理wranging
li_volleyball
2016年3月22日
data wrangling with Rpackages:tidyr dplyr
Ground rules library(tidyr) library(dplyr) ## ## Attaching package: 'dplyr' ## The following objects are masked from 'package:stats': ## ## filter, lag ## The following objects are masked from 'package:base': ## ## intersect, setdiff, setequal, union View(iris) View(mtcars) select(mtcars,am:1) ## am vs qsec wt drat hp disp cyl mpg ## Mazda RX4 1 0 16.46 2.620 3.90 110 160.0 6 21.0 ## Mazda RX4 Wag 1 0 17.02 2.875 3.90 110 160.0 6 21.0 ## Datsun 710 1 1 18.61 2.320 3.85 93 108.0 4 22.8 ## Hornet 4 Drive 0 1 19.44 3.215 3.08 110 258.0 6 21.4 ## Hornet Sportabout 0 0 17.02 3.440 3.15 175 360.0 8 18.7 ## Valiant 0 1 20.22 3.460 2.76 105 225.0 6 18.1 ## Duster 360 0 0 15.84 3.570 3.21 245 360.0 8 14.3 ## Merc 240D 0 1 20.00 3.190 3.69 62 146.7 4 24.4 ## Merc 230 0 1 22.90 3.150 3.92 95 140.8 4 22.8 ## Merc 280 0 1 18.30 3.440 3.92 123 167.6 6 19.2 ## Merc 280C 0 1 18.90 3.440 3.92 123 167.6 6 17.8 ## Merc 450SE 0 0 17.40 4.070 3.07 180 275.8 8 16.4 ## Merc 450SL 0 0 17.60 3.730 3.07 180 275.8 8 17.3 ## Merc 450SLC 0 0 18.00 3.780 3.07 180 275.8 8 15.2 ## Cadillac Fleetwood 0 0 17.98 5.250 2.93 205 472.0 8 10.4 ## Lincoln Continental 0 0 17.82 5.424 3.00 215 460.0 8 10.4 ## Chrysler Imperial 0 0 17.42 5.345 3.23 230 440.0 8 14.7 ## Fiat 128 1 1 19.47 2.200 4.08 66 78.7 4 32.4 ## Honda Civic 1 1 18.52 1.615 4.93 52 75.7 4 30.4 ## Toyota Corolla 1 1 19.90 1.835 4.22 65 71.1 4 33.9 ## Toyota Corona 0 1 20.01 2.465 3.70 97 120.1 4 21.5 ## Dodge Challenger 0 0 16.87 3.520 2.76 150 318.0 8 15.5 ## AMC Javelin 0 0 17.30 3.435 3.15 150 304.0 8 15.2 ## Camaro Z28 0 0 15.41 3.840 3.73 245 350.0 8 13.3 ## Pontiac Firebird 0 0 17.05 3.845 3.08 175 400.0 8 19.2 ## Fiat X1-9 1 1 18.90 1.935 4.08 66 79.0 4 27.3 ## Porsche 914-2 1 0 16.70 2.140 4.43 91 120.3 4 26.0 ## Lotus Europa 1 1 16.90 1.513 3.77 113 95.1 4 30.4 ## Ford Pantera L 1 0 14.50 3.170 4.22 264 351.0 8 15.8 ## Ferrari Dino 1 0 15.50 2.770 3.62 175 145.0 6 19.7 ## Maserati Bora 1 0 14.60 3.570 3.54 335 301.0 8 15.0 ## Volvo 142E 1 1 18.60 2.780 4.11 109 121.0 4 21.4 mtcars %>% select(am:1) ## am vs qsec wt drat hp disp cyl mpg ## Mazda RX4 1 0 16.46 2.620 3.90 110 160.0 6 21.0 ## Mazda RX4 Wag 1 0 17.02 2.875 3.90 110 160.0 6 21.0 ## Datsun 710 1 1 18.61 2.320 3.85 93 108.0 4 22.8 ## Hornet 4 Drive 0 1 19.44 3.215 3.08 110 258.0 6 21.4 ## Hornet Sportabout 0 0 17.02 3.440 3.15 175 360.0 8 18.7 ## Valiant 0 1 20.22 3.460 2.76 105 225.0 6 18.1 ## Duster 360 0 0 15.84 3.570 3.21 245 360.0 8 14.3 ## Merc 240D 0 1 20.00 3.190 3.69 62 146.7 4 24.4 ## Merc 230 0 1 22.90 3.150 3.92 95 140.8 4 22.8 ## Merc 280 0 1 18.30 3.440 3.92 123 167.6 6 19.2 ## Merc 280C 0 1 18.90 3.440 3.92 123 167.6 6 17.8 ## Merc 450SE 0 0 17.40 4.070 3.07 180 275.8 8 16.4 ## Merc 450SL 0 0 17.60 3.730 3.07 180 275.8 8 17.3 ## Merc 450SLC 0 0 18.00 3.780 3.07 180 275.8 8 15.2 ## Cadillac Fleetwood 0 0 17.98 5.250 2.93 205 472.0 8 10.4 ## Lincoln Continental 0 0 17.82 5.424 3.00 215 460.0 8 10.4 ## Chrysler Imperial 0 0 17.42 5.345 3.23 230 440.0 8 14.7 ## Fiat 128 1 1 19.47 2.200 4.08 66 78.7 4 32.4 ## Honda Civic 1 1 18.52 1.615 4.93 52 75.7 4 30.4 ## Toyota Corolla 1 1 19.90 1.835 4.22 65 71.1 4 33.9 ## Toyota Corona 0 1 20.01 2.465 3.70 97 120.1 4 21.5 ## Dodge Challenger 0 0 16.87 3.520 2.76 150 318.0 8 15.5 ## AMC Javelin 0 0 17.30 3.435 3.15 150 304.0 8 15.2 ## Camaro Z28 0 0 15.41 3.840 3.73 245 350.0 8 13.3 ## Pontiac Firebird 0 0 17.05 3.845 3.08 175 400.0 8 19.2 ## Fiat X1-9 1 1 18.90 1.935 4.08 66 79.0 4 27.3 ## Porsche 914-2 1 0 16.70 2.140 4.43 91 120.3 4 26.0 ## Lotus Europa 1 1 16.90 1.513 3.77 113 95.1 4 30.4 ## Ford Pantera L 1 0 14.50 3.170 4.22 264 351.0 8 15.8 ## Ferrari Dino 1 0 15.50 2.770 3.62 175 145.0 6 19.7 ## Maserati Bora 1 0 14.60 3.570 3.54 335 301.0 8 15.0 ## Volvo 142E 1 1 18.60 2.780 4.11 109 121.0 4 21.4 example1<-data.frame(A=c(paste("x",1:6,sep = "")), B=seq(1,11,2), c=1:6, date=c("2000-08-15","1998-07-15","1995-06-04","1997-07-01","1999-06-01","1996-06-25")) example1 ## A B c date ## 1 x1 1 1 2000-08-15 ## 2 x2 3 2 1998-07-15 ## 3 x3 5 3 1995-06-04 ## 4 x4 7 4 1997-07-01 ## 5 x5 9 5 1999-06-01 ## 6 x6 11 6 1996-06-25 # 一个变量一列 # 一个观测值一行 #每一种观测在一个表里 #separate() separate(example1,date,c("Y","m","d"),sep="-") ## A B c Y m d ## 1 x1 1 1 2000 08 15 ## 2 x2 3 2 1998 07 15 ## 3 x3 5 3 1995 06 04 ## 4 x4 7 4 1997 07 01 ## 5 x5 9 5 1999 06 01 ## 6 x6 11 6 1996 06 25 example12<-example1 %>% separate(date,c("Y","m","d"),sep="-") #unite() unite(example12,"YM",Y,m,sep="-") ## A B c YM d ## 1 x1 1 1 2000-08 15 ## 2 x2 3 2 1998-07 15 ## 3 x3 5 3 1995-06 04 ## 4 x4 7 4 1997-07 01 ## 5 x5 9 5 1999-06 01 ## 6 x6 11 6 1996-06 25 #select() select(example1,A,B) ## A B ## 1 x1 1 ## 2 x2 3 ## 3 x3 5 ## 4 x4 7 ## 5 x5 9 ## 6 x6 11 select(example1,-A) ## B c date ## 1 1 1 2000-08-15 ## 2 3 2 1998-07-15 ## 3 5 3 1995-06-04 ## 4 7 4 1997-07-01 ## 5 9 5 1999-06-01 ## 6 11 6 1996-06-25 select(example1,B:date) ## B c date ## 1 1 1 2000-08-15 ## 2 3 2 1998-07-15 ## 3 5 3 1995-06-04 ## 4 7 4 1997-07-01 ## 5 9 5 1999-06-01 ## 6 11 6 1996-06-25 select(example1,starts_with("d")) ## date ## 1 2000-08-15 ## 2 1998-07-15 ## 3 1995-06-04 ## 4 1997-07-01 ## 5 1999-06-01 ## 6 1996-06-25 select(example1,ends_with("e")) ## date ## 1 2000-08-15 ## 2 1998-07-15 ## 3 1995-06-04 ## 4 1997-07-01 ## 5 1999-06-01 ## 6 1996-06-25 select(example1,contains("a")) ## A date ## 1 x1 2000-08-15 ## 2 x2 1998-07-15 ## 3 x3 1995-06-04 ## 4 x4 1997-07-01 ## 5 x5 1999-06-01 ## 6 x6 1996-06-25 #filter() filter(example1,B>=6) ## A B c date ## 1 x4 7 4 1997-07-01 ## 2 x5 9 5 1999-06-01 ## 3 x6 11 6 1996-06-25 filter(example1,B>=6,A%in%c("x1","x4","x5")) ## A B c date ## 1 x4 7 4 1997-07-01 ## 2 x5 9 5 1999-06-01 #mutate() mutate(example1,ratio=B/c) ## A B c date ratio ## 1 x1 1 1 2000-08-15 1.000000 ## 2 x2 3 2 1998-07-15 1.500000 ## 3 x3 5 3 1995-06-04 1.666667 ## 4 x4 7 4 1997-07-01 1.750000 ## 5 x5 9 5 1999-06-01 1.800000 ## 6 x6 11 6 1996-06-25 1.833333 mutate(example1,ratio=B/c,inverse=ratio-1) ## A B c date ratio inverse ## 1 x1 1 1 2000-08-15 1.000000 0.0000000 ## 2 x2 3 2 1998-07-15 1.500000 0.5000000 ## 3 x3 5 3 1995-06-04 1.666667 0.6666667 ## 4 x4 7 4 1997-07-01 1.750000 0.7500000 ## 5 x5 9 5 1999-06-01 1.800000 0.8000000 ## 6 x6 11 6 1996-06-25 1.833333 0.8333333 mutate(example1,cumsum(B)) ## A B c date cumsum(B) ## 1 x1 1 1 2000-08-15 1 ## 2 x2 3 2 1998-07-15 4 ## 3 x3 5 3 1995-06-04 9 ## 4 x4 7 4 1997-07-01 16 ## 5 x5 9 5 1999-06-01 25 ## 6 x6 11 6 1996-06-25 36 mutate(example1,cumsum(B),cummean(B),cumany(B>6),cumall(B>6)) ## A B c date cumsum(B) cummean(B) cumany(B > 6) cumall(B > 6) ## 1 x1 1 1 2000-08-15 1 1 FALSE FALSE ## 2 x2 3 2 1998-07-15 4 2 FALSE FALSE ## 3 x3 5 3 1995-06-04 9 3 FALSE FALSE ## 4 x4 7 4 1997-07-01 16 4 TRUE FALSE ## 5 x5 9 5 1999-06-01 25 5 TRUE FALSE ## 6 x6 11 6 1996-06-25 36 6 TRUE FALSE mutate(example1,cummin(B),cummax(B)) ## A B c date cummin(B) cummax(B) ## 1 x1 1 1 2000-08-15 1 1 ## 2 x2 3 2 1998-07-15 1 3 ## 3 x3 5 3 1995-06-04 1 5 ## 4 x4 7 4 1997-07-01 1 7 ## 5 x5 9 5 1999-06-01 1 9 ## 6 x6 11 6 1996-06-25 1 11 mutate(example1,between(B,4,8)) ## A B c date between(B, 4, 8) ## 1 x1 1 1 2000-08-15 FALSE ## 2 x2 3 2 1998-07-15 FALSE ## 3 x3 5 3 1995-06-04 TRUE ## 4 x4 7 4 1997-07-01 TRUE ## 5 x5 9 5 1999-06-01 FALSE ## 6 x6 11 6 1996-06-25 FALSE mutate(example1,cume_dist(B)) ## A B c date cume_dist(B) ## 1 x1 1 1 2000-08-15 0.1666667 ## 2 x2 3 2 1998-07-15 0.3333333