提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
缺失值
导入玩家的玩牌游戏数据 #player <- read.csv("./data/chapter05/玩家玩牌数据.csv",F,na.strings = “NA”) player <- read.csv("./data/chapter05/玩家玩牌数据.csv",F) head(player) str(player) player_col_names <- c(“用户id”,“性别”,“等级”,“站内好友数”,“经验值”, “积分”,“登录总次数”,“玩牌局数”,“赢牌局数”,“身上货币量”)
查看变量名 colnames(player) <- player_col_names colnames(player) 查看前六行 head(player)
利用is.na函数判断“玩牌局数”变量各值是否为缺失值 is.na(player$玩牌局数)
统计缺失值与非缺失值的个数 table(is.na(player$玩牌局数))
sum()和mean()函数来统计缺失值的个数和占比 #计算缺失值个数 sum(is.na(player$玩牌局数))
#计算缺失值占比 mean(is.na(player$玩牌局数))
利用complete.cases函数查看完整实例 sum(complete.cases(player))
用md.pattern函数查看player的缺失值模式 if(!require(mice)) install.packages(“mice”) md.pattern(player)
–删除缺失样本 sum(!complete.cases(player)) player_full <- na.omit(player) 计算有缺失值的样本个数 sum(!complete.cases(player_full))
–替换缺失值 iris1 <- iris[,c(1,5)] head(iris1) table(iris1KaTeX parse error: Expected 'EOF', got '#' at position 10: Species) #̲将40、80、120号样本的S…Sepal.Length,na.rm = T);Sepal.Length.mean iris1[c(40,80,120),1] <- round(Sepal.Length.mean,1) iris1[c(40,80,120),1] 查看以前的值和现在的值
iris[c(40,80,120),1];iris1[c(40,80,120),1]
利用同类均值进行赋值的方式来填补缺失值 #将40、80、120号样本的Sepal.Length设置为缺失值 iris2 <- iris[,c(1,5)] iris2[c(40,80,120),1] <- NA iris2[40,1] <- round(mean(iris2[iris1
S
p
e
c
i
e
s
=
=
′
s
e
t
o
s
a
′
,
′
S
e
p
a
l
.
L
e
n
g
t
h
′
]
,
n
a
.
r
m
=
T
)
,
1
)
i
r
i
s
2
[
80
,
1
]
<
−
r
o
u
n
d
(
m
e
a
n
(
i
r
i
s
2
[
i
r
i
s
1
Species=='setosa','Sepal.Length'], na.rm = T),1) iris2[80,1] <- round(mean(iris2[iris1
Species==′setosa′,′Sepal.Length′],na.rm=T),1)iris2[80,1]<−round(mean(iris2[iris1Species==‘versicolor’,‘Sepal.Length’], na.rm = T),1) iris2[120,1] <- round(mean(iris2[iris1$Species==‘virginica’,‘Sepal.Length’], na.rm = T),1) #查看以前的值和现在的值 iris[c(40,80,120),1];iris1[c(40,80,120),1];iris2[c(40,80,120),1]
数据转换:
导入数据 rawdata <- read.csv(“D://小学期/数据转换数据.csv”) #查看数据的前六行 head(rawdata) str(rawdata) #将注册日期变量转换成日期格式 rawdata
r
e
g
i
s
t
r
a
t
i
o
n
<
−
a
s
.
D
a
t
e
(
p
a
s
t
e
(
s
u
b
s
t
r
(
r
a
w
d
a
t
a
registration <- as.Date(paste(substr(rawdata
registration<−as.Date(paste(substr(rawdataregistration,1,4), substr(rawdata
r
e
g
i
s
t
r
a
t
i
o
n
,
5
,
6
)
,
s
u
b
s
t
r
(
r
a
w
d
a
t
a
registration,5,6), substr(rawdata
registration,5,6),substr(rawdataregistration,7,8), sep="-"), “%Y-%m-%d”) head(rawdata) str(rawdata)
将首次付费日期转换成日期格式 rawdata
f
i
r
s
t
p
a
y
d
a
t
e
<
−
a
s
.
D
a
t
e
(
p
a
s
t
e
(
s
u
b
s
t
r
(
r
a
w
d
a
t
a
firstpaydate <- as.Date(paste(substr(rawdata
firstpaydate<−as.Date(paste(substr(rawdatafirstpaydate,1,4), substr(rawdata
f
i
r
s
t
p
a
y
d
a
t
e
,
5
,
6
)
,
s
u
b
s
t
r
(
r
a
w
d
a
t
a
firstpaydate,5,6), substr(rawdata
firstpaydate,5,6),substr(rawdatafirstpaydate,7,8), sep="-"), “%Y-%m-%d”)
查看数据的前六行 head(rawdata) str(rawdata)
#增加ispay变量:0表示非付费用户,1表示付费用户 rawdata
i
s
p
a
y
<
−
i
f
e
l
s
e
(
!
i
s
.
n
a
(
r
a
w
d
a
t
a
ispay <- ifelse(!is.na(rawdata
ispay<−ifelse(!is.na(rawdatafirstpaydate),1,0) head(rawdata) 增加isnewpay变量:0表示非新增首日付费用户,1表示新增首日付费用户 rawdata
i
s
n
e
w
p
a
y
<
−
i
f
e
l
s
e
(
r
a
w
d
a
t
a
isnewpay <- ifelse(rawdata
isnewpay<−ifelse(rawdataregistration==rawdata
f
i
r
s
t
p
a
y
d
a
t
e
,
1
,
0
)
h
e
a
d
(
r
a
w
d
a
t
a
)
r
a
w
d
a
t
a
[
i
s
.
n
a
(
r
a
w
d
a
t
a
firstpaydate, 1,0) head(rawdata) rawdata[is.na(rawdata
firstpaydate,1,0)head(rawdata)rawdata[is.na(rawdataisnewpay),‘isnewpay’] <- 0 #查看数据前10行 head(rawdata)
#采用(x-mu)/std的标准化方法,与scale()函数效果一样 #install.packages(“caret”) library(caret) ?preProcess standard <- preProcess(iris) head(predict(standard,iris)) head(scale(iris[,1:4])) #采用(x-min(x))/(max(x)-min(x))的标准化方法 standard <- preProcess(iris, method = ‘range’) head(predict(standard,iris)) fun <- function(x) (x-min(x))/(max(x)-min(x)) head(sapply(iris[,1:4],fun))
数据分箱:
利用cut函数对数据进行分箱
对days(活跃天数)进行分箱操作 head(rawdata) rawdata
d
a
y
s
i
n
t
e
r
v
a
l
<
−
c
u
t
(
r
a
w
d
a
t
a
days_interval <- cut(rawdata
daysinterval<−cut(rawdatadays, breaks=c(0,30,60,90,Inf), labels=c(‘一个月内’,‘3160天’,'6190天’,‘三个月以上’)) head(rawdata) 对lifetime(生命周期)进行分箱操作 rawdata
l
i
f
e
t
i
m
e
i
n
t
e
r
v
a
l
<
−
c
u
t
(
r
a
w
d
a
t
a
lifetime_interval <- cut(rawdata
lifetimeinterval<−cut(rawdatalifetime, breaks=c(0,7,21,30,90,Inf), labels=c(‘小于一周’,‘小于三周’,‘小于一个月’, ‘小于三个月’,‘三个月以上’)) 查看前六行 head(rawdata) )
数据标准化变换
#采用(x-mu)/std的标准化方法,与scale()函数效果一样 #install.packages(“caret”) library(caret) ?preProcess standard <- preProcess(iris) head(predict(standard,iris)) head(scale(iris[,1:4])) #采用(x-min(x))/(max(x)-min(x))的标准化方法 standard <- preProcess(iris, method = ‘range’) head(predict(standard,iris)) fun <- function(x) (x-min(x))/(max(x)-min(x)) head(sapply(iris[,1:4],fun))
离散数据编码
构建customers数据集 customers<-data.frame(id=c(10,20,30,40,50), gender=c(“male”,“female”,“female”,“male”,“female”), mood=c(“happy”,“sad”,“happy”,“sad”,“happy”), outcome=c(1,1,0,0,0)) customers #创建新数据框customers.new customers.new <- customers[,c(‘id’,‘outcome’)] customers.new 对gender变量进行哑变量处理 customers.new
g
e
n
d
e
r
.
m
a
l
e
<
−
i
f
e
l
s
e
(
c
u
s
t
o
m
e
r
s
gender.male <- ifelse(customers
gender.male<−ifelse(customersgender==‘male’,1,0) customers.new
g
e
n
d
e
r
.
f
e
m
a
l
e
<
−
i
f
e
l
s
e
(
c
u
s
t
o
m
e
r
s
gender.female <- ifelse(customers
gender.female<−ifelse(customersgender==‘female’,1,0) customers.new
g
e
n
d
e
r
<
−
c
u
s
t
o
m
e
r
s
gender <- customers
gender<−customersgender customers.new 对mood变量进行哑变量处理 customers.new
m
o
o
d
.
h
a
p
p
y
<
−
i
f
e
l
s
e
(
c
u
s
t
o
m
e
r
s
mood.happy <- ifelse(customers
mood.happy<−ifelse(customersmood==‘happy’,1,0) customers.new
m
o
o
d
.
s
a
d
<
−
i
f
e
l
s
e
(
c
u
s
t
o
m
e
r
s
mood.sad <- ifelse(customers
mood.sad<−ifelse(customersmood==‘sad’,1,0) customers.new
加载caret包到内存
library(caret)
查看customers的数据结构
str(customers)
利用dummyVars函数对customers数据进行哑变量处理
dmy<-dummyVars(~.,data=customers)
对自身变量进行预测,并转换成data.frame格式
trsf<-data.frame(predict(dmy,newdata=customers))
查看转换结果
trsf
将outcome变量转换成因子型变量
customers
o
u
t
c
o
m
e
<
−
a
s
.
f
a
c
t
o
r
(
c
u
s
t
o
m
e
r
s
outcome <- as.factor(customers
outcome<−as.factor(customersoutcome)
利用dummyVars函数对customers数据进行哑变量处理
dmy<-dummyVars(~.,data=customers)
对自身变量进行预测,并转换成data.frame格式
trsf<-data.frame(predict(dmy,newdata=customers))
查看转换结果
trsf
只对gender变量进行哑变量转换
dmy.gender <- dummyVars(~gender,data=customers) trsf.gender <- data.frame(predict(dmy.gender,newdata=customers)) trsf.gender
将levelsOnly和fullRank设置为TRUE
customers<-data.frame(id=c(10,20,30,40,50), gender=c(“male”,“female”,“female”,“male”,“female”), mood=c(“happy”,“sad”,“happy”,“sad”,“happy”), outcome=c(1,1,0,0,0)) dmy<-dummyVars(~.,data=customers,levelsOnly=TRUE,fullRank=TRUE) trsf<-data.frame(predict(dmy,newdata=customers)) trsf
customers<-data.frame(id=c(10,20,30,40,50), gender=c(“male”,“female”,“female”,“male”,“female”), mood=c(“happy”,“sad”,“happy”,“sad”,“happy”), outcome=c(1,1,0,0,0), test=c(“1”,“2”,“1”,“3”,“4”)) dmy<-dummyVars(~.,data=customers,levelsOnly=TRUE,fullRank=TRUE) trsf<-data.frame(predict(dmy,newdata=customers)) trsf
|
请发表评论