本节讲“因子”和“数据帧"
> #因子
> # Create a vector as input.
> data <- c("East","West","East","North","North","East","West","West","West","East","North")
> print(data)
[1] "East" "West" "East" "North" "North" "East" "West" "West" "West" "East" "North"
> print(is.factor(data))
[1] FALSE
> print(is.vector(data))
[1] TRUE
> # Apply the factor function.
> factor_data <- factor(data)
> print(factor_data)
[1] East West East North North East West West West East North
Levels: East North West
> print(is.factor(factor_data))
[1] TRUE
> #数据帧的因子
> #在创建具有文本数据列的任何数据框时,R语言将文本列视为分类数据并在其上创建因子。
> # Create the vectors for data frame.
> height <- c(132,151,162,139,166,147,122)
> weight <- c(48,49,66,53,67,52,40)
> gender <- c("male","male","female","female","male","female","male")
>
> #CREATE THE DATA FRAME
> input_data <- data.frame(height,weight,gender)
> print(input_data)
height weight gender
1 132 48 male
2 151 49 male
3 162 66 female
4 139 53 female
5 166 67 male
6 147 52 female
7 122 40 male
>
> #test if gender is a factor
> print(is.factor(input_data$gender))
[1] TRUE
>
> #print the gender column so see the levels
> print(input_data$gender)
[1] male male female female male female male
Levels: female male
> #更改级别顺序
> #可以通过使用新的等级次序再次应用因子函数来改变因子中的等级的顺序。
>
> data <- c("East","West","East","North","North","East","West","West","West","East","North")
> # Create the factors
> factor_data <- factor(data)
> print(factor_data)
[1] East West East North North East West West West East North
Levels: East North West
> # Apply the factor function with required order of the level.
> new_order_data <-factor(factor_data,levels = c ("West","East","North"))
> print(new_order_data)
[1] East West East North North East West West West East North
Levels: West East North
>
> #生成因子级别
> v <- gl(3, 4, labels = c("Tampa", "Seattle","Boston"))
> print(v)
[1] Tampa Tampa Tampa Tampa Seattle Seattle Seattle Seattle Boston Boston Boston Boston
Levels: Tampa Seattle Boston
>
数据帧
> # Create the data frame.
> emp.data <- data.frame(
+ emp_id = c (1:5),
+ emp_name = c("Rick","Dan","Michelle","Ryan","Gary"),
+ salary = c(623.3,515.2,611.0,729.0,843.25),
+
+ start_date = as.Date(c("2012-01-01", "2013-09-23", "2014-11-15", "2014-05-11",
+ "2015-03-27")),
+ stringsAsFactors = FALSE
+ )
> # Print the data frame.
> print(emp.data)
emp_id emp_name salary start_date
1 1 Rick 623.30 2012-01-01
2 2 Dan 515.20 2013-09-23
3 3 Michelle 611.00 2014-11-15
4 4 Ryan 729.00 2014-05-11
5 5 Gary 843.25 2015-03-27
> #获取数据帧的结构
> #通过使用str()函数可以看到数据帧的结构。
> str(emp.data)
'data.frame': 5 obs. of 4 variables:
$ emp_id : int 1 2 3 4 5
$ emp_name : chr "Rick" "Dan" "Michelle" "Ryan" ...
$ salary : num 623 515 611 729 843
$ start_date: Date, format: "2012-01-01" "2013-09-23" "2014-11-15" "2014-05-11" ...
> #数据框中的数据摘要
> #可以通过应用summary()函数获取数据的统计摘要和性质。
> summary(emp.data)
emp_id emp_name salary start_date
Min. :1 Length:5 Min. :515.2 Min. :2012-01-01
1st Qu.:2 Class :character 1st Qu.:611.0 1st Qu.:2013-09-23
Median :3 Mode :character Median :623.3 Median :2014-05-11
Mean :3 Mean :664.4 Mean :2014-01-14
3rd Qu.:4 3rd Qu.:729.0 3rd Qu.:2014-11-15
Max. :5 Max. :843.2 Max. :2015-03-27
> #从数据帧提取数据
> #使用列名称从数据框中提取特定列。
>
> #extract specific columns
> result <- data.frame(emp.data$emp_name,emp.data$start_date)
> print(result)
emp.data.emp_name emp.data.start_date
1 Rick 2012-01-01
2 Dan 2013-09-23
3 Michelle 2014-11-15
4 Ryan 2014-05-11
5 Gary 2015-03-27
> #extract first two rows
> result <- emp.data[1:2]
> print(result)
emp_id emp_name
1 1 Rick
2 2 Dan
3 3 Michelle
4 4 Ryan
5 5 Gary
> result <- emp.data[1:2,]
> print(result)
emp_id emp_name salary start_date
1 1 Rick 623.3 2012-01-01
2 2 Dan 515.2 2013-09-23
> # Create the data frame.
> emp.data <- data.frame(
+ emp_id = c (1:5),
+ emp_name = c("Rick","Dan","Michelle","Ryan","Gary"),
+ salary = c(623.3,515.2,611.0,729.0,843.25),
+
+ start_date = as.Date(c("2012-01-01", "2013-09-23", "2014-11-15", "2014-05-11",
+ "2015-03-27")),
+ stringsAsFactors = FALSE
+ )
> #extract 3rd and 5th row with 2nd and 4th column.
> result <- emp.data[c(3,5),c(2,4)]
> print(result)
emp_name start_date
3 Michelle 2014-11-15
5 Gary 2015-03-27
> result <- emp.data[c(3,5),c(2)]
> print(result)
[1] "Michelle" "Gary"
> #扩展数据帧
> #可以通过添加列和行来扩展数据帧。
>
> #添加列
> #只需使用新的列名称添加列向量。
> # Create the data frame.
> emp.data <- data.frame(
+ emp_id = c (1:5),
+ emp_name = c("Rick","Dan","Michelle","Ryan","Gary"),
+ salary = c(623.3,515.2,611.0,729.0,843.25),
+
+ start_date = as.Date(c("2012-01-01", "2013-09-23", "2014-11-15", "2014-05-11",
+ "2015-03-27")),
+ stringsAsFactors = FALSE
+ )
> print(emp.data)
emp_id emp_name salary start_date
1 1 Rick 623.30 2012-01-01
2 2 Dan 515.20 2013-09-23
3 3 Michelle 611.00 2014-11-15
4 4 Ryan 729.00 2014-05-11
5 5 Gary 843.25 2015-03-27
> #add the "dept" column
> emp.data$dept <- c("IT","Operations","IT","HR","Finance")
> v <-emp.data
> print(v)
emp_id emp_name salary start_date dept
1 1 Rick 623.30 2012-01-01 IT
2 2 Dan 515.20 2013-09-23 Operations
3 3 Michelle 611.00 2014-11-15 IT
4 4 Ryan 729.00 2014-05-11 HR
5 5 Gary 843.25 2015-03-27 Finance
> print(emp.data)
emp_id emp_name salary start_date dept
1 1 Rick 623.30 2012-01-01 IT
2 2 Dan 515.20 2013-09-23 Operations
3 3 Michelle 611.00 2014-11-15 IT
4 4 Ryan 729.00 2014-05-11 HR
5 5 Gary 843.25 2015-03-27 Finance
> #添加行
> #要将更多行永久添加到现有数据帧,我们需要引入与现有数据帧相同结构的新行,并使用rbind()函数。
> #在下面的示例中,我们创建一个包含新行的数据帧,并将其与现有数据帧合并以创建最终数据帧。
> # Create the first data frame.
> emp.data <- data.frame(
+ emp_id = c (1:5),
+ emp_name = c("Rick","Dan","Michelle","Ryan","Gary"),
+ salary = c(623.3,515.2,611.0,729.0,843.25),
+
+ start_date = as.Date(c("2012-01-01", "2013-09-23", "2014-11-15", "2014-05-11",
+ "2015-03-27")),
+ dept = c("IT","Operations","IT","HR","Finance"),
+ stringsAsFactors = FALSE
+ )
> print(emp.data)
emp_id emp_name salary start_date dept
1 1 Rick 623.30 2012-01-01 IT
2 2 Dan 515.20 2013-09-23 Operations
3 3 Michelle 611.00 2014-11-15 IT
4 4 Ryan 729.00 2014-05-11 HR
5 5 Gary 843.25 2015-03-27 Finance
> # Create the second data frame
> emp.newdata <- data.frame(
+ emp_id = c (6:8),
+ emp_name = c("Rasmi","Pranab","Tusar"),
+ salary = c(578.0,722.5,632.8),
+ start_date = as.Date(c("2013-05-21","2013-07-30","2014-06-17")),
+ dept = c("IT","Operations","Fianance"),
+ stringsAsFactors = FALSE
+ )
> print(emp.newdata)
emp_id emp_name salary start_date dept
1 6 Rasmi 578.0 2013-05-21 IT
2 7 Pranab 722.5 2013-07-30 Operations
3 8 Tusar 632.8 2014-06-17 Fianance
> # Bind the two data frames.
> emp.finaldata <- rbind(emp.data,emp.newdata)
> print(emp.finaldata)
emp_id emp_name salary start_date dept
1 1 Rick 623.30 2012-01-01 IT
2 2 Dan 515.20 2013-09-23 Operations
3 3 Michelle 611.00 2014-11-15 IT
4 4 Ryan 729.00 2014-05-11 HR
5 5 Gary 843.25 2015-03-27 Finance
6 6 Rasmi 578.00 2013-05-21 IT
7 7 Pranab 722.50 2013-07-30 Operations
8 8 Tusar 632.80 2014-06-17 Fianance
>
|
请发表评论