《R语言》学习笔记（第6篇）

OStack程序员社区-中国程序员成长平台 › 门户 › 编程› R语言›R语言教程

原作者: [db:作者] 来自: [db:来源] 收藏邀请

本节讲“因子”和“数据帧"

> #因子
> # Create a vector as input.
> data <- c("East","West","East","North","North","East","West","West","West","East","North")

> print(data)
 [1] "East"  "West"  "East"  "North" "North" "East"  "West"  "West"  "West"  "East"  "North"
> print(is.factor(data))
[1] FALSE
> print(is.vector(data))
[1] TRUE


> # Apply the factor function.
> factor_data <- factor(data)
> print(factor_data)
 [1] East  West  East  North North East  West  West  West  East  North
Levels: East North West

> print(is.factor(factor_data))
[1] TRUE


> #数据帧的因子
> #在创建具有文本数据列的任何数据框时，R语言将文本列视为分类数据并在其上创建因子。
> # Create the vectors for data frame.
> height <- c(132,151,162,139,166,147,122)
> weight <- c(48,49,66,53,67,52,40)
> gender <- c("male","male","female","female","male","female","male")
> 
> #CREATE THE DATA FRAME
> input_data <- data.frame(height,weight,gender)
> print(input_data)
  height weight gender
1    132     48   male
2    151     49   male
3    162     66 female
4    139     53 female
5    166     67   male
6    147     52 female
7    122     40   male
> 
> #test if gender is a factor
> print(is.factor(input_data$gender))
[1] TRUE
> 
> #print the gender column so see the levels
> print(input_data$gender)
[1] male   male   female female male   female male  
Levels: female male


> #更改级别顺序
> #可以通过使用新的等级次序再次应用因子函数来改变因子中的等级的顺序。
> 
> data <- c("East","West","East","North","North","East","West","West","West","East","North")
> # Create the factors
> factor_data <- factor(data)
> print(factor_data)
 [1] East  West  East  North North East  West  West  West  East  North
Levels: East North West

> # Apply the factor function with required order of the level.
> new_order_data <-factor(factor_data,levels = c ("West","East","North"))
> print(new_order_data)
 [1] East  West  East  North North East  West  West  West  East  North
Levels: West East North
>

> #生成因子级别
> v <- gl(3, 4, labels = c("Tampa", "Seattle","Boston"))
> print(v)
 [1] Tampa   Tampa   Tampa   Tampa   Seattle Seattle Seattle Seattle Boston  Boston  Boston  Boston 
Levels: Tampa Seattle Boston
>

数据帧

> # Create the data frame.
> emp.data <- data.frame(
+    emp_id = c (1:5), 
+    emp_name = c("Rick","Dan","Michelle","Ryan","Gary"),
+    salary = c(623.3,515.2,611.0,729.0,843.25), 
+    
+    start_date = as.Date(c("2012-01-01", "2013-09-23", "2014-11-15", "2014-05-11",
+       "2015-03-27")),
+    stringsAsFactors = FALSE
+ )

> # Print the data frame.
> print(emp.data) 
  emp_id emp_name salary start_date
1      1     Rick 623.30 2012-01-01
2      2      Dan 515.20 2013-09-23
3      3 Michelle 611.00 2014-11-15
4      4     Ryan 729.00 2014-05-11
5      5     Gary 843.25 2015-03-27


> #获取数据帧的结构
> #通过使用str()函数可以看到数据帧的结构。
> str(emp.data)
'data.frame':   5 obs. of  4 variables:
 $ emp_id    : int  1 2 3 4 5
 $ emp_name  : chr  "Rick" "Dan" "Michelle" "Ryan" ...
 $ salary    : num  623 515 611 729 843
 $ start_date: Date, format: "2012-01-01" "2013-09-23" "2014-11-15" "2014-05-11" ...


> #数据框中的数据摘要
> #可以通过应用summary()函数获取数据的统计摘要和性质。
> summary(emp.data)
     emp_id    emp_name             salary        start_date        
 Min.   :1   Length:5           Min.   :515.2   Min.   :2012-01-01  
 1st Qu.:2   Class :character   1st Qu.:611.0   1st Qu.:2013-09-23  
 Median :3   Mode  :character   Median :623.3   Median :2014-05-11  
 Mean   :3                      Mean   :664.4   Mean   :2014-01-14  
 3rd Qu.:4                      3rd Qu.:729.0   3rd Qu.:2014-11-15  
 Max.   :5                      Max.   :843.2   Max.   :2015-03-27  


> #从数据帧提取数据
> #使用列名称从数据框中提取特定列。
> 
> #extract specific columns
>  result <- data.frame(emp.data$emp_name,emp.data$start_date)
> print(result)
  emp.data.emp_name emp.data.start_date
1              Rick          2012-01-01
2               Dan          2013-09-23
3          Michelle          2014-11-15
4              Ryan          2014-05-11
5              Gary          2015-03-27


> #extract first two rows
> result <- emp.data[1:2]
> print(result)
  emp_id emp_name
1      1     Rick
2      2      Dan
3      3 Michelle
4      4     Ryan
5      5     Gary

> result <- emp.data[1:2,]
> print(result)
  emp_id emp_name salary start_date
1      1     Rick  623.3 2012-01-01
2      2      Dan  515.2 2013-09-23


> # Create the data frame.
> emp.data <- data.frame(
+    emp_id = c (1:5), 
+    emp_name = c("Rick","Dan","Michelle","Ryan","Gary"),
+    salary = c(623.3,515.2,611.0,729.0,843.25), 
+    
+ start_date = as.Date(c("2012-01-01", "2013-09-23", "2014-11-15", "2014-05-11",
+       "2015-03-27")),
+    stringsAsFactors = FALSE
+ )


> #extract 3rd and 5th row with 2nd and 4th column.
> result <- emp.data[c(3,5),c(2,4)]
> print(result)
  emp_name start_date
3 Michelle 2014-11-15
5     Gary 2015-03-27
> result <- emp.data[c(3,5),c(2)]
> print(result)
[1] "Michelle" "Gary"    


> #扩展数据帧
> #可以通过添加列和行来扩展数据帧。
> 
> #添加列
> #只需使用新的列名称添加列向量。
> # Create the data frame.
> emp.data <- data.frame(
+    emp_id = c (1:5), 
+    emp_name = c("Rick","Dan","Michelle","Ryan","Gary"),
+    salary = c(623.3,515.2,611.0,729.0,843.25), 
+    
+    start_date = as.Date(c("2012-01-01", "2013-09-23", "2014-11-15", "2014-05-11",
+       "2015-03-27")),
+    stringsAsFactors = FALSE
+ )

> print(emp.data)
  emp_id emp_name salary start_date
1      1     Rick 623.30 2012-01-01
2      2      Dan 515.20 2013-09-23
3      3 Michelle 611.00 2014-11-15
4      4     Ryan 729.00 2014-05-11
5      5     Gary 843.25 2015-03-27

> #add the "dept" column
> emp.data$dept <- c("IT","Operations","IT","HR","Finance")
> v <-emp.data 

> print(v)
  emp_id emp_name salary start_date       dept
1      1     Rick 623.30 2012-01-01         IT
2      2      Dan 515.20 2013-09-23 Operations
3      3 Michelle 611.00 2014-11-15         IT
4      4     Ryan 729.00 2014-05-11         HR
5      5     Gary 843.25 2015-03-27    Finance

> print(emp.data)
  emp_id emp_name salary start_date       dept
1      1     Rick 623.30 2012-01-01         IT
2      2      Dan 515.20 2013-09-23 Operations
3      3 Michelle 611.00 2014-11-15         IT
4      4     Ryan 729.00 2014-05-11         HR
5      5     Gary 843.25 2015-03-27    Finance


> #添加行
> #要将更多行永久添加到现有数据帧，我们需要引入与现有数据帧相同结构的新行，并使用rbind()函数。
> #在下面的示例中，我们创建一个包含新行的数据帧，并将其与现有数据帧合并以创建最终数据帧。
> # Create the first data frame.
> emp.data <- data.frame(
+    emp_id = c (1:5), 
+    emp_name = c("Rick","Dan","Michelle","Ryan","Gary"),
+    salary = c(623.3,515.2,611.0,729.0,843.25), 
+    
+    start_date = as.Date(c("2012-01-01", "2013-09-23", "2014-11-15", "2014-05-11",
+       "2015-03-27")),
+    dept = c("IT","Operations","IT","HR","Finance"),
+    stringsAsFactors = FALSE
+ )

> print(emp.data)
  emp_id emp_name salary start_date       dept
1      1     Rick 623.30 2012-01-01         IT
2      2      Dan 515.20 2013-09-23 Operations
3      3 Michelle 611.00 2014-11-15         IT
4      4     Ryan 729.00 2014-05-11         HR
5      5     Gary 843.25 2015-03-27    Finance


> # Create the second data frame
> emp.newdata <- data.frame(
+    emp_id = c (6:8), 
+    emp_name = c("Rasmi","Pranab","Tusar"),
+    salary = c(578.0,722.5,632.8), 
+    start_date = as.Date(c("2013-05-21","2013-07-30","2014-06-17")),
+    dept = c("IT","Operations","Fianance"),
+    stringsAsFactors = FALSE
+ )

> print(emp.newdata)
  emp_id emp_name salary start_date       dept
1      6    Rasmi  578.0 2013-05-21         IT
2      7   Pranab  722.5 2013-07-30 Operations
3      8    Tusar  632.8 2014-06-17   Fianance


> # Bind the two data frames.
> emp.finaldata <- rbind(emp.data,emp.newdata)

> print(emp.finaldata)
  emp_id emp_name salary start_date       dept
1      1     Rick 623.30 2012-01-01         IT
2      2      Dan 515.20 2013-09-23 Operations
3      3 Michelle 611.00 2014-11-15         IT
4      4     Ryan 729.00 2014-05-11         HR
5      5     Gary 843.25 2015-03-27    Finance
6      6    Rasmi 578.00 2013-05-21         IT
7      7   Pranab 722.50 2013-07-30 Operations
8      8    Tusar 632.80 2014-06-17   Fianance
>