R语言处理的数据一般从外部导入,因此需要数据接口来读取各种格式化的数据
CSV
# 获得data是一个数据帧
data = read.csv("input.csv")
# 可以使用类似于SQL的where查询
retval = subset(data,dept == "IT" & salary > 600)
print(retval)
# 写入文件
# row.names=FALSE是为了去除额外的行号
write.csv(retval,"output.csv", row.names = FALSE)
Excel
安装xlsx时,要依赖rJava,和xlsxjars
在安装完之后要要建立和java的连接
sudo R CMD javareconf
配置完后在terminal中运行没有问题,但在Rgui和RStudio中均报错
dlopen(/Library/Frameworks/R.framework/Versions/3.4/Resources/library/rJava/libs/rJava.so, 6): Library not loaded: @rpath/libjvm.dylib
Referenced from: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/rJava/libs/rJava.so
Reason: image not found
博主也尚未找到原因
因为Excel效果跟CSV一致,建议使用CSV
#需要在terminal中运行
library("xlsx")
data = read.xlsx("data.xlsx", sheetIndex = 1)
print(data)
Binary
二进制文件不能结构化存储,但不容易被查看,也是一种数据接口(虽然博主从没用过)
# 写入二进制文件
writeData = colnames(read.csv('input.csv'))
# 创建文件句柄
print(writeData)
writeFile = file('data.dat','wb')
# 写入文件
# 二进制文件只能写矢量对象
writeBin(writeData,writeFile)
close(writeFile)
# 读取二进制文件
readFile = file('data.dat','rb')
# 读取18个字节,内容解析为character否则默认转成数字
readData = readBin(readFile, character(), n = 18)
print(readData)
close(readFile)
[1] "id" "name" "salary" "start_date" "dept"
[1] "id" "name" "salary" "start_date" "dept"
XML
library(XML)
xmlToList('data.xml')[1:3]
xmlToDataFrame('data.xml')
$EMPLOYEE
$EMPLOYEE$ID
[1] "1"
$EMPLOYEE$NAME
[1] "Rick"
$EMPLOYEE$SALARY
[1] "623.3"
$EMPLOYEE$STARTDATE
[1] "1/1/2012"
$EMPLOYEE$DEPT
[1] "IT"
$EMPLOYEE
$EMPLOYEE$ID
[1] "2"
$EMPLOYEE$NAME
[1] "Dan"
$EMPLOYEE$SALARY
[1] "515.2"
$EMPLOYEE$STARTDATE
[1] "9/23/2013"
$EMPLOYEE$DEPT
[1] "Operations"
$EMPLOYEE
$EMPLOYEE$ID
[1] "3"
$EMPLOYEE$NAME
[1] "Michelle"
$EMPLOYEE$SALARY
[1] "611"
$EMPLOYEE$STARTDATE
[1] "11/15/2014"
$EMPLOYEE$DEPT
[1] "IT"
ID NAME SALARY STARTDATE DEPT
1 1 Rick 623.3 1/1/2012 IT
2 2 Dan 515.2 9/23/2013 Operations
3 3 Michelle 611 11/15/2014 IT
4 4 Ryan 729 5/11/2014 HR
5 5 Gary 843.25 3/27/2015 Finance
6 6 Nina 578 5/21/2013 IT
7 7 Simon 632.8 7/30/2013 Operations
8 8 Guru 722.5 6/17/2014 Finance
JSON
library(rjson)
result = fromJSON(file = 'data.json')
result #默认方法的JSON对象
as.data.frame(result) #转为数据帧
$ID
[1] "1" "2" "3" "4" "5" "6" "7" "8"
$Name
[1] "Rick" "Dan" "Michelle" "Ryan" "Gary" "Nina"
[7] "Simon" "Guru"
$Salary
[1] "623.3" "515.2" "611" "729" "843.25" "578" "632.8"
[8] "722.5"
$StartDate
[1] "1/1/2012" "9/23/2013" "11/15/2014" "5/11/2014" "3/27/2015"
[6] "5/21/2013" "7/30/2013" "6/17/2014"
$Dept
[1] "IT" "Operations" "IT" "HR" "Finance"
[6] "IT" "Operations" "Finance"
ID Name Salary StartDate Dept
1 1 Rick 623.3 1/1/2012 IT
2 2 Dan 515.2 9/23/2013 Operations
3 3 Michelle 611 11/15/2014 IT
4 4 Ryan 729 5/11/2014 HR
5 5 Gary 843.25 3/27/2015 Finance
6 6 Nina 578 5/21/2013 IT
7 7 Simon 632.8 7/30/2013 Operations
8 8 Guru 722.5 6/17/2014 Finance
Web数据
web中的数据就是要做网络爬虫了,这个以后单独讨论。
数据库
R语言可以连接数据库,连接方式与其他编程语言非常相似,下面以MySQL为例介绍
library(RMySQL)
# 建立数据库连接手柄
con = dbConnect(MySQL(),
user = 'user',
password = 'password',
dbname = 'dbname',
host = 'host'
)
# 解决中文编码问题
# 统一使用utf8
dbSendQuery(con, "SET NAMES utf8")
# 列出表格
dbListTables(con)
# 执行SQL,result接收数据入口
result = dbSendQuery(con, "select * from card")
df = fetch(result) #返回一个dataFrame
print(head(df))
card_ID card_name card_workplace card_type
1 CD01 张敏 浙江大学 教师
2 CD02 刘鹏飞 浙江大学 学生
3 CD03 李想 杭州电子科技大学 职工
4 CD04 李四 南京大学 学生
5 CD05 李楠 腾讯 社会人士、
6 CD09 张三 阿里巴巴 社会人士
小结
博主认为,数据接口的优点就是把采集数据和分析数据的过程解耦合,让每个模块处理擅长的问题。
|
请发表评论