数据读取与包的加载
由于最近对文本分析比较感兴趣,所以分析三国演义的文本。
getwd()
setwd("E:\\三国")
library(jiebaR)
library(ggplot2)
library(jpeg)
library(reshape2)
library(wordcloud)
读取前十行
> sanguo <- readLines("E:/三国/三国演义白话文版.txt")#逐行读取数据
> sanguo[1:10]#展现数据前十行
[1] "三国演义明?罗贯中"
[2] "致读者"
[3] " “大江东去,浪淘尽,千古风流人物……”北宋大文学家苏东坡的一首《念奴娇.赤壁》"
[4] " ,仅用百字,就生动地使三国英雄的形象跃然纸上,再现了三国时火烧赤壁的悲壮惨烈、波澜"
[5] " 壮阔的战争场面。"
[6] " 《三国演义》是中国历史上继《水浒传》之后的又一部伟大的现实主义巨著,是中国古典"
[7] " 文学宝库中的又一灿烂的瑰宝,波澜壮阔,气象万千。《三国演义》全名《三国志通俗演义"
[8] " 》,取材于三国时近百年的历史事实,经作者进行了文学创作,终成一部浩瀚的鸿篇巨帙,"
[9] " 流传至今,脍炙人口。东汉末年是诸侯割据、天下大乱的年代,英雄造时势,时势出英雄,"
[10] " 政"
分词处理
> dictpath <- "三国停词.txt"
> stoppath <- "stopwords.dat" #设置停用词
> cutter <- worker(user="E:/三国/三国停词.txt", bylines = TRUE, stop_word="E:/三国/stopwords.dat")#进行分词
res <- cutter[sanguo]#分词
> head(res)#展示前六行
[[1]]
[1] "三国演义" "明" "罗贯中"
[[2]]
[1] "读者"
[[3]]
[1] " " "大江东去" "浪淘尽" "千古" "风流人物" "北宋"
[7] "文学家" "苏东坡" "一首" "念奴娇" "赤壁"
[[4]]
[1] " " " " "仅用" "百字" "生动" "地使"
[7] "三国" "英雄" "形象" "跃然纸上" "再现" "三国"
[13] "时" "火烧" "赤壁" "悲壮" "惨烈" "波澜"
[[5]]
[1] " " " " "壮阔" "战争场面"
[[6]]
[1] " " " " "三国演义" "中国" "历史" "上继"
[7] "水浒传" "一部" "现实主义" "巨著" "中国" "古典"
词频统计
> text <- unlist(res)#设置数据类型
> freq <- data.frame(table(text))#设置数据框
> freq <- freq[nchar(as.character(freq$text))>=2,]#提取字符串大于等于2的词
> freq <- freq[order(-freq$Freq),]#对词频进行排序
> str(freq)#浏览freq对象的存储类型和结构
> head(freq)#读取前六行数据
text Freq
1176 曹操 2430
6792 刘备 2266
6130 孔明 1794
3936 关公 862
7023 吕布 692
14075 张飞 602
描绘词频图
roles <- c("曹操|孟德|阿瞒","刘备|玄德|刘玄德","孔明|诸葛亮|卧龙|诸葛孔明","关公|关羽|云长|关云长|长生","吕布|奉先|飞将军","翼德|张飞") #输入人物称谓
role_name = c("曹操","刘备","孔明","关公", "吕布","张飞")#输入人物称谓
role_paras = sapply(roles, grepl, text) #判断
colnames(role_paras) = role_name #修改列名
role_counts = data.frame(role = factor(colnames(role_paras),
levels = c("曹操","刘备","孔明","关公", "吕布","张飞")),
count = colSums(role_paras)) #统计
library(ggplot2) #加载程序包
ggplot(role_counts, aes(x = role, y = count, fill = role)) +
geom_bar(stat = "identity", width = 0.75) +
xlab("人物")+
ylab("频数")
|
请发表评论