#写在前面的话:此教程主要是用R连接了DB2数据库,并进行文本分析,制作了词图
#教程为markdown编写
---
title: "网站留言分析"output: html_document---```{r setup, include=FALSE}knitr::opts_chunk$set(echo=FALSE,error=FALSE,warning=FALSE,message = F,comment="")``````{r}#连接数据库并读取数据library(RODBC)options(scipen=200)conn = odbcConnect(dsn="DB2",uid="tjfx","1")myfile<-sqlQuery(conn,"select * from T_from_last_year_liuyan")myfile<-myfile[,4]#head(myfile)#调入分词的库library("rJava")library("Rwordseg")#调入绘制词云的库library("RColorBrewer")library("wordcloud") #预处理,这步可以将读入的文本转换为可以分词的字符,没有这步不能分词myfile.res <- myfile[myfile!=" "] ``````{r,echo=F,error=FALSE,warning=FALSE,message = F,comment="",results='hide'}#载入语料库,搞好词库后,用下面的语句就可以装入内存:installDict("ciku.txt",dictname = "my dict")``````{r}#查看装载的词典:#listDict()#删除安装的词典:#as.charectermyfile.res<-as.character(myfile.res)#分词,并将分词结果转换为向量myfile.words <- unlist(lapply(X = myfile.res,FUN = segmentCN)) #剔除URL等各种不需要的字符,还需要删除什么特殊的字符可以依样画葫芦在下面增加gsub的语句myfile.words <- gsub(pattern="http:[a-zA-Z\\/\\.0-9]+","",myfile.words)myfile.words <- gsub("\n","",myfile.words)myfile.words <- gsub(" ","",myfile.words) #去掉停用词data_stw=read.table(file="mystopword.txt",colClasses="character")stopwords_CN=c(NULL)for(i in 1:dim(data_stw)[1]){ stopwords_CN=c(stopwords_CN,data_stw[i,1])}for(j in 1:length(stopwords_CN)){ myfile.words <- subset(myfile.words,myfile.words!=stopwords_CN[j])}#过滤掉1个字的词myfile.words <- subset(myfile.words, nchar(as.character(myfile.words))>1) #统计词频myfile.freq <- table(unlist(myfile.words))myfile.freq <- rev(sort(myfile.freq))myfile.freq <- data.frame(word=names(myfile.freq), freq=myfile.freq) #按词频过滤词,过滤掉只出现过一次的词,这里可以根据需要调整过滤的词频数myfile.freq2=subset(myfile.freq, myfile.freq$freq>=5) #绘制词云#设置一个颜色系:mycolors <- brewer.pal(8,"Dark2")#设置字体windowsFonts(myFont=windowsFont("微软雅黑"))#画图set.seed(123)wordcloud(myfile.freq2$word,myfile.freq2$freq,random.order=FALSE,random.color=FALSE,colors=mycolors,family="myFont") write.table(myfile.freq2,row.names=F,quote = FALSE )
#结果图:
输出的词频文档删除了行号,详见代码: