吾读新闻,常看后附之评论,然常感言之无物,虽想一探究竟。然因余之技术粗鄙,不知动态网页之原理,余不能获之。经探测,3g网页数据可读之,窃喜。虽获获新浪之《中方反对美渉华军力报告》(2012-5-20 02:25发布)评论526则、网易之《中方反对美散布“中国威胁论”》(2012-5-20 07:54发布)评论457则。
新浪之用户为手机用户与非手机用户,手机读新闻之黔首过半矣;网易用户之地域可观之,如粤(83)、浙(42)、苏(38)、京(33)、沪(30)等,一域网络阅新闻之黔首数量与经济正相关。虽网易发布晚于新浪近5小时,然用户之高峰却同也。闲言少许,有图为证:
其内容可见黔首如何之义愤填膺、如何之怒不可遏、如何之言词凿凿等等不一而足,看官可观之。
虽二者主流意识是一致的,但新浪用户的词语比网易的更为集中,而网易用户词语较为分散,其评论不尽限于此新闻,有所延伸,然亦有发泄者、广告者等江湖人物,可谓鱼龙混杂。
附录:网易数据获取
- library(RCurl)
- # http://3g.163.com/ntes/12/0520/07/81UCOED600963VRO.html
- myH <- c(Host = "http://3g.163.com", `User-Agent` = "Mozilla/5.0 (Windows NT 5.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
- Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- `Accept-Language` = "zh-cn,zh;q=0.5", `Accept-Encoding` = "gzip, deflate",
- `Accept-Charset` = "GB2312,utf-8;q=0.7,*;q=0.7", `Keep-Alive` = "115",
- Connection = "keep-alive", Referer = "http://3g.163.com",
- `Content-Type` = "application/x-www-form-urlencoded; charset=UTF-8")
- d <- debugGatherer()
- cH <- getCurlHandle(debugfunction = d$update, verbose = T,
- ssl.verifyhost = F, ssl.verifypeer = F, followlocation = T, cookiefile = "cc.txt")
- getCurlInfo(cH)[["cookielist"]]
-
- urlT <- "http://comment.3g.163.com/3g_bbs/81UCOED600963VRO_"
- ##创建data.frame,保存用户所在省份、评论时间、评论内容
- NetEase <- as.data.frame(matrix(NA, ncol = 3, nrow = 0))
- names(NetEase) <- c("Zone", "RCtime", "Comtx")
- ##拆楼
- delrep <- function(x) {
- x <- unlist(strsplit(x, ">"))
- y <- x[length(x)]
- return(y)
- }
-
- ##根据页码循环获取页面数据
- for(pg in 1:46){
- www<-paste(urlT,pg,'.html',sep='')
- txt=postForm(www,httpheader=myH,curl=cH,style="post",.encoding = "UTF-8")
- write(txt, "tmp.txt")
- txt<-readLines('tmp.txt')
-
- txt<-txt[grep('
',txt)]
- #txt<-gsub('<[^>]*|>| ','',txt)
-
- Zone<-txt[grep('span class="name"',txt)]
- Zone<-gsub('|| ','',Zone)
- RCtime<-txt[grep('span class="time"',txt)]
- RCtime<-gsub('|发表| ','',RCtime)
-
- Comtxt<-txt[grep('
',txt)]
- Comtxt<-Comtxt[1:(length(Comtxt)-2)]
- #Comtxt<-gsub('
|
| ','',Comtxt) - Comtxt<-gsub('<[^>]*| ','',Comtxt)
- Comtxt<-gsub('>>>>|\\([^\\)]*','',Comtxt)
- ##拆楼
- x1<-Comtxt[grep('原贴',Comtxt)]
- x1<-mapply(delrep,x1)
- Comtxt[grep('原贴',Comtxt)]<-x1
- Comtxt<-gsub('>','',Comtxt)
-
- temp<-cbind(Zone,RCtime,Comtxt)
- cat('已读取',pg,'页','\n')
- NetEase<-rbind(NetEase,temp)
- }