IT博客汇 | 链接：人人网关系数据获取

链接：人人网关系数据获取

qxde01发表于 2013-06-13 23:35:28

注：本文完成于2012-10-23，一直在草稿箱里，今天（2013-06-13）发出，代码不知是否还能运行。

现在社交网络分析非常热火，本人也有极大的兴趣，就以人人网为例，学习社交网络分析的过程。

首先是数据获取问题，关于人人网的数据获取可以参考这里的R程序：包括登录、好友获取、好友状态及网络图绘制等。本文主要引用了其中的登录程序。

获取好友数据主要有四个函数：

renren_login 登录函数
renren_detail 获取好友信息函数（只支持部分好友）
renren_friend 获取用户的所有好友的ID
friend_detail 调用 renren_detail ，批量获取好友信息

详细代码如下：

renren_login=function(name="****",pwd="******"){
##登录函数
library(RCurl)
memory.limit(4000)
myH=c(
"User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="zh-cn,zh;q=0.5",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7",
"Keep-Alive"="115",
"Connection"="keep-alive",
"Content-Type"="application/x-www-form-urlencoded; charset=UTF-8",
"Pragma"="no-cache",
"Cache-Control"="no-cache")
d=debugGatherer()
cH=getCurlHandle(debugfunction=d$update,verbose=T,
ssl.verifyhost=F,ssl.verifypeer=F,followlocation=T,cookiefile="cc.txt")
pinfo=c("email"=name, "password"=pwd,
"origURL"="http://www.renren.com/Home.do","domain"="renren.com")
pinfo=iconv(pinfo,from="GB18030",to="UTF-8")
ttt=postForm("http://passport.renren.com/PLogin.do",httpheader=myH,.params=pinfo,curl=cH,style="post")
getCurlInfo(cH)[["cookielist"]]
h=getURL("http://www.renren.com",curl=cH,.encoding="gbk")
write(h,"temp.txt")
hh=readLines("temp.txt",encoding="UTF-8")
# file.remove("temp.txt")
rm(h)
hh=hh[grep("user : ",hh)]
hh=gsub("[^0-9]","",hh)
return(list(curl=cH,userid=hh))
}
renren_detail<-function(curl,userid){
##根据用户的ID获取用户的一些基本信息，这个函数虽短，但BUG比行数最多，因很多用户人人的
##页面是个性化设置的，此函数也只能获取部分用户的信息。
Myurl<-paste('http://www.renren.com/',userid,'/profile',sep='')
h=getURL(Myurl,curl=curl,.encoding="gbk")
write(h,'h.txt')
h=readLines("h.txt",encoding="UTF-8")
user=unlist(strsplit(h[grep('',h)],'-'))[2]
hi<-grep('profile-summary',h)[1]
if(length(hi)==0)hi<-grep('profile-information',h)[1]
if(length(hi)==0|is.na(hi))hi=10
h1<-h[seq(hi-1,hi+40)]
h2<-gsub('<[^]*>|\t| ', '', h1)
h3<-h2[nchar(h2)>0]
school<-h3[grep('所在',h3)+1]
if(length(school)==0)school=NA
birthday<-h3[grep('生　　日:',h3)+1]
if(length(birthday)==0)birthday=NA
constellation<-h3[grep('星　　座:',h3)+1]
if(length(constellation)==0)constellation=NA
hometown<-h3[grep('家　　乡:',h3)+1]
if(length(hometown)==0)hometown=NA
level<-h3[grep('等　　级:',h3)+1]
if(length(level)==0)level=NA
cbind(user=user,userid=userid,school=school,birthday=birthday,constellation=constellation, hometown=hometown,level=level)
}
renren_friend<-function(curl,userid){
##根据用户的ID获取该用户的所有好友的ID
myurl=paste("http://friend.renren.com/GetFriendList.do?curpage=0&id;=",userid,sep="")
h=getURL(myurl,curl=curl,.encoding="gbk")
write(h,'h.txt')
h=readLines("h.txt",encoding="UTF-8")
##好友数量
hcount<-as.integer(gsub('[^0-9]','',h[grep('',h)]))
pgs=ceiling(hcount/20)
##获取好友ID,第一页
cat('正在获取',1,'页\n')
h1<-h[grep('http://www.renren.com/profile.do',h)]
h2<-h1[grep('avatar',h1)]
strsplit(h2,'')
h3<-gsub(']*.>|
|
, '', h2)
hid<-gsub('[^0-9]','',h3)
if(pgs>1){
for(pg in 1:(pgs-1)){
cat('正在获取',pg+1,'页\n')
myurl=paste("http://friend.renren.com/GetFriendList.do?curpage=",pg,"&id;=",userid,sep="")
h=getURL(myurl,curl=curl,.encoding="gbk")
write(h,'h.txt')
h=readLines("h.txt",encoding="UTF-8")
##获取好友ID,第2-pgs页
h1<-h[grep('http://www.renren.com/profile.do',h)]
h2<-h1[grep('avatar',h1)]
strsplit(h2,'')
h3<-gsub(']*.>|
|
, '', h2)
hid0<-gsub('[^0-9]','',h3)
hid<-c(hid,hid0)
}
}
return(hid)
}
friend_detail<-function(curl,hid){
##根据好友ID列表，调用renren_detail批量获取好友的一些信息
n=length(hid)
x<-c()
for(i in 1:n){
cat('正在获取第',i,'个',hid[i],'\n')
x1<-renren_detail(curl,hid[i])
x<-rbind(x,x1)
}
x
}

好友数据获取过程如下，txt文件保存userid的好友ID及userid好友的好友ID，csv文件保存userid的好友基本信息及userid好友的好友基本信息。

ch=renren_login(name='***@***',pwd='******')
curl=ch$curl
userid=ch$userid
hids<-renren_friend(curl,userid=userid)
write(hids,file=paste(userid,'.txt',sep=''))
write.csv(d1,file=paste(userid,'.csv',sep=''),row.names=F)
n=length(hids)
for(i in 1:n){
hisid<-renren_friend(curl,userid=hids[i])
write(hisid,file=paste(hids[i],'.txt',sep=''))
d<-friend_detail(curl,hisid)
write.csv(d,file=paste(hids[i],'.csv',sep=''),row.names=F)
}

将所有csv文件保存至同一文件夹下，将所有csv文件合并:

setwd('D:/renren/csv/')
csvnames<-dir()
csvID<-gsub('[^0-9]','',csvnames)
n<-length(csvnames)
MyF<-c()
for(i in 1:n){
csv1<-read.csv(csvnames[i])
nr<-nrow(csv1)
MFID<-rep(csvID[i],nr)
mfrela<-cbind(MFID=MFID,csv1)
MyF<-rbind(MyF,mfrela)
}
write.csv(MyF,file='myfriends.csv',row.names=F)

此时得到了一个好友数据集（MyF），共16042个好友ID，23727好友对，其中我的好友117个，我的好友中最多好友数998个，最少10个,平均203个，中位数164，其直方图有一个长长的尾线，大部分好友的好友数在50--300之间，占75.21%。其中前64位（37.65%）的好友数占80%，与幂律分布的二八定律还差一定距离。

fc<-sort(table(MyF[,1]),decreasing=T)

hist(fc, breaks = 12,freq = F, col=5, border="red",ylim=c(0,0.0035),xlab='好友数',main='好友数直方图')

lines(density(fc),col='blue')

其实在数据集中我的好友及我的好友之间的关系只占很少一部分，将它们提取出来，共有2600好友对。

hids<-readLines('MyfriendId.txt') ##包含自己的ID

onlymy<-MyF[MyF[,3] %in% hids,]

数据集MyF和onlymy如果作为无向图，都有重复的好友对出现，如好友对A--B和B--A实际上是一对好友，造成了数据冗余。使用package:sqldf中的sqldf函数，利用SQL语句进行剔除（这种办法效率比较高），提出1299行数据，剩余22428（myfno）行。

library(sqldf)
ID<-1:nrow(MyF)
myfs<-cbind(ID,MyF)
same<-sqldf('select A.* from myfs A,myfs B
where A.MFID =B.userid and A.userid=B.MFID and A.ID>B.ID')
sid<-same$ID
myfno<-MyF[-sid,]
write.csv(myfno,'myfirends0.csv',row.names=F)

目前得到四个数据集：

MyF，23727对好友，部分是双向关系；
myfno，22428对好友，只有单向关系；
onlymy，2600对好友，只有我的好友及他们之间的关系对。
friendinfo，16042个，好友的基本信息，可作为图的边界。（因数据不是在同一时间段获取，好友信息有变更，要去除重复ID）

MyF<-read.csv('myfriends.csv')
myfno<-read.csv('myfirends0.csv')
onlymy<-read.csv('myonlyfriend.csv')
friendinfo<-read.csv('friendinfo.csv')

另外基本信息中星座中的牧羊座、山羊座统一改为白羊座。获得星座信息的好友数5128个，不足1/3，获得家乡信息的6442个，约40%。

cons<-as.character(friendinfo$constellation)
x<-sort(table(cons),decreasing=T)
a<-x/sum(x)*100
pie(x,labels = paste(names(x),format(a,digits=3),'%',sep=''),col=rainbow(12))

对8307个姓名统计，前15个姓氏占51.25%，前30个占63.81%，前52个占74.14%，频率最高的前30个姓氏分别为"王" 、"张"、 "李" 、"刘" 、"陈"、 "杨"、 "赵" 、"周"、"郭" 、"秦"、 "孙" 、"吴"、 "朱" 、"马" 、"黄"、 "徐"、 "胡"、 "韩" 、"高" 、"宋" 、"许"、 "冯"、 "梁"、 "郑" 、"林"、 "田" 、"杜" 、"吕" 、"袁"、 "崔"。重名最多的几个名字分别为秦海、王静、杨广超、张静。

b<-readLines('name.txt')

b<-substr(a, 1, 1)

bb<-sort(table(b),decreasing=F)

labels = paste(names(bb[bb>50]),format(bb[bb>50]/sum(bb)*100,digits=2),'%',sep=' ')

png('bar1.png',width=1240*0.8,height=1028*0.8)

par(mar=c(3,3,1,1))

barplot(bb[bb>50],col=rainbow(length(bb[bb>50])),cex.names = 1,horiz=T,legend.text=labels, args.legend=list(x=550,y=25,horiz=F,text.font=2.5))

dev.off()