# -*- coding: utf-8 -*-
from urllib2 import urlopen,Request
import urllib
from lxml import *
import lxml.html as HTML
import time
def error(txt):
with open("../it/error.txt","a") as f:
f.write(txt + '\n')
def con(url,count=4):
try:
req = Request(url)
req.add_header('Referer','
http://www.baidu.com')
req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
res = urlopen(req,timeout = 20)
page = res.read()
res.close()
#dom = HTML.document_fromstring(page)
return page
except Exception,e:
if count >= 10:
print e
error(url)
else:
count += 1
time.sleep(1)
return con(url,count)
def menu(url):
page = con(url)
dom = HTML.document_fromstring(page)
path = "//h5/a"
node = dom.xpath(path)
for n in node:
dic = {}
dic['title'] = n.text_content()
dic['url'] = "http:" + n.get("href")
if dic['title'] and dic['url']:
yield dic
def save(title,content):
with open('../it/'+unicode(title)+'.html','w') as f:
f.write(content)
def blog():
prev = menu("
http://eyehere.net/2011/python-pygame-novice\
-professional-index/")
for dic in prev:
title = dic.get("title",'')
url = dic.get("url",'')
page = con(url)
save(title,page)
print "saved ",unicode(title)
if __name__ == "__main__":
## try:
blog()
## except Exception,e:
## print e