使用pyqt webkit写headless 动态爬虫,需要捕获到所有的HTTP request和response(包括JS异步请求)。
QNetworkReply 没有返回content-length相关的方法,rawHeaderList实际上并不包括content-length。
一个可行的方法,是在readyRead信号触发时,size()方法获取到body的大小。示例代码如下,请注意self.reply_lst这个列表是必不可少的:
import sys from PyQt5.Qt import QApplication from PyQt5.QtCore import QUrl from PyQt5.Qt import QWebView, QWebPage from PyQt5.QtNetwork import QNetworkAccessManager class Manager(QNetworkAccessManager): def __init__(self, parent=None): QNetworkAccessManager.__init__(self, parent) self.finished.connect(self._finished) self.reply_lst = [] def _finished(self, reply): print '[%s bytes] %s' % (reply.content_length, reply.url().toString()) self.reply_lst.remove(reply) def createRequest(self, operation, request, body=None): _url = request.url() reply = super(Manager, self).createRequest(operation, request, body) reply.readyRead.connect(self.read_read) self.reply_lst.append(reply) return reply def read_read(self): self.sender().content_length = self.sender().size() def app_quit(): app.quit() if __name__ == "__main__": app = QApplication(['-platform', 'minimal']) browser = QWebView() page = QWebPage() manager = Manager() page.setNetworkAccessManager(manager) browser.setPage(page) browser.load(QUrl('http://www.lijiejie.com/')) browser.loadFinished.connect(app_quit) sys.exit(app.exec_())
最后,拿到所有URL的content-length如下,可以把它写到爬虫的结果中:
[499 bytes] http://www.lijiejie.com/ [2082 bytes] https://s6.cnzz.com/stat.php?id=3804994&web_id=3804994&show=pic [11414 bytes] http://www.lijiejie.com/wp-includes/js/wp-emoji-release.min.js?ver=4.7.1 [20172 bytes] http://www.lijiejie.com/wp-content/plugins/crayon-syntax-highlighter/css/min/crayon.min.css?ver=_2.7.2_beta [2850 bytes] http://www.lijiejie.com/wp-content/plugins/crayon-syntax-highlighter/themes/github/github.css?ver=_2.7.2_beta [86 bytes] http://www.lijiejie.com/wp-content/plugins/crayon-syntax-highlighter/fonts/courier-new.css?ver=_2.7.2_beta [374 bytes] http://www.lijiejie.com/wp-content/plugins/wp-pagenavi/pagenavi-css.css?ver=2.70 [27597 bytes] http://www.lijiejie.com/wp-content/themes/retro-fitted/style.min.css?ver=0.4 [10056 bytes] http://www.lijiejie.com/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1 [3678 bytes] http://www.lijiejie.com/wp-content/plugins/dynamic-to-top/js/libs/jquery.easing.js?ver=1.3 [4214 bytes] http://www.lijiejie.com/wp-content/themes/retro-fitted/library/js/drop-downs.min.js?ver=20110920 [22337 bytes] http://www.lijiejie.com/wp-content/plugins/crayon-syntax-highlighter/js/min/crayon.min.js?ver=_2.7.2_beta [1192 bytes] http://www.lijiejie.com/wp-content/plugins/dynamic-to-top/js/dynamic.to.top.min.js?ver=3.5 [1398 bytes] http://www.lijiejie.com/wp-includes/js/wp-embed.min.js?ver=4.7.1 [6108 bytes] http://www.lijiejie.com/wp-includes/js/jquery/jquery.js?ver=1.12.4 [765 bytes] https://c.cnzz.com/core.php?web_id=3804994&show=pic&t=z [3197 bytes] http://www.lijiejie.com/wp-content/themes/retro-fitted/images/bg.jpg [2236 bytes] http://www.lijiejie.com/wp-content/plugins/crayon-syntax-highlighter/css/images/toolbar/buttons.png [576 bytes] http://www.lijiejie.com/wp-content/themes/retro-fitted/images/quotes.png [9766 bytes] http://www.lijiejie.com/wp-content/themes/retro-fitted/images/header.png [7210 bytes] http://www.lijiejie.com/wp-content/uploads/2014/10/stom.tencent.com_.png [719 bytes] https://icon.cnzz.com/img/pic.gif [43 bytes] https://hzs9.cnzz.com/stat.htm?id=3804994&r=&lg=zh-cn&ntime=none&cnzz_eid=1834513944-1484813777-&showp=1920x1080&t=李劼杰的博客&h=1&rnd=1792993998 [43 bytes] https://cnzz.mmstat.com/9.gif?abc=1&rnd=2124379566 [142 bytes] http://www.lijiejie.com/wp-content/themes/retro-fitted/images/bullet.png [2516 bytes] http://www.lijiejie.com/wp-content/uploads/2017/01/eping.png [3282 bytes] http://www.lijiejie.com/wp-content/uploads/2014/10/stom.tencent.com_upfile.png