IT博客汇
  • 首页
  • 精华
  • 技术
  • 设计
  • 资讯
  • 扯淡
  • 权利声明
  • 登录 注册

    Python 抓取新闻,放到数据库

    Fish (fsh267@gmail.com)发表于 2014-02-08 00:00:00
    love 0

    使用的BeautifulSoup库,比较小白,对于中文,有的页面需要添加from_encoding = "GBK", 还有数据库的各种转码,比如con.escape_string(),这个问题卡了一下午+一晚上,还是在StackOverFlow上撞到的解决方案, 以腾讯新闻为例:

        # coding: utf-8
        # 文章信息保存到details_list中,包括标题,作者,发布时间,摘要,内容, 原地址
        # /usr/bin/python
        # author: fish
         
        import sys
        reload(sys)
        sys.setdefaultencoding('utf-8')
        from bs4 import BeautifulSoup
        import urllib2
        import urllib
        import socket
        import re
        import MySQLdb
        url = 'http://tech.qq.com'
        socket.setdefaulttimeout(200)
        soup = BeautifulSoup(urllib.urlopen(url), from_encoding = 'GBK')
        #print soup
        href_list = []
        title = soup.find_all('div', 'Q-tpListInner')
        #print title
        for detail_href in title:
            try:
                href_list.append(detail_href.a.get('href'))
            except:
                AttributeError
        details_list = []
        #print href_list
        '''connect to mysql'''
        try:
            con = MySQLdb.connect(host = 'localhost', user = 'root', passwd = 'baidusql', charset = 'utf8')
            con.select_db('ali_app')
            cur = con.cursor()
         
         
         
            for href in href_list:
                sub_details_list = []
                detail_soup = BeautifulSoup(urllib.urlopen(href).read(), from_encoding = 'GBK')
                print href
         
                try:
                    article_title = detail_soup.find('div', 'hd').h1.string
                    article_pub_date = detail_soup.find('span', 'pubTime').string
                    article_author = detail_soup.find('span', 'auth').string
                    if str(article_author) == 0:
                        article_author = '腾讯科技'
                    article_abridgement = detail_soup.find(attrs = {
                            'name': 'Description'
                            }).get('content')
         
                    article_contents = detail_soup.find('div', id = 'Cnt-Main-Article-QQ')
                    article_source_address = href
                    if(str(article_contents).find('videoPlayer') != -1 or str(detail_soup).find('gqMaskshowBT') != -1):
                        continue
                except:
                    AttributeError
                 
                sub_details_list.append(article_title)
                sub_details_list.append(article_author)
                sub_details_list.append(article_pub_date)
                sub_details_list.append(article_source_address)
                sub_details_list.append(article_abridgement)
                sub_details_list.append(article_contents)
                 
        #       cur.execute('drop table if exists QQ_TECH')
         
        #       cur.execute("insert into QQ_TECH(title, autor, pub_date, source_address, description, content) values(%s, %s, %s, %s, %s, %s)", sub_details_list)
        #       cur.execute("insert into QQ_TECH(title, autor ,pub_date, source_address, description, content ) values('fdsaf', 'fds', '发范德萨', 'fdsafdfdksajfwefdsdsa放到', 'fdsafewr3', '范德萨范德萨定时分尸案')")
        #   print sub_details_list
                cur.execute('delete from jr_category where source_address = "%s"' %article_source_address)
                cur.execute('delete from jr_content where source_address = "%s"' %article_source_address)
                cur.execute('insert into jr_category(sid, title, date, author, source_address) values(3, "%s", "%s",  "%s", "%s")' %(article_title, article_pub_date, article_author, article_source_address))
                article_contents = con.escape_string(str(article_contents))
                cur.execute("insert into jr_content(source_address, description, content) values('%s', '%s', '%s')" %(article_source_address, article_abridgement, article_contents))
                 
        #       content = con.escape_string(str(article_contents))
        #       cur.execute('''insert into test2(name) values("%s")'''  %content)
         
            details_list.append(sub_details_list) 
            con.commit()
            cur.close()
            con.close()
        except MySQLdb.Error, e:
            print "Error %d: %s" %(e.args[0], e.args[1])


沪ICP备19023445号-2号
友情链接