一个简单的爬虫（二）

最新推荐文章于 2022-08-06 19:07:22 发布

lllwxy

最新推荐文章于 2022-08-06 19:07:22 发布

阅读量774

点赞数 2

分类专栏：爬虫

本文链接：https://blog.csdn.net/lllwxy/article/details/52449110

版权

爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

这个是第二爬虫，主要实现的是每日更新数据抓取新进入的评价。具体是这样的，我们需要美团每日的最新评价，也就是过去评价的不需要抓了。

上代码，核心代码。

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import urllib2
import requests
import time,datetime
import MySQLdb
from lxml import etree
import re
import json
from URL import ShopId
from proxies import proxy
import random
import socket
from UserAgent import user_agent_list

class MtComment:
    # 初始化方法，定义一些变量
    def __init__(self):
        self.pageIndex = None
        self.Connection = 'keep-alive'
        self.Accept_Language = 'zh-CN,zh;q=0.8'
        #初始化headers
        self.headers = ''
        self.proxy = ''
        # 存放程序是否继续运行的变量
        self.enable = False
        # connect MySQLdb
        self.db = MySQLdb.connect("IP", "username", "password", "database")
        #定义SQL对象
        self.sql = ''
        #cursor()方法获取操作游标
        self.cursor = self.db.cursor()
        #打开表
        self.of = open('proxy.txt', 'w+')
        #当日
        self.dates = str(datetime.date.today())
        #类初始化
        self.YunMei = ShopId()
        #代理
        self.Proxy = proxy()

    def DefineUrl(self):
        '''
        构造原始URL
        :return:
        '''
        DataUrl = []
        WmIds,ShopIds = self.YunMei.GetUrlId()
        for WmId,shopid in zip(WmIds,ShopIds):
            url = 'http://meituan.com/shop/{}'.format(shopid)
            DataUrl.append(url)
        return DataUrl

    def OpenUrl(self,url,shopid,item):
        '''
        下载网页
        '''
        i2 = random.choice(item)
        ip = self.Proxy.test_proxy(i2)
        referer =  'http://su.meituan.com/shop/' + str(shopid)
        self.user_agent = random.choice(user_agent_list)
        self.headers = {'User-Agent' :self.user_agent,
                        'Accept-Language': self.Accept_Language,
                        'Connection' : self.Connection,
                        'Host' : 'su.meituan.com',
                        'Accept-Encoding' : 'gzip, deflate, sdch',
                        'Referer' : referer,
                        'X-Requested-With' : 'XMLHttpRequest',
                        }
        try:
            # 构建请求的request
            timeout = 20
            socket.setdefaulttimeout(timeout)
            session = requests.session()
            request = session.get(url,proxies={'https':ip},timeout = 60, headers=self.headers)

        except requests.HTTPError,IOError:
            self.of.write('%s\n' % url)
            print u"链接美团失败,错误原因：%s" % url
            print u"IOERROR：%s" % IOError[url]
            time.sleep(10)
        except UnicodeDecodeError as e:
            print('-----UnicodeDecodeErrorurl:',url),e
        except requests.exceptions:
            print 'Connection aborted'
        except socket.timeout as e:
            print("-----socket timout:",url),e
        else:
            if request.url.split('.',1)[1] == 'meituan.com/':
                print ("跳转了，这点店已经不存在了")
            else:
                if request.status_code == 200:
                    pageCode = request.text
                    return pageCode
                else:
                    return None

    def DefineCommentUrl(self):
        '''
        构造评论URL地址，如果程序断开根据数据库中的URL可以去重
        :return:
        '''
        result = []
        # self.sql = "SELECT DISTINCT openurl FROM gz_comment_groupbuy WHERE dates = '2016-08-01'"
        self.sql = "SELECT DISTINCT openurl FROM gz_comment_groupbuy WHERE dates = '" + self.dates + "'"
        try:
            # 执行SQL语句
            self.db.set_character_set('utf8')
            self.cursor.execute(self.sql)
            # 获取所有记录列表
            results = self.cursor.fetchall()
        except:
            print u"MYSQL读取存在错误"
        else:
            for i in results:
                result.append(i[0].strip())
            CommentUrls = []
            CommentShopName = []
            CommentWmId = []
            GroupId = []
            WmIds,ShopNames, ShopIds = self.YunMei.GetUrlId()
            for WmId,ShopName, shopid in zip(WmIds,ShopNames, ShopIds):
            # Urls,MaxPages,Ids = self.GetMaxPageNumber()
            # for Url,MaxPage,Id in zip(Urls,MaxPages,Ids):
                Url = 'http://su.meituan.com/deal/feedbacklist/0/{}/all/0/time/40?limit=10&showpoititle=0&offset=0'.format(shopid)
                url = Url.split('offset=0')[0] + 'offset='
                for i in range(0,int(0) + 10,10):
                    EndUrl = url + str(i)
                    if EndUrl not in result:
                        CommentUrls.append(EndUrl)
                        CommentWmId.append(WmId)
                        CommentShopName.append(ShopName)
                        GroupId.append(str(shopid))
                    else:
                        pass
            return CommentUrls,CommentWmId,CommentShopName,GroupId

    def saveImg(self,imageURL):
        '''
        打印图片
        :param imageURL:
        :return:
        '''
        u = urllib2.urlopen(imageURL)
        data = u.read()
        print u"正在悄悄保存她的一张图片为"
        return data

    def getCurrentTime(self):
        return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time()))

    #获取当前时间
    def getCurrentDate(self):
        return time.strftime('%Y-%m-%d',time.localtime(time.time()))

    def GetComments(self,OpenUrl,Wid,shopname,shopid,items):
        '''
        获取评论解析模块
        :return:
        '''
        items = self.Proxy.IP()
        try:
            JsInfo = self.OpenUrl(OpenUrl,shopid,items)
        except:
            print u"请求超时 休息一下再试试"
            time.sleep(10)
            exit()
        if JsInfo != None:
            js = json.loads(JsInfo)
            ratelist = js['data']['ratelistHtml']
            selector = etree.HTML(ratelist)
            content_field = selector.xpath('//li[@class="J-ratelist-item rate-list__item cf"]')
            conmentid_field = selector.xpath('//li[@class="J-ratelist-item rate-list__item cf"]/@data-rateid')

            itemlist = []
            imgslist = []
            imgsidlist = []
            conmentdate = []

            for each,conmentid in zip(content_field,conmentid_field):
                item = {}
                content = each.xpath('div[@class="review-content-wrapper"]/div[@class="J-normal-view"]/p/a/strong/text()')
                if len(content) > 0:
                    content2  = '[' + each.xpath('string()').replace('\n','').replace('  ','').split('[')[-1]
                else:
                    content2 =  each.xpath('div[@class="review-content-wrapper"]/div[@class="J-normal-view"]/p/text()')[0].replace('\n','').strip()

                headportrait = each.xpath('div[@class="user-info-block"]/div[@class="avatar-wrapper"]/img/@src')
                if len(headportrait) > 0:
                    headportrait2 = headportrait[0]
                else:
                    headportrait2 = 'NULL'

                nickname = each.xpath('div[@class="user-info-block"]/p[@class="name-wrapper"]/span/text()')
                if len(nickname) > 0:
                    nickname2 = nickname[0]
                else:
                    nickname2 = 'NULL'
                level = each.xpath('div[@class="user-info-block"]/p[@class="name-wrapper"]/span/i/@title')
                if len(level) > 0:
                    level2 = level[0].replace('width:','')
                else:
                    level2 = 'NULL'
                score = each.xpath('div[@class="review-content-wrapper"]/div/div/span/span/@style')
                if len(score) > 0:
                    score2 = score[0].replace('width:','')
                else:
                    score2 = 'NULL'
                date =  each.xpath('div[@class="review-content-wrapper"]/div/span[@class="time"]/text()')
                if len(date) > 0:
                    date2 = date[0]
                else:
                    date2 = 'NULL'

                goods = each.xpath('div[@class="review-content-wrapper"]/p[@class="deal-title"]/a/text()')
                if len(goods) > 0:
                    goods2 = goods[0]
                else:
                    goods2 = 'NULL'
                imgs = each.xpath('div[@class="review-content-wrapper"]/div[@class="J-normal-view"]/div/div/ul/li/a/img/@src')
                '''
                打包图片和评论ID
                '''
                if len(imgs) > 0:
                    for img in imgs:
                        imgsidlist.append(conmentid)
                        imgslist.append(img)

                item['nickname'] = nickname2.strip()
                item['headportrait'] = str(headportrait2).strip()
                item['userPower'] = level2.strip()
                item['star'] = str(int(float(score2.strip().replace('%',''))/100 * 5))
                timeArray = time.strptime(date2.strip(), "%Y-%m-%d")
                timeStamp = int(time.mktime(timeArray))
                item['standardDate'] = str(timeStamp)
                item['reviewContent'] = content2.strip()
                item['project_name'] = goods2.strip()
                item['reviewId'] = conmentid.strip()
                item['dates'] = self.dates
                item['dpShopid'] = shopid.strip()
                item['openurl'] = OpenUrl.strip()
                item['source'] = str(2)
                item['wsc_shop_name'] = shopname.decode('utf-8')
                item['wsc_shop_no'] = Wid
                conmentdate.append(date2)
                itemlist.append(item)
            return itemlist,imgslist,imgsidlist,conmentdate,shopid

    def CheckInfo(self,item, imgslist, imgsidlist,conmentdate,shopid):
        '''
        构造页码用日来判断
        '''
        proxys = proxy()
        Ipitem = proxys.IP()
        days = datetime.date.today() + datetime.timedelta(-5)
        print days
        if item:
            if min(conmentdate) >= str(days):
                print '插入【{}】店本页数据,下一页还有更新正在进行翻页处理'.format(shopid)
                self.FilterIngo(item, imgslist, imgsidlist)
                print u"开启爬取下一页数据"
                Url = item[0]['openurl']
                Wid = item[0]['wsc_shop_no']
                shopname = item[0]['wsc_shop_name']
                url = Url.split('offset=')[0]  + 'offset=' + str(int(Url.split('offset=')[1]) + 10)
                try:
                    item2, imgslist2, imgsidlist2,conmentdate2, shopid2 = self.GetComments(url,Wid,shopname, shopid, Ipitem)
                except:
                    print u"马上再试一次"
                    time.sleep(30)
                else:
                    self.CheckInfo(item2, imgslist2, imgsidlist2,conmentdate2,shopid2)
            else:
                print u"更新数据就在本页",min(conmentdate)
                self.FilterIngo(item, imgslist, imgsidlist)

    def FilterIngo(self,item, imgslist, imgsidlist):
        '''
        数据过滤
        '''
        item2 = []
        imgslist2 = []
        imgsidlist2 = []
        for i in item:
            sql1 = "SELECT reviewId FROM gz_comment_groupbuy WHERE reviewId = '" + i['reviewId'] + "' AND source = 2"
            try:
                self.db.set_character_set('utf8')
                self.cursor.execute(sql1)
            except:
                print u"MYSQL读取存在错误"
            else:
                results = self.cursor.fetchall()
                if len(results) == 0 :
                    item2.append(i)
        for j , z in zip(imgslist, imgsidlist):
            sql2 = "SELECT comment_id FROM gz_comment_photo WHERE comment_id = '" + z + "'"
            try:
                self.db.set_character_set('utf8')
                self.cursor.execute(sql2)
            except:
                print u"MYSQL读取存在错误"
            else:
                results = self.cursor.fetchall()
                if len(results) == 0:
                    imgslist2.append(j)
                    imgsidlist2.append(z)

        self.SaveMysql(item2, imgslist2, imgsidlist2, 'gz_comment_groupbuy')

    def SaveMysql(self, dictlist ,imgs,imgids, table):
        try:
            SqlList = []
            for my_dict in dictlist:
                self.db.set_character_set('utf8')
                cols = ','.join(my_dict.keys())
                values = '","'.join(my_dict.values())
                sql = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, '"'+values+'"')
                SqlList.append(sql)

            if len(imgs) > 0:
                a = str(2)
                for img,imgid in zip(imgs,imgids):
                    sqlimg = "INSERT INTO gz_comment_photo(comment_id,comment_photo,type) VALUES ('" + imgid + "', '" + img + "', '" + a + "')"
                    SqlList.append(sqlimg)

            for SqlInsert in SqlList:
                self.sql = SqlInsert
                # print self.sql
                try:
                    result = self.cursor.execute(self.sql)
                    insert_id = self.db.insert_id()
                    self.db.commit()
                    #判断是否执行成功
                    if result:
                        print "插入成功:%s" % insert_id
                    else:
                        print "插入为NULL"
                except MySQLdb.Error, e:
                            print(e)
                            print(MySQLdb.Error)
                            # 发生错误时回滚
                            self.db.rollback()
                            # 主键唯一，无法插入
                            if "key 'PRIMARY'" in e.args[1]:
                                print "数据已存在，未插入数据"
                            else:
                                print "插入数据失败，原因 %d: %s" % (e.args[0], e.args[1])
        except MySQLdb.Error, e:
            print "数据库错误，原因%d: %s" % (e.args[0], e.args[1])

if __name__ == '__main__':
    print u"爬取初始化....."
    print u"爬取初始化........"
    time.sleep(2)
    print u"---开始爬取评论---"
    InsertId = 1
    proxys = proxy()
    Ipitem = proxys.IP()
    spider = MtComment()
    CommentsUrls,CommentWmIds,CommentShopNames,GroupIds = spider.DefineCommentUrl()
    counts = len(CommentsUrls)
    for OpenUrl,Wid,shopname,GroupId in zip(CommentsUrls,CommentWmIds,CommentShopNames,GroupIds):
        # print OpenUrl,GroupId
        print InsertId
        print counts
        timesO = random.uniform(1, 1.5)
        time.sleep(timesO)
        if InsertId % 100 == 0:
            timesT = random.uniform(10, 30)
            time.sleep(timesT)
            try:
                item ,imgslist ,imgsidlist,conmentdate,shopid = spider.GetComments(OpenUrl,Wid,shopname,GroupId,Ipitem)
            except:
                print u"休息30秒再试一次"
                time.sleep(30)
            else:
                spider.CheckInfo(item, imgslist, imgsidlist, conmentdate, shopid)
        else:
            try:
                item ,imgslist ,imgsidlist,conmentdate,shopid = spider.GetComments(OpenUrl,Wid,shopname,GroupId,Ipitem)
            except:
                print u"休息30秒再试一次"
                time.sleep(30)
            else:
                spider.CheckInfo(item, imgslist, imgsidlist, conmentdate, shopid)
        InsertId += 1
        counts -= 1
    spider.db.close()

url使我们事先整理好的存储在mysql中。

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import MySQLdb

class ShopId:
    def __init__(self):
        self.db = MySQLdb.connect("192.168.22.6", "liulin2", "root", "spider")
        self.cursor = self.db.cursor()
        self.sql = ''

    def GetUrlId(self):
        List_shopid = []
        List_shop_id = []
        List_name = []
        self.sql = "SELECT shop_no,name,REPLACE(mshopid,'L','') as '"'mshopid'"' from dimension_table2 where mshopid <> '' and mshopid <> 0 and close = '0' and mshopid not in ('95335250','40973512','40000330','41490570','41059144','42043645','41730837','42748619','42445858','50024048','50257793','50685422','50811352','42709034','6850552','88064482','42054004','64357828','68484797','68677165','6685012','41540120','60896108','86458997','84264183','50715952','69440092','70170017','76671852','41492423','40682157','52975074','90781236','41056058','41917130','79808784','5041075','41385806','65791943','35290034','4799755','40207457','40179897','41260698','6559530','5107260','40324435','66298404','66073336','42378816','74661183','83161706')"
        # print(self.sql)
        try:
            # 执行SQL语句
            self.db.set_character_set('utf8')
            self.cursor.execute(self.sql)
            # 获取所有记录列表
            results = self.cursor.fetchall()
            for row in results:
                shopid = row[0]
                name = row[1]
                shop_id = row[2]
                List_shopid.append(shopid)
                List_name.append(name)
                List_shop_id.append(shop_id)
        except:
            print "Error: unable to fecth data"
        else:
            return (List_shopid,List_name,List_shop_id)
        finally:
            self.db.close()

上代理

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests

class proxy:

    def __init__(self):
        self.pageIndex = None
        self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        self.Connection = 'keep-alive'
        self.Accept_Language = 'zh-CN,zh;q=0.8'
        #初始化headers
        self.headers = {'User-Agent': self.user_agent,
                        'Connection': self.Connection,
                        'Accept-Language': self.Accept_Language,
                        }

    def test_proxy(self,ip):
        '''''测试代理是否可用，免费的代理挺不稳定的。使用率较低'''
        try:
            r = requests.get('http://t.dianping.com/',proxies={'https':ip},headers= self.headers)
            status = r.status_code
            #主要根据返回的状态码判断
        except:
            pass
        else:
            if status == 200:
                return ip
            else:
                return None

    def IP(self):
        item = []
        with open('proxys.txt', 'r') as a:
            x = a.read().strip()
            y = x.split('-')
            for i in y:
                item.append(i)
            return item

最后再来一个UA。

import random


user_agent_list = [\
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
    "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "  ,
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"  ,
]