一个简单的爬虫(二)

这个是第二爬虫,主要实现的是每日更新数据抓取新进入的评价。具体是这样的,我们需要美团每日的最新评价,也就是过去评价的不需要抓了。

上代码,核心代码。

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import urllib2
import requests
import time,datetime
import MySQLdb
from lxml import etree
import re
import json
from URL import ShopId
from proxies import proxy
import random
import socket
from UserAgent import user_agent_list

class MtComment:
    # 初始化方法,定义一些变量
    def __init__(self):
        self.pageIndex = None
        self.Connection = 'keep-alive'
        self.Accept_Language = 'zh-CN,zh;q=0.8'
        #初始化headers
        self.headers = ''
        self.proxy = ''
        # 存放程序是否继续运行的变量
        self.enable = False
        # connect MySQLdb
        self.db = MySQLdb.connect("IP", "username", "password", "database")
        #定义SQL对象
        self.sql = ''
        #cursor()方法获取操作游标
        self.cursor = self.db.cursor()
        #打开表
        self.of = open('proxy.txt', 'w+')
        #当日
        self.dates = str(datetime.date.today())
        #类初始化
        self.YunMei = ShopId()
        #代理
        self.Proxy = proxy()

    def DefineUrl(self):
        '''
        构造原始URL
        :return:
        '''
        DataUrl = []
        WmIds,ShopIds = self.YunMei.GetUrlId()
        for WmId,shopid in zip(WmIds,ShopIds):
            url = 'http://meituan.com/shop/{}'.format(shopid)
            DataUrl.append(url)
        return DataUrl

    def OpenUrl(self,url,shopid,item):
        '''
        下载网页
        '''
        i2 = random.choice(item)
        ip = self.Proxy.test_proxy(i2)
        referer =  'http://su.meituan.com/shop/' + str(shopid)
        self.user_agent = random.choice(user_agent_list)
        self.headers = {'User-Agent' :self.user_agent,
                        'Accept-Language': self.Accept_Language,
                        'Connection' : self.Connection,
                        'Host' : 'su.meituan.com',
                        'Accept-Encoding' : 'gzip, deflate, sdch',
                        'Referer' : referer,
                        'X-Requested-With' : 'XMLHttpRequest',
                        }
        try:
            # 构建请求的request
            timeout = 20
            socket.setdefaulttimeout(timeout)
            session = requests.session()
            request = session.get(url,proxies={'https':ip},timeout = 60, headers=self.headers)

        except requests.HTTPError,IOError:
            self.of.write('%s\n' % url)
            print u"链接美团失败,错误原因:%s" % url
            print u"IOERROR:%s" % IOError[url]
            time.sleep(10)
        except UnicodeDecodeError as e:
            print('-----UnicodeDecodeErrorurl:',url),e
        except requests.exceptions:
            print 'Connection aborted'
        except socket.timeout as e:
            print("-----socket timout:",url),e
        else:
            if request.url.split('.',1)[1] == 'meituan.com/':
                print ("跳转了,这点店已经不存在了")
            else:
                if request.status_code == 200:
                    pageCode = request.text
                    return pageCode
                else:
                    return None

    def DefineCommentUrl(self):
        '''
        构造评论URL地址,如果程序断开根据数据库中的URL可以去重
        :return:
        '''
        result = []
        # self.sql = "SELECT DISTINCT openurl FROM gz_comment_groupbuy WHERE dates = '2016-08-01'"
        self.sql = "SELECT DISTINCT openurl FROM gz_comment_groupbuy WHERE dates = '" + self.dates + "'"
        try:
            # 执行SQL语句
            self.db.set_character_set('utf8')
            self.cursor.execute(self.sql)
            # 获取所有记录列表
            results = self.cursor.fetchall()
        except:
            print u"MYSQL读取存在错误"
        else:
            for i in results:
                result.append(i[0].strip())
            CommentUrls = []
            CommentShopName = []
            CommentWmId = []
            GroupId = []
            WmIds,ShopNames, ShopIds = self.YunMei.GetUrlId()
            for WmId,ShopName, shopid in zip(WmIds,ShopNames, ShopIds):
            # Urls,MaxPages,Ids = self.GetMaxPageNumber()
            # for Url,MaxPage,Id in zip(Urls,MaxPages,Ids):
                Url = 'http://su.meituan.com/deal/feedbacklist/0/{}/all/0/time/40?limit=10&showpoititle=0&offset=0'.format(shopid)
                url = Url.split('offset=0')[0] + 'offset='
                for i in range(0,int(0) + 10,10):
                    EndUrl = url + str(i)
                    if EndUrl not in result:
                        CommentUrls.append(EndUrl)
                        CommentWmId.append(WmId)
                        CommentShopName.append(ShopName)
                        GroupId.append(str(shopid))
                    else:
                        pass
            return CommentUrls,CommentWmId,CommentShopName,GroupId

    def saveImg(self,imageURL):
        '''
        打印图片
        :param imageURL:
        :return:
        '''
        u = urllib2.urlopen(imageURL)
        data = u.read()
        print u"正在悄悄保存她的一张图片为"
        return data

    def getCurrentTime(self):
        return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time()))

    #获取当前时间
    def getCurrentDate(self):
        return time.strftime('%Y-%m-%d',time.localtime(time.time()))

    def GetComments(self,OpenUrl,Wid,shopname,shopid,items):
        '''
        获取评论解析模块
        :return:
        '''
        items = self.Proxy.IP()
        try:
            JsInfo = self.OpenUrl(OpenUrl,shopid,items)
        except:
            print u"请求超时 休息一下再试试"
            time.sleep(10)
            exit()
        if JsInfo != None:
            js = json.loads(JsInfo)
            ratelist = js['data']['ratelistHtml']
            selector = etree.HTML(ratelist)
            content_field = selector.xpath('//li[@class="J-ratelist-item rate-list__item cf"]')
            conmentid_field = selector.xpath('//li[@class="J-ratelist-item rate-list__item cf"]/@data-rateid')

            itemlist = []
            imgslist = []
            imgsidlist = []
            conmentdate = []

            for each,conmentid in zip(content_field,conmentid_field):
                item = {}
                content = each.xpath('div[@class="review-content-wrapper"]/div[@class="J-normal-view"]/p/a/strong/text()')
                if len(content) > 0:
                    content2  = '[' + each.xpath('string()').replace('\n','').replace('  ','').split('[')[-1]
                else:
                    content2 =  each.xpath('div[@class="review-content-wrapper"]/div[@class="J-normal-view"]/p/text()')[0].replace('\n','').strip()

                headportrait = each.xpath('div[@class="user-info-block"]/div[@class="avatar-wrapper"]/img/@src')
                if len(headportrait) > 0:
                    headportrait2 = headportrait[0]
                else:
                    headportrait2 = 'NULL'

                nickname = each.xpath('div[@class="user-info-block"]/p[@class="name-wrapper"]/span/text()')
                if len(nickname) > 0:
                    nickname2 = nickname[0]
                else:
                    nickname2 = 'NULL'
                level = each.xpath('div[@class="user-info-block"]/p[@class="name-wrapper"]/span/i/@title')
                if len(level) > 0:
                    level2 = level[0].replace('width:','')
                else:
                    level2 = 'NULL'
                score = each.xpath('div[@class="review-content-wrapper"]/div/div/span/span/@style')
                if len(score) > 0:
                    score2 = score[0].replace('width:','')
                else:
                    score2 = 'NULL'
                date =  each.xpath('div[@class="review-content-wrapper"]/div/span[@class="time"]/text()')
                if len(date) > 0:
                    date2 = date[0]
                else:
                    date2 = 'NULL'

                goods = each.xpath('div[@class="review-content-wrapper"]/p[@class="deal-title"]/a/text()')
                if len(goods) > 0:
                    goods2 = goods[0]
                else:
                    goods2 = 'NULL'
                imgs = each.xpath('div[@class="review-content-wrapper"]/div[@class="J-normal-view"]/div/div/ul/li/a/img/@src')
                '''
                打包图片和评论ID
                '''
                if len(imgs) > 0:
                    for img in imgs:
                        imgsidlist.append(conmentid)
                        imgslist.append(img)

                item['nickname'] = nickname2.strip()
                item['headportrait'] = str(headportrait2).strip()
                item['userPower'] = level2.strip()
                item['star'] = str(int(float(score2.strip().replace('%',''))/100 * 5))
                timeArray = time.strptime(date2.strip(), "%Y-%m-%d")
                timeStamp = int(time.mktime(timeArray))
                item['standardDate'] = str(timeStamp)
                item['reviewContent'] = content2.strip()
                item['project_name'] = goods2.strip()
                item['reviewId'] = conmentid.strip()
                item['dates'] = self.dates
                item['dpShopid'] = shopid.strip()
                item['openurl'] = OpenUrl.strip()
                item['source'] = str(2)
                item['wsc_shop_name'] = shopname.decode('utf-8')
                item['wsc_shop_no'] = Wid
                conmentdate.append(date2)
                itemlist.append(item)
            return itemlist,imgslist,imgsidlist,conmentdate,shopid

    def CheckInfo(self,item, imgslist, imgsidlist,conmentdate,shopid):
        '''
        构造页码用日来判断
        '''
        proxys = proxy()
        Ipitem = proxys.IP()
        days = datetime.date.today() + datetime.timedelta(-5)
        print days
        if item:
            if min(conmentdate) >= str(days):
                print '插入【{}】店本页数据,下一页还有更新正在进行翻页处理'.format(shopid)
                self.FilterIngo(item, imgslist, imgsidlist)
                print u"开启爬取下一页数据"
                Url = item[0]['openurl']
                Wid = item[0]['wsc_shop_no']
                shopname = item[0]['wsc_shop_name']
                url = Url.split('offset=')[0]  + 'offset=' + str(int(Url.split('offset=')[1]) + 10)
                try:
                    item2, imgslist2, imgsidlist2,conmentdate2, shopid2 = self.GetComments(url,Wid,shopname, shopid, Ipitem)
                except:
                    print u"马上再试一次"
                    time.sleep(30)
                else:
                    self.CheckInfo(item2, imgslist2, imgsidlist2,conmentdate2,shopid2)
            else:
                print u"更新数据就在本页",min(conmentdate)
                self.FilterIngo(item, imgslist, imgsidlist)

    def FilterIngo(self,item, imgslist, imgsidlist):
        '''
        数据过滤
        '''
        item2 = []
        imgslist2 = []
        imgsidlist2 = []
        for i in item:
            sql1 = "SELECT reviewId FROM gz_comment_groupbuy WHERE reviewId = '" + i['reviewId'] + "' AND source = 2"
            try:
                self.db.set_character_set('utf8')
                self.cursor.execute(sql1)
            except:
                print u"MYSQL读取存在错误"
            else:
                results = self.cursor.fetchall()
                if len(results) == 0 :
                    item2.append(i)
        for j , z in zip(imgslist, imgsidlist):
            sql2 = "SELECT comment_id FROM gz_comment_photo WHERE comment_id = '" + z + "'"
            try:
                self.db.set_character_set('utf8')
                self.cursor.execute(sql2)
            except:
                print u"MYSQL读取存在错误"
            else:
                results = self.cursor.fetchall()
                if len(results) == 0:
                    imgslist2.append(j)
                    imgsidlist2.append(z)

        self.SaveMysql(item2, imgslist2, imgsidlist2, 'gz_comment_groupbuy')

    def SaveMysql(self, dictlist ,imgs,imgids, table):
        try:
            SqlList = []
            for my_dict in dictlist:
                self.db.set_character_set('utf8')
                cols = ','.join(my_dict.keys())
                values = '","'.join(my_dict.values())
                sql = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, '"'+values+'"')
                SqlList.append(sql)

            if len(imgs) > 0:
                a = str(2)
                for img,imgid in zip(imgs,imgids):
                    sqlimg = "INSERT INTO gz_comment_photo(comment_id,comment_photo,type) VALUES ('" + imgid + "', '" + img + "', '" + a + "')"
                    SqlList.append(sqlimg)

            for SqlInsert in SqlList:
                self.sql = SqlInsert
                # print self.sql
                try:
                    result = self.cursor.execute(self.sql)
                    insert_id = self.db.insert_id()
                    self.db.commit()
                    #判断是否执行成功
                    if result:
                        print "插入成功:%s" % insert_id
                    else:
                        print "插入为NULL"
                except MySQLdb.Error, e:
                            print(e)
                            print(MySQLdb.Error)
                            # 发生错误时回滚
                            self.db.rollback()
                            # 主键唯一,无法插入
                            if "key 'PRIMARY'" in e.args[1]:
                                print "数据已存在,未插入数据"
                            else:
                                print "插入数据失败,原因 %d: %s" % (e.args[0], e.args[1])
        except MySQLdb.Error, e:
            print "数据库错误,原因%d: %s" % (e.args[0], e.args[1])

if __name__ == '__main__':
    print u"爬取初始化....."
    print u"爬取初始化........"
    time.sleep(2)
    print u"---开始爬取评论---"
    InsertId = 1
    proxys = proxy()
    Ipitem = proxys.IP()
    spider = MtComment()
    CommentsUrls,CommentWmIds,CommentShopNames,GroupIds = spider.DefineCommentUrl()
    counts = len(CommentsUrls)
    for OpenUrl,Wid,shopname,GroupId in zip(CommentsUrls,CommentWmIds,CommentShopNames,GroupIds):
        # print OpenUrl,GroupId
        print InsertId
        print counts
        timesO = random.uniform(1, 1.5)
        time.sleep(timesO)
        if InsertId % 100 == 0:
            timesT = random.uniform(10, 30)
            time.sleep(timesT)
            try:
                item ,imgslist ,imgsidlist,conmentdate,shopid = spider.GetComments(OpenUrl,Wid,shopname,GroupId,Ipitem)
            except:
                print u"休息30秒再试一次"
                time.sleep(30)
            else:
                spider.CheckInfo(item, imgslist, imgsidlist, conmentdate, shopid)
        else:
            try:
                item ,imgslist ,imgsidlist,conmentdate,shopid = spider.GetComments(OpenUrl,Wid,shopname,GroupId,Ipitem)
            except:
                print u"休息30秒再试一次"
                time.sleep(30)
            else:
                spider.CheckInfo(item, imgslist, imgsidlist, conmentdate, shopid)
        InsertId += 1
        counts -= 1
    spider.db.close()






url使我们事先整理好的存储在mysql中。

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import MySQLdb

class ShopId:
    def __init__(self):
        self.db = MySQLdb.connect("192.168.22.6", "liulin2", "root", "spider")
        self.cursor = self.db.cursor()
        self.sql = ''

    def GetUrlId(self):
        List_shopid = []
        List_shop_id = []
        List_name = []
        self.sql = "SELECT shop_no,name,REPLACE(mshopid,'L','') as '"'mshopid'"' from dimension_table2 where mshopid <> '' and mshopid <> 0 and close = '0' and mshopid not in ('95335250','40973512','40000330','41490570','41059144','42043645','41730837','42748619','42445858','50024048','50257793','50685422','50811352','42709034','6850552','88064482','42054004','64357828','68484797','68677165','6685012','41540120','60896108','86458997','84264183','50715952','69440092','70170017','76671852','41492423','40682157','52975074','90781236','41056058','41917130','79808784','5041075','41385806','65791943','35290034','4799755','40207457','40179897','41260698','6559530','5107260','40324435','66298404','66073336','42378816','74661183','83161706')"
        # print(self.sql)
        try:
            # 执行SQL语句
            self.db.set_character_set('utf8')
            self.cursor.execute(self.sql)
            # 获取所有记录列表
            results = self.cursor.fetchall()
            for row in results:
                shopid = row[0]
                name = row[1]
                shop_id = row[2]
                List_shopid.append(shopid)
                List_name.append(name)
                List_shop_id.append(shop_id)
        except:
            print "Error: unable to fecth data"
        else:
            return (List_shopid,List_name,List_shop_id)
        finally:
            self.db.close()



上代理

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests

class proxy:

    def __init__(self):
        self.pageIndex = None
        self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        self.Connection = 'keep-alive'
        self.Accept_Language = 'zh-CN,zh;q=0.8'
        #初始化headers
        self.headers = {'User-Agent': self.user_agent,
                        'Connection': self.Connection,
                        'Accept-Language': self.Accept_Language,
                        }

    def test_proxy(self,ip):
        '''''测试代理是否可用,免费的代理挺不稳定的。使用率较低'''
        try:
            r = requests.get('http://t.dianping.com/',proxies={'https':ip},headers= self.headers)
            status = r.status_code
            #主要根据返回的状态码判断
        except:
            pass
        else:
            if status == 200:
                return ip
            else:
                return None

    def IP(self):
        item = []
        with open('proxys.txt', 'r') as a:
            x = a.read().strip()
            y = x.split('-')
            for i in y:
                item.append(i)
            return item
最后再来一个UA。

import random


user_agent_list = [\
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
    "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "  ,
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"  ,
]




创建一个简单的Python爬虫框架,我们可以使用`requests`和`BeautifulSoup`这两个库。下面将指导您构建这样一个基本框架。 ### 第一步:安装必要的库 首先,你需要在你的环境中安装`requests` 和 `beautifulsoup4`。你可以使用pip命令安装它们: ```bash pip install requests beautifulsoup4 ``` ### 第步:设计框架结构 我们定义一个基础的爬虫类,这个类会封装请求网页、解析HTML以及存储数据的功能。 #### 类结构说明: 1. **初始化方法** (`__init__`):设置默认参数,比如超时时间、请求头部信息等。 2. **获取网页源码** (`get_html(url)`):发送HTTP GET请求并返回响应的内容。 3. **解析HTML** (`parse_html(html_content)`):使用BeautifulSoup解析HTML内容,提取有用的数据。 4. **存储数据** (`store_data(data)`):根据需求将数据保存到文件或其他数据库。 ### 实现代码 ```python import requests from bs4 import BeautifulSoup class SimpleSpider: def __init__(self, timeout=5, headers={'User-Agent': 'Mozilla/5.0'}): self.timeout = timeout self.headers = headers def get_html(self, url): try: response = requests.get(url, headers=self.headers, timeout=self.timeout) if response.status_code == 200: return response.text else: print(f"Failed to get the HTML content with status code {response.status_code}") return None except Exception as e: print(f"Error occurred while getting the HTML content: {e}") return None def parse_html(self, html_content): soup = BeautifulSoup(html_content, "html.parser") # 这里假设页面有一个特定的标签用于抓取数据,例如所有链接 links = [a['href'] for a in soup.find_all('a')] return links def store_data(self, data): filename = "data.txt" with open(filename, 'w') as file: for item in data: file.write("%s\n" % item) # 使用示例 spider = SimpleSpider() url = "https://example.com" # 目标网站URL content = spider.get_html(url) if content is not None: parsed_data = spider.parse_html(content) spider.store_data(parsed_data) else: print("Could not fetch and process the HTML content.") ``` ### 注意事项: - 确保遵守目标网站的`robots.txt`规则和版权法律。 - 对于更复杂的爬虫,你可能还需要处理JavaScript渲染的页面、登录认证等问题。 - 考虑使用异步请求来提高速度,可以使用`asyncio`和`aiohttp`库。 以上就是一个基础的Python爬虫框架的实现,您可以根据实际需求调整和扩展功能。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值