python爬虫学习(下)——爬虫代码实现

上篇分析出了数据获取的完整路径,下面对应介绍具体的代码实现


注:代码说明、我的一些总结心得都放到了代码注释里


整个程序主要由以下几个类组成:

Class Car:汽车模型,存储每个车的信息

Class CarFactory:传入获取的网络数据,生产出汽车模型

Class CarSpider:爬虫的主体类,串联整个业务

Class DataSaver:负责数据库操作,数据存储

Class RequestThread:后期我把请求改成了多线程操作,引入了这个类


具体代码及注释:

#CarSpider对象负责抓取数据
spider = CarSpider()
cars = spider.getDatas()

#DataSaver对象负责入库
dataSaver = DataSaver()
dataSaver.updateCarsData(cars)
print "finish"

CarSpider:
class CarSpider:

    def __init__(self):
        #类属性要放到init中
        self.allCars = []

    #对外暴露函数
    def getDatas(self):
        # 根据首字母得到所有车系列表
        carList = self.__getCarSeriesListByInitialChar()
        # 从车型列表中提取出每个车系首页的url
        carUrlList = self.__getAllCarSeriesIndexUrl(carList)
        # 解析出车系配置详情页面url
        urlQueue = self.__getCarSeriesInfoUrls(carUrlList)
        # 获取车辆详情
        cars = self.__getCarsBycarSeriesInfoUrls(urlQueue)
        return cars


    #gzip解压(有的页面是经过gzip压缩的,gzip解压)
    #私有方法,方法名以__开头
    def __gzipDecode(self,response):
        if response.info().get('Content-Encoding') == 'gzip':
            compressedstream = StringIO.StringIO(response.read())
            gziper = gzip.GzipFile(fileobj=compressedstream)
            result = gziper.read()
            return result
        else:
            return response.read()

    #根据首字母得到所有车系列表
    def __getCarSeriesListByInitialChar(self):
        dataList = []
        for i in range(65,91):
            url = 'http://www.autohome.com.cn/grade/carhtml/%c.html'%(chr(i))
            try:
                response = urllib2.urlopen(url)
            except urllib2.HTTPError,e:
                print e.reason
            zipDecodeData = self.__gzipDecode(response)
            data = zipDecodeData.decode("gbk")#返回的数据有中文,需要经过编码转换
            dataList.append(data)
        return dataList

    #从车型列表中提取出每个车系首页的url
    def __getAllCarSeriesIndexUrl(self,dataList):
        carInfoUrlList = []
        #()分组的概念!!!
        pattern = re.compile(r'<h4.*><a href="(.*)">.*</a></h4>', re.M)
        for data in dataList:
            carList = re.findall(pattern, data)
            for carUrl in carList:
                if carUrl.find('greylink') == -1:#greylink意味着车系信息没有价值(不全或陈旧),丢弃
                    carInfoUrlList.append(carUrl)
        return carInfoUrlList

    #解析出车系配置详情页面
    def __getCarSeriesInfoUrls(self,carUrlList):
        urlQueue = Queue()
        for carUrl in carUrlList:
            #从车系首页url中提取车系id
            pattern = re.compile(r'[0-9]+',re.M)
            carId = re.search(pattern,carUrl).group()
            #根绝总结的规律,拼出配置详情页url
            carSeriesInfoUrl = 'http://car.autohome.com.cn/config/series/%d.html'%(int(carId))
            #print carSeriesInfoUrl
            urlQueue.put(carSeriesInfoUrl)
        return urlQueue

    # 获取车辆详情
    def __getCarsBycarSeriesInfoUrls(self,urlQueue):
        #Queue,线程安全,有阻塞机制
        ouputDataQueue = Queue()
        threads = []
        for i in range(0,4):
            #查了一下,python的多线程是伪多线程,把计算放到线程里意义不大,只把请求放到线程中去做
            requestThread = RequestThread(i,urlQueue,ouputDataQueue)
            threads.append(requestThread)
            #不能在此处join,此处join后,循环是在当前线程中的,下次循环也会被阻塞住
            #requestThread.join()
            requestThread.start()

        for requestThread in threads:
            requestThread.join()

        while ouputDataQueue.qsize() != 0:
            outputData = ouputDataQueue.get()[0]
            seriesInfoUrl = ouputDataQueue.get()[1]
            data = outputData.decode("gbk")
            pattern = re.compile(r'var config = ({.*};)', re.M)
            result = re.findall(pattern, data)
            if len(result) > 0:
                infoJsonStr = result[0][0:-1]
                #json转换
                infoDict = json.loads(infoJsonStr)
                carFactory = CarFactory(infoDict, seriesInfoUrl)
                # carfactory依据infoDict生成car模型
                cars = carFactory.analysisData()
                for car in cars:
                    self.allCars.append(car)
            else:
                continue
            #从队列里get()数据后,数据使用完后要调用task_done(),让队列取消阻塞
            ouputDataQueue.task_done()
        return self.allCars


CarFactory:
class CarFactory:

    def __init__(self,carsData,carSeriesUrl):
        self.carsData = carsData
        self.carSeriesUrl = carSeriesUrl
        self.cars = []
        self.carsNum = 0

    def __setCarsParam(self,param,values):
        for i in range(self.carsNum):
            car = self.cars[i]
            value = values[i]["value"]
            # 反射机制
            setattr(car,param,value)

    def analysisData(self):
        #一个车系中会有多种车型的信息
        specsList = self.carsData["result"]["specsList"]
        for spec in specsList:
            car = Car();
            car.specid = spec["specid"]
            car.infoUrl = self.carSeriesUrl
            self.cars.append(car)
        self.carsNum = len(self.cars)

        paramTypeItems = self.carsData["result"]["paramtypeitems"]
        for paramTypeItem in paramTypeItems:
            paramTypeName = paramTypeItem["name"]
            paramItems = paramTypeItem["paramitems"]
            if paramTypeName == u"基本参数":
                for param in paramItems:
                    paramName = param["name"]
                    values = param["valueitems"]
                    if paramName == u"厂商":
                        self.__setCarsParam("family",values)
                    .......

            elif paramTypeName == u"发动机":
                for param in paramItems:
                    paramName = param["name"]
                    values = param["valueitems"]
                    if paramName == u"排量(L)":
                        self.__setCarsParam("sv", values)
                    elif paramName == u"最大马力(Ps)":
                        self.__setCarsParam("hpower", values)
                    ..........

                    ..........

                    ..........

        return self.cars

Car:

#python中的枚举实现方式之一
class FuelType:
    Gasoline = 0
    Diesel = 1

class GearType:
    MT = 0
    AUTO = 1
    DCT = 2
    CVT = 3
........

class Car:

    def __init__(self):
        #实例变量要定义在init中
        #self.xxx的调用也会触发__setattr__方法
        self.specid = 0L  # id,long型
        self.family = None  # 厂牌
        self.name = None  # 车型
        self.price = u"0万"  # 价格
        self.level = None  # 级别
        self.maxSpeed = 0  # 最高时速
	..........

    #因为CarFactory中用了反射进行属性赋值,而python中每个属性没有独立对应的默认set方法,所以只能把逻辑都写在__setattr__里了
    def __setattr__(self, key, value):
        if key == 'specid':
            self.__dict__[key] = int(value)
        elif key == 'isTurbo':
            if value.find(u'自然') != -1:
                # __dict__ 返回的是一个字典,它的键(key)是属性名,键值(value)是相应的属性对象的数据值
                self.__dict__[key] = False
            else:
                self.__dict__[key] = True
        elif key == 'price':
            #python里数量词默认是贪婪的(在少数语言里也可能是默认非贪婪),总是尝试匹配尽可能多的字符;非贪婪则相反,总是尝试匹配尽可能少的字符。在"*","?","+","{m,n}"后面加上?,使贪婪变成非贪婪。
            pattern = re.compile(ur'(.*?)万.*', re.M)
            result = re.findall(pattern,value)
            if len(result) > 0:
                self.__dict__[key] = float(result[0])
            else:
                self.__dict__[key] = float(0)
        elif key == 'fuleType':
            if value.find(u'汽油') != -1:
                self.__dict__[key] = FuelType.Gasoline
            else:
                self.__dict__[key] = FuelType.Diesel
        elif key == 'gearType':
            pattern = re.compile(ur'手动|MT', re.M)
            result = re.search(pattern,value)
            if result != None:
                self.__dict__[key] = GearType.MT
            else:
                pattern = re.compile(ur'双离合|DSG|DCT|PDK|tronic|MDKG|power shift', re.M)
                result = re.search(pattern, value)
                if result != None:
                    self.__dict__[key] = GearType.DCT
                else:
                    pattern = re.compile(ur'无极|CVT', re.M)
                    result = re.search(pattern, value)
                    if result != None:
                        self.__dict__[key] = GearType.CVT
                    else:
                        self.__dict__[key] = GearType.AUTO
        ..........
        else:
            try:
                number = float(value)
                self.__dict__[key] = number
            except StandardError:
                self.__dict__[key] = value

RequestThread:

class RequestThread(threading.Thread):
    
    def __init__(self,threadId,queue,outputQueue):
        threading.Thread.__init__(self)
        self.threadId = threadId
        self.queue = queue
        self.outputQueue = outputQueue


    def run(self):
	#从url队列中取出一个尚未处理的url
        while self.queue.qsize() != 0:
            url = self.queue.get()
            self.queue.task_done()#get配合task_done,告知queue,内容已取出,可以不阻塞了
            print "%d: url:%s %d\n" % (self.threadId, url, self.queue.qsize())
            try:
                response = urllib2.urlopen(url)
                #print id(response) #response会重复???
                #此处返回data外,还将车系url返回(url会存回数据库)
                data = self.gzipDecode(response)
                self.outputQueue.put((data,url))

            except urllib2.URLError,e:
                print e.reason


DataSaver:

class DataSaver:

    def __init__(self):
        self.db = sqlite3.connect("CarDB.sqlite")
        print self.db


    def updateCarsData(self,cars):
        #先删除之前所有数据,简单粗暴
        self.db.execute("delete from Cars")
        for car in cars:
            self.db.execute("insert into Cars (specid,name,family,price,level,maxSpeed,accelerate,sv,hpower,mpower) VALUES (?,?,?,?,?,?,?,?,?,?)",[car.specid,car.name,car.family,car.price,car.level,car.maxSpeed,car.accelerate,
                                                                                           car.sv,car.hpower,car.mpower])
            self.db.commit()

        self.db.close()

以上便是爬虫的主要代码,本人刚刚接触python,文中若有错误或不妥之处,望大家多多指教,谢谢。









  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值