拍拍贷爬虫_爬虫数据拍拍贷-CSDN博客

本文链接：https://blog.csdn.net/HMMPO/article/details/78585179
#-*- coding:utf-8 -*-

import urllib2
import time
import xlwt
import re
import csv
from lxml import etree
from threading import Thread
from Queue import Queue
import random
import socket
import ip_pool

URL_EXIT = False
PARSE_EXIT = False
COLLECT_EXIT = False
OUTPUT_EXIT = False


class urlCollect(Thread):
    def __init__(self, urlQueue, pageQueue):
        super(urlCollect, self).__init__()
        self.urlQueue = urlQueue
        self.pageQueue = pageQueue
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1"
        }
    def run(self):
        while not URL_EXIT:
            try:
                url = self.urlQueue.get(False)
                request = urllib2.Request(url, headers=self.headers)
                time.sleep(3)
                response = urllib2.urlopen(request)
                text = response.read()
                pattern = re.compile(r'<a class="title ell" target="_blank" href="(.*?)"')
                links = pattern.findall(text)
                for link in links:
                   self.pageQueue.put(link)

            except:
                pass


class pageCollect(Thread):

    def __init__(self, pageQueue, dataQueue):
        super(pageCollect, self).__init__()
        self.pageQueue = pageQueue
        self.dataQueue = dataQueue
        # self.proxy_pool = proxy_pool
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1"
        }

    def run(self):
        while not COLLECT_EXIT:
            try:
                time.sleep(4)
                # proxy_ip = random.choice(self.proxy_pool)
                url = self.pageQueue.get(False)
                # for proxy_ip in self.proxy_pool:
                # proxy_handler = urllib2.ProxyHandler({"http": proxy_ip})
                # opener = urllib2.build_opener(proxy_handler)
                # urllib2.install_opener(opener)
                request = urllib2.Request(url, headers=self.headers)
                response = urllib2.urlopen(request)
                self.dataQueue.put(response.read())
                # proxy_handler = urllib2.ProxyHandler({"http":proxy_ip})
                # opener = urllib2.build_opener(proxy_handler)
                # urllib2.install_opener(opener)
                # try:
                #
                #     request = urllib2.Request(url, headers=self.headers, proxies=proxy_ip)
                #     response = urllib2.urlopen(request)
                #     self.dataQueue.put(response.read())
                # except:
                #     self.pageQueue.put(url)

            except:
                pass


class contentParse(Thread):

    def __init__(self, dataQueue, resultQueue):
        super(contentParse, self).__init__()
        self.dataQueue = dataQueue
        self.resultQueue = resultQueue
    def run(self):
        i = 1
        while not PARSE_EXIT:
            try:
                result = {}
                text = self.dataQueue.get(False)
                html = etree.HTML(text)
                # price
                result['price'] = html.xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/dl[1]/dd/text()')[0]

                # rate
                result['rate'] = html.xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/dl[2]/dd/text()')[0]

                # date
                result['date'] = html.xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/dl[3]/dd/text()')[0]+html.xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/dl[3]/dd/em/text()')[0]

                # sex
                result['sex'] = html.xpath('/html/body/div[3]/div[3]/div[1]/div/div[1]/p[1]/span/text()')[0]

                # age
                result['age'] = html.xpath('/html/body/div[3]/div[3]/div[1]/div/div[1]/p[2]/span/text()')[0]
                # 文化程度
                result['wenhua'] = html.xpath('/html/body/div[3]/div[3]/div[1]/div/div[2]/p[1]/span/text()')[0]
                # 还款方式 去空格
                result['func'] = html.xpath('/html/body/div[3]/div[2]/div/div[1]/div[3]/div[1]/div/text()')[0].strip()
                # 认证 只判断第一个
                result['renzheng'] = html.xpath('/html/body/div[3]/div[3]/div[2]/ul/li[1]/text()')[0]

                # 成功次数
                result['times'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[1]/p[1]/span/text()')[0]
                # first time
                result['first'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[1]/p[2]/span/text()')[0]
                # history

                result['history'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/p[2]/span/text()')[0]
                # 成功还款次数
                result['succ'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/p[4]/span/text()')[0]
                # 还清次数
                xianshi = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[4]/span/text()')
                if len(xianshi) == 0:
                    result['huanqing'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[4]/p[1]/span/text()')[0]
                    result['yuqi1'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[4]/p[2]/span/text()')[0]
                    result['yuqi2'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[4]/p[3]/span/text()')[0]
                    weilai = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[5]/text()')
                    guoqu = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/text()')

                    if len(weilai) == 2:
                        result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[5]/p/span/text()')[0]
                        result['daihuan'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/p[1]/span/text()')[0]
                        result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/p[2]/span/text()')[0].strip()
                        result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[1]/span/text()')[0]
                        result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[2]/span/text()')[0]


                    else:
                        if len(guoqu) == 2:
                            result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/p/span/text()')[0]
                            result['daihuan'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[1]/span/text()')[0]
                            result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[2]/span/text()')[0].strip()
                            result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[1]/span/text()')[0]
                            result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[2]/span/text()')[0]
                        else:
                            result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p/span/text()')[0]
                            result['daihuan'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[1]/span/text()')[0]
                            result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[2]/span/text()')[0].strip()
                            result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[9]/p[1]/span/text()')[0]
                            result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[9]/p[2]/span/text()')[0]

                else:
                    result['huanqing'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[5]/p[1]/span/text()')[0]
                    result['yuqi1'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[5]/p[2]/span/text()')[0]
                    result['yuqi2'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[5]/p[3]/span/text()')[0]
                    weilai = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/text()')
                    guoqu = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/text()')
                    if len(weilai) == 2:
                        result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/p/span/text()')[0]
                        result['daihuan'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[1]/span/text()')[0]
                        result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[2]/span/text()')[0].strip()
                        result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[1]/span/text()')[0]
                        result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[2]/span/text()')[0]
                    else:
                        if len(guoqu) == 2:
                            result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p/span/text()')[0]
                            result['daihuan'] = html.xpath('//html/body/div[3]/div[3]/div[3]/div/div[8]/p[1]/span/text()')[0]
                            result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[2]/span/text()')[0].strip()
                            result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[9]/p[1]/span/text()')[0]
                            result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[9]/p[2]/span/text()')[0]
                        else:
                            result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p/span/text()')[0]
                            result['daihuan'] = html.xpath('//html/body/div[3]/div[3]/div[3]/div/div[9]/p[1]/span/text()')[0]
                            result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[9]/p[2]/span/text()')[0].strip()
                            result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[10]/p[1]/span/text()')[0]
                            result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[10]/p[2]/span/text()')[0]
                i = i + 1
                print i
                self.resultQueue.put(result)

            except:
                pass


class parseData:

    def __init__(self, resultQueue):
        self.resultQueue = resultQueue


    def parse(self):
        i = 1
        book = xlwt.Workbook(encoding='utf-8', style_compression=0)
        sheet = book.add_sheet('data', cell_overwrite_ok=True)
        sheet.write(0, 0, '借贷金额')
        sheet.write(0, 1, '年利率')
        sheet.write(0, 2, '期限')
        sheet.write(0, 3, '性别')
        sheet.write(0, 4, '年龄')
        sheet.write(0, 5, '文化程度')
        sheet.write(0, 6, '还款方式')
        sheet.write(0, 7, '学历认证')
        sheet.write(0, 8, '身份证认证')
        sheet.write(0, 9, '手机认证')
        sheet.write(0, 10, '成功借款次数')
        sheet.write(0, 11, '第一次成功借款时间')
        sheet.write(0, 12, '历史记录')
        sheet.write(0, 13, '成功还款次数')
        sheet.write(0, 14, '正常还清次数')
        sheet.write(0, 15, '逾期0-15天')
        sheet.write(0, 16, '逾期15天以上')
        sheet.write(0, 17, '累计借款金额')
        sheet.write(0, 18, '待还金额')
        sheet.write(0, 19, '待收金额')
        sheet.write(0, 20, '单笔最高借款金额')
        sheet.write(0, 21, '历史最高负债')
        while True:
            try:

                result = self.resultQueue.get(False)
                #print result
                sheet.write(i, 0, result['price'])
                sheet.write(i, 1, result['rate'])
                sheet.write(i, 2, result['date'])
                sheet.write(i, 3, result['sex'])
                sheet.write(i, 4, result['age'])
                sheet.write(i, 5, result['wenhua'])
                sheet.write(i, 6, result['func'])
                if result['renzheng'].encode('utf-8') == '学历认证':
                    sheet.write(i, 7, '学历认证')
                else:
                    sheet.write(i, 7, " ")
                sheet.write(i, 8, '身份证认证')
                sheet.write(i, 9, '手机认证')
                sheet.write(i, 10, result['times'])
                sheet.write(i, 11, result['first'])
                sheet.write(i, 12, result['history'])
                sheet.write(i, 13, result['succ'])
                sheet.write(i, 14, result['huanqing'])
                sheet.write(i, 15, result['yuqi1'])
                sheet.write(i, 16, result['yuqi2'])
                sheet.write(i, 17, result['leiji'])
                sheet.write(i, 18, result['daihuan'])
                sheet.write(i, 19, result['daishou'])
                sheet.write(i, 20, result['danbi'])
                sheet.write(i, 21, result['zuigao'].strip())
                i += 1

            except:
                break
        book.save('result1.xls')




def main(pn):

    # proxy_pool = []
    # f = open('enable.txt', "r")
    # ip_pool= f.readlines()
    # for ip in ip_pool:
    #     proxy_pool.append(ip[:-2])
    # ip_pool.IPspider(10)
    # reader = csv.reader(open('ips.csv'))
    #
    # for row in reader:
    #     proxy = row[0] + ':' + row[1]
    #     proxy_pool.append(proxy)

    #一阶段收集链接
    urlQueue = Queue()
    # 收集页面url
    pageQueue = Queue()
    # 页面数据队列
    dataQueue = Queue()
    #需求数据队列
    resultQueue = Queue()


    for i in range(1, int(pn)+1):
        fullurl = "http://invest.ppdai.com/loan/listnew?LoanCategoryId=4&SortType=0&PageIndex="+pn+"&MinAmount=0&MaxAmount=0"
        urlQueue.put(fullurl)

    uThread = []
    for threadname in range(1,3):
        thread = urlCollect(urlQueue, pageQueue)
        thread.start()
        uThread.append(thread)



    cThread = []
    for threadname in range(1,3):
        thread = pageCollect(pageQueue, dataQueue)

        thread.start()
        cThread.append(thread)



    pThread = []
    for threadname in range(1, 2):
        thread = contentParse(dataQueue, resultQueue)
        thread.start()
        pThread.append(thread)

    while not urlQueue.empty():
        pass
    global URL_EXIT
    URL_EXIT = True
    for thread in uThread:
        thread.join()

    while not pageQueue.empty():
        pass

    global COLLECT_EXIT
    COLLECT_EXIT = True

    for thread in cThread:
        thread.join()

    while not dataQueue.empty():
        pass

    global PARSE_EXIT
    PARSE_EXIT = True

    for thread in pThread:
        thread.join()

    print '开始导出数据'
    parse = parseData(resultQueue)
    parse.parse()



if __name__ == "__main__":
    print u'==============================='
    print u'| 默认导出路径为当前目录        |'
    print u'| 数据来源ppdai.com            |'
    print u'==============================='
    pn = raw_input(u'采集页数：')
    # start_time = time.time()
    main(pn)
    # end_time = time.time()
    # print u'----ok----'
    # print u'-----用时:%.2f s-----'%(end_time-start_time)
python编写，多线程只是鸡肋，没i什么卵用，请求密集这个网站会禁ip，所以只能降低线程(测试了下ip代理，效果并不好)