#-*- coding:utf-8 -*-
import urllib2
import time
import xlwt
import re
import csv
from lxml import etree
from threading import Thread
from Queue import Queue
import random
import socket
import ip_pool
URL_EXIT = False
PARSE_EXIT = False
COLLECT_EXIT = False
OUTPUT_EXIT = False
class urlCollect(Thread):
def __init__(self, urlQueue, pageQueue):
super(urlCollect, self).__init__()
self.urlQueue = urlQueue
self.pageQueue = pageQueue
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1"
}
def run(self):
while not URL_EXIT:
try:
url = self.urlQueue.get(False)
request = urllib2.Request(url, headers=self.headers)
time.sleep(3)
response = urllib2.urlopen(request)
text = response.read()
pattern = re.compile(r'<a class="title ell" target="_blank" href="(.*?)"')
links = pattern.findall(text)
for link in links:
self.pageQueue.put(link)
except:
pass
class pageCollect(Thread):
def __init__(self, pageQueue, dataQueue):
super(pageCollect, self).__init__()
self.pageQueue = pageQueue
self.dataQueue = dataQueue
# self.proxy_pool = proxy_pool
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1"
}
def run(self):
while not COLLECT_EXIT:
try:
time.sleep(4)
# proxy_ip = random.choice(self.proxy_pool)
url = self.pageQueue.get(False)
# for proxy_ip in self.proxy_pool:
# proxy_handler = urllib2.ProxyHandler({"http": proxy_ip})
# opener = urllib2.build_opener(proxy_handler)
# urllib2.install_opener(opener)
request = urllib2.Request(url, headers=self.headers)
response = urllib2.urlopen(request)
self.dataQueue.put(response.read())
# proxy_handler = urllib2.ProxyHandler({"http":proxy_ip})
# opener = urllib2.build_opener(proxy_handler)
# urllib2.install_opener(opener)
# try:
#
# request = urllib2.Request(url, headers=self.headers, proxies=proxy_ip)
# response = urllib2.urlopen(request)
# self.dataQueue.put(response.read())
# except:
# self.pageQueue.put(url)
except:
pass
class contentParse(Thread):
def __init__(self, dataQueue, resultQueue):
super(contentParse, self).__init__()
self.dataQueue = dataQueue
self.resultQueue = resultQueue
def run(self):
i = 1
while not PARSE_EXIT:
try:
result = {}
text = self.dataQueue.get(False)
html = etree.HTML(text)
# price
result['price'] = html.xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/dl[1]/dd/text()')[0]
# rate
result['rate'] = html.xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/dl[2]/dd/text()')[0]
# date
result['date'] = html.xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/dl[3]/dd/text()')[0]+html.xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/dl[3]/dd/em/text()')[0]
# sex
result['sex'] = html.xpath('/html/body/div[3]/div[3]/div[1]/div/div[1]/p[1]/span/text()')[0]
# age
result['age'] = html.xpath('/html/body/div[3]/div[3]/div[1]/div/div[1]/p[2]/span/text()')[0]
# 文化程度
result['wenhua'] = html.xpath('/html/body/div[3]/div[3]/div[1]/div/div[2]/p[1]/span/text()')[0]
# 还款方式 去空格
result['func'] = html.xpath('/html/body/div[3]/div[2]/div/div[1]/div[3]/div[1]/div/text()')[0].strip()
# 认证 只判断第一个
result['renzheng'] = html.xpath('/html/body/div[3]/div[3]/div[2]/ul/li[1]/text()')[0]
# 成功次数
result['times'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[1]/p[1]/span/text()')[0]
# first time
result['first'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[1]/p[2]/span/text()')[0]
# history
result['history'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/p[2]/span/text()')[0]
# 成功还款次数
result['succ'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/p[4]/span/text()')[0]
# 还清次数
xianshi = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[4]/span/text()')
if len(xianshi) == 0:
result['huanqing'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[4]/p[1]/span/text()')[0]
result['yuqi1'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[4]/p[2]/span/text()')[0]
result['yuqi2'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[4]/p[3]/span/text()')[0]
weilai = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[5]/text()')
guoqu = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/text()')
if len(weilai) == 2:
result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[5]/p/span/text()')[0]
result['daihuan'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/p[1]/span/text()')[0]
result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/p[2]/span/text()')[0].strip()
result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[1]/span/text()')[0]
result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[2]/span/text()')[0]
else:
if len(guoqu) == 2:
result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/p/span/text()')[0]
result['daihuan'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[1]/span/text()')[0]
result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[2]/span/text()')[0].strip()
result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[1]/span/text()')[0]
result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[2]/span/text()')[0]
else:
result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p/span/text()')[0]
result['daihuan'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[1]/span/text()')[0]
result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[2]/span/text()')[0].strip()
result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[9]/p[1]/span/text()')[0]
result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[9]/p[2]/span/text()')[0]
else:
result['huanqing'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[5]/p[1]/span/text()')[0]
result['yuqi1'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[5]/p[2]/span/text()')[0]
result['yuqi2'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[5]/p[3]/span/text()')[0]
weilai = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/text()')
guoqu = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/text()')
if len(weilai) == 2:
result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[6]/p/span/text()')[0]
result['daihuan'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[1]/span/text()')[0]
result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p[2]/span/text()')[0].strip()
result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[1]/span/text()')[0]
result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[2]/span/text()')[0]
else:
if len(guoqu) == 2:
result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[7]/p/span/text()')[0]
result['daihuan'] = html.xpath('//html/body/div[3]/div[3]/div[3]/div/div[8]/p[1]/span/text()')[0]
result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p[2]/span/text()')[0].strip()
result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[9]/p[1]/span/text()')[0]
result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[9]/p[2]/span/text()')[0]
else:
result['leiji'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[8]/p/span/text()')[0]
result['daihuan'] = html.xpath('//html/body/div[3]/div[3]/div[3]/div/div[9]/p[1]/span/text()')[0]
result['daishou'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[9]/p[2]/span/text()')[0].strip()
result['danbi'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[10]/p[1]/span/text()')[0]
result['zuigao'] = html.xpath('/html/body/div[3]/div[3]/div[3]/div/div[10]/p[2]/span/text()')[0]
i = i + 1
print i
self.resultQueue.put(result)
except:
pass
class parseData:
def __init__(self, resultQueue):
self.resultQueue = resultQueue
def parse(self):
i = 1
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('data', cell_overwrite_ok=True)
sheet.write(0, 0, '借贷金额')
sheet.write(0, 1, '年利率')
sheet.write(0, 2, '期限')
sheet.write(0, 3, '性别')
sheet.write(0, 4, '年龄')
sheet.write(0, 5, '文化程度')
sheet.write(0, 6, '还款方式')
sheet.write(0, 7, '学历认证')
sheet.write(0, 8, '身份证认证')
sheet.write(0, 9, '手机认证')
sheet.write(0, 10, '成功借款次数')
sheet.write(0, 11, '第一次成功借款时间')
sheet.write(0, 12, '历史记录')
sheet.write(0, 13, '成功还款次数')
sheet.write(0, 14, '正常还清次数')
sheet.write(0, 15, '逾期0-15天')
sheet.write(0, 16, '逾期15天以上')
sheet.write(0, 17, '累计借款金额')
sheet.write(0, 18, '待还金额')
sheet.write(0, 19, '待收金额')
sheet.write(0, 20, '单笔最高借款金额')
sheet.write(0, 21, '历史最高负债')
while True:
try:
result = self.resultQueue.get(False)
#print result
sheet.write(i, 0, result['price'])
sheet.write(i, 1, result['rate'])
sheet.write(i, 2, result['date'])
sheet.write(i, 3, result['sex'])
sheet.write(i, 4, result['age'])
sheet.write(i, 5, result['wenhua'])
sheet.write(i, 6, result['func'])
if result['renzheng'].encode('utf-8') == '学历认证':
sheet.write(i, 7, '学历认证')
else:
sheet.write(i, 7, " ")
sheet.write(i, 8, '身份证认证')
sheet.write(i, 9, '手机认证')
sheet.write(i, 10, result['times'])
sheet.write(i, 11, result['first'])
sheet.write(i, 12, result['history'])
sheet.write(i, 13, result['succ'])
sheet.write(i, 14, result['huanqing'])
sheet.write(i, 15, result['yuqi1'])
sheet.write(i, 16, result['yuqi2'])
sheet.write(i, 17, result['leiji'])
sheet.write(i, 18, result['daihuan'])
sheet.write(i, 19, result['daishou'])
sheet.write(i, 20, result['danbi'])
sheet.write(i, 21, result['zuigao'].strip())
i += 1
except:
break
book.save('result1.xls')
def main(pn):
# proxy_pool = []
# f = open('enable.txt', "r")
# ip_pool= f.readlines()
# for ip in ip_pool:
# proxy_pool.append(ip[:-2])
# ip_pool.IPspider(10)
# reader = csv.reader(open('ips.csv'))
#
# for row in reader:
# proxy = row[0] + ':' + row[1]
# proxy_pool.append(proxy)
#一阶段收集链接
urlQueue = Queue()
# 收集页面url
pageQueue = Queue()
# 页面数据队列
dataQueue = Queue()
#需求数据队列
resultQueue = Queue()
for i in range(1, int(pn)+1):
fullurl = "http://invest.ppdai.com/loan/listnew?LoanCategoryId=4&SortType=0&PageIndex="+pn+"&MinAmount=0&MaxAmount=0"
urlQueue.put(fullurl)
uThread = []
for threadname in range(1,3):
thread = urlCollect(urlQueue, pageQueue)
thread.start()
uThread.append(thread)
cThread = []
for threadname in range(1,3):
thread = pageCollect(pageQueue, dataQueue)
thread.start()
cThread.append(thread)
pThread = []
for threadname in range(1, 2):
thread = contentParse(dataQueue, resultQueue)
thread.start()
pThread.append(thread)
while not urlQueue.empty():
pass
global URL_EXIT
URL_EXIT = True
for thread in uThread:
thread.join()
while not pageQueue.empty():
pass
global COLLECT_EXIT
COLLECT_EXIT = True
for thread in cThread:
thread.join()
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
for thread in pThread:
thread.join()
print '开始导出数据'
parse = parseData(resultQueue)
parse.parse()
if __name__ == "__main__":
print u'==============================='
print u'| 默认导出路径为当前目录 |'
print u'| 数据来源ppdai.com |'
print u'==============================='
pn = raw_input(u'采集页数:')
# start_time = time.time()
main(pn)
# end_time = time.time()
# print u'----ok----'
# print u'-----用时:%.2f s-----'%(end_time-start_time)
python编写, 多线程只是鸡肋,没i什么卵用,请求密集这个网站会禁ip,所以只能降低线程(测试了下ip代理,效果并不好)