翻到一个两年前写的爬虫小框架
# coding=utf-8
import tushare as ts
import pandas as pd
import requests
import json
import re
import time
from retrying import retry
from concurrent.futures import ThreadPoolExecutor
import random
def get_pro():
list = ['122.114.31.177:808', '61.135.217.7:80', '113.121.243.109:808', '171.39.40.5:8123', '121.31.199.30:8123',
'111.155.116.240:8123', '125.121.121.171:808', '115.213.178.192:808']
return list
start = time.clock() # 计时-开始
urlnum = range(8)
listdo = urlnum
while True:
listye = []
listno = []
event = []
@retry(stop_max_attempt_number=8) # 设置最大重试次数
def crawl(n):
pro_list = get_pro()
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}
proxies_l = {'http': pro_list[random.randint(0, len(pro_list))],
}
print(proxies_l['http'])
try:
req = requests.get('http://httpbin.org/ip', headers=header, proxies=proxies_l)
print('finish')
listye.append(n)
listdo.remove(n)
print (listdo)
return req.text
except:
print('no proxies')
listno.append(n)
# 多线程
def multithreading():
number = listdo
with ThreadPoolExecutor(max_workers=10) as executor:
for result in executor.map(crawl, number, chunksize=10):
event.append(result)
return event
event = multithreading()
print ('listye')
print (listye)
print ('listno')
print (listno)
print ('listdo')
print (listdo)
if len(listdo) == 0:
break
end = time.clock() # 计时-结束
print ("爬取完成 用时:")
print (end - start)