python多线程queue_python之多线程 queue 实践 筛选有效url

importthreading

lock=threading.Lock()importQueue

task_queue=Queue.Queue()

write_queue=Queue.Queue()importrequestsfrom requests.exceptions import(ConnectionError, ConnectTimeout, ReadTimeout, SSLError,

ProxyError, RetryError, InvalidSchema)

s=requests.Session()

s.headers.update({'user-agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13G36 MicroMessenger/6.5.12 NetType/4G'})

# 隐藏 referer 细节,实测可不用#s.headers.update({'Referer':'https://servicewechat.com/xxxxxxxx'})

s.verify =False

s.mount('https://', requests.adapters.HTTPAdapter(pool_connections=1000, pool_maxsize=1000))importcopy

sp=copy.deepcopy(s)

proxies = {'http': 'http://127.0.0.1:3128', 'https': 'https://127.0.0.1:3128'}

sp.proxies=proxiesfrom urllib3.exceptions importInsecureRequestWarningfrom warnings importfilterwarnings

filterwarnings('ignore', category =InsecureRequestWarning)from bs4 importBeautifulSoup as BSfrom pypinyin importlazy_pinyinimportpickleimportloggingdefget_logger():

logger= logging.getLogger("threading_example")

logger.setLevel(logging.DEBUG)#fh = logging.FileHandler("d:/threading.log")

fh =logging.StreamHandler()

fmt= '%(asctime)s - %(threadName)-10s - %(levelname)s - %(message)s'formatter=logging.Formatter(fmt)

fh.setFormatter(formatter)

logger.addHandler(fh)returnlogger

logger=get_logger()#url 不对的时候正常返回:#In [88]: r.text#Out[88]: u'jsonp_queryMoreNums({"numRetailList":[],"code":"M1","uuid":"a95ca4c6-957e-462a-80cd-0412b#d5672df","numArray":[]});'

results =[]defget_nums():globalresults

pattern= re.compile(r'({.*?})') #, re.S | re.I | re.X)

whileTrue:try: #尽量缩小try代码块大小

_url =task_queue.get()

url= _url + str(int(time.time()*1000))

resp= sp.get(url, timeout=10)except(ConnectionError, ConnectTimeout, ReadTimeout, SSLError,

ProxyError, RetryError, InvalidSchema) as err:

task_queue.task_done()############### 重新 put 之前需要 task_done ,才能保证释放 task_queue.join()

task_queue.put(_url)exceptException as err:

logger.debug('\nstatus_code:{}\nurl:{}\nerr: {}\ntraceback: {}'.format(resp.status_code, url, err, traceback.format_exc()))

task_queue.task_done()############### 重新 put 之前需要 task_done ,才能保证释放 task_queue.join()

task_queue.put(_url)else:try:#rst = resp.content

#match = rst[rst.index('{'):rst.index('}')+1]

#m = re.search(r'({.*?})',resp.content)

m =pattern.search(resp.content)

match=m.group()

rst=json.loads(match)

nums= [num for num in rst['numArray'] if num>10000]

nums_len=len(nums)#assert nums_len == 10

num = nums[-1]

province_zh, city_zh, province_pinyin, city_pinyin=get_num_info(num)

result=(str(num), province_zh, city_zh, province_pinyin, city_pinyin, _url)

results.append(result)

write_queue.put(result)

logger.debug(u'results:{} threads: {} task_queue: {} {} {} {} {}'.format(len(results), threading.activeCount(), task_queue.qsize(),

num, province_zh, city_zh, _url))except(ValueError, AttributeError, IndexError) as err:pass

exceptException as err:#print err,traceback.format_exc()

logger.debug('\nstatus_code:{}\nurl:{}\ncontent:{}\nerr: {}\ntraceback: {}'.format(resp.status_code, url, resp.content, err, traceback.format_exc()))finally:

task_queue.task_done()###############

defget_num_info(num):try:

url= 'http://www.ip138.com:8080/search.asp?action=mobile&mobile=%s' %num

resp=s.get(url)

soup= BS(resp.content, 'lxml')#pro, cit = re.findall(r'

(.*?)

rst = soup.select('tr td.tdc2')[1].text.split()if len(rst) == 2:

province_zh, city_zh=rstelse:

province_zh= city_zh =rst[0]

province_pinyin= ''.join(lazy_pinyin(province_zh))

city_pinyin= ''.join(lazy_pinyin(city_zh))exceptException as err:printerr,traceback.format_exc()

province_zh= city_zh = province_pinyin = city_pinyin = 'xxx'

returnprovince_zh, city_zh, province_pinyin, city_pinyindefwrite_result():

with open('10010temp.txt','w',0) as f: #'w' open时会截去之前内容,所以放在 while True 之上

whileTrue:try:

rst= ' '.join(write_queue.get()) + '\n'f.write(rst.encode('utf-8'))

write_queue.task_done()exceptException as err:printerr,traceback.format_exc()if __name__ == '__main__':

province_groupkey_list=[

('18', '15237219'),

('51', '21236872'),

('31', '34236498'),

('87', '43236612'),

('10', '8400250331'),

('89', '90242110'),

('83', '99250240'),

('19', '59237227'),

('36', '60236866'),

('97', '49236584'),

('79', '13238152'),

('34', '33236916'),

('71', '40236873'),

('88', '9100283297'),

('50', '27237168'),

('59', '6800258755'),

('74', '71237034'),

('11', '85236889'),

('84', '13236970'),

('76', '85236973'),

('13', '36236594'),

('85', '53237275'),

('86', '79237759'),

('90', '19236614'),

('30', '2400265649'),

('38', '12236361'),

('17', '17236695'),

('70', '4900281779'),

('75', '67237076'),

('91', '19236028'),

('81', '20236750')]#province_groupkey_list = [('51', '21236872')]

importitertoolsfor (provinceCode, groupKey) inprovince_groupkey_list:#for cityCode in range(1000):

for cityCode in [''.join(i) for i in itertools.product('0123456789',repeat=3)]:

fmt= 'https://m.1xxxx.com/xxxxx&provinceCode={provinceCode}&cityCode={cityCode}&xxxxx&groupKey={groupKey}&xxxxx' # url 细节已被隐藏url= fmt.format(provinceCode=provinceCode, cityCode=cityCode, groupKey=groupKey)#, now=int(float(time.time())*1000))

task_queue.put(url)

threads=[]for i in range(300):

t= threading.Thread(target=get_nums) #args接收元组,至少(a,)

threads.append(t)

t_write_result= threading.Thread(target=write_result)

threads.append(t_write_result)#for t in threads:

#t.setDaemon(True)

#t.start()

#while True:

#pass

for t inthreads:

t.setDaemon(True)

t.start()#for t in threads:

#t.join()

task_queue.join()print 'task done'write_queue.join()print 'write done'with open('10010temp','w') as f:

pickle.dump(results, f)print 'all done'

#while True:

#pass

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值