importthreading
lock=threading.Lock()importQueue
task_queue=Queue.Queue()
write_queue=Queue.Queue()importrequestsfrom requests.exceptions import(ConnectionError, ConnectTimeout, ReadTimeout, SSLError,
ProxyError, RetryError, InvalidSchema)
s=requests.Session()
s.headers.update({'user-agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13G36 MicroMessenger/6.5.12 NetType/4G'})
# 隐藏 referer 细节,实测可不用#s.headers.update({'Referer':'https://servicewechat.com/xxxxxxxx'})
s.verify =False
s.mount('https://', requests.adapters.HTTPAdapter(pool_connections=1000, pool_maxsize=1000))importcopy
sp=copy.deepcopy(s)
proxies = {'http': 'http://127.0.0.1:3128', 'https': 'https://127.0.0.1:3128'}
sp.proxies=proxiesfrom urllib3.exceptions importInsecureRequestWarningfrom warnings importfilterwarnings
filterwarnings('ignore', category =InsecureRequestWarning)from bs4 importBeautifulSoup as BSfrom pypinyin importlazy_pinyinimportpickleimportloggingdefget_logger():
logger= logging.getLogger("threading_example")
logger.setLevel(logging.DEBUG)#fh = logging.FileHandler("d:/threading.log")
fh =logging.StreamHandler()
fmt= '%(asctime)s - %(threadName)-10s - %(levelname)s - %(message)s'formatter=logging.Formatter(fmt)
fh.setFormatter(formatter)
logger.addHandler(fh)returnlogger
logger=get_logger()#url 不对的时候正常返回:#In [88]: r.text#Out[88]: u'jsonp_queryMoreNums({"numRetailList":[],"code":"M1","uuid":"a95ca4c6-957e-462a-80cd-0412b#d5672df","numArray":[]});'
results =[]defget_nums():globalresults
pattern= re.compile(r'({.*?})') #, re.S | re.I | re.X)
whileTrue:try: #尽量缩小try代码块大小
_url =task_queue.get()
url= _url + str(int(time.time()*1000))
resp= sp.get(url, timeout=10)except(ConnectionError, ConnectTimeout, ReadTimeout, SSLError,
ProxyError, RetryError, InvalidSchema) as err:
task_queue.task_done()############### 重新 put 之前需要 task_done ,才能保证释放 task_queue.join()
task_queue.put(_url)exceptException as err:
logger.debug('\nstatus_code:{}\nurl:{}\nerr: {}\ntraceback: {}'.format(resp.status_code, url, err, traceback.format_exc()))
task_queue.task_done()############### 重新 put 之前需要 task_done ,才能保证释放 task_queue.join()
task_queue.put(_url)else:try:#rst = resp.content
#match = rst[rst.index('{'):rst.index('}')+1]
#m = re.search(r'({.*?})',resp.content)
m =pattern.search(resp.content)
match=m.group()
rst=json.loads(match)
nums= [num for num in rst['numArray'] if num>10000]
nums_len=len(nums)#assert nums_len == 10
num = nums[-1]
province_zh, city_zh, province_pinyin, city_pinyin=get_num_info(num)
result=(str(num), province_zh, city_zh, province_pinyin, city_pinyin, _url)
results.append(result)
write_queue.put(result)
logger.debug(u'results:{} threads: {} task_queue: {} {} {} {} {}'.format(len(results), threading.activeCount(), task_queue.qsize(),
num, province_zh, city_zh, _url))except(ValueError, AttributeError, IndexError) as err:pass
exceptException as err:#print err,traceback.format_exc()
logger.debug('\nstatus_code:{}\nurl:{}\ncontent:{}\nerr: {}\ntraceback: {}'.format(resp.status_code, url, resp.content, err, traceback.format_exc()))finally:
task_queue.task_done()###############
defget_num_info(num):try:
url= 'http://www.ip138.com:8080/search.asp?action=mobile&mobile=%s' %num
resp=s.get(url)
soup= BS(resp.content, 'lxml')#pro, cit = re.findall(r'
(.*?)rst = soup.select('tr td.tdc2')[1].text.split()if len(rst) == 2:
province_zh, city_zh=rstelse:
province_zh= city_zh =rst[0]
province_pinyin= ''.join(lazy_pinyin(province_zh))
city_pinyin= ''.join(lazy_pinyin(city_zh))exceptException as err:printerr,traceback.format_exc()
province_zh= city_zh = province_pinyin = city_pinyin = 'xxx'
returnprovince_zh, city_zh, province_pinyin, city_pinyindefwrite_result():
with open('10010temp.txt','w',0) as f: #'w' open时会截去之前内容,所以放在 while True 之上
whileTrue:try:
rst= ' '.join(write_queue.get()) + '\n'f.write(rst.encode('utf-8'))
write_queue.task_done()exceptException as err:printerr,traceback.format_exc()if __name__ == '__main__':
province_groupkey_list=[
('18', '15237219'),
('51', '21236872'),
('31', '34236498'),
('87', '43236612'),
('10', '8400250331'),
('89', '90242110'),
('83', '99250240'),
('19', '59237227'),
('36', '60236866'),
('97', '49236584'),
('79', '13238152'),
('34', '33236916'),
('71', '40236873'),
('88', '9100283297'),
('50', '27237168'),
('59', '6800258755'),
('74', '71237034'),
('11', '85236889'),
('84', '13236970'),
('76', '85236973'),
('13', '36236594'),
('85', '53237275'),
('86', '79237759'),
('90', '19236614'),
('30', '2400265649'),
('38', '12236361'),
('17', '17236695'),
('70', '4900281779'),
('75', '67237076'),
('91', '19236028'),
('81', '20236750')]#province_groupkey_list = [('51', '21236872')]
importitertoolsfor (provinceCode, groupKey) inprovince_groupkey_list:#for cityCode in range(1000):
for cityCode in [''.join(i) for i in itertools.product('0123456789',repeat=3)]:
fmt= 'https://m.1xxxx.com/xxxxx&provinceCode={provinceCode}&cityCode={cityCode}&xxxxx&groupKey={groupKey}&xxxxx' # url 细节已被隐藏url= fmt.format(provinceCode=provinceCode, cityCode=cityCode, groupKey=groupKey)#, now=int(float(time.time())*1000))
task_queue.put(url)
threads=[]for i in range(300):
t= threading.Thread(target=get_nums) #args接收元组,至少(a,)
threads.append(t)
t_write_result= threading.Thread(target=write_result)
threads.append(t_write_result)#for t in threads:
#t.setDaemon(True)
#t.start()
#while True:
#pass
for t inthreads:
t.setDaemon(True)
t.start()#for t in threads:
#t.join()
task_queue.join()print 'task done'write_queue.join()print 'write done'with open('10010temp','w') as f:
pickle.dump(results, f)print 'all done'
#while True:
#pass