多线程爬取腾讯招聘信息,并存放在MongoDB
header = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/87.0.4280.66 Safari/537.36'
}
def deal_params ( p_q) :
for i in range ( 1 , 51 ) :
params = {
'timestamp' : f'{round(time.time() * 1000)}' ,
'countryId' : '' ,
'cityId' : '' ,
'bgIds' : '' ,
'productId' : '' ,
'categoryId' : '' ,
'parentCategoryId' : '' ,
'attrId' : '' ,
'keyword' : '' ,
'pageIndex' : f'{i}' ,
'pageSize' : '10' ,
'language' : 'zh-cn' ,
'area' : 'cn' ,
}
p_q. put( params)
cursor = pymongo. MongoClient( host= '127.0.0.1' , port= 27017 )
db = cursor[ 'class' ]
col = db[ 'student' ]
switch = 0
def get_mess ( q, p_q) :
while 1 :
if not p_q. empty( ) :
response = requests. get( url= 'https://careers.tencent.com/tencentcareer/api/post/Query' , headers= header,
params= p_q. get( ) ) . json( )
for i in response[ 'Data' ] [ 'Posts' ] :
q. put( { 'title' : i[ 'RecruitPostName' ] , 'target' : i[ 'Responsibility' ] } )
else :
break
def save_message ( q) :
while 1 :
if q. empty( ) and switch == 1 :
break
try :
col. insert( q. get( timeout= 10 ) )
except :
break
if __name__ == '__main__' :
q = Queue( maxsize= 500 )
p_q = Queue( maxsize= 100 )
deal_params( p_q)
lst = [ ]
for i in range ( 10 ) :
t1 = threading. Thread( target= get_mess, args= ( q, p_q) )
t2 = threading. Thread( target= save_message, args= ( q, ) )
t1. start( )
t2. start( )
lst. append( t1)
for i in lst:
i. join( )
switch = 1