Chinaz网站排名爬取

在这里插入图片描述
在这里插入图片描述

#! /bin/python
import requests
import jsonimport 
datetimeimport reimport sysfrom bs4 
import BeautifulSoup

def excuteSpider(url,headers,session,):

    req = session.get(url, headers=headers)    
    req.encoding = "utf-8"    
    bsObj = BeautifulSoup(req.text, 'html.parser')
    rankList = bsObj.findAll("strong", {"class": "col-red02"})    
    linkList = bsObj.findAll("span", {"class": "col-gray"})    
    nameList = bsObj.findAll("a", {"class": "pr10 fz14"})    
    infoList = bsObj.findAll("p", {"class": "RtCInfo"})       
    lranksub=[]    
    llinksub=[]    
    lnamesub=[]    
    lnfosub=[]
    
    for rank in rankList:        
    	lranksub.append(rank.get_text())
    	
    for link in linkList:        
    	if link.get_text().find('(') == -1:             
    	llinksub.append(link.get_text())
    	
    for name in nameList:        
    	lnamesub.append(name.get_text())
    	           
    for info in infoList:        
   	s=info.get_text().split(':')        
    	lnfosub.append(s[1])
    	
    return lranksub,llinksub,lnamesub,lnfosub
    
if __name__=='__main__':

    lrank=[]    
    llink=[]    
    lname=[]    
    lnfo=[]
    session = requests.Session()    
    
    headers = {        
    "Accept": "application/json, text/javascript, */*; q=0.01",        "Accept-Encoding": "gzip, deflate, br",        
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7",        
    "Cache-Control": "no-cache",        
    "Connection": "keep-alive",        
    "Content-Type":"application/x-www-form-urlencoded",       
    "Cookie": "Hm_lvt_e2d6533b8d3c86a8202250d4989a2fe5=1594263790; UM_distinctid=1733186e35dd07-066df823586632-7a1437-e1000-1733186e35ecbe; PHPSESSID=qohkl9cgabsv14f7jatrl1sl42; exi_query_history=IxGjvC2Bm3NLwvy4Ox7IfCz7gKGkEcjHRWkIemRtAGmDhK5e4qLILAxGJWCr-D8Kgah4qhzBsotZIJTYc5jTpq63nCfBRMk176e4Fdm3D6ymNGYy18XvnqDXYq0BVIka-GrurNBIsAbnF0IN67JA2rID3YWrxyjV-GNxQvBLh4VneY-L; exi_users=PTXl5X0WGo6-F9-FwneiCpyphXanNLejfpxEvRZEfDRVotQMletHig6hXEmBnKANQPXVDi3UM48PgVHwAhus-FKHQKfiE2rEjBpVuUFyD8w5nf-FjaIvI-ENMXXpBlyqINgTY6ljhwmRl5nxbslFoPdFz58NfJjrpLjoJhV-Exb1Ltnfrrk2J1WyYTdCeEbdperVl4pcB2uiQCSI8BSjeJ-EitboSzQ0-FqdUmc61z9aoYNkLlq7DIBsi33V5EIQDWHbLWqnqhS8FDF3es6r5Fl7SgUUmWQ-EXhF5AKPH6vMIXzmQQOZCYVM548Qz4W9qqGA5aptXzE0-E2R-Fv5RlXH8WXVUgjQdeRMkTo8FqwKTtj9EuRF2ZFFHnt-ExHu5bmIMPiYXcF-FX3V5vHIstJtub2-E-FIhuKTyKRHwgxjqmEVu1jtsv1HH4-O; CNZZDATA1276361993=1264455532-1594260046-%7C1594346791; Hm_lpvt_e2d6533b8d3c86a8202250d4989a2fe5=1594347558",        
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2"    
    }
    
    pages = int(sys.argv[1])
    
    for i in range(1,pages+1):        
    	if i==1:            
    		url = "https://top.chinaz.com/all/index_br.html"        
    	else:            
    		url = "https://top.chinaz.com/all/index_br_"+str(i)+".html"
    	
  lranksub,llinksub,lnamesub,lnfosub=excuteSpider(url, headers, session)
  lrank+=lranksub        
  llink+=llinksub        
  lname+=lnamesub        
  lnfo+=lnfosub
  
  jsontext = {"version":0,"domains":[]}    
  for i in range(len(lrank)):        
  	jsontext["domains"].append({"id":lrank[i],"domain":llink[i],"name":lname[i],"description":lnfo[i]})

    cur_dir = sys.path[0]    
    work_dir = cur_dir + '/../db/chinaz-com/'
    now = datetime.datetime.now()    
    name = datetime.datetime.strftime(now,'%Y%m%d%H%M%S.json')    
    chinazfilepath = work_dir+name       
    wf = open(chinazfilepath, 'w')
    wf.write(json.dumps(jsontext,indent=4,ensure_ascii=False))    
    wf.close()
    print('ok')
    
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值