#! /bin/python
import requests
import jsonimport
datetimeimport reimport sysfrom bs4
import BeautifulSoup
def excuteSpider(url,headers,session,):
req = session.get(url, headers=headers)
req.encoding = "utf-8"
bsObj = BeautifulSoup(req.text, 'html.parser')
rankList = bsObj.findAll("strong", {"class": "col-red02"})
linkList = bsObj.findAll("span", {"class": "col-gray"})
nameList = bsObj.findAll("a", {"class": "pr10 fz14"})
infoList = bsObj.findAll("p", {"class": "RtCInfo"})
lranksub=[]
llinksub=[]
lnamesub=[]
lnfosub=[]
for rank in rankList:
lranksub.append(rank.get_text())
for link in linkList:
if link.get_text().find('(') == -1:
llinksub.append(link.get_text())
for name in nameList:
lnamesub.append(name.get_text())
for info in infoList:
s=info.get_text().split(':')
lnfosub.append(s[1])
return lranksub,llinksub,lnamesub,lnfosub
if __name__=='__main__':
lrank=[]
llink=[]
lname=[]
lnfo=[]
session = requests.Session()
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type":"application/x-www-form-urlencoded",
"Cookie": "Hm_lvt_e2d6533b8d3c86a8202250d4989a2fe5=1594263790; UM_distinctid=1733186e35dd07-066df823586632-7a1437-e1000-1733186e35ecbe; PHPSESSID=qohkl9cgabsv14f7jatrl1sl42; exi_query_history=IxGjvC2Bm3NLwvy4Ox7IfCz7gKGkEcjHRWkIemRtAGmDhK5e4qLILAxGJWCr-D8Kgah4qhzBsotZIJTYc5jTpq63nCfBRMk176e4Fdm3D6ymNGYy18XvnqDXYq0BVIka-GrurNBIsAbnF0IN67JA2rID3YWrxyjV-GNxQvBLh4VneY-L; exi_users=PTXl5X0WGo6-F9-FwneiCpyphXanNLejfpxEvRZEfDRVotQMletHig6hXEmBnKANQPXVDi3UM48PgVHwAhus-FKHQKfiE2rEjBpVuUFyD8w5nf-FjaIvI-ENMXXpBlyqINgTY6ljhwmRl5nxbslFoPdFz58NfJjrpLjoJhV-Exb1Ltnfrrk2J1WyYTdCeEbdperVl4pcB2uiQCSI8BSjeJ-EitboSzQ0-FqdUmc61z9aoYNkLlq7DIBsi33V5EIQDWHbLWqnqhS8FDF3es6r5Fl7SgUUmWQ-EXhF5AKPH6vMIXzmQQOZCYVM548Qz4W9qqGA5aptXzE0-E2R-Fv5RlXH8WXVUgjQdeRMkTo8FqwKTtj9EuRF2ZFFHnt-ExHu5bmIMPiYXcF-FX3V5vHIstJtub2-E-FIhuKTyKRHwgxjqmEVu1jtsv1HH4-O; CNZZDATA1276361993=1264455532-1594260046-%7C1594346791; Hm_lpvt_e2d6533b8d3c86a8202250d4989a2fe5=1594347558",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2"
}
pages = int(sys.argv[1])
for i in range(1,pages+1):
if i==1:
url = "https://top.chinaz.com/all/index_br.html"
else:
url = "https://top.chinaz.com/all/index_br_"+str(i)+".html"
lranksub,llinksub,lnamesub,lnfosub=excuteSpider(url, headers, session)
lrank+=lranksub
llink+=llinksub
lname+=lnamesub
lnfo+=lnfosub
jsontext = {"version":0,"domains":[]}
for i in range(len(lrank)):
jsontext["domains"].append({"id":lrank[i],"domain":llink[i],"name":lname[i],"description":lnfo[i]})
cur_dir = sys.path[0]
work_dir = cur_dir + '/../db/chinaz-com/'
now = datetime.datetime.now()
name = datetime.datetime.strftime(now,'%Y%m%d%H%M%S.json')
chinazfilepath = work_dir+name
wf = open(chinazfilepath, 'w')
wf.write(json.dumps(jsontext,indent=4,ensure_ascii=False))
wf.close()
print('ok')