import requests
from lxml.etree import HTML
from redis import StrictRedis
r=StrictRedis(host=‘localhost’,decode_responses=True,port=6379)
def getrst(url):
headers={‘cookie’: ‘f=n; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; userid360_xml=B8955B52EAC6F483BB1F2450101DE23E; time_create=1559436741241; xxzl_deviceid=uT86T8Rgjf%2B9T35MDHrFe3YcMerBrOdob9tYtDOdJBJnNldtBlmZh5s%2F7lbOKVcP; id58=c5/njVzETqN7W6kOCe9UAg==; 58tj_uuid=921753d1-e5be-4089-afe6-b5afc50bc534; als=0; wmda_uuid=20c219fb0038da645ebdfa5921f3011a; wmda_new_uuid=1; show_zcm_banner=true; wmda_visited_projects=%3B2385390625025%3B1731916484865%3B6333604277682%3B6289197098934; __xsptplus8=8.1.1556528603.1556528603.1%234%7C%7C%7C%7C%7C%23%234_cLV4eO4YDTDH9iT7EH2iOYIt1Llxn2%23; ppStore_fingerprint=82E1F0D9ECF41F343CB3C483409B26EB12F99E8C78E56A8E%EF%BC%BF1556585062184; city=bj; 58home=bj; f=n; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; commontopbar_ipcity=bj%7C%E5%8C%97%E4%BA%AC%7C0; new_uv=15; utm_source=; spm=; init_refer=https%253A%252F%252Fwww.baidu.com%252Flink%253Furl%253DsSv7ucYsW6w55S3XFiHnEclPK-e8ui0q-DdGN8lhUmi%2526wd%253D%2526eqid%253Dcc47cfeb0012336e000000035ccb905e; new_session=0; wmda_session_id_2385390625025=1556844673131-7857751b-910e-c106; xzfzqtoken=TvjnNJky4gEHEz0nNDxNmVX8TGioQPao2736MtxB2OFwquS44svZbxJlmog7ogPxin35brBb%2F%2FeSODvMgkQULA%3D%3D’,
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36’
}
rst=requests.get(url,headers=headers,timeout=10)
rst.encoding=rst.apparent_encoding
rst=rst.text
return rst
def getUrl(): #所有区域
url=‘https://bj.58.com/chaoyang/chuzu/f1/?PGTID=0d3090a7-0047-63b1-3bbf-e307e62ae2ea&ClickID=2’
try:
return getrst(url)
except:
return ‘’
def getQuyu(rst):
html=HTML(rst)
lsq=html.xpath(’/html/body/div[4]/div/div[3]/dl[1]/dd/a[.]/@href’)[1:] #区域
quyu=[]
for i in lsq:
quyu.append(i.split(’/’)[3])
return quyu
def getDidian(quyu): #所有地点
quyuURL=[]
didian=[]
for j in quyu:
quyuURL.append(‘https://bj.58.com/{}/chuzu/?PGTID=0d3090a7-0000-13c9-d250-69b273834f20&ClickID=2’.format(j))
for url in quyuURL:
try:
print(url)
rst = getrst(url)
html=HTML(rst)
lsd=html.xpath(’/html/body/div[4]/div/div[3]/dl[1]/dd/div/a[.]/@href’) #总地名
didian.append(lsd)
except Exception as err:
print(err.args)
return didian
def getTotal(didian): #网页总页数
try:
for m in range(len(didian)):
for url in didian[m]:
print(url)
rst = getrst(url)
html=HTML(rst)
lsz=html.xpath(’//[@id=“bottom_ad_li”]/div[2]/a[.]/span/text()’) #总页数
if len(lsz) == 4:
total = lsz[2]
elif len(lsz) != 4 and len(lsz) !=0:
total = lsz[len(lsz)-2]
else:
title = html.xpath(’/html/body/div[5]/div/div[5]/div[2]/ul/li[.]/div[2]/h2/a/text()’)
if len(title)<1: continue
else:total = html.xpath(’//[@id=“bottom_ad_li”]/div[2]/strong/text()’)[0]
# w=[]
for q in range(1,int(total)+1):
w=url + “pn” + str(q)
r.hset(‘58url1’,w,1)
print(w)
except Exception as e:
print(e.args)
def main():
rst = getUrl()
quyu = getQuyu(rst)
didian = getDidian(quyu)
getTotal(didian)
main()