python函数框架 爬数据(58同城租房)

import requests
from lxml.etree import HTML
from redis import StrictRedis
r=StrictRedis(host=‘localhost’,decode_responses=True,port=6379)

def getrst(url):
headers={‘cookie’: ‘f=n; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; userid360_xml=B8955B52EAC6F483BB1F2450101DE23E; time_create=1559436741241; xxzl_deviceid=uT86T8Rgjf%2B9T35MDHrFe3YcMerBrOdob9tYtDOdJBJnNldtBlmZh5s%2F7lbOKVcP; id58=c5/njVzETqN7W6kOCe9UAg==; 58tj_uuid=921753d1-e5be-4089-afe6-b5afc50bc534; als=0; wmda_uuid=20c219fb0038da645ebdfa5921f3011a; wmda_new_uuid=1; show_zcm_banner=true; wmda_visited_projects=%3B2385390625025%3B1731916484865%3B6333604277682%3B6289197098934; __xsptplus8=8.1.1556528603.1556528603.1%234%7C%7C%7C%7C%7C%23%234_cLV4eO4YDTDH9iT7EH2iOYIt1Llxn2%23; ppStore_fingerprint=82E1F0D9ECF41F343CB3C483409B26EB12F99E8C78E56A8E%EF%BC%BF1556585062184; city=bj; 58home=bj; f=n; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; commontopbar_ipcity=bj%7C%E5%8C%97%E4%BA%AC%7C0; new_uv=15; utm_source=; spm=; init_refer=https%253A%252F%252Fwww.baidu.com%252Flink%253Furl%253DsSv7ucYsW6w55S3XFiHnEclPK-e8ui0q-DdGN8lhUmi%2526wd%253D%2526eqid%253Dcc47cfeb0012336e000000035ccb905e; new_session=0; wmda_session_id_2385390625025=1556844673131-7857751b-910e-c106; xzfzqtoken=TvjnNJky4gEHEz0nNDxNmVX8TGioQPao2736MtxB2OFwquS44svZbxJlmog7ogPxin35brBb%2F%2FeSODvMgkQULA%3D%3D’,
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36’
}
rst=requests.get(url,headers=headers,timeout=10)
rst.encoding=rst.apparent_encoding
rst=rst.text
return rst
def getUrl(): #所有区域
url=‘https://bj.58.com/chaoyang/chuzu/f1/?PGTID=0d3090a7-0047-63b1-3bbf-e307e62ae2ea&ClickID=2
try:
return getrst(url)
except:
return ‘’

def getQuyu(rst):
html=HTML(rst)
lsq=html.xpath(’/html/body/div[4]/div/div[3]/dl[1]/dd/a[.]/@href’)[1:] #区域
quyu=[]
for i in lsq:
quyu.append(i.split(’/’)[3])
return quyu

def getDidian(quyu): #所有地点
quyuURL=[]
didian=[]
for j in quyu:
quyuURL.append(‘https://bj.58.com/{}/chuzu/?PGTID=0d3090a7-0000-13c9-d250-69b273834f20&ClickID=2’.format(j))
for url in quyuURL:
try:
print(url)
rst = getrst(url)
html=HTML(rst)
lsd=html.xpath(’/html/body/div[4]/div/div[3]/dl[1]/dd/div/a[.]/@href’) #总地名
didian.append(lsd)
except Exception as err:
print(err.args)
return didian

def getTotal(didian): #网页总页数
try:
for m in range(len(didian)):
for url in didian[m]:
print(url)
rst = getrst(url)
html=HTML(rst)
lsz=html.xpath(’//[@id=“bottom_ad_li”]/div[2]/a[.]/span/text()’) #总页数
if len(lsz) == 4:
total = lsz[2]
elif len(lsz) != 4 and len(lsz) !=0:
total = lsz[len(lsz)-2]
else:
title = html.xpath(’/html/body/div[5]/div/div[5]/div[2]/ul/li[.]/div[2]/h2/a/text()’)
if len(title)<1: continue
else:total = html.xpath(’//
[@id=“bottom_ad_li”]/div[2]/strong/text()’)[0]
# w=[]
for q in range(1,int(total)+1):
w=url + “pn” + str(q)
r.hset(‘58url1’,w,1)
print(w)
except Exception as e:
print(e.args)
def main():
rst = getUrl()
quyu = getQuyu(rst)
didian = getDidian(quyu)
getTotal(didian)

main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值