源码如下(解释请看注释):
import re
import requests
from my_fake_useragent import UserAgent
class LianJia():
def __init__(self, html):
self.html = html
# print(self.html)
pass
def __call__(self, *args, **kwargs):
self.main()
pass
def main(self):
# 去除获取页面中所有的空格和换行
str = re.sub('\s+', '', self.html).strip()
# print(str)
# 匹配所有房子的li标签
res = re.findall('<liclass="res.*?"data.*?>(.*?)</li>', str)
# 去除匹配结果中的无用值
res.pop()
for j, i in enumerate(res, 1):
print(f'第{j}个')
# 获取房子名字
rule1 = '.*?class="resblock-img-wrapper"title="(.*?)"'
res1 = re.findall(rule1, i)
print('名称:', res1[0] if res1 else '暂无', )
# 获取房子效果图
rule2 = '.*?class="lj-lazy"data-original="(.*?)"'
res2 = re.findall(rule2, i)
print('效果图:', res2[0] if res2 else '暂无')
# 获取楼盘状态
rule3 = 'style=.*?>(.*?)</span><spanclass="sale-status"style=.*?>(.*?)</span>'
res3 = re.findall(rule3, i)
print('状态:', ''.join([i + ' ' for i in res3[0]]) if res3 else '暂无')
# 单价
rule4 = '<spanclass="number">(.*?)</span>'
res4 = re.findall(rule4, i)
print('均价:', res4[0], '元/㎡' if res4 else '暂无')
# 总价
rule5 = '<divclass="second">总价(.*?)</div>'
res5 = re.findall(rule5, i)
print('总价:', res5[0] if res5 else '暂无')
# 建筑面积
rule6 = '<divclass="resblock-area"><span>建面(.*?)</span>'
res6 = re.findall(rule6, i)
print('建筑面积:', res6[0] if res6 else '暂无')
# 地址
rule7 = '<span>(.*?)</span><iclass="split">/</i><span>(.*?)</span><iclass="split">/</i><ahref=.*?}">(.*?)</a>'
res7 = re.findall(rule7, i)
print('地址:', ''.join([i + ',' for i in res7[0]]) if res7 else '暂无')
# 特点
rule8 = '<divclass="resblock-tag"><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span>'
res8 = re.findall(rule8, i)
print(print('优点:', ''.join([i + ',' for i in res8[0]]) if res8 else '暂无'))
print()
pass
if __name__ == '__main__':
# 这里是看到这个网页一共有100页,所以直接循环100次,这里也可以直接在网页去获取页数
for page in range(1,101):
print(f'第{page}页')
# 拼接网址
url = f'https://xa.fang.lianjia.com/loupan/pg{page}/'
# 请求头
headers = {
'Cookie': 'ab_jid_BFESS=b4db74462a11ae46581569a222fcfacbd11f;'
' ab_jid=b4db74462a11ae46581569a222fcfacbd11f;'
' ab_jid=b4db74462a11ae46581569a222fcfacbd11f; '
'BIDUPSID=B14AA316D2CDE2A262495641CB162AC7;'
' PSTM=1578509640;'
' BDUSS_BFESS=hSU0k2eWF-Y0VtOEt1TlRVfk1JMlpxMGpyNHNmQ'
'ldqalpCTU4yTUo4dkI0SDlnRVFBQUFBJCQAAAAAAAAAAAEAAADv7'
'Mp2n2~0dWNrAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMFTWGDBU1hgN2; '
'BAIDUID_BFESS=4E5DC05E66B65A2F0A1BAA1D6504F37D:FG=1;'
' ab_sr=1.0.0_MDRhNWI1MzMxMmVjNjMxZjI5NWFmYjJiOWI5MjdmY'
'2I4OWE3ZjUxMTNjZjRjODI4NWIxZmY2ODQyZDFlOTEwZjQyZjAxMDY'
'yMWE5NjhlN2RmOGEyYmI4YzYwMmE5MWJh',
'Host': 'xa.fang.lianjia.com',
"user-agent": UserAgent().random(),
'Referer': 'https: // xa.fang.lianjia.com /'
}
# 获取页面
response = requests.get(url=url, headers=headers)
html = response.content.decode('utf-8')
# 实例化对象和调用
LianJia(html)()