importreimportrequestsfrom bs4 importBeautifulSoup#主方法
defmain():#给请求指定一个请求头来模拟chrome浏览器
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
page_max= 100
for i in range(1, int(page_max) + 1):if i == 1:
house= 'https://cc.lianjia.com/ershoufang/erdaoqu/'
else:
house= 'https://cc.lianjia.com/ershoufang/erdaoqu/pg'+str(i)
res= requests.get(house, headers=headers)
soup= BeautifulSoup(res.text, 'html.parser')
li_max= soup.find('ul', class_='sellListContent').find_all('li')for li inli_max:try:
house_param={}
content= li.find('div', class_='houseInfo').text
content= content.split("|")
house_param['housing_estate'] =content[0]
house_param['square_metre'] = re.findall(r'-?\d+\.?\d*e?-?\d*?', content[2])[0]#--------------------------------------------------------#
position = li.find('div', class_='positionInfo').find('a').text
house_param['position'] =position#--------------------------------------------------------#
totalprice = li.find('div', class_='totalPrice').text
house_param['total_price'] = re.sub("\D", "", totalprice)
unitprice= li.find('div', class_='unitPrice').text
house_param['unit_price'] = re.sub("\D", "", unitprice)#--------------------------------------------------------#
follow = li.find('div', class_='followInfo').text
follow= follow.split("/")
house_param['follow'] = re.sub("\D", "", follow[0])
house_param['take_look'] = re.sub("\D", "", follow[1])#--------------------------------------------------------#
title_src = li.find('div', class_='title').find('a').attrs['href']
house_param['url'] = re.sub("\D", "", title_src)
res= requests.get(title_src, headers=headers)
soup= BeautifulSoup(res.text, 'html.parser')#--------------------------------------------------------#
pub_date = soup.find('div', class_='transaction').find_all('li')[0].find_all('span')[1].text
house_param['pub_date'] =pub_dateprint(house_param)exceptException as e:print(e)if __name__ == '__main__':
main()