#爬取链家二手房信息
import requests
from bs4 import BeautifulSoup
import re
from fake_useragent import UserAgent
class HomeLinkSpider(object):
def __init__(self,url):
self.ua=UserAgent()
self.headers={"User-Agent":self.ua.random}
self.data=list()
self.url=url
def get_max_page(self):
response=requests.get(self.url,headers=self.headers)
if response.status_code==200:
soup=BeautifulSoup(response.text,'html.parser')
a=soup.select('div[class="page-box house-lst-page-box"]')
max_page=eval(a[0].attrs['page-data'])["totalPage"] #使用eval是字符串转化为字典格式
return max_page
print(max_page)
else:
return None
def parse_page(self):
max_page=self.get_max_page()
print(max_page)
for i in range(1,max_page+1):
url="{}pg()/".format(self.url,i)
response=requests.get(url,headers=self.headers)
soup=BeautifulSoup(response.text,'html.parser')
ul=soup.find_all("ul",class_="sellListContent")
li_list=ul[0].select("li")
#print(li_list)
for li in li_list:
detail=dict()
detail["titile"]=li.select('div[class="title"]')[0].get_text()
# 大华锦绣华城(九街区) | 3室2厅 | 76.9平米 | 南 | 其他 | 无电梯
house_info=li.select('div[class="houseInfo"]')[0].get_text()
house_info_list=house_info.split(" | ")
detail['house']=house_info_list[0]
detail['bedroom']=house_info_list[1]
detail['area']=house_info_list[2]
detail['direction']=house_info_list[3]
# 低楼层(共7层)2006年建板楼 - 张江. 提取楼层,年份和板块
position_info=li.select('div[class="positionInfo"]')[0].get_text().split(" - ")
floor_pattern = re.compile(r'.+\)')
match1=re.search(floor_pattern,position_info[0]) #从字符串任意位置匹配
if match1:
detail['floor']=match1.group()
else:
detail['floor']="未知"
detail['floor'] = re.search(floor_pattern, position_info[0]).group() #从字符串任意位置匹配
year_pattern=re.compile(r'\d{4}')
match2=re.search(year_pattern,position_info[0]) #从字符串任意位置匹配
if match2:
detail['year']=match2.group()
else:
detail['year']="未知"
detail['location'] = position_info[1]
# 650万,匹配650
price_pattern = re.compile(r'\d+')
total_price = li.select('div[class="totalPrice"]')[0].get_text()
detail['total_price'] = re.search(price_pattern, total_price).group()
# 单价64182元/平米, 匹配64182
unit_price = li.select('div[class="unitPrice"]')[0].get_text()
detail['unit_price'] = re.search(price_pattern, unit_price).group()
self.data.append(detail)
print(self.data)
return(self.data)
url="https://sh.lianjia.com/ershoufang/minhang/l2p1/"
spider=HomeLinkSpider(url)
#spider.parse_page()
print(len(spider.parse_page()))