一.二手房
网站:安居客
工具:开发平台:pycharm,爬取工具:selenium+pyquery
主要爬取信息: town:镇名,flood:小区名称,decoration装修程度,range房屋户型,built建筑年代,area 面积,floor楼层,totalPrice总价,unitprice单价 orientation 朝向
难点:安居客最多只能爬取100页,所以分批爬取并且,如上图所示,只有7个条件,还差装修程度,以及镇名,因此我们可以通过每个镇进行划分
# 镇名
names = ['nanchenga', 'dongchengd', 'humen', 'changpingb', 'zhangmutou', 'tangxia', 'houjiea', 'wanjianga',
'liaobua', 'changanc', 'fengganga', 'guanchengb', 'dalang', 'songshanhua', 'huangjianga', 'qingxizhen',
'dalinshanzhen', 'henglizhen', 'shilongzhen', 'shatianzhen', 'shijiezhen', 'daojiaozhen', 'mayongzhen',
'dongkengzhen', 'chashanzhen', 'zhongtangzhen', 'qiaotouzhen', 'gaobuzhen', 'shipaizhen',
'dgwangniudunzhen','hongmeizhen', 'qishizhen', 'xiegangzhen']
但是通过这样观察还是可以发现,南城区超过100页,因此,还需要增加一个筛选条件,可以选择影响最小的条件–品牌中介。筛选条件如下:
相关分类实现,尤其需要注意的是d2_123,也就简修的调价,跳转是有问题。
# 乐有家:j84442
# 中原地产:j15236
# 裕丰地产:j102476
# 我爱我家:j54471
# 融易地产:j15693
# 嘉丰地产:j19844
# 万科物业二手房:j29181
real_estate = ['j84442', 'j15236', 'j102476', 'j54471', 'j15693', 'j19844', 'j29181']
# 毛坯d1
# 普通装修d2_123
# 豪华装修d4
# 精装修d3
#decorations = ['d1', 'd4', 'd3'] d2_123比较特殊,下一页跳转有问题
decorations = ['d1', 'd2_123', 'd4', 'd3']
开始实现,def main:主要是通过嵌套循环进行
for name in names:
for estate in real_estate:
for decoration in decorations:
current_url = 'https://dg.anjuke.com/sale/' + quote(name) + '/' + quote(decoration) + '-' + quote(
estate) + '-p1'
i = 0
if decoration == 'd2_123':#重点注意!!!!参考上一点问题
i = i + 1
special_current_url = 'https://dg.anjuke.com/sale/' + quote(name) + '/' + quote(decoration) + '-' + quote(
estate) + '-p'
go_jianxiu_next_page(special_current_url, decoration, i)
continue
print('开始一页-------------------------------------------------------------------------------------------')
go_next_page(current_url, decoration)
browser.close()
获取页面信息
def get_page_info(decoration):
decoration = decoration.replace('d1', '毛坯')
decoration = decoration.replace('d2_123', '普通装修')
decoration = decoration.replace('d4', '豪华装修')
decoration = decoration.replace('d3', '精装修')
html = browser.page_source
doc = pq(html)
infos = doc('#houselist-mod-new .list-item').items()
with open('C://Users//Administrator//Desktop//毕设数据//安居客//二手房//安居客.csv', 'a', newline='')as csvFile:
# town:镇名,flood:小区名称,decoration装修程度,range房屋户型,built建筑年代,area 面积,floor楼层
# totalPrice总价,unitprice单价 orientation 朝向
filename = ['town', 'flood', 'decoration', 'built', 'house_range', 'area', 'floor', 'total_price', 'unit_price']
writer = csv.DictWriter(csvFile, filename)
i = 1
for info in infos:
things = {
'town': doc('#filtersort span strong').text(),
'flood': info.find('div.house-details div:nth-child(2) span:nth-child(5)').text(),
'decoration': decoration,
'built': info.find('div.house-details div:nth-child(2) span:nth-child(7)').text(),
'house_range': info.find('div.house-details div:nth-child(2) span:nth-child(1)').text(),
'area': info.find('div.house-details div:nth-child(2) span:nth-child(3)').text().strip('m²'),
'floor': info.find('div.house-details > div:nth-child(3) > span').text().split('\xa0\xa0 ')[0].replace('\u4c9e', '鱼立'),
'total_price': info.find('div.pro-price span.price-det strong').text(),
'unit_price': info.find('div.pro-price span.unit-price').text().strip("元/m²"),
}
print("第"+str(i)+"条数据")
i = i + 1
print(things)
writer.writerow(things)
之后需要进行正常装修条件下的页面的跳转
def go_next_page(current_url, decoration):
browser.get(current_url)
browser.execute_script("window.scrollTo(0,document.body.scrollHeight*0.93);")
time.sleep(5)
get_page_info(decoration)
html = browser.page_source
doc = pq(html)
try:
next_page = doc('#content > div.sale-left > div.multi-page > a.aNxt')
if len(next_page) != 0:
href_url = browser.find_element_by_xpath('//a[contains(text(), "下一页")]')
print("新一页-------------------------------------------------------------")
print(href_url.get_attribute('href'))
go_next_page(href_url.get_attribute('href'), decoration)
except NoSuchElementException:
print("单页结束")
简修条件下的页面爬取信息
def go_jianxiu_next_page(current_url, decoration, i):
get_current_url = current_url + quote(str(i))
browser.get(get_current_url)
browser.execute_script("window.scrollTo(0,document.body.scrollHeight*0.93);")
time.sleep(5)
get_page_info(decoration)
html = browser.page_source
doc = pq(html)
try:
next_page = doc('#content > div.sale-left > div.multi-page > a.aNxt')
if len(next_page) != 0:
href_url = browser.find_element_by_xpath('//a[contains(text(), "下一页")]')
print("新一页-------------------------------------------------------------")
print(href_url.get_attribute('href'))
i = int(i) + 1
print("i的值为:"+str(i))
go_jianxiu_next_page(current_url, decoration, i)
except NoSuchElementException:
print("单页结束")
tips:爬取过程中会出现滑动验证问题,暂时还没有解决,等解决在更新。