爬取小猪网的短租房信息的实现
#小猪网爬虫2.0
#功能:实现爬取多页面,并将图片和CSV文件存入桌面文件夹
from PIL import Image
import requests
from bs4 import BeautifulSoup
import re
import os
from io import BytesIO
import csv
import pandas as pd
import time
#寻找房源价格
def get_prices(soup):
prices_list=[]
prices=soup.select('#page_list > ul > li > div > div > span')
for price in prices:
prices_list.append(price.text)
return prices_list
#找到图片链接
def get_images(links):
images_list=[]
for link in links:
link=str(link)
regex=r' lazy_src="(.*?)"'
regex=re.compile(regex)
result=re.findall(regex,link)
images_list.append(result)
return images_list
#查找地址
def get_address(soup):
address_list=[]
for i in range(24):
address=soup.select('#page_list > ul > li:nth-of-type({}) > div.result_btm_con.lodgeunitname > div.result_intro > a > span'.format(i+1))
address_list.append(address[0].text)
return address_list
#查找房东主页
def get_room_host_info(soup):
host_info=[]
for i in range(24):
title=soup.select('#page_list > ul > li:nth-of-type({}) > a > img'.format(i+1))
host=title[0].get('data-growing-title')
host_info.append(host)
return host_info
#判断性别
def judeg_gender(gender):
if gender[0]=='member_girl_ico':
gender='女'
else:
gender='男'
return gender
#查找房东性别和名字
def get_gender_and_name(host_info):
gender_list=[]
name_list=[]
for i in range(24):
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"}
url='https://sh.xiaozhu.com//fangzi//{}.html'.format(host_info[i])
host_res=requests.get(url,headers=headers)
host_soup=BeautifulSoup(host_res.text,'lxml')
gender=host_soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > span')
name=host_soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
gender=gender[0].get('class')
gender=judeg_gender(gender)
gender_list.append(gender)
name_list.append(name[0].text)
gender_and_name=[gender_list,name_list]
time.sleep(2)
return gender_and_name
#主函数
def main(i):
#定义Headers和URL
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"}
url='https://sh.xiaozhu.com//search-duanzufang-p{}-0//'.format(i+1)#小猪网查找北京的房子
#从网站上获取信息,放入BeautifulSoup
res=requests.get(url,headers=headers)
soup=BeautifulSoup(res.text,'lxml')
links=soup.select('#page_list > ul > li > a')
print('已获取小猪网的URL{}遍...'.format(i+1))
prices_list=get_prices(soup)
print('正在查找房源价格{}遍...'.format(i+1))
images_list=get_images(links)
print('正在查找图片链接{}遍...'.format(i+1))
address_list=get_address(soup)
print('正在查找房源地址{}遍...'.format(i+1))
host_info=get_room_host_info(soup)
print('搜寻房东页面的URL{}遍'.format(i+1))
gender_and_name=get_gender_and_name(host_info)
print('正在查找房东性别和姓名...{}遍'.format(i+1))
gender_list=gender_and_name[0]
name_list=gender_and_name[1]
#将所有列表装入数据
information=pd.DataFrame({'地址':address_list,'价格':prices_list,'名字':name_list,'性别':gender_list,'图片链接':images_list})
#将数据写入CSV文件
information.to_csv('Room_Data{}.csv'.format(i+1),index=0,sep=',')
print('数据已经写入CSV文件{}遍'.format(i+1))
if __name__=='__main__':
for i in range(10):
try:
main(i)
print('程序已执行{}遍'.format(i+1))
time.sleep(10)
print('sleeping...')
except :
print('对方反爬虫机制已启动,请前往小猪网滑动验证方块')
answer=input('是否继续(Y/N):')
if answer=='Y':
main(i)
else:
print('程序已退出')