——本次的目标是爬取‘广州链家新房’前十页的信息,具体需要爬取的信息为‘楼房名字、地址、价格以及是否在售的情况’,具体的代码如下。
import requests,time
import pandas as pd
from lxml import etree
house_name = []
location_list = []
Price_list =[]
Is_it_for_sale =[]
def get_house_data():
for i in range(1,74):
print("正在爬取第{}页数据".format(i))
url = 'http://gz.fang.lianjia.com/loupan/pg{}/'.format(i)
page_taxt = requests.get(url=url).text
#print(page_taxt)
tree = etree.HTML(page_taxt)
#进行属性定位
li_list = tree.xpath('//ul[@class="resblock-list-wrapper"]/li')
for li in li_list:
name = li.xpath('./div/div[1]/a/text()')[0]
location = li.xpath('.//div/div[2]/a/text()')[0]
price = li.xpath('.//div/div[6]/div[1]/span[1]/text()')[0]
on_sale = li.xpath('./div/div[1]/span[2]/text()')[0]
house_name.append(name)
location_list.append(location)
Price_list.append(price)
Is_it_for_sale.append(on_sale)
time.sleep(0.1)
def save_data():
print("————正在保存数据————")
data = {
'楼房名字':house_name,
'地址':location_list,
'价格':Price_list,
'是否在售':Is_it_for_sale
}
data_all = pd.DataFrame(data)
data_all.to_csv('data_all.csv',index=None,encoding='utf-8')
data_all
def get_lens():
try:
if len(house_name) == len(location_list) == len(Price_list) == len(Is_it_for_sale):
print("数据无误,保存成功。")
except:
print("数据有缺漏,请认真核查")
if __name__ == '__main__':
get_house_data()
save_data()
get_lens()