爬取房源信息,保存到CSV文件,比较简单,没有什么反爬虫。
"""
@File : qfang.py
@Time : 2020/6/11 14:44
@Author : ligang
@WeChat : 18233275213
@Software: PyCharm
"""
import requests
import time
from lxml import etree
import csv
def spider_page(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/70.0.3538.110 Safari/537.36',
'upgrade-insecure-requests': '1',
'cookie': 'cookieId=4a8744d7-42ab-4567-bb86-3eaea4fab2e1; sid=eacd9f8d-3a3a-4bb0-8a15-8ce9adba2886; qchatid=2ba17dcd-b976-46c6-b02b-3c12323002fd; language=SIMPLIFIED; JSESSIONID=aaavyjKCV6-9H5phIDEkx; cookieId=03a9dc9f-40c1-4d34-890b-0ef91d91d713; cookieId=91044341-5ea6-45fc-95e6-e22c7160c570; CITY_NAME=SHENZHEN; Hm_lvt_4d7fad96f5f1077431b1e8d8d8b0f1ab=1591858478; Hm_lpvt_4d7fad96f5f1077431b1e8d8d8b0f1ab=1591858478; Hm_lvt_de678bd934b065f76f05705d4e7b662c=1591858479; Hm_lpvt_de678bd934b065f76f05705d4e7b662c=1591858479; WINDOW_DEVICE_PIXEL_RATIO=1; _jzqa=1.1115870713157731100.1591858479.1591858479.1591858479.1; _jzqc=1; _jzqckmp=1; _qzja=1.1207994965.1591858478700.1591858478700.1591858478700.1591858478700.1591858478700.0.0.0.1.1; _qzjc=1; _qzjto=1.1.0; _ga=GA1.3.1531234912.1591858479; _gid=GA1.3.676679335.1591858479; _dc_gtm_UA-47416713-1=1; _jzqb=1.1.10.1591858479.1; _qzjb=1.1591858478700.1.0.0.0'}
response = requests.get(url, headers=headers)
time.sleep(2)
return response.text
def csv_data(item):
with open('fangwo_info.csv', 'a+', encoding='utf-8', newline='')as csvfile:
writer = csv.writer(csvfile)
writer.writerow(item)
def paser_info(url):
html = spider_page(url)
selector = etree.HTML(html)
for index in range(1, 31):
xiangq = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[1]/a/text()')[0]
name = xiangq.split(' ')[0]
style = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[2]/p[1]/text()')[0].split(' ', 1)[0]
area = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[2]/p[2]/text()')[0].split(' ', 1)[0]
decotored = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[2]/p[3]/text()')[0].split(' ', 1)[0]
louceng = selector.xpath('.//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[2]/p[4]/text()')[0].split(' ', 1)[0]
chaoxiang = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[2]/p[5]/text()')[0].split(' ', 1)[0]
total = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[3]/p[1]/span[1]/text()')[0].split(' ', 1)[0]
price = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[3]/p[2]/text()')[0].split(' ', 1)[0]
info = [name, style, area, decotored, louceng, chaoxiang, price, total, xiangq]
csv_data(info)
print("正在爬取", name)
def main():
info_title = ["名称", "户型", "面积", "装修", "楼层", "朝向", "售价", "总价/万", "详情"]
csv_data(info_title)
urls = ['https://shenzhen.qfang.com/sale/f%s' % x for x in range(1, 10)]
for url in urls:
paser_info(url)
if __name__ == '__main__':
main()
结果: