import requests from lxml import etree import pandas def get_html(url, headers): #获取HTML文本 res = requests.get(url, headers=headers) res.raise_for_status() res.encoding = res.apparent_encoding return res.text def parser(html): #解析HTML函数,获取二手房的URL doc = etree.HTML(html) for row in doc.xpath('//*[@id="beike"]/div[1]/div[4]/div[1]/div[4]/ul'): href = row.xpath('//li/div/div/a/@href') return href def parser_datali(html): #将上一个函数获取的URL进行解析,获取想要得到的数据 row = etree.HTML(html) houses = row.xpath('//*[@id="beike"]/div[1]/div[4]/div[1]/div[2]/div[4]/div[1]/a[1]/@title')[0] locality = row.xpath('//*[@id="beike"]/div[1]/div[4]/div[1]/div[2]/div[4]/div[2]/span[2]/a[1]/text()')[0] price = row.xpath('//*[@id="beike"]/div[1]/div[4]/div[1]/div[2]/div[2]/div/span[1]/text()')[0] housetype = row.xpath('//*[@id="beike"]/div[1]/div[4]/div[1]/div[2]/div[3]/div[1]/div[1]/text()')[0] floor = row.xpath('//*[@id="beike"]/div[1]/div[4]/div[1]/div[2]/div[3]/div[1]/div[2]/text()')[0] area = row.xpath('//*[@id="beike"]/div[1]/div[4]/div[1]/div[2]/div[3]/div[3]/div[1]/text()')[0] out_dict = {"楼盘": houses, "区域": locality , "总价(万元)": price, "户型": housetype, "楼层": floor, "面积": area} return out_dict def save_csv(item, path): #将文件存为CSV df = pandas.DataFrame(item) df.to_csv(path, mode='a') for i in range(1, 15): #爬取15页,分页爬取,注意分析他的URL 找规律即可 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57'} url = "https://cq.ke.com/ershoufang/pg{0}/".format(i) html = get_html(url, headers=headers) url_l = parser(html) out_list = [] for i in range(30): html_detali = get_html(url_l[i], headers=headers) data_house = parser_datali(html_detali) out_list.append(data_house) save_csv(out_list, "esf.csv")
爬取时间有一点长,需要耐心等待
最终结果:(最后有可能会报错,但是能够在你当前目录下面找到“esf.csv”文件)