import requests
from lxml import etree
import csv
class JuAnKe():
def __init__(self):
self.url_temp = "https://suzhou.anjuke.com/sale/p{}"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
def get_url_list(self):
return [self.url_temp.format(i) for i in range(1, 3)] #这里爬取1-3页。
def pase_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode()
def get_content_list(self, html_str):
html = etree.HTML(html_str)
content_list = []
div_list = html.xpath('//ul[@id="houselist-mod-new"]/li')
for div in div_list:
item = {}
item["标题"] = div.xpath(
'.//div[@class="house-title"]/a/text()')
item["标题"] = item["标题"][0].strip()
item["楼盘名称"] = div.xpath(
'.//div[@class="details-item"]/span[@class="comm-address"]/text()')
item["楼盘名称"] = item['楼盘名称'][0].split("\xa0")[0].strip()
item["地址"] = div.xpath(
'.//div[@class="details-item"]/span[@class="comm-address"]/text()')
item["地址"] = item['地址'][0].split("\xa0")[-1].strip()
content_list.append(item)
return content_list
def save_content_list(self, content_list):
headers = ["标题","楼盘名称","地址"]
with open("信息.csv","w",encoding="utf-8-sig", newline="") as fp:
writer = csv.DictWriter(fp, headers)
writer.writeheader()
writer.writerows(content_list)
# for i in content_list:
# print(i["title"])
def run(self):
url_list = self.get_url_list()
for url in url_list:
html_str = self.pase_url(url)
content_list = self.get_content_list(html_str)
self.save_content_list(content_list)
if __name__ == '__main__':
juanke = JuAnKe()
juanke.run()