import requests
from selenium import webdriver
from lxml import etree
class Fantaixia(object):
def __init__(self,url):
self.url = url
self.parse()
def get_xpath_by_request(self,url):
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'referer': 'https://zu.fang.com/house/c20-d21000/',
'upgrade-insecure-requests': '1',
'cookie': 'city=www; global_cookie=t9mmpqtin4j8nf5tnbx87fwnw18k4i834na; Integrateactivity=notincludemc; integratecover=1; g_sourcepage=zf_fy%5Elb_pc; __utma=147393320.1830412335.1577092390.1577092390.1577953883.2; __utmc=147393320; __utmz=147393320.1577953883.2.2.utmcsr=fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ASP.NET_SessionId=fzfvqfwnuyrsw5fjsjgubbdx; keyWord_recenthousebj=%5b%7b%22name%22%3a%22%e6%b5%b7%e6%b7%80%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a00%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e4%b8%9c%e5%9f%8e%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a02%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e4%b8%b0%e5%8f%b0%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a06%2f%22%2c%22sort%22%3a1%7d%5d; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; unique_cookie=U_bsgj421nek8sgaaqr8a9x0yov1ok4wgzv2m*16; Captcha=2B696F72454245454F626D6D4539526B55736254666E4F4B756A3379594F2B2B7868346A3564557866772B797376306D346D4C45677A6142347A38304C506C7A61354530794E6F336939383D; __utmb=147393320.39.10.1577953883',
}
response = requests.get(url,headers=headers)
if response.status_code==200:
return etree.HTML(response.text)
else:
print(response.status_code)
return ''
def get_text(self,text):
if text:
return text[0]
return ''
def parse_page(self,url):
html = self.get_xpath_by_request(url)
dl_list = html.xpath('//div[@class="houseList"]/dl')
for dl in dl_list:
title = self.get_text(dl.xpath('.//p[@class="title"]/a/@title'))
price = self.get_text(dl.xpath('.//span[@class="price"]/text()'))
infos = dl.xpath('.//p[@class="font15 mt12 bold"]/text()')
if infos:
area_size = infos[2]
house_scal = infos[1]
location = infos[3]
item = {}
item['title'] = title
item['price'] = price
item['area_size'] = area_size
item['house_scal'] = house_scal
item['location'] = location
print(item)
def parse_area(self,url):
html = self.get_xpath_by_request(url)
max_page = self.get_text(html.xpath('//div[@id="rentid_D10_01"]/span/text()'))
print(max_page)
if max_page:
max_page_num = max_page[1:-1]
print(max_page_num)
for i in range(1,int(max_page_num)+1):
page_url = url+'i31/'
print(page_url)
self.parse_page(page_url)
def parse(self):
html = self.get_xpath_by_request(self.url)
area_list = html.xpath('//dl[@id="rentid_D04_01"]/dd/a[position()>1]/@href')
print(area_list)
for area in area_list:
area_url = 'https://zu.fang.com'+area
self.parse_area(area_url)
if __name__ == '__main__':
base_url= 'https://zu.fang.com/'
Fantaixia(base_url)
运行结果的一部分