利用scrapy框架获取全国的房价数据
1.首先这里我是利用scrapy框架来进行爬取的。scrapy框架的使用可谓是既简单效率又高,下面来一起爬取。
2.直接上代码:
class Fang2Spider(scrapy.Spider):
name = 'fang2'
allowed_domains = ['fang.com/']
start_urls = ['https://www.fang.com/SoufunFamily.htm']
# 当前页数
page_now = 1
page_num = None
# 抓取所有城市的入口
def parse(self, response):
# 国外的城市去除掉了
url_list_p = response.xpath('//table[@id="senfe"]//tr[not(@id="sffamily_B03_30")]/td/a/@href')
for i in range(len(url_list_p)):
url_p = url_list_p[i].extract()
yield scrapy.Request(url=url_p, callback=self.z_parse, dont_filter=True)
# # 所有城市的首页里新房的url
def z_parse(self, response):
url_list_s = response.xpath('//div[@id="dsy_H01_03"]/div/a/@href')
for i in range(len(url_list_s)):
url_s = url_list_s[i].extract()
yield scrapy.Request(url=url_s, callback=self.one_parse,dont_filter=True)
# 一个城市的新房页面信息
def one_parse(self, response):
# 进第一页的时候抓取尾页数目
if self.page_now == 1:
url_list_last = response.xpath('//div[@id="sjina_C01_47"]/ul/li/a[@class="last"]/@href')
for i in range(len(url_list_last)):
url_last = url_list_last[i].extract()
url_last = url_last.replace('/house/s/b9', '')
url_last = url_last.replace('/', '')
self.page_num = url_last
#各小区div
div_list = response.xpath('//div[@class="nlc_details"]')
for i in div_list:
name_list = i.xpath('./div[@class="house_value clearfix"]/div[@class="nlcd_name"]/a/text()')
price_list = i.xpath('./div[@class="nhouse_price"]/span/text()')
price2_list = i.xpath('./div[@class="nhouse_price"]/i/text()')
addrss_list = i.xpath('./div[@class="relative_message clearfix"]//a/@title')
name = name_list.extract_first()
name = name.replace('\n\t\t\t\t\t\t\t\t\t\t','')
name = name.replace('\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t','')
price = price_list.extract_first()
price2 = price2_list.extract_first()
addrss = addrss_list.extract_first()
dict = {}
dict['小区'] = name
dict['地址'] = addrss
if price != None:
dict['价格'] = price
else:
dict['价格'] = '价格待定,周边价格:' + price2
# 数据交给管道
yield dict
# 判断当前页 是否有下一页
if self.page_num !=None and self.page_now < int(self.page_num):
self.page_now = self.page_now + 1
r_url = response.url
url = re.split(r'/house/', r_url)
url_next = url[0] + '/house/s/b9' + str(self.page_now)
yield scrapy.Request(url=url_next, callback=self.one_parse, dont_filter=True)
#进入下一个城市 把参数重置 末尾加上上个爬取城市的信息
else:
self.page_now = 1
self.page_num = None
dict2 = {}
a_url = response.url
a_url = re.split(r'https://', a_url)
a_url = re.split(r'.newhouse', a_url[1])
dict2['所在城市'] =a_url[0]
dict2['上城市新房的url'] = response.url
# 数据交给管道
yield dict2
这里是我在spiders中定义的类的方法,然后获取完数据后将数据交给管道将数据以json的格式写入到文本中。
然后运行指令:scrapy crawl fang
就爬下来了。