要求
1.获取所有城市的url: http://www.fang.com/SoufunFamily.htm
2.获取所有城市的新房url
例如:杭州:http://hz.fang.com/
杭州新房: https://hz.newhouse.fang.com/house/s/
(需要将获得的url先进行拆分)
3.获取所有城市二手房url链接
例如:杭州:http://hz.fang.com/
杭州二手房:https://hz.esf.fang.com/
代码目录
middlewares.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random
class UserAgentDownloadMiddleware(object):
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36'
]
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
user_agent = random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = user_agent
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
class FangPipeline(object):
def __init__(self):
self.newhouse_fp = open('newhouse.json', 'wb')
self.esfhouse_fp = open('esfhouse.json', 'wb')
self.newhouse_exporter = JsonLinesItemExporter(
self.newhouse_fp, ensure_ascii=False
)
self.esfhouse_exporter = JsonLinesItemExporter(
self.esfhouse_fp, ensure_ascii=False
)
def process_item(self, item, spider):
self.newhouse_exporter.export_item(item)
self.esfhouse_exporter.export_item(item)
return item
def close_item(self, item, spider):
self.newhouse_fp.close()
self.esfhouse_fp.close()
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
# 新房
class NewHouseItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 省份
province = scrapy.Field()
# 城市
city = scrapy.Field()
# 小区名字
name = scrapy.Field()
# 价格
price = scrapy.Field()
# 几局 是个列表形式
rooms = scrapy.Field()
# 面积
area = scrapy.Field()
# 地址
address = scrapy.Field()
# 行政区
district = scrapy.Field()
# 是否在售
sale = scrapy.Field()
# 房天下详情页面的url
origin_url = scrapy.Field()
# 二手房
class ESFHouseItem(scrapy.Item):
# 省份
province = scrapy.Field()
# 城市
city = scrapy.Field()
# 小区名字
name = scrapy.Field()
# 几室几厅
rooms = scrapy.Field()
# 层
floor = scrapy.Field()
# 朝向
toward = scrapy.Field()
# 年代
year = scrapy.Field()
# 地址
address = scrapy.Field()
# 建筑面积
area = scrapy.Field()
# 总价
price = scrapy.Field()
# 单价
unit = scrapy.Field()
# 详情url
origin_url = scrapy.Field()
sfw.py文件
# -*- coding: utf-8 -*-
import scrapy
import re
from fang.items import NewHouseItem, ESFHouseItem
class SfwSpider(scrapy.Spider):
name = 'sfw'
allowed_domains = ['fang.com']
start_urls = ['https://www.fang.com/SoufunFamily.htm']
def parse(self, response):
trs = response.xpath("//div[@class = 'outCont']//tr")
province = None
for tr in trs:
tds = tr.xpath(".//td[not(@class)]") # 每个tr标签下有多个td标签,除了带class的(因为带class没有文字,只有空白字符),其余td标签均要提取
province_td = tds[0]
province_text = province_td.xpath(".//text()").get() # 提取省份字段
province_text = re.sub(r"\s", "", province_text) # 保存省份 替换成空白字符
if province_text: # 若当前这列有省份,则保存;若没有省份字段,为空,说明该列字段空字符,该行城市属于该省份
province = province_text
# 不爬取海外国家
if province == '其它':
continue
city_td = tds[1] # 找到城市,其中包含多个城市
city_links = city_td.xpath(".//a")
for city in city_links:
city_name = city.xpath("./text()").get() # .//text()
city_url = city.xpath("./@href").get() # .//@href()
# 构建新房链接
# 将http://hz.fang.com/拆分成 http://hz fang com/ 三部分
url_module = re.split("\\.", city_url)
url0 = url_module[0]
url1 = url_module[1]
url2 = url_module[2]
# 重新组合成 https://hz.newhouse.fang.com/house/s/
newhouse_url = url0 + '.' + 'newhouse.' + url1 + '.' + url2 + 'house/s/'
# 构建二手房链接
# https://hz.esf.fang.com/
esf_url = url0 + '.' + 'esf.' + url1 + '.' + url2
yield scrapy.Request(url=newhouse_url, callback=self.parse_newhouse, meta={"info": (province, city_name)})
yield scrapy.Request(url=esf_url, callback=self.parse_esf, meta={"info": (province, city_name)})
# break
# break
# 解析新房信息
def parse_newhouse(self, response):
province, city = response.meta.get('info') # 元组形式
lis = response.xpath("//div[@class='nl_con clearfix']/ul/li") # 或者 //li
for li in lis:
# 所有小区的名字
name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
if name == None: # 由于name存在None值,无法strip,使用需要进行判断
continue
name = name.strip() # 存在空白字符
# 小区的类型(几居 列表形式)
rooms = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
rooms = list(filter(lambda x: x.endswith("居"), rooms)) # 过滤掉以“居”结尾的字符串,比如2居,而其他字符串‘亦庄·臻珑府价格待定'则不需要
# 面积
area = "".join(li.xpath(".//div[@class='house_type clearfix']/text()").getall()) # 使用get()只会返回“面积”前的一段空白,所以getall,由于返回的是列表,需要转换成字符串,否则会有\t一堆
area = re.sub(r"\s|-|/", "", area) # 过滤掉空白字符\s(制表符/空格/其他空白)
# 地址
address = li.xpath(".//div[@class='address']/a/@title").get() # 注意写法 <a title="地址" target="xxx"....
# 行政区
district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall()) # 提取所有的地址 并转换成字符串
district = re.search(r".*\[(.+)\].*", district_text).group(1) # 从字符串中提取带[]的字段.group(1)和group()有区别
# 不采用该方法是因为不是所有的都有span标签,会出现NoneTpye
# district = li.xpath(".//div[@class='address']/a/span/text()").get().strip()
# 是否在售
# sale = li.xpath(".//div[@class='fangyuan pr']/span/text()").get() # 会提取到None,该方法不合适
# sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/@class").get() # 提取英文,注意写法<span class="inSale">在售</span>
sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get() # 提取中文
# 价格
# strip还可以用正则表达式替代 price = re.sub(r"\s","",price)
price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall()).strip() # 既要取价格还要取单位,所以用getall和//text()
# 详情页面的url https://beidaihekongquecheng.fang.com/
origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
origin_url = 'https:' + origin_url # 获取的url不全,需要字符拼接
item = NewHouseItem(province=province, city=city, name=name, rooms=rooms, area=area, address=address, district=district,
sale=sale, price=price, origin_url=origin_url)
yield item
next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get() # 注意写法 <a class='next' href='xxxx'>下一页
if next_url: # 如果存在下一页,由于可能url不完全,所以需要拼接url,将newhouse_url,即response.urljoin next_url
yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse(),
meta={"info": (province, city)}) # 如果有下一页,循环调用newhouse新房列表页面
# 解析二手房信息
def parse_esf(self, response):
province, city = response.meta.get('info') # 元组形式
dls = response.xpath("//div[contains(@class,'shop_list ')]/dl")
for dl in dls:
item = ESFHouseItem(province=province, city=city)
# 小区名字
item['name'] = dl.xpath(".//p/a/@title").get()
# 面积朝向等信息infos
infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() # 信息较多,先用getall获取所有
infos = list(map(lambda x: re.sub("\s", "", x), infos))
for info in infos:
# 几室几厅
if '厅' in info:
item['rooms'] = info
# 层
elif '层' in info:
item['floor'] = info
# 面积
elif '㎡' in info:
item['area'] = info
# 朝向
elif '向' in info:
item['toward'] = info
# 年代
elif '年' in info:
item['year'] = info
# item['year'] = info.replace("建筑年代:", "") # 去掉建筑年代,只保留数字 2019
# 地址
item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
# 价格(数字和单位分开)
num = dl.xpath(".//span[@class='red']/b/text()").get()
if num:
num = "".join(num) # num 会出现None,无法join,报错
num_danwei = dl.xpath(".//dd[@class='price_right']/span/text()").get()
if num_danwei: # 同样会出现none。直接join报错
num_danwei = "".join(num_danwei)
# 字符串拼接(nonetype无法拼接)444万
if num and num_danwei:
item['price'] = num + num_danwei
# 单价
item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get()
# 二手房详情页面
detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
if detail_url:
item['origin_url'] = response.urljoin(detail_url) # 网页不全,且有none
yield item
next_url = response.xpath("//div[@class='page_al']//p/a/@href").get()
yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={'info': (province, city)})