上一节我们相当于写完了一个单机爬虫,下面将各板块的完整代码给出:
ftx.py
# -*- coding: utf-8 -*-
import scrapy
import re
from fang.items import NewHouseItem
from fang.items import EsfItem
class FtxSpider(scrapy.Spider):
name = 'ftx'
allowed_domains = ['fang.com']
start_urls = ['http://www.fang.com/SoufunFamily.htm']
def parse(self, response):
# 获取所有的tr标签
trs = response.xpath("//div[@class='outCont']//tr")
province = None # 先让省份为None,后续对省份赋值
for tr in trs:
tds = tr.xpath(".//td[not(@class)]")
province_id = tds[0]
province_text = province_id.xpath(".//text()").get()
province_text = re.sub(r'\s', "", province_text)
if province_text:
province = province_text # 再给省份赋值
# 不抓取海外的房价信息
if province == "其它":
continue
city_td = tds[1]
city_links = city_td.xpath(".//a")
for city_link in city_links:
city = city_link.xpath(".//text()").get() # 获取城市名称
city_url = city_link.xpath("./@href").get() # 获取城市链接
url_moudle = city_url.split(".")
scheme = url_moudle[0]
domain = url_moudle[1]
last = url_moudle[2]
if "bj" in scheme: # 构建北京的新房和二手房的url链接
newhouse_url = "https://newhouse.fang.com/house/s/"
esf_url = "https://esf.fang.com/"
else:
# 构建城市新房链接
newhouse_url = scheme + ".newhouse." + domain + "." + last + "house/s/"
# 构建城市二手房链接
esf_url = scheme + ".esf." + domain + '.' + last
# 构建生成器,将新房的详细信息传递个下一个函数数获取
yield scrapy.Request(
url=response.urljoin(newhouse_url),
callback=self.parse_newhouse,
meta={"info": (province, city)}
)
# 构建生成器,将二手房信息传递给另一个函数获取
yield scrapy.Request(
url=response.urljoin(esf_url),
callback=self.parse_esf,
meta={"info": (province, city)}
)
def parse_newhouse(self, response):
province, city = response.meta.get("info")
lis = response.xpath("//div[contains(@class, 'nl_con')]/ul//li")
for li in lis:
name = li.xpath(".//div[@class='nlcd_name']/a//text()").get()
if name:
name = name.strip()
if name is None:
continue
price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
price = re.sub(r'\s|广告', '', price)
sale = li.xpath(".//div[contains(@class, 'fangyuan')]/span/text()").get()
address = li.xpath(".//div[@class='address']/a/@title").get()
house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
house_type_list = list(map(lambda x:re.sub(r'\s', '', x), house_type_list))
rooms = list(filter(lambda x:x.endswith("居"), house_type_list))
area = "".join(li.xpath(".//div[contains(@class, 'house_type')]/text()").getall())
area = re.sub(r'\s|-|/', "", area)
origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
origin_url = "https:" + origin_url
district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
district = re.search(r'.*?\[(.*?)\].*?', district_text)
if district:
district = district.group(1)
item = NewHouseItem(
area=area, rooms=rooms, address=address, origin_url=origin_url,
name=name, sale=sale, price=price, province=province,
city=city, district=district
)
print(item)
yield item
next_page_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
if next_page_url:
yield scrapy.Request(url=response.urljoin(next_page_url),
callback=self.parse_newhouse,
meta={"info": (province, city)})
def parse_esf(self, response):
province, city = response.meta.get("info")
item = EsfItem(province=province, city=city)
dls = response.xpath("//div[contains(@class, 'shop_list')]/dl")
for dl in dls:
item["name"] = dl.xpath(".//p[@class='add_shop']/a/@title").get()
infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
for info in infos:
if "厅" in info:
item['rooms'] = info
elif "㎡" in info:
item["area"] = info
elif "层" in info:
item["floor"] = info
elif "向" in info:
item["toward"] = info
elif "年" in info:
item["year"] = info
price_str = "".join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall())
item["unit"] = "".join(dl.xpath(".//dd[@class='price_right']/span[2]//text()").getall())
item["price"] = re.sub(r"\s", "", price_str)
item["address"] = dl.xpath(".//p[@class='add_shop']/span//text()").get()
detail_url = dl.xpath(".//dt[@class='floatl']/a/@href").get()
item["origin_url"] = response.urljoin(detail_url)
print(item)
yield item
url = response.xpath("//div[@id='list_D10_15']/p[1]/a/@href").get()
if url:
next_url = response.urljoin(url)
yield scrapy.Request(url=next_url, callback=self.parse_esf,
meta={"info": (province, city)})
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class NewHouseItem(scrapy.Item):
# 省份
province = scrapy.Field()
# 城市
city = scrapy.Field()
# 价格
price = scrapy.Field()
# 面积
area = scrapy.Field()
# 详细链接
origin_url = scrapy.Field()
# 地点
address = scrapy.Field()
# 名称
name = scrapy.Field()
# 是否在售
sale = scrapy.Field()
# 几居
rooms = scrapy.Field()
# 行政区
district = scrapy.Field()
class EsfItem(scrapy.Item):
# 省份
province = scrapy.Field()
# 城市
city = scrapy.Field()
# 小区名字
name = scrapy.Field()
# 地址
address = scrapy.Field()
# 面积
area = scrapy.Field()
# 总价
price = scrapy.Field()
# 单价
unit = scrapy.Field()
# 年代
year = scrapy.Field()
# 朝向
toward = scrapy.Field()
# 几室几厅
rooms = scrapy.Field()
# 层
floor = scrapy.Field()
# 详情页信息
origin_url = scrapy.Field()
middlewares.py不变
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
class FangPipeline(object):
def __init__(self):
self.newhouse_fp = open("newhouse.json", "wb")
self.esfhouse_fp = open("esfhouse.json", "wb")
self.newhouse_expoter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False)
self.esfhouse_expoter = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False)
def process_item(self, item, spider):
self.newhouse_expoter.export_item(item)
self.esfhouse_expoter.export_item(item)
return item
def close_spider(self, spider):
self.newhouse_fp.close()
self.esfhouse_fp.close()
seetings.py
# -*- coding: utf-8 -*-
# Scrapy settings for fang project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'fang'
SPIDER_MODULES = ['fang.spiders']
NEWSPIDER_MODULE = 'fang.spiders'
LOG_LEVEL = "WARNING"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'fang.middlewares.FangSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'fang.middlewares.UserAgentDownloadMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'fang.pipelines.FangPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
大家只要复制粘贴,就能够运行啦,下一节讲解如何将一个单机爬虫转换成一个分布式爬虫。