1、创建项目
scrapy startproject pachong
创建文件
scrapy genspider woaiwojia
2、编辑items.py
自定义要爬取的字段域
class Pachong2Item(scrapy.Item):
apartment = scrapy.Field()
total_price = scrapy.Field()
编辑setting.py文件
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/'
'537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/'
'537.36 Core/1.63.6788.400 QQBrowser/10.3.2727.400',
'Referer': 'https://bj.5i5j.com/ershoufang/n1/?wscckey=3af98ca61d541c63_1550824465',
'Cookie': 'yfx_c_g_u_id_10000001=_ck19022116084813839206365574151; _ga=GA1.2.172982220.1550736528; _gid=GA1.2.262368544.1550736528; _Jo0OQK=1EF60A430707C39DC66841396A856BB9F1CDAFCCCBE5DD3EF55A648ADA5CBA77AEE43F896CA59E44D089FA0454846BD97D281B9CBA503EBFB1655D4D98FAB359BC6C57212F12283777C840763663251ADEB840763663251ADEB55B9636F868E3ABE2350674422DE2517GJ1Z1Pg==; PHPSESSID=1sa6iff0gbok9mts9t6mgt4l2e; domain=bj; yfx_f_l_v_t_10000001=f_t_1550736528365__r_t_1550736528365__v_t_1550759656352__r_c_0; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1550736529,1550759657; ershoufang_BROWSES=41857749%2C42331571; _gat=1; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=1550760880'
}
ITEM_PIPELINES = {
'pachong2.pipelines.Pachong2Pipeline': 300,
# 'scrapy.pipelines.images.ImagesPipeline': 1
}
3、编写spiders文件下的woaowojia.py文件
import scrapy
from pachong2.items import Pachong2Item
class WoaiwojiaSpider(scrapy.Spider):
name = 'woaiwojia'
allowed_domains = ['bj.5i5j.com']
start_urls = ['https://bj.5i5j.com/ershoufang/n%s/' % x for x in range(1, 11)]
def parse(self, response):
# 获取房源列表信息
house_list = response.xpath('//ul[@class="pList"]/li')
for house in house_list:
item = Pachong2Item()
item['apartment'] = house.xpath('div[2]/h3/a/text()').extract()
item['total_price'] = house.xpath('div[2]/div[1]/div/p[1]/strong/text()').extract()
yield item
下面数据获取完成可通过两种方式保存数据
1、直接的命令行 scrapy crawl woaiwojia.py -o wo.csv
2、就是编辑piplines.py文件
import pymongo
from scrapy.item import Item
class Pachong2Pipeline(object):
def open_spider(self, spider):
# 连接数据库
self.client = pymongo.MongoClient(host='localhost', port=27017)
# 创建myspider数据库
self.db = self.client.myspider
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
collection = self.db.spider.name
post = dict(item) if isinstance(item, Item) else item
collection.insert_one(post)
return item
一切就基本完成
现在需要打开数据库并创建数据库确保连接
最后只需在命令行输入 scrapy crawl woaiwojia
数据就保存到数据库了
数据库的操作见上一篇数据库文章