Scrapy框架:爬取云起

39 篇文章 3 订阅

更多爬虫实例请见 https://blog.csdn.net/weixin_39777626/article/details/81564819

首先创建一个爬虫项目

scrapy startproject yunqiCrawl
cd yunqiCrawl
scrapy genspider -t  crawl yunqi.qq.com yunqi.qq.com

yunqi.py

# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from yunqiCrawl.items import YunqiBookListItem,YunqiBookDetailItem



class YunqiSpider(CrawlSpider):

	name = 'yunqi'

	allowed_domains = ['yunqi.qq.com']

	start_urls = ['http://yunqi.qq.com/bk/so2/n10p1']



	rules = (

		Rule(LinkExtractor(allow=r'/bk/so2/n10p\d+'), callback='parse_book_list', follow=True),

	)



	def parse_book_list(self, response):

		books=response.xpath('//*[@id="detailedBookList"]/div')

		for book in books:

			novelImageUrl = book.xpath('./a/img/@src').extract_first()

			novelId = book.xpath('./div/h3/a/@id').extract_first()

			novelLink = book.xpath('./div/h3/a/@href').extract_first()

			novelName = book.xpath('./div/h3/a/text()').extract_first()

			novelAuthor = book.xpath('./div/dl[1]/dd[1]/a/text()').extract_first()

			novelType = book.xpath('./div/dl[1]/dd[2]/a/text()').extract_first()

			novelStatus = book.xpath('./div/dl[1]/dd[3]/text()').extract_first()

			novelUpdateTime = book.xpath('./div/dl[2]/dd[1]/text()').extract_first()

			novelWords = book.xpath('./div/dl[2]/dd[2]/text()').extract_first()

			bookListItem=YunqiBookListItem(

				novelImageUrl=novelImageUrl,

				novelId= novelId,

				novelLink= novelLink,

				novelName= novelName,

				novelAuthor= novelAuthor,

				novelType= novelType,

				novelStatus= novelStatus,

				novelUpdateTime= novelUpdateTime,

				novelWords=novelWords)

			yield bookListItem

			yield scrapy.Request(url=novelLink,callback=self.parse_book_detail, meta={'novelId':novelId})



	def parse_book_detail(self, response):

		novelId=response.meta['novelId']

		novelLabel = response.xpath('//*[@class="tags"]/text()').extract_first()

		novelAllClick= response.xpath('//*[@id="novelInfo"]/table/tr[2]/td[1]/text()').extract_first()

		novelMonthClick = response.xpath('//*[@id="novelInfo"]/table/tr[3]/td[1]/text()').extract_first()

		novelWeekClick = response.xpath('//*[@id="novelInfo"]/table/tr[4]/td[1]/text()').extract_first()

		novelAllPopular = response.xpath('//*[@id="novelInfo"]/table/tr[2]/td[2]/text()').extract_first()

		novelMonthPopular = response.xpath('//*[@id="novelInfo"]/table/tr[3]/td[2]/text()').extract_first()

		novelWeekPopular = response.xpath('//*[@id="novelInfo"]/table/tr[4]/td[2]/text()').extract_first()

		novelCommentNum = response.xpath('//*[@id="novelInfo"]/table/tr[5]/td[2]/text()').extract_first()

		novelAllComm = response.xpath('//*[@id="novelInfo"]/table/tr[2]/td[3]/text()').extract_first()

		novelMonthComm = response.xpath('//*[@id="novelInfo"]/table/tr[3]/td[3]/text()').extract_first()

		novelWeekComm = response.xpath('//*[@id="novelInfo"]/table/tr[4]/td[3]/text()').extract_first()

		bookDetailItem=YunqiBookDetailItem(

			novelId=novelId,

			novelLabel= novelLabel,

			novelAllClick= novelAllClick,

			novelMonthClick= novelMonthClick,

			novelWeekClick= novelWeekClick,

			novelAllPopular= novelAllPopular,

			novelMonthPopular= novelMonthPopular,

			novelWeekPopular= novelWeekPopular,

			novelCommentNum= novelCommentNum,

			novelAllComm = novelAllComm,

			novelMonthComm = novelMonthComm,

			novelWeekComm =novelWeekComm)

		yield bookDetailItem

#items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class YunqiBookListItem(scrapy.Item):
    # define the fields for your item here like:
    novelId = scrapy.Field()
    novelName = scrapy.Field()
    novelLink = scrapy.Field()
    novelAuthor = scrapy.Field()
    novelType = scrapy.Field()
    novelStatus = scrapy.Field()
    novelUpdateTime = scrapy.Field()
    novelWords = scrapy.Field()
    novelImageUrl = scrapy.Field()


class YunqiBookDetailItem(scrapy.Item):
    # define the fields for your item here like:
    novelId = scrapy.Field()
    novelLabel = scrapy.Field()
    novelAllClick = scrapy.Field()
    novelMonthClick = scrapy.Field()
    novelWeekClick = scrapy.Field()
    novelAllPopular = scrapy.Field()
    novelMonthPopular = scrapy.Field()
    novelWeekPopular = scrapy.Field()
    novelCommentNum = scrapy.Field()
    novelAllComm = scrapy.Field()
    novelMonthComm = scrapy.Field()
    novelWeekComm = scrapy.Field()

#pipelines.py

# -*- coding: utf-8 -*-



# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymongo

import re

from yunqiCrawl.items import YunqiBookListItem,YunqiBookDetailItem



class YunqicrawlPipeline(object):



	def __init__(self, mongo_uri, mongo_db,replicaset):

		self.mongo_uri = mongo_uri

		self.mongo_db = mongo_db

		self.replicaset = replicaset



	@classmethod

	def from_crawler(cls, crawler):

		return cls(

			mongo_uri=crawler.settings.get('MONGO_URI'),

			mongo_db=crawler.settings.get('MONGO_DATABASE', 'yunqi'),

			replicaset=crawler.settings.get('REPLICASET')

		)



	def open_spider(self, spider):

		self.client = pymongo.MongoClient(self.mongo_uri)

		self.db = self.client[self.mongo_db]



	def close_spider(self, spider):

		self.client.close()



	def process_item(self, item, spider):

		if isinstance(item,YunqiBookListItem):

			self._process_booklist_item(item)

		else:

			self._process_bookDetail_item(item)

		return item



	def _process_booklist_item(self,item):

		self.db.bookInfo.insert(dict(item))



	def _process_bookDetail_item(self,item):

		pattern=re.compile('\d+')

		item['novelLabel']=item['novelLabel'].strip().replace('\n','')



		match=pattern.search(item['novelAllClick'])

		item['novelAllClick']=match.group() if match else item['novelAllClick']



		match=pattern.search(item['novelMonthClick'])

		item['novelMonthClick']=match.group() if match else item['novelMonthClick']



		match=pattern.search(item['novelWeekClick'])

		item['novelWeekClick']=match.group() if match else item['novelWeekClick']



		match=pattern.search(item['novelAllPopular'])

		item['novelAllPopular']=match.group() if match else item['novelAllPopular']



		match=pattern.search(item['novelMonthPopular'])

		item['novelMonthPopular']=match.group() if match else item['novelMonthPopular']



		match=pattern.search(item['novelWeekPopular'])

		item['novelWeekPopular']=match.group() if match else item['novelWeekPopular']



		match=pattern.search(item['novelAllComm'])

		item['novelAllComm']=match.group() if match else item['novelAllComm']



		match=pattern.search(item['novelMonthComm'])

		item['novelMonthComm']=match.group() if match else item['novelMonthComm']



		match=pattern.search(item['novelWeekComm'])

		item['novelWeekComm']=match.group() if match else item['novelWeekComm']



		match=pattern.search(item['novelCommentNum'])

		item['novelCommentNum']=match.group() if match else item['novelCommentNum']



		self.db.bookhot.insert(dict(item))

#settings.py

# -*- coding: utf-8 -*-



# Scrapy settings for yunqiCrawl project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html



BOT_NAME = 'yunqiCrawl'



SPIDER_MODULES = ['yunqiCrawl.spiders']

NEWSPIDER_MODULE = 'yunqiCrawl.spiders'





# Crawl responsibly by identifying yourself (and your website) on the user-agent


# Obey robots.txt rules

ROBOTSTXT_OBEY = False



# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32



# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

DOWNLOAD_DELAY = 2

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16



# Disable cookies (enabled by default)

COOKIES_ENABLED = False



# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False



# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}



# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'yunqiCrawl.middlewares.YunqicrawlSpiderMiddleware': 543,

#}



# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,

#    'yunqiCrawl.middlewares.RandomUserAgent.RandomUserAgent': 410,

#}



# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}



# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html



MONGO_URI='mongodb://127.0.0.1:27017/'

MONGO_DATABASE='yunqi'

ITEM_PIPELINES = {

    'yunqiCrawl.pipelines.YunqicrawlPipeline': 300,

}



# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

# The initial download delay

AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False



# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'



#SCHEDULER="scrapy_redis.scheduler.Scheduler"

#SCHEDULER_PERSIST=True

#DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter"

#REDIS_HOST='127.0.0.1'

#REDIS_PORT=6379


最后启动爬虫

scrapy crawl yunqi

更多爬虫实例请见 https://blog.csdn.net/weixin_39777626/article/details/81564819

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值