使用scrapy抓取糗事百科

最新推荐文章于 2020-07-07 23:40:13 发布

davidsu33

最新推荐文章于 2020-07-07 23:40:13 发布

阅读量979

点赞数 1

分类专栏： python 文章标签： scrapy

本文链接：https://blog.csdn.net/davidsu33/article/details/50316305

版权

python 专栏收录该内容

87 篇文章 1 订阅

订阅专栏

先scrapy startapp tut01

scrapy genspider qsbk www.qsbk.com

qsbk.py

# -*- coding: utf-8 -*-
import scrapy

QSBK_HOST = u"http://www.qiushibaike.com"
COUNT = 0

def debug(msg):
	scrapy.log.msg(msg.decode("utf-8").encode("gb2312"), level=scrapy.log.DEBUG)

#糗事百科单位
"""
作者
内容
点击量
"""
class QBItem(scrapy.Item):
	author = scrapy.Field()
	content = scrapy.Field()
	ctr = scrapy.Field()

	def __str__(self):
		return "%s:%s %s:%s %s:%s" %(
				'author',
				self['author'].encode("GB2312"),
				'content',
				self['content'].encode("GB2312"),
				'clickcount',
				self['ctr'].encode("GB2312"),
				)
"""
糗事百科爬虫
需要定制USER_AGENT和HEADERS，否则服务器不接收
"""

class QsbkSpider(scrapy.Spider):
	name = "qsbk"
	allowed_domains = ["www.qiushibaike.com",]
	start_urls = (
		'http://www.qiushibaike.com/',
	)

	def read_QBItems(self, response):
		global COUNT
		COUNT += 1
		debug("正在爬取第{}页".format(COUNT))

		css_block = "div.article.block"
		css_author = "div.author h2::text"
		css_content = "div.content::text"
		css_ctr = "div > span > i::text"

		for b in response.selector.css(css_block):
			try:
				qb = QBItem()
				qb['author'] = b.css(css_author).extract()[0]
				qb['content'] = b.css(css_content).extract()[0]
				qb['ctr'] = b.css(css_ctr).extract()[0]
				#返回数据结果，用于输出
				yield qb
			except Exception,e:
				debug("抓取失败{}".format(response.url.encode("GB2312")))

		css_next = "div.pageto a.next::attr(href)"
		npage = response.selector.css(css_next).extract()[0]
		if npage.startswith(u'/'):
			next_url = QSBK_HOST + npage
			yield scrapy.Request(next_url, self.read_QBItems)

	def parse(self, response):
		global COUNT
		if not COUNT:
			debug("准备开始解析数据...")
		return self.read_QBItems(response)

tut01/settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for tut01 project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'tut01'

SPIDER_MODULES = ['tut01.spiders']
NEWSPIDER_MODULE = 'tut01.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#如果使用默认的scrapy用户代理，QSBK不让访问
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
'''
DEFAULT_REQUEST_HEADERS = {
	'Proxy-Connection': 'keep-alive',
	'Cache-Control': 'max-age=0',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
	'Accept-Encoding': 'gzip, deflate, sdch',
	'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
}
'''