之前是爬取单个页面的内容,今天对所有文章进行爬取。
所有文章文章的地址:http://blog.jobbole.com/all-posts/
对所有文章的URL进行提取
提取第一页URL
F12
可以看到单个文章链接在 #archive .floated-thumb .post-thumb a
标签之下
对其进行提取,在 scrapy shell
中
在 Ipython
进行遍历
可以得到第一页中所有文章的URL。
用 Request
库对提取的URL交给scrapy下载,然后调用自己定义的解析函数。
from scrapy.http import Request
post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
for post_url in post_urls:
#构建Request
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)#通过回调提取文章函数
提取下一页URL
这次通过两个类标签来定位 下一页
的标签,(中间没有空格)
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
整个代码:
# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy.http import Request
from urllib import parse
from ArticleSpider.items import ArticleSpiderItem
class JobboleSpider(scrapy.Spider):
name = "jobbole"
allowed_domains = ["blog.jobbole.com"]
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
# '''
# 1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析
# 2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse
# '''
# 解析列表页中的所有文章url并交给scrapy下载后并进行解析(request库解析)
post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
for post_url in post_urls:
#构建Request
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)#通过回调函数
# 提取下一页并交给scrapy下载
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
def parse_detail(self, response):
# 提取文章的具体字段
#通过CSS选择器提取网页的具体字段
#标题
title = response.css(".entry-header h1::text").extract_first()
#发布日期
create_data = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip()
#标签
tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract()
tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
tags = ",".join(tag_list)
#点赞数
praise_nums = response.css(".vote-post-up h10::text").extract_first()
#收藏数
fav_nums = response.css("span.btn-bluet-bigger:nth-child(2)::text").extract_first()
match_re = re.match(".*?(\d+).*", fav_nums)
if match_re:
fav_nums = int(match_re.group(1))
else:
fav_nums = 0
# 评论数
comment_nums = response.css("a[href='#article-comment'] span::text").extract_first()
match_re = re.match(".*?(\d+).*", comment_nums)
if match_re:
comment_nums = int(match_re.group(1))
else:
comment_nums = 0
#正文
content = response.css("div .entry").extract()[0]
好,到此为止,整个网站的文章的内容都已经爬取完毕了。下面开始将爬到的数据保存到数据库里。
把封面图下载下来
配置 items
在 items
封装需要用到的类。
import scrapy
class ArticleSpiderItem(scrapy.Item):
title = scrapy.Field()
create_data = scrapy.Field()
url = scrapy.Field()
front_image_url = scrapy.Field()
front_image_path = scrapy.Field()
tags = scrapy.Field()
praise_nums = scrapy.Field()
fav_nums = scrapy.Field()
comment_nums = scrapy.Field()
content = scrapy.Field()
设置 settings
import os
ITEM_PIPELINES = {
'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGES_URLS_FIELD = "front_image_url"
project_dir = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_dir, "images")
Spider
文件
# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy.http import Request
from urllib import parse
from ArticleSpider.items import ArticleSpiderItem
class JobboleSpider(scrapy.Spider):
name = "jobbole"
allowed_domains = ["blog.jobbole.com"]
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
# '''
# 1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析
# 2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse
# '''
# 解析列表页中的所有文章url并交给scrapy下载后并进行解析(request库解析)
post_nodes = response.css("#archive .floated-thumb .post-thumb a")
for post_node in post_nodes:
#提取照片链接
image_url = post_node.css("img::attr(src)").extract_first()
post_url = post_node.css("::attr(href)").extract_first()
#构建Request
yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": image_url}, callback=self.parse_detail)#通过回调函数
# 提取下一页并交给scrapy下载
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
def parse_detail(self, response):
article_items = ArticleSpiderItem()
# 提取文章的具体字段
#通过CSS选择器提取网页的具体字段
#文章封面图
front_image_url = response.meta.get("front_image_url", "")
#标题
title = response.css(".entry-header h1::text").extract_first()
#发布日期
create_data = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip()
#标签
tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract()
tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
tags = ",".join(tag_list)
#点赞数
praise_nums = response.css(".vote-post-up h10::text").extract_first()
#收藏数
fav_nums = response.css("span.btn-bluet-bigger:nth-child(2)::text").extract_first()
match_re = re.match(".*?(\d+).*", fav_nums)
if match_re:
fav_nums = int(match_re.group(1))
else:
fav_nums = 0
# 评论数
comment_nums = response.css("a[href='#article-comment'] span::text").extract_first()
match_re = re.match(".*?(\d+).*", comment_nums)
if match_re:
comment_nums = int(match_re.group(1))
else:
comment_nums = 0
#正文
content = response.css("div .entry").extract()[0]
article_items["title"] = title
article_items["create_data"] = create_data
article_items["tags"] = tags
article_items["url"] = response.url
article_items["front_image_url"] = [front_image_url]
article_items["fav_nums"] = fav_nums
article_items["comment_nums"] = comment_nums
article_items["praise_nums"] = praise_nums
article_items["content"] = content
yield article_items
图片被下载下来了。
通过配置 settings
文件,保存图片地址。
'ArticleSpider.pipelines.ArticleImagePipeline': 1,
在 pipeline
中,重写 ArticleImagePipeline
方法,保存图片路径。
from scrapy.pipelines.images import ImagesPipeline
class ArticlespiderPipeline(object):
def process_item(self, item, spider):
return item
#保存图片的路径
class ArticleImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
for ok, value in results:
image_file_path = value["path"]
item["front_image_path"] = image_file_path
return item
保存成 Json
文件
自定义 Json
文件的导出
打开 Json
文件
def __init__(self):
self.file = codecs.open('Article.json', 'w', encoding="utf-8")
写入Json
文件
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(lines)
return item
调用该函数,结束后关闭
def spider_closed(self, spider):
self.file.close()
设置 settings
'ArticleSpider.pipelines.JsonWithEncodingPipeline': 3,
运行程序,可以得到下载的 Json
文件
用 Scrapy
提供的 json export
导出 json
文件
直接贴代码:
from scrapy.exporters import JsonItemExporter
#调用scrapy提供的json export导出json文件
class JsonExporterPipleline(object):
def __init__(self):
self.file = open('ArticleExport.json', 'wb')
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
设置 settings
'ArticleSpider.pipelines.JsonExporterPipleline': 3,
同样可以得到结果。
欢迎关注我的个人公众号。