首先创建scrapy项目MyScrapy
新建jobbole.py文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import re
from MyScrapy.items import MyscrapyItem
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
#allowed_domains = ['blog.jobbole.com/114261/']
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
post_urls=response.css('#archive .floated-thumb .post-thumb a')
for link in post_urls:
url=link.css('::attr(href)').extract_first('')
ImgUrl=link.css('img::attr(src)').extract_first('')
yield Request(url=url,meta={'img':ImgUrl},callback=self.parse_detail)
nextPage=response.css('.next.page-numbers::attr(href)').extract_first('')
print(nextPage)
#这里是下一页,打开后可以爬取所有的页面,但是太费时间,就注释了
# if nextPage:
# Request(url=nextPage,callback=self.parse)
def parse_detail(self, response):
# 标题
title=response.css('.entry-header h1::text').extract()
# 文章内容
context=response.css('.entry p::text').extract()
# 日期
dateTime= response.css('.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·','')
#点赞数
goosNum=response.css('.post-adds span h10::text').extract()
#收藏数
ShouCang= response.css('.post-adds span:nth-of-type(2)::text').extract()
#评论数
comment=response.css('.post-adds a span::text').extract()
bookMark= response.css('.entry-meta p a::text').extract()
pattern = re.compile('.*?([\d]+).*?')
# 得到的是[' 3 收藏'],通过正则表达式得到数字
t = pattern.findall(ShouCang[0])
if t == []:
sNum=0
else:
sNum=t[0]
# 得到图片的连接
img=response.meta.get('img')
# 保存文件
myItem=MyscrapyItem()
# 键需要和items.MyscrapyItem()中定义的变量相对应
myItem['title']=title
myItem['dateTime']=dateTime
# 图片传入列表
myItem['img']=[img]
myItem['sNum']=sNum
yield myItem
修改item.py文件:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MyscrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
dateTime=scrapy.Field()
img=scrapy.Field()
sNum=scrapy.Field()
修改setting.py文件:
ITEM_PIPELINES = {
'MyScrapy.pipelines.MyscrapyPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline':200,
}
import os
IMAGES_URLS_FIELD='img'
# 保存文件的地址C:\Users\17864\Desktop\翡翠培训\python
filep=os.path.dirname(os.path.abspath(__file__))
IMAGES_STORE=r'C:\Users\17864\Desktop\翡翠培训\python'
最后需要程序入口文件(运行这个文件):
import os
import sys
from scrapy.cmdline import execute
data=os.path.abspath(__file__)#绝对路径
dir_file=os.path.dirname(data)#上一级
sys.path.append(dir_file)
execute(['scrapy','crawl','jobbole'])