新浪新闻逐页爬取标题和链接并保存到txt中
class itemSpider(scrapy.Spider):
name = 'itemSpider'
start_urls = ['http://mil.news.sina.com.cn/roll/index.d.html?cid=57918']
def parse(self, response):
li=response.css('.fixList .linkNews li')
for l in li:
link = l.css('a::attr(href)').extract_first() # 提取首页所有url
title=l.css('a::text').extract_first()#提取title
with open('xinwen.txt', "a+") as f: # “a+”以追加的形式
f.write(title)
f.write('\n') # ‘\n’ 表示换行
f.write('链接:' + link)
f.write('\n-------\n')
f.close()
next_page = response.css('.pagebox_next a::attr(href)').extract_first()
if next_page is not None:
print(next_page)
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
逐页爬取名言
# class itemSpider(scrapy.Spider):
# name = 'itemSpider'
# start_urls = ['http://lab.scrapyd.cn']
# def parse(self, response):
# mingyan = response.css('div.quote') # 提取首页所有名言,保存至变量mingyan
#
# for v in mingyan: # 循环获取每一条名言里面的:名言内容、作者、标签
#
# text = v.css('.text::text').extract_first() # 提取名言
# autor = v.css('.author::text').extract_first() # 提取作者
# tags = v.css('.tags .tag::text').extract() # 提取标签
# tags = ','.join(tags) # 数组转换为字符串
#
# """
# 接下来进行写文件操作,每个名人的名言储存在一个txt文档里面
# """
# fileName = '%s-语录.txt' % autor # 定义文件名,如:木心-语录.txt
#
# with open(fileName, "a+") as f: # 不同人的名言保存在不同的txt文档,“a+”以追加的形式
# f.write(text)
# f.write('\n') # ‘\n’ 表示换行
# f.write('标签:' + tags)
# f.write('\n-------\n')
# f.close()
# next_page = response.css('li.next a::attr(href)').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
#
按照传来的参数爬取名言
import scrapy
class ArgsspiderSpider(scrapy.Spider):
name = "argsSpider"
def start_requests(self):
url = 'http://lab.scrapyd.cn/'
tag = getattr(self, 'tag', None) # 获取tag值,也就是爬取时传过来的参数
if tag is not None: # 判断是否存在tag,若存在,重新构造url
url = url + 'tag/' + tag # 构造url若tag=爱情,url= "http://lab.scrapyd.cn/tag/爱情"
yield scrapy.Request(url, self.parse) # 发送请求爬取参数内容
"""
以下内容为上一讲知识,若不清楚具体细节,请查看上一讲!
"""
def parse(self, response):
mingyan = response.css('div.quote')
for v in mingyan:
text = v.css('.text::text').extract_first()
tags = v.css('.tags .tag::text').extract()
tags = ','.join(tags)
fileName = '%s-语录.txt' % tags
with open(fileName, "a+") as f:
f.write(text)
f.write('\n')
f.write('标签:' + tags)
f.write('\n-------\n')
f.close()
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
代码写好之后,那我们要如何传参呢?如何运行呢?比如我们要爬取标签:爱情,我们可以这样:
scrapy crawl argsSpider -a tag=爱情