多级解析:目标 爬取大神的所有文章
参考: https://blog.csdn.net/ck784101777/article/details/105157365
思路:
1,先爬文章列表 文章链接
2,交给二级解析 获取文章名 和文章内容
3,在爬虫管道中将数据保存
为了方便提取Xpath 可以安装 Xpath helper
修改设置: settings.py
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'spiderAdvanced.pipelines.SpideradvancedPipeline': 300,
}
Spider:
class AdvanceSpider(scrapy.Spider):
# 爬虫名字 必须唯一
name = 'advance'
# 爬取范围,超出这个范围就不爬了
allowed_domains = ['csdn.net']
# 爬虫爬取数据来时的地方 元组 或者列表
start_urls = ('https://blog.csdn.net/ck784101777',)
# 调试状态只爬一篇
def parse(self, response):
yield scrapy.Request(
url='https://blog.csdn.net/ck784101777/article/details/105157365',
callback=self.second_page
)
def second_page(self, response):
item = CsdnItem()
url = response.url
item['url'] = url
title = response.xpath('//div[@class="article-header"]/div[@class="article-title-box"]/h1/text()').extract()
item['title'] = title
element_list = response.xpath('//div[@class="htmledit_views"]')
for ele in element_list:
print(ele)
content1 = ele.xpath('./p/text()').extract()
content2 = ele.xpath('./p/a/text()').extract()
content3 = ele.xpath('./p/strong/text()').extract()
content4 = ele.xpath('./h3/text()').extract()
content5 = ele.xpath('./h3/strong/text()').extract()
print(1, type(content1), content1)
print(2, type(content2), content2)
print(3, type(content3), content3)
print(4, type(content4), content4)
print(5, type(content5), content5)
item['content'] = None
yield item
但是这么爬顺序乱了
标题分一类,内容分一类,代码又一类
可以不要大小写 可以不要 标题 但顺序不能乱呀
进阶:
str方法中有一个str.splitlines() 按照’\n’ 分割
将抓取到的element按行转换成list
extract = response.xpath(’//div[@class=“htmledit_views”]’).extract_first()
lines = extract.splitlines()
#然后每一行一行的提取
for line in lines:
#把string转成Selector对象
ele = Selector(text=line)
content1 = ele.xpath(’//p/text()’).extract()
content3 = ele.xpath(’//p/strong/text()’).extract()
…
然后再拼接
爬虫代码
import scrapy
from scrapy import Selector
from ..items import CsdnItem
class AdvanceSpider(scrapy.Spider):
# 爬虫名字 必须唯一
name = 'advance'
# 爬取范围,超出这个范围就不爬了
allowed_domains = ['csdn.net']
# 爬虫爬取数据来时的地方 元组 或者列表
start_urls = ('https://blog.csdn.net/ck784101777',)
offset = 1
def parse(self, response):
lists = response.xpath('//div[@class="article-list"]/div[@class="article-item-box csdn-tracking-statistics"]')
for i in lists:
# 文章的连接
link_url = i.xpath('./h4/a/@href').get()
yield scrapy.Request(
url=link_url,
callback=self.second_page
)
if self.offset < 7:
self.offset += 1
url = 'https://blog.csdn.net/ck784101777/article/list/{}'.format(str(self.offset))
yield scrapy.Request(url=url, callback=self.parse)
def second_page(self, response):
item = CsdnItem()
url = response.url
item['url'] = url
title = response.xpath('//div[@class="article-header"]/div[@class="article-title-box"]/h1/text()').extract()
item['title'] = title
# 将文章内容转维string
extract = response.xpath('//div[@class="htmledit_views"]').extract_first()
lines = extract.splitlines()
contents = [url]
#按行提取每一行的文字
for line in lines:
ele = Selector(text=line)
content1 = ele.xpath('//p/text()').extract()
content2 = ele.xpath('//p/a/text()').extract()
content3 = ele.xpath('//p/strong/text()').extract()
content4 = ele.xpath('//h3/text()').extract()
content5 = ele.xpath('//h3/strong/text()').extract()
content6 = ele.xpath('//p/span/text()').extract()
contents.extend(content1)
contents.extend(content2)
contents.extend(content3)
contents.extend(content4)
contents.extend(content5)
contents.extend(content6)
spl = '\n'
content = spl.join(contents)
item['content'] = content
yield item
items.py代码
import scrapy
class CsdnItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
content = scrapy.Field()
pass
pipelines.py代码
import os
class SpideradvancedPipeline(object):
def process_item(self, item, spider):
print("***************************************************************")
path = os.getcwd() + '/csdn'
if not os.path.exists(path):
os.makedirs(path)
filename = path + '/' + item['title'][0] + '.txt'
with open(filename, 'w', encoding="utf-8") as f:
f.write(item['content'])
return item
def open_spider(self, spider):
print('爬虫开始!!')
def close_spider(self, spider):
print('爬虫结束')
#启动代码
from scrapy import cmdline
if __name__ == '__main__':
cmdline.execute('scrapy crawl advance'.split())
这也只是一种思路,而且文章中有一部分还没有提取完整,比如
里面的内容,我想还有更好的方法,只是我没找到,在网上也没查到先用这种方法凑合一下