博文配套视频课程:24小时实现从零到AI人工智能
简书URL地址分析
可以指定爬虫抓取的规则,支持正则表达式,目前简书
- https://www.jianshu.com/p/df7cad4eb8d8
- https://www.jianshu.com/p/07b0456cbadb?*****
- https://www.jianshu.com/p/.*
rules = (
Rule(LinkExtractor(allow=r'https://www.jianshu.com/p/[0-9a-z]{12}.*'), callback='parse_item', follow=True),
)
获取简书文章数据
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from items import ArticleItem
from lxml import etree
class JsSpider(CrawlSpider):
name = 'js'
allowed_domains = ['jianshu.com']
start_urls = ['https://www.jianshu.com/']
rules = (
# 支持正则表达式 .* 代表后面可有可无
# callback是指定要解析的方法
# follow可以理解为回调自己的回调函数 (如果当前抓取的页面里面还有符合条件的地址,则继续跟进解析)
# 简书首页底部的"阅读更多"后续通过AJAX加载实现
# allow=r'.*/p/[0-9a-z].*'
Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_item', follow=True),
)
# 也可以通过命令行方式测试:
# 1: 输入要测试的命令:scrapy shell https://www.jianshu.com/p/00b7130b2fad
# 2:交互式命令行中执行:response.xpath("//h1[@class='title']/text()").get()
def parse_item(self, response):
html = etree.HTML(response.text)
print(html.xpath("//title/text()")[0].split('-')[0])
return None
根据业务需求创建模型
import scrapy
class JianshuItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
name = scrapy.Field()
collection = scrapy.Field()
url = scrapy.Field()
测试parse_item解析函数
# 也可以通过命令行方式测试:
# 1: 输入要测试的命令:scrapy shell https://www.jianshu.com/p/00b7130b2fad
# 2:交互式命令行中执行:response.xpath("//h1[@class='title']/text()").get()
def parse_item(self, response):
from lxml import etree
html = etree.HTML(response.text)
print(html.xpath("//title/text()")[0].split('-')[0])
time.sleep(1)
item = JianshuItem()
item['title'] = html.xpath("//title/text()")[0].split('-')[0]
item['name'] = html.xpath("//span[@class='name']/a/text()")[0]
item['url'] = response.url.split('?')[0]
collection = html.xpath("//div[@class='include-collection']/a/div[@class='name']/text()")
if collection:
item['collection'] = '|'.join(collection)
yield item