2019年9月4日测试可用
settings.py
中必要的设置
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
}
ITEM_PIPELINES = {
'jianshu_spider.pipelines.JianshuTwistedPipeline': 300,
}
pipelines.py
异步存储到mysql
import pymysql
from twisted.enterprise import adbapi
class JianshuTwistedPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'jianshu',
'cursorclass':pymysql.cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
self._sql=None
# 数据处理,pipelines必须的函数
def process_item(self,item,spider):
defer=self.dbpool.runInteraction(self.insert_item, item)
defer.addErrback(self.handle_error,item,spider)
# 插入数据的函数,这里的curson由runInteraction自动补上
def insert_item(self,cursor,item):
cursor.execute(self.sql,(item['title'],item['author'],item['pub_time'],item['origin_url'],item['article_id'],item['content'],item['avatar']))
# 处理错误的函数,这里的error由addErrback自动补上
def handle_error(self,error,item,spider):
with open("ERROR.txt",'w',encoding='utf-8')as fp:
fp.write("=" * 10 + "error" + "=" * 10+"\n"+
str(error)+"\n"+
item['origin_url'] + "\n"+
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\n"+
"=" * 10 + "error" + "=" * 10 + "\n"
)
# sql语句自动
@property
def sql(self):
if not self._sql:
self._sql="""
insert into article(id,title,author,pub_time,origin_url,article_id,content,avatar) values(null,%s,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql
items.py
没有用ajax
请求就能获取到的数据
import scrapy
class JianshuSpiderItem(scrapy.Item):
title=scrapy.Field()
content=scrapy.Field()
pub_time=scrapy.Field()
origin_url=scrapy.Field()
author=scrapy.Field()
article_id=scrapy.Field()
avatar=scrapy.Field()
jianshu.py
核心代码,注意在实际测试的过程中仍然有bug,比如获取到作者有时为None,我也懒得找原因了,代码跑的动就行
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from jianshu_spider.items import JianshuSpiderItem
class JianshuSpider(CrawlSpider):
name = 'jianshu'
allowed_domains = ['jianshu.com']
start_urls = ['http://jianshu.com/']
rules = (
Rule(LinkExtractor(allow=r'.*p\/[a-z0-9]{12}.*'), callback='parse_item', follow=True),
)
def parse_item(self, response):
title=response.xpath('//title/text()').get().split("- 简书")[0].strip()
author=response.xpath("//span[@class='name']/text()").get()
pub_time=response.xpath('//span[@class="publish-time"]/text()').get()
article_id=response.url.split("?")[0].split("/")[-1]
origin_url=response.url
content=response.xpath('//div[@class="show-content"]').get()
avatar=response.xpath('//div[@class="avatar"]/img/@src').get()
if not content:
content=response.xpath('//article').get()
author=response.xpath('//header/div[1]/div[2]/div/div[2]/a/span/text()').get()
pub_time=response.xpath('//*[@id="__next"]/div[1]/div/div/section[1]/div[1]/div/time/text()').get()
avatar=re.search('"nickname":"'+author+'.*?'+'avatar":"'+'(.*?)'+'"',response.text).group(1)
item=JianshuSpiderItem(
title=title,
author=author,
pub_time=pub_time,
article_id=article_id,
origin_url=origin_url,
content=content,
avatar=avatar
)
yield item
运行截图:
获取到的数据[4180条数据/3min]:
还是蛮快的,而且我是校园ip,简书似乎没有任何反爬虫机制。
bug:有时页面获取到的作者为 null,比如:
https://www.jianshu.com/p/c8ca65263424?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation
有空我会看看这个的问题的,不过现在问题不大,只要将数据库允许写入null,待会通过origin_url
检查一下就莫得问题了。