#优化动态数据 处理ajax加载进来的数据#selenium+chromdriver 处理
#爬虫文件 把阅读量,点赞数,文章字数,标题分类,评论数 字段获取,保存到item中
defparse_detail(self, response):#print(response.text)
title = response.xpath("//div[@class='note']/div[@class='post']/div[@class='article']/h1[@class='title']/text()").get()print(title)
avatar= response.xpath("//a[@class='avatar']/img/@src").get()#print(avatar)
author = response.xpath("//span[@class='name']/a/text()").get()#print(author)
pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")#print(pub_time)
#url正常情况下里面只有一个?
url =response.url
url1= url.split("?")[0]
article_id= url1.split("/")[-1]#print(article_id)
#把html标签一起趴下来, 方便以后展示
content = response.xpath("//div[@class='show-content']").get()#print(content)
#动态获取下面的数据
word_count = response.xpath("//span[@class='wordage']/text()").get().split(" ")[-1]
read_count= response.xpath("//span[@class='views-count']/text()").get().split(" ")[-1]
comment_count= response.xpath("//span[@class='comments-count']/text()").get().split(" ")[-1]
like_count= response.xpath("//span[@class='likes-count']/text()").get().split(" ")[-1]
subject= response.xpath("//div[@class='include-collection']/a/div/text()").getall()#subject 获取的时候一个列表 存到mysql的时候不支持, 需要把列表转成字符串
subject = ",".join(subject)
item=ArticleItem(
title=title,
avatar=avatar,
author=author,
pub_time=pub_time,
origin_url=response.url,
article_id=article_id,
content=content,
word_count=word_count,
read_count=read_count,
comment_count=comment_count,
like_count=like_count,
subject=subject,
)yielditem#管道文件#上面的存储是同步 比较慢, 现在优化成异步
classJianshuTwistedPipeline(object):def __init__(self):#创建连接池
dbparams ={'host': '127.0.0.1','port': 3306,'user': 'root','password': '','database': 'jianshu','charset': 'utf8','cursorclass':cursors.DictCursor
}
self.dbpool= adbapi.ConnectionPool('pymysql',**dbparams)
self._sql=None
@propertydefsql(self):if not self._sql: #如果没有 执行
self._sql = '''insert into article2(id,title,content,author,avatar,pub_time,
origin_url,article_id,read_count, word_count, like_count, comment_count,subject)
values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
#
returnself._sqlelse:returnself._sqldefprocess_item(self,item,spider):#runInteraction执行异步的
defer =self.dbpool.runInteraction(self.insert_item,item)
defer.addErrback(self.handle_error,item,spider)def insert_item(self,cursor,item): #插入数据库
cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],
item['pub_time'],item['origin_url'],item['article_id'],
item['read_count'],item['word_count'],item['like_count'],item['comment_count'],item['subject']))defhandle_error(self,error,item,spider):print('='*20+'error'+'='*20)print("error:",error)print('='*20+'error'+'='*20)