tc抓取mysql_简书全站爬取 mysql异步保存

最新推荐文章于 2021-07-01 11:48:02 发布

比个那噶

最新推荐文章于 2021-07-01 11:48:02 发布

阅读量98

点赞数

文章标签： tc抓取mysql

本文链接：https://blog.csdn.net/weixin_36362920/article/details/113400350

版权

#优化动态数据处理ajax加载进来的数据#selenium+chromdriver 处理

#爬虫文件把阅读量,点赞数,文章字数,标题分类,评论数字段获取,保存到item中

defparse_detail(self, response):#print(response.text)

title = response.xpath("//div[@class='note']/div[@class='post']/div[@class='article']/h1[@class='title']/text()").get()print(title)

avatar= response.xpath("//a[@class='avatar']/img/@src").get()#print(avatar)

author = response.xpath("//span[@class='name']/a/text()").get()#print(author)

pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")#print(pub_time)

#url正常情况下里面只有一个?

url =response.url

url1= url.split("?")[0]

article_id= url1.split("/")[-1]#print(article_id)

#把html标签一起趴下来, 方便以后展示

content = response.xpath("//div[@class='show-content']").get()#print(content)

#动态获取下面的数据

word_count = response.xpath("//span[@class='wordage']/text()").get().split(" ")[-1]

read_count= response.xpath("//span[@class='views-count']/text()").get().split(" ")[-1]

comment_count= response.xpath("//span[@class='comments-count']/text()").get().split(" ")[-1]

like_count= response.xpath("//span[@class='likes-count']/text()").get().split(" ")[-1]

subject= response.xpath("//div[@class='include-collection']/a/div/text()").getall()#subject 获取的时候一个列表存到mysql的时候不支持, 需要把列表转成字符串

subject = ",".join(subject)

item=ArticleItem(

title=title,

avatar=avatar,

author=author,

pub_time=pub_time,

origin_url=response.url,

article_id=article_id,

content=content,

word_count=word_count,

read_count=read_count,

comment_count=comment_count,

like_count=like_count,

subject=subject,

)yielditem#管道文件#上面的存储是同步比较慢, 现在优化成异步

classJianshuTwistedPipeline(object):def __init__(self):#创建连接池

dbparams ={'host': '127.0.0.1','port': 3306,'user': 'root','password': '','database': 'jianshu','charset': 'utf8','cursorclass':cursors.DictCursor

}

self.dbpool= adbapi.ConnectionPool('pymysql',**dbparams)

self._sql=None

@propertydefsql(self):if not self._sql: #如果没有执行

self._sql = '''insert into article2(id,title,content,author,avatar,pub_time,

origin_url,article_id,read_count, word_count, like_count, comment_count,subject)

values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

returnself._sqlelse:returnself._sqldefprocess_item(self,item,spider):#runInteraction执行异步的

defer =self.dbpool.runInteraction(self.insert_item,item)

defer.addErrback(self.handle_error,item,spider)def insert_item(self,cursor,item): #插入数据库

cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],

item['pub_time'],item['origin_url'],item['article_id'],

item['read_count'],item['word_count'],item['like_count'],item['comment_count'],item['subject']))defhandle_error(self,error,item,spider):print('='*20+'error'+'='*20)print("error:",error)print('='*20+'error'+'='*20)

比个那噶

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫