python selenium span内容读取_python微博爬虫——使用selenium爬取关键词下超话内容

28a12e39867d4f42d115a5c9fc459221.png
最近微博手机端的页面发生了些微的变化,导致了我之前的两篇文章微博任意关键词爬虫——使用selenium模拟浏览器和来!用python爬一爬“不知知网翟博士”的微博超话中的代码出现了一些报错情况,这里来修改一下

欢迎关注公众号:老白和他的爬虫

1.微博手机端出现的变化

爬取手机端的微博好处在于能够爬取比网页端更多的数据,因为网页端微博内容一般限定在50页,数据量不够大,所以选择爬取手机端,这样可以一直往下“刷”,出现新的微博

在之前的代码中,微博手机端超话页面是这样的

4834a0de925be1eaefe3dd23b80caae2.png

但是这几天在爬取的过程中发现微博超话的页面变成了这样

5802b7e4e703c942afa2beeb934f739d.png

这样一眼就可以看到区别吧,就是超话的名称、阅读量、讨论数不见了,所以现在运行代码会报错,这一点也很好解决,我们只需要提前在这个页面提前获取我们需要的超话的名称、阅读量、讨论数就可以了

f8a47b4db1ef94f9ef8e0e68922e292e.png

2.代码修改

代码相比于之前,主要修改了超话的名称、阅读量、讨论数的获取 方式,修改后的weiboTest.py代码如下,如遇到问题可到后台留言

  1. import time
  2. import xlrd
  3. from selenium import webdriver
  4. from selenium.webdriver.common.keys import Keys
  5. import os
  6. import excelSave as save
  7. # 用来控制页面滚动
  8. def Transfer_Clicks(browser):
  9. try:
  10. browser.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")
  11. except:
  12. pass
  13. return "Transfer successfully n"
  14. #判断页面是否加载出来
  15. def isPresent():
  16. temp =1
  17. try:
  18. driver.find_elements_by_css_selector('div.line-around.layout-box.mod-pagination > a:nth-child(2) > div > select > option')
  19. except:
  20. temp =0
  21. return temp
  22. #把超话页面滚动到底
  23. def SuperwordRollToTheEnd():
  24. before = 0
  25. after = 0
  26. n = 0
  27. timeToSleep = 50
  28. while True:
  29. before = after
  30. Transfer_Clicks(driver)
  31. time.sleep(3)
  32. elems = driver.find_elements_by_css_selector('div.m-box')
  33. print("当前包含超话最大数量:%d,n当前的值为:%d,当n为5无法解析出新的超话" % (len(elems),n))
  34. after = len(elems)
  35. if after > before:
  36. n = 0
  37. if after == before:
  38. n = n + 1
  39. if n == 5:
  40. print("当前包含最大超话数为:%d" % after)
  41. break
  42. if after > timeToSleep:
  43. print("抓取到%d多条超话,休眠30秒" % timeToSleep)
  44. timeToSleep = timeToSleep + 50
  45. time.sleep(30)
  46. #插入数据
  47. def insert_data(elems,path,name,yuedu,taolun):
  48. for elem in elems:
  49. workbook = xlrd.open_workbook(path) # 打开工作簿
  50. sheets = workbook.sheet_names() # 获取工作簿中的所有表格
  51. worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
  52. rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
  53. rid = rows_old
  54. #用户名
  55. weibo_username = elem.find_elements_by_css_selector('h3.m-text-cut')[0].text
  56. weibo_userlevel = "普通用户"
  57. #微博等级
  58. try:
  59. weibo_userlevel_color_class = elem.find_elements_by_css_selector("i.m-icon")[0].get_attribute("class").replace("m-icon ","")
  60. if weibo_userlevel_color_class == "m-icon-yellowv":
  61. weibo_userlevel = "黄v"
  62. if weibo_userlevel_color_class == "m-icon-bluev":
  63. weibo_userlevel = "蓝v"
  64. if weibo_userlevel_color_class == "m-icon-goldv-static":
  65. weibo_userlevel = "金v"
  66. if weibo_userlevel_color_class == "m-icon-club":
  67. weibo_userlevel = "微博达人"
  68. except:
  69. weibo_userlevel = "普通用户"
  70. #微博内容
  71. weibo_content = elem.find_elements_by_css_selector('div.weibo-text')[0].text
  72. shares = elem.find_elements_by_css_selector('i.m-font.m-font-forward + h4')[0].text
  73. comments = elem.find_elements_by_css_selector('i.m-font.m-font-comment + h4')[0].text
  74. likes = elem.find_elements_by_css_selector('i.m-icon.m-icon-like + h4')[0].text
  75. #发布时间
  76. weibo_time = elem.find_elements_by_css_selector('span.time')[0].text
  77. print("用户名:"+ weibo_username + "|"
  78. "微博等级:"+ weibo_userlevel + "|"
  79. "微博内容:"+ weibo_content + "|"
  80. "转发:"+ shares + "|"
  81. "评论数:"+ comments + "|"
  82. "点赞数:"+ likes + "|"
  83. "发布时间:"+ weibo_time + "|"
  84. "话题名称" + name + "|"
  85. "话题讨论数" + yuedu + "|"
  86. "话题阅读数" + taolun)
  87. value1 = [[rid, weibo_username, weibo_userlevel,weibo_content, shares,comments,likes,weibo_time,keyword,name,yuedu,taolun],]
  88. print("当前插入第%d条数据" % rid)
  89. save.write_excel_xls_append_norepeat(book_name_xls, value1)
  90. #获取当前页面的数据
  91. def get_current_weibo_data(elems,book_name_xls,name,yuedu,taolun,maxWeibo):
  92. #开始爬取数据
  93. before = 0
  94. after = 0
  95. n = 0
  96. timeToSleep = 300
  97. while True:
  98. before = after
  99. Transfer_Clicks(driver)
  100. time.sleep(3)
  101. elems = driver.find_elements_by_css_selector('div.card.m-panel.card9')
  102. print("当前包含微博最大数量:%d,n当前的值为:%d, n值到5说明已无法解析出新的微博" % (len(elems),n))
  103. after = len(elems)
  104. if after > before:
  105. n = 0
  106. if after == before:
  107. n = n + 1
  108. if n == 5:
  109. print("当前关键词最大微博数为:%d" % after)
  110. insert_data(elems,book_name_xls,name,yuedu,taolun)
  111. break
  112. if len(elems)>maxWeibo:
  113. print("当前微博数以达到%d条"%maxWeibo)
  114. insert_data(elems,book_name_xls,name,yuedu,taolun)
  115. break
  116. if after > timeToSleep:
  117. print("抓取到%d多条,插入当前新抓取数据并休眠30秒" % timeToSleep)
  118. timeToSleep = timeToSleep + 300
  119. insert_data(elems,book_name_xls,name,yuedu,taolun)
  120. time.sleep(30)
  121. #点击超话按钮,获取超话页面
  122. def get_superWords():
  123. time.sleep(5)
  124. elem = driver.find_element_by_xpath("//*[@class='scroll-box nav_item']/ul/li/span[text()='话题']")
  125. elem.click()
  126. #获取所有超话
  127. SuperwordRollToTheEnd()
  128. elemsOfSuper = driver.find_elements_by_css_selector('div.card.m-panel.card26')
  129. return elemsOfSuper
  130. #获取超话链接、名称、讨论量、阅读量
  131. def get_superwordsUrl():
  132. elemsOfSuper = get_superWords()
  133. superWords_url = []
  134. for i in range(0,len(elemsOfSuper)):
  135. superwordsInfo = []
  136. print("当前获取第%d个超话链接,共有%d个超话"% (i+1,len(elemsOfSuper)))
  137. time.sleep(1)
  138. element = driver.find_elements_by_css_selector('div.card.m-panel.card26')[i]
  139. name = driver.find_elements_by_css_selector('div.card.m-panel.card26 h3')[i].text
  140. yuedu_taolun = driver.find_elements_by_css_selector('div.card.m-panel.card26 h4:nth-last-child(1)')[i].text
  141. yuedu = yuedu_taolun.split(" ")[0]
  142. taolun = yuedu_taolun.split(" ")[1]
  143. #获取话题名称,话题讨论数,阅读数
  144. print(name)
  145. print(taolun)
  146. print(yuedu)
  147. #获取超话链接
  148. driver.execute_script('arguments[0].click()',element)
  149. time.sleep(3)
  150. print(driver.current_url)
  151. #把链接和超话信息一起存放于列表中
  152. superwordsInfo = [driver.current_url,name,taolun,yuedu]
  153. superWords_url.append(superwordsInfo)
  154. driver.back()
  155. return superWords_url
  156. #爬虫运行
  157. def spider(username,password,driver,book_name_xls,sheet_name_xls,keyword,maxWeibo):
  158. #创建文件
  159. if os.path.exists(book_name_xls):
  160. print("文件已存在")
  161. else:
  162. print("文件不存在,重新创建")
  163. value_title = [["rid", "用户名称", "微博等级", "微博内容", "微博转发量","微博评论量","微博点赞","发布时间","搜索关键词","话题名称","话题讨论数","话题阅读数"],]
  164. save.write_excel_xls(book_name_xls, sheet_name_xls, value_title)
  165. #加载驱动,使用浏览器打开指定网址
  166. driver.set_window_size(452, 790)
  167. driver.get("https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=https%3A%2F%2Fm.weibo.cn%2F")
  168. time.sleep(3)
  169. #登陆
  170. elem = driver.find_element_by_xpath("//*[@id='loginName']");
  171. elem.send_keys(username)
  172. elem = driver.find_element_by_xpath("//*[@id='loginPassword']");
  173. elem.send_keys(password)
  174. elem = driver.find_element_by_xpath("//*[@id='loginAction']");
  175. elem.send_keys(Keys.ENTER)
  176. time.sleep(5)
  177. #判断页面是否加载出
  178. while 1: # 循环条件为1必定成立
  179. result = isPresent()
  180. print ('判断页面1成功 0失败 结果是=%d' % result )
  181. if result == 1:
  182. elems = driver.find_elements_by_css_selector('div.line-around.layout-box.mod-pagination > a:nth-child(2) > div > select > option')
  183. #return elems #如果封装函数,返回页面
  184. break
  185. else:
  186. print ('页面还没加载出来呢')
  187. time.sleep(20)
  188. time.sleep(5)
  189. #搜索关键词
  190. elem = driver.find_element_by_xpath("//*[@class='m-text-cut']").click();
  191. time.sleep(5)
  192. elem = driver.find_element_by_xpath("//*[@type='search']");
  193. elem.send_keys(keyword)
  194. elem.send_keys(Keys.ENTER)
  195. superWords_url = get_superwordsUrl()
  196. print("超话链接获取完毕,休眠5秒")
  197. time.sleep(5)
  198. for url in superWords_url:
  199. driver.get(url[0])
  200. time.sleep(3)
  201. name = url[1]
  202. taolun = url[2]
  203. yuedu = url[3]
  204. get_current_weibo_data(elems,book_name_xls,name,yuedu,taolun,maxWeibo) #爬取综合
  205. time.sleep(3)
  206. shishi_element = driver.find_element_by_xpath("//*[@class='scroll-box nav_item']/ul/li/span[text()='实时']")
  207. driver.execute_script('arguments[0].click()',shishi_element)
  208. get_current_weibo_data(elems,book_name_xls,name,yuedu,taolun,maxWeibo) #爬取实时
  209. time.sleep(5)
  210. remen_element = driver.find_element_by_xpath("//*[@class='scroll-box nav_item']/ul/li/span[text()='热门']")
  211. driver.execute_script('arguments[0].click()',remen_element)
  212. get_current_weibo_data(elems,book_name_xls,name,yuedu,taolun,maxWeibo) #爬取热门
  213. if __name__ == '__main__':
  214. username = "" #你的微博登录名
  215. password = "" #你的密码
  216. driver = webdriver.Chrome('/Users/Desktop/python/weibo_keyword/chromedriver')#你的chromedriver的地址
  217. book_name_xls = "/Users/Desktop/weibo.xls" #填写你想存放excel的路径,没有文件会自动创建
  218. sheet_name_xls = '微博数据' #sheet表名
  219. maxWeibo = 1000 #设置最多多少条微博,如果未达到最大微博数量可以爬取当前已解析的微博数量
  220. keywords = ["翟天临学术",] #输入你想要的关键字,可以是多个关键词的列表的形式
  221. for keyword in keywords:
  222. spider(username,password,driver,book_name_xls,sheet_name_xls,keyword,maxWeibo)

数据存储的excelSave.py没有变

  1. import xlrd
  2. import xlwt
  3. from xlutils.copy import copy
  4. def write_excel_xls(path, sheet_name, value):
  5. index = len(value) # 获取需要写入数据的行数
  6. workbook = xlwt.Workbook() # 新建一个工作簿
  7. sheet = workbook.add_sheet(sheet_name) # 在工作簿中新建一个表格
  8. for i in range(0, index):
  9. for j in range(0, len(value[i])):
  10. sheet.write(i, j, value[i][j]) # 像表格中写入数据(对应的行和列)
  11. workbook.save(path) # 保存工作簿
  12. print("xls格式表格写入数据成功!")
  13. def read_excel_xls(path):
  14. data = []
  15. workbook = xlrd.open_workbook(path) # 打开工作簿
  16. sheets = workbook.sheet_names() # 获取工作簿中的所有表格
  17. worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
  18. if worksheet.nrows == 1:
  19. print("目前是第一行")
  20. else:
  21. for i in range(1, worksheet.nrows): #从第二行取值
  22. dataTemp = []
  23. for j in range(0, worksheet.ncols):
  24. #print(worksheet.cell_value(i, j), "t", end="") # 逐行逐列读取数据
  25. dataTemp.append(worksheet.cell_value(i, j))
  26. data.append(dataTemp)
  27. return data
  28. def write_excel_xls_append_norepeat(path, value):
  29. workbook = xlrd.open_workbook(path) # 打开工作簿
  30. sheets = workbook.sheet_names() # 获取工作簿中的所有表格
  31. worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
  32. rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
  33. new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
  34. new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
  35. rid = 0
  36. for i in range(0, len(value)):
  37. data = read_excel_xls(path)
  38. data_temp = []
  39. for m in range(0,len(data)):
  40. data_temp.append(data[m][1:len(data[m])])
  41. value_temp = []
  42. for m in range(0,len(value)):
  43. value_temp.append(value[m][1:len(value[m])])
  44. if value_temp[i] not in data_temp:
  45. for j in range(0, len(value[i])):
  46. new_worksheet.write(rid+rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
  47. rid = rid + 1
  48. new_workbook.save(path) # 保存工作簿
  49. print("xls格式表格【追加】写入数据成功!")
  50. else:
  51. print("数据重复")

完整代码及驱动,后台回复“20190414”获取

最后关注一波吧

48da7907efb53824b2e81d24e744ef96.png
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值