由于松爱协会的小伙伴邀请,我研究了一下爬取b站上协会的一些精彩评论
由于评论是动态的,所以要用到 selenium 之前的博文里已经有关于selenium的安装注意事项
还要用到Firefox的firebug 去获取xpath信息
target = app.find_element_by_xpath(".//*[@id='recommend_report']/div[1]/span")
app.execute_script("arguments[0].scrollIntoView();", target)#定位到特定的元素
time.sleep(3)
这里注意一下 由于有些信息要 下拉滚动条才可以 获取到 那么这里有一个下拉滚动条 定位到某一元素的方法
贴上代码
#coding=utf-8 from selenium import webdriver import sys import time from selenium.webdriver.common.keys import Keys reload(sys) sys.setdefaultencoding("utf-8") # fp = webdriver.PhantomJS() # fp.set_preference("permissions.default.stylesheet",2) # fp.set_preference("permissions.default.image",2) app = webdriver.Firefox() app.get("https://www.bilibili.com/video/av3553625/?from=search&seid=10292605247919873793") target = app.find_element_by_xpath(".//*[@id='recommend_report']/div[1]/span") app.execute_script("arguments[0].scrollIntoView();", target)#定位到特定的元素 time.sleep(3) target2 = app.find_element_by_xpath(".//*[@id='bbComment']/div[1]/div[4]/div[4]/span/a"); app.execute_script("arguments[0].scrollIntoView();", target2) time.sleep(3) target2.click() # js="var q=document.documentElement.scrollTop=100000" # app.execute_script(js) # time.sleep(3) for i in range(20): if(i==7): continue name = app.find_element_by_xpath(".//*[@id='bbComment']/div[1]/div[4]/div["+str(i+1)+"]/div[2]/div[1]/a[1]") test = app.find_element_by_xpath(".//*[@id='bbComment']/div[1]/div[4]/div["+str(i+1)+"]/div[2]/p") if (i != 13 and i != 17): pinglun1 = app.find_element_by_xpath(".//*[@id='bbComment']/div[1]/div[4]/div["+str(i+1)+"]/div[2]/div[3]/div[1]/div/div[1]/span") if (i != 13 and i != 17): pinglun2 = app.find_element_by_xpath(".//*[@id='bbComment']/div[1]/div[4]/div["+str(i+1)+"]/div[2]/div[3]/div[2]/div/div[1]/span") if(i !=12 and i !=13 and i != 17): pinglun3 = app.find_element_by_xpath(".//*[@id='bbComment']/div[1]/div[4]/div["+str(i+1)+"]/div[2]/div[3]/div[3]/div/div[1]/span") print ("作者:") print (name.text.strip()) print ("内容:") print test.text.strip() print ("后续:") if (i != 13 and i != 17): print pinglun1.text.strip() if (i != 13 and i != 17): print pinglun2.text.strip() if (i != 12 and i != 13 and i != 17): print pinglun3.text.strip() print ("\n") time.sleep(3) # # tests = app.find_elements_by_css_selector("p.text") # for i in range(len(tests)): # test = tests[i].text.strip() # print test app.quit()
这里只抓取了一页的评论 还可以再完善抓更多页面