这里用了爬虫从入门到实践第二版的例子,爬取作者博客前3页评论,这里作者的代码与现在的页面不一致,之前所有内容可以加载到一页上,现在按钮是进行翻页,这里我采用for循环每次遍历输出的 方式
from selenium import webdriver
import time
driver = webdriver.Firefox(executable_path=r'C:\Users\lenovo\Desktop\geckodriver.exe')
driver.implicitly_wait(20) # 隐性等待最多20秒
driver.get("http://www.santostang.com/2018/07/04/hello-world/")
#time.sleep(5)
#print(driver.page_source)
for page in range(1, 4):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
driver.switch_to.frame(driver.find_element_by_css_selector("iframe[title='livere']"))
comments = driver.find_elements_by_css_selector('div.reply-content')
for comment in comments:
content = comment.find_element_by_css_selector('p')
print(content.text)
load_more = driver.find_element_by_css_selector("button.page-btn[data-page='" + str(page+1)+ "']")
load_more.click()
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element_by_css_selector("iframe[title='livere']"))
也可以直接用浏览器检查方式获取真实链接,并通过json库进行解析
import requests
import json
def singlepa(link):
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36',
'Host': 'api-zero.livere.com'
}
r=requests.get(link,headers=headers)
json_string=r.text
json_string=json_string[json_string.find('{'):-2]
json_data=json.loads(json_string)
comment_list=json_data['results']['parents']
for eachone in comment_list:
message=eachone['content']
print(message)
for page in range(1,11):
link1='https://api-zero.livere.com/v1/comments/list?callback=jQuery1124022919375016779964_1574004898283&limit=10&offset='+str(page)
link2='&repSeq=4272904&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&_=1574004898289'
link=link1+link2
print(link)
singlepa(link)