使用的是selenium方式爬取
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
import time
class Selectors(object):
selectors = {
"height_script": "return document.body.scrollHeight",
"scroll_script": "window.scrollTo(0, document.body.scrollHeight);",
"comment_section": ".//*[@class='commentable_item']",
"more_comment_replies": ".//a[contains(@class,'_4sxc _42ft')]",
"comment_see_more_link": ".//a[contains(@class,'_5v47 fss')]",
}
class FBSpider(object):
def __init__(self, c_user, xs, total_scrolls, scroll_time, spider_url):
self.c_user = c_user
self.xs = xs
self.total_scrolls = total_scrolls
self.scroll_time = scroll_time
self.spider_url = spider_url
self.selectors = Selectors().selectors
self.driver = None
self.cookies_list = [
{"value": f"{self.c_user}", "name": "c_user", "domain": "facebook.com", "path": "/"},
{"value": f"{self.xs}", "name": "xs", "domain": "facebook.com", "path": "/"}
]
def cookie_login(self):
"""
使用cookie登录
:return:
"""
options = Options()
options.add_argument("--disable-notifications")
options.add_argument("--disable-infobars")
options.add_argument("--mute-audio")
self.driver = webdriver.Chrome(
executable_path=ChromeDriverManager().install(), options=options
)
login_url = "https://www.facebook.com"
self.driver.get(login_url)
time.sleep(5)
for cookie in self.cookies_list:
self.driver.add_cookie(cookie)
def run(self):
self.work()
with open('posts.html', 'wb') as f:
f.write(self.driver.page_source.encode('utf8'))
def work(self):
print('开始登录Facebook')
self.cookie_login()
print('登录Facebook成功')
self.driver.get(self.spider_url)
time.sleep(5)
print('开始滑动滚动条')
self.sliding_scroll_bar(self.driver)
print('开始将隐藏的评论逐一展开')
self.expand_comments(self.driver)
def sliding_scroll_bar(self, driver):
"""
滑动滚动条
:param driver:
:return:
"""
old_height = 0
current_scrolls = 0
while True:
try:
if current_scrolls == self.total_scrolls:
return
old_height = driver.execute_script(self.selectors.get('height_script'))
driver.execute_script(self.selectors.get('scroll_script'))
WebDriverWait(driver, self.scroll_time, 0.05).until(
lambda driver: self.check_height(driver, old_height)
)
current_scrolls += 1
time.sleep(1)
except TimeoutException:
break
return
def check_height(self, driver, old_height):
new_height = driver.execute_script(self.selectors.get('height_script'))
return new_height != old_height
def expand_comments(self, driver):
"""
展开评论
:param driver:
:return:
"""
try:
reply_links = driver.find_elements_by_xpath(self.selectors.get('more_comment_replies'))
for link in reply_links:
try:
driver.execute_script("arguments[0].click();", link)
except Exception:
pass
see_more_links = driver.find_elements_by_xpath(self.selectors.get('comment_see_more_link'))
for link in see_more_links:
try:
driver.execute_script("arguments[0].click();", link)
except Exception:
pass
except Exception:
pass