FB贴文采集

使用的是selenium方式爬取

# -*- coding: utf-8 -*-

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
import time


class Selectors(object):
    selectors = {
        "height_script": "return document.body.scrollHeight",
        "scroll_script": "window.scrollTo(0, document.body.scrollHeight);",
        "comment_section": ".//*[@class='commentable_item']",
        "more_comment_replies": ".//a[contains(@class,'_4sxc _42ft')]",
        "comment_see_more_link": ".//a[contains(@class,'_5v47 fss')]",
    }


class FBSpider(object):
    def __init__(self, c_user, xs, total_scrolls, scroll_time, spider_url):
        self.c_user = c_user
        self.xs = xs
        self.total_scrolls = total_scrolls
        self.scroll_time = scroll_time
        self.spider_url = spider_url
        self.selectors = Selectors().selectors
        self.driver = None
        self.cookies_list = [
            {"value": f"{self.c_user}", "name": "c_user", "domain": "facebook.com", "path": "/"},
            {"value": f"{self.xs}", "name": "xs", "domain": "facebook.com", "path": "/"}
        ]

    def cookie_login(self):
        """
        使用cookie登录
        :return:
        """
        options = Options()
        options.add_argument("--disable-notifications")
        options.add_argument("--disable-infobars")
        options.add_argument("--mute-audio")

        self.driver = webdriver.Chrome(
            executable_path=ChromeDriverManager().install(), options=options
        )
        login_url = "https://www.facebook.com"
        self.driver.get(login_url)
        time.sleep(5)

        for cookie in self.cookies_list:
            self.driver.add_cookie(cookie)

    def run(self):
        self.work()
        with open('posts.html', 'wb') as f:
            f.write(self.driver.page_source.encode('utf8'))

    def work(self):
        print('开始登录Facebook')
        self.cookie_login()
        print('登录Facebook成功')
        self.driver.get(self.spider_url)
        time.sleep(5)
        print('开始滑动滚动条')
        self.sliding_scroll_bar(self.driver)
        print('开始将隐藏的评论逐一展开')
        self.expand_comments(self.driver)

    def sliding_scroll_bar(self, driver):
        """
        滑动滚动条
        :param driver:
        :return:
        """
        old_height = 0
        current_scrolls = 0

        while True:
            try:
                if current_scrolls == self.total_scrolls:
                    return
                old_height = driver.execute_script(self.selectors.get('height_script'))
                driver.execute_script(self.selectors.get('scroll_script'))
                WebDriverWait(driver, self.scroll_time, 0.05).until(
                    lambda driver: self.check_height(driver, old_height)
                )
                current_scrolls += 1
                time.sleep(1)
            except TimeoutException:
                break
        return

    def check_height(self, driver, old_height):
        new_height = driver.execute_script(self.selectors.get('height_script'))
        return new_height != old_height

    def expand_comments(self, driver):
        """
        展开评论
        :param driver:
        :return:
        """
        try:
            # 展开更多评论
            reply_links = driver.find_elements_by_xpath(self.selectors.get('more_comment_replies'))
            # 逐个展开每个帖子的更多评论
            for link in reply_links:
                try:
                    driver.execute_script("arguments[0].click();", link)
                except Exception:
                    pass

            # 评论内容过多的话会出现隐藏,此时在界面上显示的"展开",因此找到这个按钮
            see_more_links = driver.find_elements_by_xpath(self.selectors.get('comment_see_more_link'))
            for link in see_more_links:
                try:
                    driver.execute_script("arguments[0].click();", link)
                except Exception:
                    pass
        except Exception:
            pass
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值