Python：基于selenium的微博自动爬虫并导出为CSV文件

PyDj

已于 2023-10-11 10:07:06 修改

阅读量1.2k

点赞数 4

文章标签： selenium python 爬虫新浪微博

于 2023-05-11 13:59:21 首次发布

本文链接：https://blog.csdn.net/m0_72227512/article/details/130619817

版权

这是利用业余时间编写的，基于selenium的微博关键字搜索结果全自动爬虫，支持自定义搜素关键字、搜索起始时间、爬取起始页数（以实现中断后接上次继续爬取）。爬取内容包括微博账号、发文时间、发送平台、微博内容、转发次数、评论次数、点赞次数、原博地址，并导出为CSV文件。自己测试持续稳定运行24小时以上，连续爬取微博数量3W以上。

下面放上目前版本v1.0.3的代码，仅作为个人练手项目，随缘更新：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from lxml import etree
import pandas
import requests
import json
import datetime
import random
import time
import re
import selenium.common.exceptions


# from get_cookie import get_cookie


class GetWeibo:
    browser_options = Options()
    # 不显示浏览器界面
    browser_options.add_argument("--headless")
    # 不使用沙盒模式
    # browser_options.add_argument("--no-sandbox")
    browser = webdriver.Chrome(chrome_options=browser_options)
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome'
                             '/103.0.0.0 Safari/537.36'}
    print("浏览器已成功创建。")

    def __init__(self):
        self.base_url = 'https://s.weibo.com/weibo'
        self.keywords = None
        self.origin = None
        self.time_judge = None
        # 如果cookie失效，请重新运行
        # get_cookie()
        self.main()

    def open_search(self):
        self.browser.get(self.base_url)
        self.browser.delete_all_cookies()
        time.sleep(8)
        print(f'微博搜索页面{self.browser.current_url}已成功打开...')
        kw = self.browser.find_element(By.XPATH, ('//div[@class="searchbox"]/div[@class="search-input"]/'
                                                  'input[@type="text"]'))
        self.keywords = input('请输入微博搜索的关键词，按回车键确认：')
        print(f'搜索关键词为：{self.keywords}。')
        while True:
            self.origin = input('搜索所有微博请输入1，按回车键确认，直接按回车键则只搜索原创微博：')
            if self.origin == '':
                self.origin = '&scope=ori'
                print('仅搜索原创微博。')
                break
            elif self.origin == '1':
                self.origin = '&typeall=1'
                print('搜索全部微博。')
                break
            else:
                print('输入错误，请重新输入。')
                continue
        while True:
            date_time = input('请按年-月-日-时的格式输入抓取微博的发布截止时间（示例：2022-08-03-07），按回车键确认，直接按回车键则截止时间为当前时间：')
            if date_time == '':
                date_format = '%Y-%m-%d-%H'
                date_time = datetime.datetime.now().strftime(date_format)
                date_time = (datetime.datetime.strptime(date_time, date_format) + (
                    datetime.timedelta(hours=+1))).strftime(date_format)
                print('截止时间为：当前时间。')
                break
            elif re.match(r'(2\d{3})-'
                          r'('
                          r'('
                          r'(0[13578]|1[02])-(0[1-9]|[12]\d|3[01])-'
                          r')|'
                          r'('
                          r'(0[469]|11)-(0[1-9]|[12]\d|30)-'
                          r')|'
                          r'('
                          r'02-(0[1-9]|1[\d|2[0-8])-'
                          r')'
                          r'('
                          r'(0)|([01]\d)|(2[0-3])'
                          r')'
                          r')', date_time) is None:
                print('时间格式输入错误，请重新输入！')
                continue
            else:
                print(f'截止时间为：{date_time}。')
                break
        self.time_judge = datetime.datetime.strptime(date_time, '%Y-%m-%d-%H')
        while True:
            page_begin = input('请输入微博列表的抓取起始页（0至50之间），按回车键确认，直接按回车键从第1页开始：')
            if page_begin == '':
                page_begin = ''
                print('抓取起始页为：第1页。')
                break
            elif re.match(r'([1-4]\d|50)', page_begin) is None:
                print('抓取起始页输入错误，请重新输入！')
                continue
            else:
                print(f'抓取起始页为：第{page_begin}页。')
                page_begin = '&page=' + str(page_begin)
                break
        kw.send_keys(self.keywords)
        click_search = self.browser.find_element(By.XPATH, '//div[@class="searchbox"]/button[@class="s-btn-b"]')
        click_search.click()
        time.sleep(1)
        click_list = self.browser.find_element(By.XPATH, '//div[@class ="m-main-nav"]/ul/li[2]/a')
        click_list.click()
        time.sleep(1)
        print(f'微博列表页面{self.browser.current_url}已成功打开，列表按时间倒序排序。')
        with open('cookies.txt', 'r') as f:
            cookies_list = json.load(f)
            for cookie in cookies_list:
                if isinstance(cookie.get('expiry'), float):
                    cookie['expiry'] = int(cookie['expiry'])
                self.browser.add_cookie(cookie)
        self.browser.refresh()
        date_format = '%Y-%m-%d-%H'
        date_past = (datetime.datetime.strptime(date_time, date_format) + datetime.timedelta(days=-31)).strftime(
            date_format)
        url = self.browser.current_url
        url_change = re.search(r'(.*)(?=q=)', url)
        url = url_change.group() + f'q={self.keywords}{self.origin}&suball=1&timescope=custom:{date_past}:{date_time}&Refer=g{page_begin}'
        print(f'本次抓取的开始时间是：{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
        search_times = 0
        return url, search_times

    def auto_search(self, url, search_times):
        if url != self.browser.current_url:
            self.browser.get(url)
        print(f'微博列表页面{self.browser.current_url}已打开，抓取中...')
        time.sleep(1)
        data = etree.HTML(self.browser.page_source)
        post_url = data.xpath('//p[@class="from"]/a[1]/@href')
        if len(post_url) == 0:
            post_url = data.xpath('//div[@class="from"]/a[1]/@href')
        df = pandas.DataFrame(
            columns=['微博账号', '发文时间', '发送平台', '微博内容', '转发次数', '评论次数', '点赞次数', '原博地址'])
        for index, url_single in enumerate(post_url):
            url = 'https:' + url_single
            print(url)
            while True:
                self.browser.get(url)
                time.sleep(1)
                post = etree.HTML(self.browser.page_source)
                names = post.xpath('//a[@usercard]/span[@title]/text()')
                print(names)
                time_ = post.xpath('//a[@title][@href][@class][1]/text()')
                time_ = f'20{"".join(time_).strip()}'
                if time_ == '20':
                    print('解析错误，正在处理...')
                    time.sleep(60)
                    self.browser.back()
                    continue
                elif index == 0:
                    try:
                        time_mark = datetime.datetime.strptime(time_, '%Y-%m-%d %H:%M')
                        if time_mark > self.time_judge:
                            next_time = (datetime.datetime.now() + (datetime.timedelta(seconds=+3601))).strftime(
                                '%Y-%m-%d %H:%M:%S')
                            _target_time = datetime.datetime.strptime(next_time, '%Y-%m-%d %H:%M:%S')
                            _time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
                            print(f'达到单时段最大次数限制。当前时间是{_time}，目前已抓取{search_times}条微博，下次抓取时间：{next_time}，现在睡眠中...')
                            while datetime.datetime.now() < _target_time:
                                time.sleep(60)
                            self.browser.back()
                            click_next = None
                            while True:
                                try:
                                    click_next = self.browser.find_element(By.XPATH,
                                                                           '//div[@class="m-page"]/div/a[@class="next"]')
                                    break
                                except selenium.common.exceptions.NoSuchElementException as E:
                                    print(repr(E))
                                    next_time = (
                                                datetime.datetime.now() + (datetime.timedelta(seconds=+3601))).strftime(
                                        '%Y-%m-%d %H:%M:%S')
                                    _target_time = datetime.datetime.strptime(next_time, '%Y-%m-%d %H:%M:%S')
                                    _time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
                                    print(
                                        f'达到单时段最大次数限制。当前时间是{_time}，目前已抓取{search_times}条微博，下次抓取时间：{next_time}，现在睡眠中...')
                                    while datetime.datetime.now() < _target_time:
                                        time.sleep(120)
                                    self.browser.back()
                                    continue
                            click_next.click()
                            url = self.browser.current_url
                            return url, search_times
                    except ValueError as VE:
                        print(repr(VE))
                        time.sleep(60)
                        self.browser.back()
                        continue
                break
            print(time_)
            from1 = post.xpath('//div[@class="woo-box-flex"]/div[@title]/text()')
            from2 = post.xpath('//div[@class="woo-box-flex"]/div[contains(@class, "head-info_cut")]/text()')
            from1 = ''.join(from1)
            from2 = ''.join(from2)
            from_all = from1 + from2
            blogs = post.xpath('//div[contains(@class, "detail_text")]/div/text()')
            blogs = ''.join(blogs)
            forward = post.xpath('//span[@class="woo-pop-ctrl"]/div/span/text()')
            forward = [0 if i == ' 转发 ' else i for i in forward]
            if '万' not in forward:
                pass
            else:
                forward = ''.join(forward)
                forward = int(float(forward[0:-1]) * 10000)
            comments = post.xpath('//div[contains(@class, "woo-box-item-flex toolbar_item")]'
                                  '/div[contains(@class, "woo-box-flex")]/span/text()')
            comments = [0 if i == ' 评论 ' else i for i in comments]
            if '万' not in comments:
                pass
            else:
                comments = ''.join(comments)
                comments = int(float(comments[0:-1]) * 10000)
            likes = post.xpath('//div[contains(@class, "toolbar_likebox")]/button/span[@class="woo-like-count"]/text()')
            likes = [0 if i == '赞' else i for i in likes]
            if '万' not in likes:
                pass
            else:
                likes = ''.join(likes)
                likes = int(float(likes[0:-1]) * 10000)
            key_list = ['微博账号', '发文时间', '发送平台', '微博内容', '转发次数', '评论次数', '点赞次数', '原博地址']
            info_list = [names, time_, from_all, blogs, forward, comments, likes, url]
            csv_info = dict(zip(key_list, info_list))
            df1 = pandas.DataFrame(csv_info, columns=key_list)
            df = pandas.concat([df, df1])
            time.sleep(.5)
            self.browser.back()
            search_times += 1
        df.to_csv('weibo_spider.csv', mode='a', encoding='utf_8_sig', header=False, index=False)
        url = self.browser.current_url
        page_num = url.split('page=')[-1]
        if page_num.isdigit() is False:
            page_num = '1'
        page_num = int(page_num)
        _time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
        print(f'已成功提取第{page_num}页微博信息并追加写入CSV文件！当前时间是{_time}，目前已抓取{search_times}条微博。')
        if page_num == 50:
            post = etree.HTML(self.browser.page_source)
            time_last = post.xpath('//p[@class="from"]/a[1]/text()')
            if len(time_last) == 0:
                time_last = post.xpath('//div[@class="from"]/a[1]/text()'
            time_last = time_last[-1]
            if '年' in time_last:
                year_num = ''.join(re.findall(r'(\d+)', time_last)[0])
                mon_num = ''.join(re.findall(r'(\d+)', time_last)[1])
                day_num = ''.join(re.findall(r'(\d+)', time_last)[2])
                hour_num = ''.join(re.findall(r'(\d+)', time_last)[3])
                min_num = ''.join(re.findall(r'(\d+)', time_last)[4])
            elif '今天' in time_last:
                year_num = str(datetime.datetime.today().year)
                mon_num = str(datetime.datetime.today().month)
                day_num = str(datetime.datetime.today().day)
                hour_num = ''.join(re.findall(r'(\d+)', time_last)[2])
                min_num = ''.join(re.findall(r'(\d+)', time_last)[3])
            else:
                year_num = str(datetime.datetime.today().year)
                mon_num = ''.join(re.findall(r'(\d+)', time_last)[0])
                day_num = ''.join(re.findall(r'(\d+)', time_last)[1])
                hour_num = ''.join(re.findall(r'(\d+)', time_last)[2])
                min_num = ''.join(re.findall(r'(\d+)', time_last)[3])
            time_last = year_num + '-' + mon_num + '-' + day_num + ' ' + hour_num + ':' + min_num
            time_last = datetime.datetime.strptime(time_last, '%Y-%m-%d %H:%M')
            time_last = datetime.datetime.strftime(time_last, '%Y-%m-%d-%H')
            date_format = '%Y-%m-%d-%H'
            time_begin = (datetime.datetime.strptime(time_last, date_format) +
                          datetime.timedelta(days=-31)).strftime(date_format)
            time_end = (datetime.datetime.strptime(time_last, date_format) +
                        datetime.timedelta(hours=+1)).strftime(date_format)
            print(f'这是第50页，本页最后一条微博时间为{time_last}。当前时间是{_time}，目前已抓取{search_times}条微博。准备跳转页面...')
            url = self.browser.current_url
            url_change = re.search(r'(.*)(?=q=)', url)
            url = url_change.group() + f'q={self.keywords}{self.origin}&suball=1&timescope=custom:{time_begin}:{time_end}&Refer=g&page=1'
            return url, search_times
        click_next = None
        while True:
            try:
                click_next = self.browser.find_element(By.XPATH,
                                                       '//div[@class="m-page"]/div/a[@class="next"]')
                break
            except selenium.common.exceptions.NoSuchElementException as E:
                print(repr(E))
                next_time = (datetime.datetime.now() + (datetime.timedelta(seconds=+3601))).strftime('%Y-%m-%d %H:%M:%S')
                _target_time = datetime.datetime.strptime(next_time, '%Y-%m-%d %H:%M:%S')
                _time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
                print(f'达到单时段最大次数限制。当前时间是{_time}，目前已抓取{search_times}条微博，下次抓取时间：{next_time}，现在睡眠中...')
                while datetime.datetime.now() < _target_time:
                    time.sleep(60)
                self.browser.back()
                continue
        click_next.click()
        url = self.browser.current_url
        return url, search_times

    def main(self):
        url, search_times = self.open_search()
        while True:
            url, search_times = self.auto_search(url, search_times)


if __name__ == '__main__':
    gt = GetWeibo()

获取cookie，需要登录自己的微博账号，建议手机APP扫码：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
import time

browser_options = Options()
browser = webdriver.Chrome(chrome_options=browser_options)
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome'
                         '/103.0.0.0 Safari/537.36'}
print("浏览器已成功创建。")


def get_cookie(url='https://weibo.com/login.php'):
    url = url
    browser.get(url)
    print('请在25秒内，使用微博APP扫码登录你的账号...')
    time.sleep(25)
    with open('cookies.txt', 'w') as f:
        f.write(json.dumps(browser.get_cookies()))
        f.close()
    print('已成功保存cookie信息。')


if __name__ == '__main__':
    get_cookie()

PyDj

关注

4
点赞
踩
24

收藏

觉得还不错? 一键收藏
6
评论
Python：基于selenium的微博自动爬虫并导出为CSV文件

这是利用业余时间编写的，基于selenium的某社交平台关键字搜索结果全自动爬虫，支持自定义搜素关键字、搜索起始时间、爬取起始页数（以实现中断后接上次继续爬取）。
复制链接

扫一扫