python+selenium爬虫搜索今日头条文章并爬取文章相关数据（点赞、评论等）

最新推荐文章于 2025-03-10 09:53:41 发布

淅淅的雨声

最新推荐文章于 2025-03-10 09:53:41 发布

阅读量4.9k

点赞数 5

分类专栏：爬虫文章标签： python

本文链接：https://blog.csdn.net/weixin_44879975/article/details/119514478

版权

爬虫专栏收录该内容

4 篇文章

订阅专栏

该博客详细介绍了如何使用Python的Selenium库自动化爬取今日头条的文章信息，包括登录模块、判断元素是否存在、处理评论数量等复杂情况。通过编写不同函数分别处理文章的原创声明、评论状态，并对页面信息进行提取，最终将数据存储到CSV文件中。整个过程展示了爬虫技术在信息抓取中的应用和问题解决策略。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

首先需要一个登录模块，由于今日头条需要登陆就可以搜索，因此这里不登陆，只获取页面

#打开浏览器（不登陆）
def login():
    url = 'https://www.toutiao.com/'
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    web = Chrome(options=option)
    web.maximize_window()
    web.get(url)
    return web

获取时间的部分，有些文章有原创声明，有些没有，导致xpath路径不一致，因此写一个函数来判断是否有原创声明

#判断是否有原创声明
def is_Elementexist(web):
    try:
        web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="main"]/div[@class="article-content"]/div[@class="article-meta"]/span[contains(text(),"原创")]')
        return True
    except:
        return False

判断评论的部分就比较复杂了，分为三种情况：
第一种：没有评论；
第二种：有三条及三条以下评论，此时会在文章下方完全显示；
第三种：有三条以上评论，需要点击更多来获取剩余的评论，此类又有两种情况：剩余评论较少，一次完全展开；剩余评论较多，还需要展开多次；

因此使用三个函数来判断三种情况：

#判断是否有评论
def isno_Commentexist(web):
    try:
        web.find_element_by_xpath('//*[@id="comment-area"]/ul[@class="comment-list"]/li')
        return True
    except:
        return False

#判断是否有超过三条评论
def is_Commentexist(web):
    try:
        web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="main"]/div[3]/div[@class="ttp-comment-block"]/button[@class="side-drawer-btn"]')
        return True
    except:
        return False

#判断是否有更多评论
def is_CommentExists(web):
    try:
        web.find_element_by_xpath('/html/body/div[@class="ttp-portal-wrapper ttp-drawer-wrapper ttp-comment-drawer ttp-portal-show ttp-portal-anime-show"]/div[@class="ttp-drawer"]/div[@class="body"]/button[@class="load-more-btn"]')
        return True
    except:
        return False

各个函数完成以后，就可以写函数对页面信息进行提取了：

def login():
    url = 'https://www.toutiao.com/'
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    web = Chrome(options=option)
    web.maximize_window()
    web.get(url)
    return web

#爬取每个详情页面信息，count用来计数
def page_spider(web,write_obj,count):
    div_list = web.find_elements_by_xpath(
        '/html/body/div[@class="main hide-side-list"]/div[@class="s-result-list"]/div')
    for div in div_list[0:len(div_list)-1]:
        try:
            # 数组用来存放每篇文章相关信息
            page_info_list = []
            # 进入每篇文章详情页
            div.find_element_by_xpath('./div[1]/div/div[@class="cs-view cs-view-block cs-card"]/div[@class="cs-view cs-view-block cs-card-content"]/div[@class="cs-view pad-bottom-3 cs-view-block cs-text align-items-center cs-header"]/div[@class="flex-1 text-darker text-xl text-medium d-flex align-items-center overflow-hidden"]/a').click()
            # 跳转到详情页新窗口
            web.switch_to.window(web.window_handles[2])
            # 文章作者
            page_user = web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="main"]/div[@class="article-content"]/div[@class="article-meta"]/span[@class="name"]/a').text
            # 是否原创（该标签又是不存在，影响发布时间标签的xpath路径）
            if is_Elementexist(web) == True:
                page_original = web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="main"]/div[@class="article-content"]/div[@class="article-meta"]/span[1]').text
                page_time = web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="main"]/div[@class="article-content"]/div[@class="article-meta"]/span[2]').text
            else:
                # 发布时间定位受原创位置的影响
                page_original = '无'
                page_time = web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="main"]/div[@class="article-content"]/div[@class="article-meta"]/span[1]').text
            # 文章标题
            page_title = web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="main"]/div[@class="article-content"]/h1').text
            # 文章内容
            page_detail = web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="main"]/div[@class="article-content"]/article').text
            # 点赞数
            page_praise = web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="left-sidebar"]/div/div[2]/div[@class="detail-side-interaction"]/div[@class="detail-like"]/span').text
            # 评论数
            page_comment_num = web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="left-sidebar"]/div/div[2]/div[@class="detail-side-interaction"]/div[@class="detail-interaction-comment"]/span').text
            #定义一个变量用存储评论个数
            comment_sum = 0
            #获取评论
            #首先判断是否有超过三条的评论
            comment_user_list = []
            comment_detail_list = []
            comment_time_list = []
            if is_Commentexist(web):
                web.find_element_by_xpath('//*[@id="root"]/div[@class="article-detail-container"]/div[@class="main"]/div[3]/div[@class="ttp-comment-block"]/button[@class="side-drawer-btn"]').click()
                time.sleep(1)
                #判断是否有更多的评论，有则全部展开
                #click_num点击次数，最多点五次，容易被检测出
                click_num = 0
                while is_CommentExists(web) == True:
                    web.find_element_by_xpath('/html/body/div[2]/div[2]/div[2]/button').click()
                    time.sleep(1)
                    click_num += 1
                    if click_num>5:
                        break
                #获取所有展开的评论
                li_list = web.find_elements_by_xpath('/html/body/div[@class="ttp-portal-wrapper ttp-drawer-wrapper ttp-comment-drawer ttp-portal-show ttp-portal-anime-show"]/div[@class="ttp-drawer"]/div[@class="body"]/div[@class="ttp-comment-wrapper"]/ul[@class="comment-list"]/li')
                #计算评论个数
                comment_sum = len(li_list)
                #获取每个li
                for li in li_list:
                    comment_user_list.append(li.find_element_by_xpath('./div[@class="ttp-comment-item"]/div[@class="comment-info"]/div[@class="header"]/div[@class="user-info"]/a/div[@class="user-name"]/span[@class="name"]').text)
                    comment_detail_list.append(li.find_element_by_xpath('./div[@class="ttp-comment-item"]/div[@class="comment-info"]/div[@class="body"]/p[@class="content"]').text)
                    comment_time_list.append(li.find_element_by_xpath('./div[@class="ttp-comment-item"]/div[@class="comment-info"]/div[@class="footer"]/div[@class="left-action"]/span[@class="time"]').text)
            else:
                #判断是否有评论
                if isno_Commentexist(web) == True:
                    li_list = web.find_elements_by_xpath('//*[@id="comment-area"]/ul[@class="comment-list"]/li')
                    #计算评论个数
                    comment_sum = len(li_list)
                    #获取每个li
                    for li in li_list:
                        comment_user_list.append(li.find_element_by_xpath('./div[@class="ttp-comment-item"]/div[@class="comment-info"]/div[@class="header"]/div[@class="user-info"]/a/div[@class="user-name"]/span[@class="name"]').text)
                        comment_detail_list.append(li.find_element_by_xpath('./div[@class="ttp-comment-item"]/div[@class="comment-info"]/div[@class="body"]/p[@class="content"]').text)
                        comment_time_list.append(li.find_element_by_xpath('./div[@class="ttp-comment-item"]/div[@class="comment-info"]/div[@class="footer"]/div[@class="left-action"]/span[@class="time"]').text)
                else:
                    comment_user_list = []
            # 将相关信息写入数组
            page_info_list.append(page_user)
            page_info_list.append(page_time)
            page_info_list.append(page_original)
            page_info_list.append(page_title)
            page_info_list.append(page_detail)
            page_info_list.append(page_praise)
            page_info_list.append(page_comment_num)
            #将评论信息写入数组
            if comment_sum != 0:
                for i in range(comment_sum):
                    page_info_list.append(comment_user_list[i])
                    page_info_list.append(comment_detail_list[i])
                    page_info_list.append(comment_time_list[i])
            else:
                page_info_list.append('无')
            # 将数组写入csv文件
            write_obj.writerow(page_info_list)
            # 为防止访问速度过快被封，每点击一篇文章，停留两秒
            time.sleep(3)
            web.close()
            web.switch_to.window(web.window_handles[1])
        except:
            time.sleep(10)
            web.switch_to.window(web.window_handles[1])

    count += len(div_list) - 1
    return count

#爬虫，取每一页的页面（一页包含十篇文章）
def spider(web,hot_topic):
    count = 0
    f = open(hot_topic+'.csv',mode='w',newline='',encoding='utf-8')
    write_obj = csv.writer(f)
    write_obj.writerow(['用户名','发布时间','原创/无','标题','内容','点赞','评论','评论用户','评论内容'])
    page_num = int(input("需要爬取几页数据："))
    for i in range(page_num):
        #爬取页面的同时，获得数据条数count
        count = page_spider(web,write_obj,count)
        print("已爬取{}条数据！".format(count))
        #点击下一页
        web.find_element_by_xpath('/html/body/div[@class="main hide-side-list"]/div[@class="s-result-list"]/div[@class="result-content"]/div/div[@class="cs-view pad-bottom-12 cs-view-block bar_3xTirN"]/div[@class="cs-view cs-view-block cs-pagination"]/a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]').click()
        #新页面等待三秒
        time.sleep(3)