头条号爬虫实战

1.先去部署selenium+Chrome,具体部署方法,可以去搜索一下,很简单。


2.找到你要爬取的头条号网址,本文以光明网为例。

https://www.toutiao.com/c/user/token/MS4wLjABAAAA9Lz0MeLdJDmqpU26Xi9O_M-cYI9z530wjM7eDKvzZTw/?source=feed&log_from=47e4ed6a059e5_1657954170450


3.根据网页数据去分析获取xpath路径

 

4.根据xpath路径,爬取文章内容,点赞数,文章内容等数据


5.代码实现(有注释)

# 开发人员:小李同学
# 开发日期:2022/7/11  9:07

import openpyxl

from selenium.webdriver import Chrome
import pandas as pd
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.options import Options
import re
from datetime import datetime, timedelta
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')

# 要爬取头条号的网址
net = input('请输入要访问的头条号的网址:')
num_1 = int(input('请输入滚动的次数(次数越多爬取数据越多,耗时越久):'))
file_name = input('请输入文件名:')
url = net
opt = Options()
# 删除掉Chrome浏览器正在收到自动测试软件的控制
opt.add_experimental_option('excludeSwitches', ['enable-automation'])
# 创建浏览器对象
toutiao = Chrome(options=opt)
toutiao.get(url)
time.sleep(2)
# 点击文章
toutiao.find_element(by=By.XPATH, value='//*[@id="root"]/div/div[3]/div[1]/div/div[1]/ul/li[2]').click()
time.sleep(2)

# 滚动条向下滑动
count = 0
while count < num_1:
    toutiao.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(1)
    count += 1

time.sleep(2)
# 滑动回顶部,否则爬取不到文章内容
js = "window.scrollTo(0,0)"
toutiao.execute_script(js)
time.sleep(1)
# 所有文章所在的div
div_list = toutiao.find_elements(by=By.XPATH, value='//*[@id="root"]/div/div[3]/div[1]/div/div[2]/div/div/div')
time.sleep(2)
date_list = []
# print(div_list)
for div in div_list:
    # 标题
    title_name = div.find_element(by=By.XPATH, value='./div/div/div/a').text
    # 阅读数
    read = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[1]').text
    # 评论数
    comment = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[2]').text
    # 发布时间
    timess = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[3]').text
    """ print('标题:', title_name)
    print('阅读数:', read)
    print('评论数:', comment)
    print('发布时间:', timess)"""
    time.sleep(2)
    def parseTime(timess):
        if ('分钟前' in timess) or ('分鐘前' in timess) or ('minute' in timess) or (
                '分鐘' in timess) or ('分前' in timess):
            try:
                minutes = timess[:timess.find('分鐘')]
                minutes = timedelta(minutes=int(minutes))
            except:
                minutes = timess[:timess.find('分钟前')]
                minutes = timedelta(minutes=int(minutes))
            # FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d %H:%M')
            FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d')
        elif ('mins ago') in timess:
            minutes = timess[:timess.find('mins ago')]
            minutes = timedelta(minutes=int(minutes))
            FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d')
        elif ('hours ag') in timess:
            hour = timess[:timess.find('hours ag')]
            hour = timedelta(hours=int(hour))
            FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d')
        elif ('小时前' in timess) or ('小時前' in timess) or ('hour' in timess) or (
                '小時' in timess):
            try:
                hour = timess[:timess.find('小时前')]
                hour = timedelta(hours=int(hour))
            except:
                hour = timess[:timess.find('小時')]
                hour = timedelta(hours=int(hour))
            # FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d %H:%M')
            FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d')
        elif ('天前' in timess) or ('day' in timess):
            day = timess[:timess.find('天前')]
            day = timedelta(days=int(day))
            FormatedTime = (datetime.now() - day).strftime('%Y-%m-%d')
            FormatedTime = re.findall(r'\d+', str(FormatedTime))
            FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
            # return datetime.strptime(FormatedTime, '%Y-%m-%d')
            return FormatedTime
        elif ('周前' in timess) or ('週前' in timess) or ('week' in timess) or (
                '週' in timess):
            try:
                week = timess[:timess.find('周前')]
                week = timedelta(weeks=int(week))
            except:
                week = timess[:timess.find('週')]
                week = timedelta(weeks=int(week))
            FormatedTime = (datetime.now() - week).strftime('%Y-%m-%d')
            FormatedTime = re.findall(r'\d+', str(FormatedTime))
            FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
            # return datetime.strptime(FormatedTime, '%Y-%m-%d')
            return FormatedTime
        elif ('个月前' in timess) or ('個月前' in timess) or ('month' in timess):
            month = timess[:timess.find('个月前')]
            month = timedelta(days=int(month) * 30)
            FormatedTime = (datetime.now() - month).strftime('%Y-%m-%d')
            FormatedTime = re.findall(r'\d+', str(FormatedTime))
            FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
            # return datetime.strptime(FormatedTime, '%Y-%m-%d')
            return FormatedTime
        else:
            try:
                FormatedTime = re.findall(r'\d+', str(timess))
                FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
                # FormatedTime = datetime.strptime(FormatedTime, '%Y-%m-%d')
                return FormatedTime
            except Exception as e_time:
                print(e_time)
                return timess
        FormatedTime = re.findall(r'\d+', str(FormatedTime))
        # FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2] + ' ' + FormatedTime[
        #     3] + ':' + FormatedTime[4]
        FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
        # return datetime.strptime(FormatedTime, '%Y-%m-%d %H:%M')
        return FormatedTime
    # 点击文章
    div.find_element(by=By.XPATH, value='./div/div/div/a').click()
    time.sleep(2)
    # 切换到文章页面去获取全文
    toutiao.switch_to.window(toutiao.window_handles[1])
    time.sleep(2)
    page_detail = toutiao.find_element(by=By.XPATH, value='//*[@id="root"]/div[2]/div[2]/div[1]/div/article').text
    # 替换非法字符
    page_detail = ILLEGAL_CHARACTERS_RE.sub(r'', page_detail)
    # print('原文内容:', page_detail)
    time.sleep(2)
    # 关闭当前窗口
    toutiao.close()
    # 切换为原来的窗口
    toutiao.switch_to.window(toutiao.window_handles[0])
    time.sleep(2)
    record = {
        '标题': title_name,
        '阅读数': read,
        '评论数': comment,
        '发布时间': parseTime(timess),
        '原文内容': page_detail
    }
    date_list.append(record)
    try:
        pd.DataFrame(date_list).to_excel(file_name + '.xlsx', index=False, encoding='utf-8')  # 保存数据
    except openpyxl.utils.exceptions.IllegalCharacterError:
        print('出现非法字符')
    else:
        pd.DataFrame(date_list).to_excel(file_name + '.xlsx', index=False, encoding='utf-8')  # 保存数据

print('over!!!')

6.成果展示

7.第一次写爬虫,有许多不足之处,欢迎各位大佬指导。 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

WinterWanderer

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值