1.先去部署selenium+Chrome,具体部署方法,可以去搜索一下,很简单。
2.找到你要爬取的头条号网址,本文以光明网为例。
3.根据网页数据去分析获取xpath路径
4.根据xpath路径,爬取文章内容,点赞数,文章内容等数据
5.代码实现(有注释)
# 开发人员:小李同学
# 开发日期:2022/7/11 9:07
import openpyxl
from selenium.webdriver import Chrome
import pandas as pd
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.options import Options
import re
from datetime import datetime, timedelta
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
# 要爬取头条号的网址
net = input('请输入要访问的头条号的网址:')
num_1 = int(input('请输入滚动的次数(次数越多爬取数据越多,耗时越久):'))
file_name = input('请输入文件名:')
url = net
opt = Options()
# 删除掉Chrome浏览器正在收到自动测试软件的控制
opt.add_experimental_option('excludeSwitches', ['enable-automation'])
# 创建浏览器对象
toutiao = Chrome(options=opt)
toutiao.get(url)
time.sleep(2)
# 点击文章
toutiao.find_element(by=By.XPATH, value='//*[@id="root"]/div/div[3]/div[1]/div/div[1]/ul/li[2]').click()
time.sleep(2)
# 滚动条向下滑动
count = 0
while count < num_1:
toutiao.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)
count += 1
time.sleep(2)
# 滑动回顶部,否则爬取不到文章内容
js = "window.scrollTo(0,0)"
toutiao.execute_script(js)
time.sleep(1)
# 所有文章所在的div
div_list = toutiao.find_elements(by=By.XPATH, value='//*[@id="root"]/div/div[3]/div[1]/div/div[2]/div/div/div')
time.sleep(2)
date_list = []
# print(div_list)
for div in div_list:
# 标题
title_name = div.find_element(by=By.XPATH, value='./div/div/div/a').text
# 阅读数
read = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[1]').text
# 评论数
comment = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[2]').text
# 发布时间
timess = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[3]').text
""" print('标题:', title_name)
print('阅读数:', read)
print('评论数:', comment)
print('发布时间:', timess)"""
time.sleep(2)
def parseTime(timess):
if ('分钟前' in timess) or ('分鐘前' in timess) or ('minute' in timess) or (
'分鐘' in timess) or ('分前' in timess):
try:
minutes = timess[:timess.find('分鐘')]
minutes = timedelta(minutes=int(minutes))
except:
minutes = timess[:timess.find('分钟前')]
minutes = timedelta(minutes=int(minutes))
# FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d %H:%M')
FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d')
elif ('mins ago') in timess:
minutes = timess[:timess.find('mins ago')]
minutes = timedelta(minutes=int(minutes))
FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d')
elif ('hours ag') in timess:
hour = timess[:timess.find('hours ag')]
hour = timedelta(hours=int(hour))
FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d')
elif ('小时前' in timess) or ('小時前' in timess) or ('hour' in timess) or (
'小時' in timess):
try:
hour = timess[:timess.find('小时前')]
hour = timedelta(hours=int(hour))
except:
hour = timess[:timess.find('小時')]
hour = timedelta(hours=int(hour))
# FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d %H:%M')
FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d')
elif ('天前' in timess) or ('day' in timess):
day = timess[:timess.find('天前')]
day = timedelta(days=int(day))
FormatedTime = (datetime.now() - day).strftime('%Y-%m-%d')
FormatedTime = re.findall(r'\d+', str(FormatedTime))
FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
# return datetime.strptime(FormatedTime, '%Y-%m-%d')
return FormatedTime
elif ('周前' in timess) or ('週前' in timess) or ('week' in timess) or (
'週' in timess):
try:
week = timess[:timess.find('周前')]
week = timedelta(weeks=int(week))
except:
week = timess[:timess.find('週')]
week = timedelta(weeks=int(week))
FormatedTime = (datetime.now() - week).strftime('%Y-%m-%d')
FormatedTime = re.findall(r'\d+', str(FormatedTime))
FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
# return datetime.strptime(FormatedTime, '%Y-%m-%d')
return FormatedTime
elif ('个月前' in timess) or ('個月前' in timess) or ('month' in timess):
month = timess[:timess.find('个月前')]
month = timedelta(days=int(month) * 30)
FormatedTime = (datetime.now() - month).strftime('%Y-%m-%d')
FormatedTime = re.findall(r'\d+', str(FormatedTime))
FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
# return datetime.strptime(FormatedTime, '%Y-%m-%d')
return FormatedTime
else:
try:
FormatedTime = re.findall(r'\d+', str(timess))
FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
# FormatedTime = datetime.strptime(FormatedTime, '%Y-%m-%d')
return FormatedTime
except Exception as e_time:
print(e_time)
return timess
FormatedTime = re.findall(r'\d+', str(FormatedTime))
# FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2] + ' ' + FormatedTime[
# 3] + ':' + FormatedTime[4]
FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
# return datetime.strptime(FormatedTime, '%Y-%m-%d %H:%M')
return FormatedTime
# 点击文章
div.find_element(by=By.XPATH, value='./div/div/div/a').click()
time.sleep(2)
# 切换到文章页面去获取全文
toutiao.switch_to.window(toutiao.window_handles[1])
time.sleep(2)
page_detail = toutiao.find_element(by=By.XPATH, value='//*[@id="root"]/div[2]/div[2]/div[1]/div/article').text
# 替换非法字符
page_detail = ILLEGAL_CHARACTERS_RE.sub(r'', page_detail)
# print('原文内容:', page_detail)
time.sleep(2)
# 关闭当前窗口
toutiao.close()
# 切换为原来的窗口
toutiao.switch_to.window(toutiao.window_handles[0])
time.sleep(2)
record = {
'标题': title_name,
'阅读数': read,
'评论数': comment,
'发布时间': parseTime(timess),
'原文内容': page_detail
}
date_list.append(record)
try:
pd.DataFrame(date_list).to_excel(file_name + '.xlsx', index=False, encoding='utf-8') # 保存数据
except openpyxl.utils.exceptions.IllegalCharacterError:
print('出现非法字符')
else:
pd.DataFrame(date_list).to_excel(file_name + '.xlsx', index=False, encoding='utf-8') # 保存数据
print('over!!!')
6.成果展示
7.第一次写爬虫,有许多不足之处,欢迎各位大佬指导。