【爬虫练习】爬取知乎\百度热搜榜

利用requests模块和re正则爬取知乎\百度热搜榜问题,并将数据保存到EXCEL中。

一、发送get请求

import requests
import re
import datetime
import openpyxl
import os
url = 'https://www.zhihu.com/hot'
headers = {
    'cookie': '_zap=6d20b1b3-0e83-4a23-86f8-a1a613065e2b; d_c0="ADCfiUSeQBWPTmcpyWjGq2_W-FiigLWZ2hU=|1657893709"; gdxidpyhxdE=n31M0D3qe4ne5k6SSYZO9O0WzZ0ckhGgIAqjB4ZunhaoHJ5eiW1LHMrPw2o3G%5C3B94Amy0PhOZu6Orpgp9zNUglLauXwWu6ob6z98XcPO%2Fqtts3%2FQuUfUT6JOTkAl8w1uUOAnjjMrN5hvEnyvsPjHhq%2FKfvffS%2FO5qnrWuXi8JqhJRbW%3A1657894613843; _9755xjdesxxd_=32; YD00517437729195%3AWM_NI=Pq7KzWorpm7mhgp9ml1HHrNOgnwSTsHIa5CJ95oVSVOcOgz3%2F%2FiqHe6prkvDpLV6InuvhCkO3d46UEltKIb%2ByfK6dkZ%2B0rwGoGbZNYFMSsrjlQ%2BAGYXg4psVsUedbs1aM2w%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeb1fc4f90bda1d3d97e8bbc8fa2d44f829e9bb1c859928a83b5d06dedeeb9b8d12af0fea7c3b92a94aff9acf95dadaec08eb667acb9c0daf068f3988893cd45a5b2b9b4d7798c97afd6c242fbeba1b7d25e8dbd8188f939edf1a792cb6ff6b0be8cf550b4949ad7cc66b7aaa8a4ea45f1bcfcd7d84fb0ac85aae121bbba85d7e7619af50093f64dad99aebbcd5386939d94f162b08fe5a2c6539cb9fc91f25eb18bab85e179f3bf83d3f637e2a3; YD00517437729195%3AWM_TID=%2BW0lvvbmt7RAREREFEeBXRFMWM062wCR; captcha_session_v2=2|1:0|10:1657893716|18:captcha_session_v2|88:WFlMeDR3TTNnSGIxelhiOWM2VWhtVHNNaHVsdFIvMFJFMTJDemhMUDl3ZEdVeWNJNGVLV0lWZWg5RDJMY29vag==|d5e1889c5712bd714d1c31a39a489cf6f58be0642702be6f6b46f79758866787; z_c0=2|1:0|10:1657893721|4:z_c0|92:Mi4xYThhZkNnQUFBQUFBTUotSlJKNUFGUmNBQUFCZ0FsVk5XY0ctWXdEcUkxQmctT2tCMWlxTzl4TUZUcFdDN3E1eldR|aa82f3dd2a774fa57cc87dcfb07be71040e3a53dd1919313e0a13a50cd544b97; _xsrf=83abd56b-1867-4269-bb57-960cfd5c91a3; q_c1=357fa7b4d4184eaabd568cee051209e2|1658716117000|1658716117000; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1658040783,1658128773,1658388682,1658716119; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1658716119; ariaDefaultTheme=undefined; NOT_UNREGISTER_WAITING=1; BAIDU_SSP_lcr=https://www.baidu.com/link?url=k6d8dLG71FQpb0v6O3n1IhpIx5uPWk8oDctz-vv11ki&wd=&eqid=9bf2497900047a050000000462ddffd3; SESSIONID=BmmDYzkeV0798gpIq5AEdXMDOfjNQs5GnI6luIlzjHB; JOID=Ul8XC0Jlj6ZJkJcMOW2RO3bSAUIuIOfBDMDqYlY6uJR6qfl7db5_cCOelQMyNcV7d5N13UO9qZ1Ax4lO1mvNbnk=; osd=Vl0XAU5hjaZDnJMOOWedP3TSC04qIufLAMToYlw2vJZ6o_V_d751fCeclQk-Mcd7fZ9x30O3pZlCx4NC0mnNZHU=; tst=h; KLBRSID=37f2e85292ebb2c2ef70f1d8e39c2b34|1658716150|1658716117',
    'referer': 'https://www.zhihu.com/hot',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
page_data = requests.get(url=url, headers=headers).text
ex = '<div class="HotItem-content">.*?title="(.*?)" target'
question_data = re.findall(ex, page_data, re.S)

知乎热榜问题列表在网页源代码中就可以看到,因此直接获取页面数据后利用正则提取所需内容即可。

二、将数据保存到Excel表中

# 生成时间
ti = datetime.datetime.now()
time_hot = str(ti.month) + '月' + str(ti.day) + '日 ' + str(ti.hour) + '时' + str(ti.minute) + '分'
time_file = str(ti.month) + '月' + str(ti.day) + '日 ' + str(ti.hour) + '时'
# 文件保存位置
file_path = './知乎热榜/知乎热榜' + time_file + '.xlsx'     # 以小时为单位生成excel表
if not os.path.exists(file_path):
    wb = openpyxl.Workbook()        # 创建excel表格
    wb.save(file_path)
wb = openpyxl.load_workbook(file_path)
ws = wb.create_sheet(title=time_hot)    # 创建数据表,以时间命名
ws.cell(row=1, column=1, value='排名')    # 表头
ws.cell(row=1, column=2, value='问题')
for i in range(len(question_data)):
    ws.cell(row=i+2, column=1, value=i+1)   # 热度排名
    ws.cell(row=i+2, column=2, value=question_data[i])      # 问题内容
    # print('第', i+1, '个问题保存成功')
if 'Sheet' in wb.sheetnames:        # 删除自动生成的’Sheet‘表格
    del wb['Sheet']
wb.save(file_path)      # 保存并关闭文件
wb.close()

利用openpyxl模块保存数据,用内置库datetime生成时间,方便给文件命名区分。用datetime.datetime.now().strftime(‘%Y/%m/%d/%H:%M:%S’)也可以格式化输出时间。

百度热搜爬取代码

百度热搜爬取原理相同,代码如下:

import requests
import re
import datetime

url = 'https://top.baidu.com/board?tab=realtime'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
page_data = requests.get(url=url, headers=headers).text
ex = '<div class="c-single-text-ellipsis"> (.*?) </div>'
data = re.findall(ex, page_data, re.S)      # 热搜内容-list

# 向列表中奇数位插入换行符
# for i in range(len(data)*2-1):
#     if i % 2 != 0 :
#         data.insert(i, '\n')

ti = datetime.datetime.now()
time_hot = str(ti.month) + '月' + str(ti.day) + '日 ' + str(ti.hour) + '时' + str(ti.minute) + '分'
file_path = './百度热搜_' + time_hot + '.txt'
with open(file_path, 'w', encoding='utf-8') as f:
    f.write('生成时间:')
    f.write(str(ti))
    f.write('\n')
    n = 0
    for i in data:
        f.write(str(n))
        f.write(i)
        f.write('\n')
        n += 1
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值