修改代码的日期可以获取规定时间范围内的代码
还需要自己去网页获取cookie
import random
import time
import pandas as pd
import requests
import datetime
from lxml.etree import HTML
from fake_useragent import UserAgent
all_data = []
count = 0
headers = {
"authority": "hot.whutech.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"referer": "https://hot.whutech.com/baidu-day-20230522.html",
"sec-ch-ua": "^\\^Google",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "^\\^Windows^^",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
cookies = {
"PHPSESSID": "7vrgu6bmvp0snosj6eo7locf1l",
"Hm_lvt_85559b529acdb69a06dd7ba3ec183ab7": "1684746176,1684809088",
"Hm_lpvt_85559b529acdb69a06dd7ba3ec183ab7": "1684822064"
}
# 5月19日之前的
def date_time():
start_date = datetime.datetime.strptime('20230101', '%Y%m%d') # 设置起始日期
end_date = datetime.datetime.strptime('20230519', '%Y%m%d') # 设置结束日期
current_date = start_date # 初始化当前日期
while current_date <= end_date:
data = current_date.strftime('%Y%m%d')
print(current_date.strftime('%Y%m%d'))
parse(data)
# 更新当前日期
current_date += datetime.timedelta(days=1)
def parse(data):
url = f"https://hot.whutech.com/baidu-day-{data}.html"
time.sleep(random.randint(1, 3))
response = requests.get(url, headers=headers, cookies=cookies)
html = HTML(response.text)
for i in range(2, 120):
try:
name = html.xpath(f'/html/body/div[2]/div/div/div[{i}]/div[2]/a/text()')[0]
hot = html.xpath(f'/html/body/div[2]/div/div/div[{i}]/div[3]/text()')[0]
# 将字符串转换为日期对象
dates = datetime.datetime.strptime(str(data), '%Y%m%d')
# 将日期对象转换为字符串,并指定格式
new_date_str = dates.strftime('%Y-%m-%d')
item = {
"标题": name,
"热度": hot,
"日期": new_date_str
}
print(item)
except Exception as e:
print(e)
break
write_excel(item)
def write_excel(item):
all_data.append(item)
df = pd.DataFrame.from_dict(all_data)
# 将 DataFrame 写入到 Excel 表格中
# 追加方式打开
with pd.ExcelWriter('baidu.xlsx', engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Sheet1', header=False, index=False)
global count
count += 1
print(f'保存完毕,已保存{count}次')
if __name__ == '__main__':
date_time()