百度热搜爬虫,存到excel表格中

 修改代码的日期可以获取规定时间范围内的代码

还需要自己去网页获取cookie

import random
import time

import pandas as pd
import requests
import datetime
from lxml.etree import HTML
from fake_useragent import UserAgent

all_data = []

count = 0
headers = {
    "authority": "hot.whutech.com",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "zh-CN,zh;q=0.9",
    "cache-control": "no-cache",
    "pragma": "no-cache",
    "referer": "https://hot.whutech.com/baidu-day-20230522.html",
    "sec-ch-ua": "^\\^Google",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "^\\^Windows^^",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
cookies = {
    "PHPSESSID": "7vrgu6bmvp0snosj6eo7locf1l",
    "Hm_lvt_85559b529acdb69a06dd7ba3ec183ab7": "1684746176,1684809088",
    "Hm_lpvt_85559b529acdb69a06dd7ba3ec183ab7": "1684822064"
}


# 5月19日之前的
def date_time():
    start_date = datetime.datetime.strptime('20230101', '%Y%m%d')  # 设置起始日期
    end_date = datetime.datetime.strptime('20230519', '%Y%m%d')  # 设置结束日期
    current_date = start_date  # 初始化当前日期

    while current_date <= end_date:
        data = current_date.strftime('%Y%m%d')
        print(current_date.strftime('%Y%m%d'))
        parse(data)
        # 更新当前日期
        current_date += datetime.timedelta(days=1)


def parse(data):
    url = f"https://hot.whutech.com/baidu-day-{data}.html"
    time.sleep(random.randint(1, 3))
    response = requests.get(url, headers=headers, cookies=cookies)
    html = HTML(response.text)
    for i in range(2, 120):
        try:
            name = html.xpath(f'/html/body/div[2]/div/div/div[{i}]/div[2]/a/text()')[0]
            hot = html.xpath(f'/html/body/div[2]/div/div/div[{i}]/div[3]/text()')[0]
            # 将字符串转换为日期对象
            dates = datetime.datetime.strptime(str(data), '%Y%m%d')

            # 将日期对象转换为字符串,并指定格式
            new_date_str = dates.strftime('%Y-%m-%d')
            item = {
                "标题": name,
                "热度": hot,
                "日期": new_date_str
            }
            print(item)
        except Exception as e:
            print(e)
            break
        write_excel(item)


def write_excel(item):
    all_data.append(item)
    df = pd.DataFrame.from_dict(all_data)
    # 将 DataFrame 写入到 Excel 表格中
    # 追加方式打开
    with pd.ExcelWriter('baidu.xlsx', engine='openpyxl') as writer:
        df.to_excel(writer, sheet_name='Sheet1', header=False, index=False)
    global count
    count += 1
    print(f'保存完毕,已保存{count}次')


if __name__ == '__main__':
    date_time()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值