百度贴吧数据提取并保存为html文件

要求

网址: https://tieba.baidu.com/f?ie=utf-8&kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&fr=search

需求: 使用requests模块爬取此贴吧前3页的数据 每一页保存为html文件 例如1.html 2.html 3.html

代码

import requests

# 获取目标链接
url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&fr=search'

# 循环抓取三页数据
for page in range(3):
    params = {'pn': page * 50}    # 第一页pn=0,第二页pn=50,第三页pn=100,以此类推
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43',
        'Acs-Token': '1686806044589_1686806150559_FdxZLPfff2vfF6YxFAaK9g2Iwx7oMFb5zMhvmxDzDbPXo8oaASu4KbjoIxszSwULy7RFLE1tGj2Lh7zmTVnZ2cTdZIAw8BPEypow7GlcWnIdnp1QqpQr4CZ2Qt/XRuwcTHbgyqu7fNLfhTQJ3BDy1QjC6WUnRClPJG2oKt4kJHVSs/mII0xS++tdoCKVyNrbkavDoPQOtCCqC9rlCr50fjgGOY2XMZUer5xwEMbumTPEYV45wYxPcJG5WDw1J9/USfjftI8uIxtsoJQH5rJfK+U59ihwGd5RqTMhn0V0nxK55jwsylCZq26mJrACbGukz1Hj1a4LVKbFZsRmr+XoTBZjRI8yyrFKrIa8eZse3BTzPWuAlSUYVcSn+XpIjCByVg4ayyeL8rgkObmr3TUrbljc+GChy0PNyfmgE3Fr73hAVMremt2r8RaAbS5NcUX9fUs6cpCL2JQlnLcU5V+X3Fo0pnfVSdSMCw66YIfkTtsUNzTDOrjYF5JJtR5sT5siv9FyOvOzDxKb3PLgeeOmwg==',
        'Cookie': 'BIDUPSID=D8F5989382594EE3B6E6EC4A53D8E96F; PSTM=1598346312; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_PREFER_SWITCH=1; SOUND_SPD_SWITCH=1; MAWEBCUID=web_kYYkmoTlwyCKIaGLPbdhWbeJZwwOrlGascvphDhlPVkQMuLyFc; BDUSS=cxOFg3UkZFUlVDcERtcjR0Uk41YmpmVks0VGR5UWJRekdjaVA5cVRCaDJWcUpqSVFBQUFBJCQAAAAAAAAAAAEAAAARUrsK0-PQyLLdsK7T4wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHbJemN2yXpjR0; BDUSS_BFESS=cxOFg3UkZFUlVDcERtcjR0Uk41YmpmVks0VGR5UWJRekdjaVA5cVRCaDJWcUpqSVFBQUFBJCQAAAAAAAAAAAEAAAARUrsK0-PQyLLdsK7T4wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHbJemN2yXpjR0; BAIDUID=FC92EDAA05E9F3121ECA733B986E3A7D:FG=1; MCITY=-198%3A; APPGUIDE_10_0_2=1; H_WISE_SIDS=219946_219561_216847_213353_214797_219942_213036_230182_204918_110085_236308_244730_245412_250148_250890_249893_253880_253427_240590_254473_254730_239150_251785_253213_250882_255982_107319_256083_253993_255661_256349_254831_256719_256739_256620_254317_257080_257102_254075_257290_257439_257542_257786_255658_257939_257167_257903_257823_257586_255231_257789_253900_258165_258373_258369_258724_258728_258837_258880_258938_257302_258983_258958_258694_230288_259026_259049_252256_259191_259193_259345_256223_259407_259391_259431_259569_256998_259558_259655_258772_234207_234295_257261_259910_259887_259643_260049_260035_254300_260103_260141_259958_256229_260302_260357_259728_260365_260228_259186_253022_255212_260158_258081_260330_260549_260660_260647_8000066_8000131_8000140_8000153_8000168_8000178_8000176_8000186_8000188_8000196; H_WISE_SIDS_BFESS=219946_219561_216847_213353_214797_219942_213036_230182_204918_110085_236308_244730_245412_250148_250890_249893_253880_253427_240590_254473_254730_239150_251785_253213_250882_255982_107319_256083_253993_255661_256349_254831_256719_256739_256620_254317_257080_257102_254075_257290_257439_257542_257786_255658_257939_257167_257903_257823_257586_255231_257789_253900_258165_258373_258369_258724_258728_258837_258880_258938_257302_258983_258958_258694_230288_259026_259049_252256_259191_259193_259345_256223_259407_259391_259431_259569_256998_259558_259655_258772_234207_234295_257261_259910_259887_259643_260049_260035_254300_260103_260141_259958_256229_260302_260357_259728_260365_260228_259186_253022_255212_260158_258081_260330_260549_260660_260647_8000066_8000131_8000140_8000153_8000168_8000178_8000176_8000186_8000188_8000196; newlogin=1; BAIDUID_BFESS=FC92EDAA05E9F3121ECA733B986E3A7D:FG=1; BA_HECTOR=8g050h25010l00ak0l018k0e1i8iuda1p; ZFY=n1zNFtOYqApRX9FdBHM:AalGOWRYlKXsE:BbNpGmCgGXU:C; RT="z=1&dm=baidu.com&si=e39401a2-ac9b-4e9c-8496-3534f02b7fcc&ss=livg9c7p&sl=4&tt=446&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=esu&ul=fuh&hd=fv1"; delPer=0; PSINO=6; BDRCVFR[S4-dAuiWMmn]=I67x6TjHwwYf0; BDRCVFR[kSyA9a8U-kc]=mk3SLVN4HKm; H_PS_PSSID=26350; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BCLID=8645790084114543236; BCLID_BFESS=8645790084114543236; BDSFRCVID=5JtOJexroG0ZmSbfi78ahMXW0_weG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKK3gOTHlkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=5JtOJexroG0ZmSbfi78ahMXW0_weG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKK3gOTHlkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tJkJoKK2JKP3fP36qRrh-tCEbfTMJTDDHD7XVMjYbp7keq8CDRK2DJTb0fj03ROt2gjv-KJdWDQPJbO2y5jHhp3L0bQbtRQBa23zbpTPyJTpsIJMbJAWbT8U5f5w-fkfaKviaKOjBMb1DbvDBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTDMFhe6oyeaLHJjDsKjAX3JjV5PK_Hn7zep34XM4pbq7H2M-jyDjg3JQVJMQZjh6y3T3r5PL1jU5n0pcH3mOfhUJb-IOdspcs3xjHMUKkQN3T-ntDHCn4L66Rtq7cDn3oyTbJXp0nhMRly5jtMgOBBJ0yQ4b4OR5JjxonDh83bG7MJUutfJCeVID-tD_3H48k-4QEbbQH-UnLq5QZbgOZ04n-ah02oR6b-4jEjTk_M-cvL5OdbKTx_bom3UTKsq76Wh35K5tTQP6rLf5K22c4KKJxbP8aKJbH5Dc6Dx4ghUJiB5JMBan7WnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbRO4-TFKejb0DxK; H_BDCLCKID_SF_BFESS=tJkJoKK2JKP3fP36qRrh-tCEbfTMJTDDHD7XVMjYbp7keq8CDRK2DJTb0fj03ROt2gjv-KJdWDQPJbO2y5jHhp3L0bQbtRQBa23zbpTPyJTpsIJMbJAWbT8U5f5w-fkfaKviaKOjBMb1DbvDBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTDMFhe6oyeaLHJjDsKjAX3JjV5PK_Hn7zep34XM4pbq7H2M-jyDjg3JQVJMQZjh6y3T3r5PL1jU5n0pcH3mOfhUJb-IOdspcs3xjHMUKkQN3T-ntDHCn4L66Rtq7cDn3oyTbJXp0nhMRly5jtMgOBBJ0yQ4b4OR5JjxonDh83bG7MJUutfJCeVID-tD_3H48k-4QEbbQH-UnLq5QZbgOZ04n-ah02oR6b-4jEjTk_M-cvL5OdbKTx_bom3UTKsq76Wh35K5tTQP6rLf5K22c4KKJxbP8aKJbH5Dc6Dx4ghUJiB5JMBan7WnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbRO4-TFKejb0DxK; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1684372393,1686805768; APPGUIDE_10_6_2=1; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1686806044; ab_sr=1.0.1_YWYxMDQyYjM2MDI4MWYyZmFkZGE0NTRmOWMxZWE4ODkxYTUxOWJiMjM4YTVlNWNjZDUyZDk3MzA2NDEzNmI4ZDM5YmExYjJlZGE1MzBiNGMyOWNhMWJjMGE1YjJlMzViMzFiMDJjZmI0ZjY4ZTJjNDNiMDZlYjcwNzEzOWJlNGQ1N2Q3ZDc2ZWNlNWEzZDM0MzE0MjAxNmE0NWVmYTk1MmJmOTkzMjEyZWJjMmZkYzgzY2VlNzliYTY3NWY1OWMy'
    }
    response = requests.get(url, params=params, headers=headers)      # 对网站发起get请求
    filename = f"{page + 1}.html"
    with open(filename, 'w', encoding='utf-8') as f:                  # 写入模式
        f.write(response.text)                                        # 写入文本内容
        print(f"{filename} 保存成功.")

  • 11
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值