一、用Python中requests爬取百度热搜及指数,用到了正则解析
步骤:
- 导入相关的库
import requests import re import csv
- 发送请求及响应数据
url = 'https://m.zhaopin.com/sou/jl864/kwCLO66RII0PJP0NG8/p1' headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Mobile Safari/537.36' } response = requests.get(url, headers=headers) content=resonse.content
- 用正则解析页面,并将热搜及指数做成列表
hot_searches = re.findall('<div class="c-single-text-ellipsis">(.*?)</div>',content,re.S) search_1 = [] for search in hot_searches: search_1.append(search) print(search_1) hot_indexes = re.findall('<div class="hot-index_1Bl1a">(.*?)</div>',content,re.S) index_1 = [] for index in hot_indexes: index_1.append(index) print(index_1)
- 将热搜及指数成字典
hot_sum = [] for search, index in zip(search_1, index_1): hot = { "热搜": search, "热搜指数": index } hot_sum.append(hot)
5.最终将字典保存为csv文件
headers = hot_sum[0].keys()
with open(r"C:\Desktop\one\百度热搜词汇_bs4爬取.csv", 'w', newline='', encoding='utf-8') as fp:
writer = csv.DictWriter(fp, headers, delimiter='|')
writer.writeheader()
writer.writerows(hot_sum)
6.完整代码汇总
# _*_ coding: utf-8 _*_
# 时间:2024/8/20 21:53
if __name__ == "__main__":
# 1、导入相应的库
import requests
import re
import csv
# 2、发送请求及响应数据
url = 'https://m.zhaopin.com/sou/jl864/kwCLO66RII0PJP0NG8/p1'
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Mobile Safari/537.36'
}
response = requests.get(url, headers=headers)
content=resonse.content
# 3、正则解析热搜及指数
hot_searches = re.findall('<div class="c-single-text-ellipsis">(.*?)</div>',content,re.S)
search_1 = []
for search in hot_searches:
search_1.append(search)
print(search_1)
hot_indexes = re.findall('<div class="hot-index_1Bl1a">(.*?)</div>',content,re.S)
index_1 = []
for index in hot_indexes:
index_1.append(index)
print(index_1)
# 4、将热搜及指数生成字典
hot_sum = []
for search, index in zip(search_1, index_1):
hot = {
"热搜": search,
"热搜指数": index
}
hot_sum.append(hot)
# 5、将字典保存为csv
headers = hot_sum[0].keys()
with open(r"C:\Desktop\one\百度热搜词汇_正则爬取.csv", 'w', newline='', encoding='utf-8') as fp:
writer = csv.DictWriter(fp, headers, delimiter='|')
writer.writeheader()
writer.writerows(hot_sum)
7.最终结果展示: