线程池获取极目新闻信息,并存入CSV文件
仅供自己练习和学习使用。
请求地址
aHR0cHM6Ly95dGguY3Rkc2IubmV0L2FtYy9jbGllbnQvbGlzdENvbnRlbnRCeUNvbHVtbg==
分析
1、请求头中需要携带Token值和requestTime值,Token为MD5加密串,requestTime为当前时间戳
代码
import requests
import time
import hashlib
from concurrent.futures import ThreadPoolExecutor
import csv
class JiMuNews:
def __init__(self):
self.url = 'aHR0cHM6Ly95dGguY3Rkc2IubmV0L2FtYy9jbGllbnQvbGlzdENvbnRlbnRCeUNvbHVtbg=='
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
@staticmethod
def get_token(date_time):
salt = "hbrb-app-amc"
h5_str = "h5Client-id"
data_str = salt + '$' + date_time
md5 = hashlib.md5()
md5.update(data_str.encode('utf-8'))
md5_str = md5.hexdigest()
new_str = h5_str + "$" + md5_str + "$" + date_time
new_md5 = hashlib.md5()
new_md5.update(new_str.encode('utf-8'))
new_md5_str = new_md5.hexdigest()
return new_md5_str
def get_news_data(self, page):
date_time = str(int(time.time() * 1000))
self.headers['Token'] = self.get_token(date_time)
self.headers['requestTime'] = date_time
form_data = {
'focusNo': 5,
'publishFlag': 1,
'pageNo': page,
'pageSize': 20,
'column': 1476
}
response = requests.post(url=self.url, headers=self.headers, data=form_data).json()
return response['data']['contentList']
def save_data(self, news_list):
for news in news_list:
with open('jm_news_info.csv', 'a', encoding='utf-8', newline='') as f:
csv_obj = csv.DictWriter(f, ['title', 'summary', 'publishTime'])
if f.tell() == 0:
csv_obj.writeheader()
news_dict = {
'title': news['title'],
'summary': news['summary'],
'publishTime': news['publishTime'],
}
print('保存数据:',news_dict)
csv_obj.writerow(news_dict)
def main(self):
pool = ThreadPoolExecutor(max_workers=5)
for page in range(1, 5):
response = pool.submit(self.get_news_data, page)
self.save_data(response.result())
if __name__ == '__main__':
jm = JiMuNews()
jm.main()