项目场景:
爬取微博主页项目导航栏:
导入一些模块:import requests
import csv
import numpy as np
import os
问题描述
爬取三个参数:
title
gid
containerid
保存到csv文件
def parse_json(response):
navList = np.append(response['groups'][3]['group'], response['groups'][4]['group'])
for nav in navList:
navName = nav['title']
gid = nav['gid']
containerid = nav['containerid']
writerRow([
navName,
gid,
containerid
])
完整代码:
import requests
import csv
import numpy as np
import os
def init():
if not os.path.exists('./navData.csv'):
with open('./navData.csv', 'w', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow([
'typeName',
'gid',
'containerid'
])
def writerRow(row):
with open('./navData.csv', 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
def get_data(url):
headers = {
'Cookie': 'XSRF-TOKEN=zASpYIx0oUosfBlB0MsTSRdi; SSOLoginState=1704083302; SUB=_2A25Ilk82DeThGeBI71US9yzKzzuIHXVr6s7-rDV8PUJbkNB-LWXlkW1NRpId-Znw75c-wagHUOjJucjoob6tHv3U; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZM5jTZLaMAANadOdO6n405NHD95QcSoBNe0MESoBNWs4DqcjPi--Xi-i2iK.4i--NiK.XiKLsS0e4eo-t; WBPSESS=Ii9Wh36g6mj5Z4ggI26vDWjCIui3_Ugbw4SWQGD-3thTaFTWO4WfBvG6bThO4kGKymgzVpGAtZV7ECafvFIdUVzuArqnCejbOvzVVpt49LX2IF7cmIN2gYRZz9Z8CMGcwbkBpKHIXseyKeK-4ee9gw==',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
params = {
'is_new_segment': '1',
'fetch_hot': '1'
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return response.json()
else:
return None
def parse_json(response):
navList = np.append(response['groups'][3]['group'], response['groups'][4]['group'])
for nav in navList:
navName = nav['title']
gid = nav['gid']
containerid = nav['containerid']
writerRow([
navName,
gid,
containerid
])
if __name__ == '__main__':
init()
url = 'https://weibo.com/ajax/feed/allGroups?is_new_segment=1&fetch_hot=1'
response = get_data(url)
parse_json(response)