项目场景:
爬取微博文章内容:
导入一些模块:
import time
import requests
import csv
import os
from datetime import datetime
问题描述
爬取14个数据:
id
likeNum
commentsLen
reports_count
region
content
contentLen
created_at
type
detailUrl
authorAvatar
authorName
authorDetail
isVip
保存到csv文件:
def init():
if not os.path.exists('./articleData.csv'):
with open('./articleData.csv', 'w', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow([
'id',
'likeNum',
'commentsLen',
'reports_count',
'region',
'content',
'contentLen',
'created_at',
'type',
'detailUrl',
'authorAvatar',
'authorName',
'authorDetail',
'isVip'
])
def writerRow(row):
with open('./articleData.csv', 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
完整代码:
import time
import requests
import csv
import os
from datetime import datetime
def init():
if not os.path.exists('./articleData.csv'):
with open('./articleData.csv', 'w', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow([
'id',
'likeNum',
'commentsLen',
'reports_count',
'region',
'content',
'contentLen',
'created_at',
'type',
'detailUrl',
'authorAvatar',
'authorName',
'authorDetail',
'isVip'
])
def writerRow(row):
with open('./articleData.csv', 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
def get_data(url, params):
headers = {
'Cookie': 'XSRF-TOKEN=zASpYIx0oUosfBlB0MsTSRdi; SSOLoginState=1704083302; SUB=_2A25Ilk82DeThGeBI71US9yzKzzuIHXVr6s7-rDV8PUJbkNB-LWXlkW1NRpId-Znw75c-wagHUOjJucjoob6tHv3U; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZM5jTZLaMAANadOdO6n405NHD95QcSoBNe0MESoBNWs4DqcjPi--Xi-i2iK.4i--NiK.XiKLsS0e4eo-t; WBPSESS=Ii9Wh36g6mj5Z4ggI26vDWjCIui3_Ugbw4SWQGD-3thTaFTWO4WfBvG6bThO4kGKymgzVpGAtZV7ECafvFIdUVzuArqnCejbOvzVVpt49LX2IF7cmIN2gYRZz9Z8CMGcwbkBpKHIXseyKeK-4ee9gw==',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return response.json()['statuses']
else:
return None
def getAllTypeList():
typeList = []
with open('./navData.csv', 'r', encoding='utf-8') as reader:
readerCsv = csv.reader(reader)
next(reader)
for nav in readerCsv:
typeList.append(nav)
return typeList
def parse_json(response, type):
for article in response:
id = article['id']
likeNum = article['attitudes_count']
commentsLen = article['comments_count']
reports_count = article['reposts_count']
try:
region = article['region_name'].replace('发布于', '')
except:
region = '无'
content = article['text_raw']
contentLen = article['textLength']
created_at = datetime.strptime(article['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
type = type
try:
detailUrl = 'https://www.weibo.com/' + str(article['id']) + '/' + str(article['mblogid'])
except:
detailUrl = '无'
authorAvatar = article['user']['avatar_large']
authorName = article['user']['screen_name']
authorDetail = 'https://www.weibo.com/u/' + str(article['user']['id'])
isVip = article['user']['v_plus']
writerRow([
id,
likeNum,
commentsLen,
reports_count,
region,
content,
contentLen,
created_at,
type,
detailUrl,
authorAvatar,
authorName,
authorDetail,
isVip
])
def start(typeNum=3, pageNum=2):
articleUrl = 'https://weibo.com/ajax/feed/hottimeline'
init()
typeList = getAllTypeList()
typeNumCount = 0
for type in typeList:
if typeNumCount > typeNum:
return
time.sleep(1)
for page in range(0, pageNum):
print('正在爬取的类型:%s中的第%s页的文章数据' % (type[0], page + 1))
time.sleep(1)
params = {
'group_id': type[1],
'containerid': type[2],
'max_id': page,
'count': 10,
'extparam': 'discover|new_feed'
}
response = get_data(articleUrl, params)
parse_json(response, type[0])
typeNumCount += 1
if __name__ == '__main__':
start()