纸上得来终觉浅, 绝知此事要躬行。
#夏天切记贪凉,光膀子对着空调吹,搞了个肩周炎,耽误事
前一篇几篇都是关于数据爬取的文章。有兴趣的可以去看看
男篮世锦赛开始了,看着中国队的苦苦挣扎为了奥运会入场券,哎,没法子呀。好在我有了这篇主题的思路了,谢谢啊,
来到nba中国官网,首先想到直接用get请求主页网址,发现页面都是ajax动态请求的,所以开始抓包吧,
在这个playerlist.json文件中,包含了所有球员的很多信息。包括[‘code’,’‘playerId’, ‘displayName’, ‘country’, ‘draftYear’, ‘experience’, ‘position’, ‘weight’, ‘schoolType’, ‘teamName’, ‘city’],这些是球员的基本信息,这里切记要获取’code‘’字段,后面获取球员关于比赛数据的信息的时候的url拼接需要。
一开始我是把这些数据存放在mysql数据库里,后来又存为了csv文件
下一步开始获取球员的具体的信息,再次抓包,
进入球员具体数据页面,抓包发现数据在stats_steven_adams.json文件中,在查看多个球员详细数据页面发现,请求url,最后都是在https://china.nba.com/static/data/player/stats_拼接了该球员的名字,后头看前面获取到的code字段,就是我们需要拼接的部分,例如,https://china.nba.com/static/data/player/stats_steven_adams.json
###具体的可以看代码
"""
@author: cht
@time: 2019/9/4 19:23
"""
# -*- coding: utf-8 -*-
import requests
import json
import pymysql
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#解决matplotlib画图中文显示乱码的情况
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
class NBA(object):
def __init__(self):
self.db = pymysql.connect("localhost", "root", "cht555", "nba", charset="utf8")
self.cursor = self.db.cursor()
def get_json(self, url):
try:
res = requests.get(url)
if res.status_code == 200:
return json.loads(res.text)
return None
except Exception as e:
print(e)
def get_data(self, url):
try:
req = requests.get(url)
if req.status_code == 200 or req.status_code == 304:
return json.loads(req.text)
return None
except Exception as t:
print(t)
def parse_jsonData(self, jsonfile):
# print(jsonfile)
playersinfos = jsonfile['payload']['players']
infoList = [
['playerId', 'displayName', 'country', 'draftYear', 'experience', 'position', 'weight', 'schoolType','teamName', 'city']]
codeList = []#收集code,return出去
for i in playersinfos:
playinfo = []#收集球员基本信息
code = i['playerProfile']['code']
print("code:%s" % code)#球员球赛个人数据页面url需要,并没有存放在数据库和csv
codeList.append(code)
playerId = i['playerProfile']['playerId']#球员id
playinfo.append(playerId)
name = i['playerProfile']['displayName']#球迷名字
playinfo.append(name)
country = i['playerProfile']['country']#球员国籍
playinfo.append(country)
draftYear = int(i['playerProfile']['draftYear'])#进入nba的年份
playinfo.append(draftYear)
experience = int(i['playerProfile']['experience'])#进入nba年限
playinfo.append(experience)
position = i['playerProfile']['position']#球员擅长的位子
playinfo.append(position)
weight = float(i['playerProfile']['weight'].split(' ')[0])#体重
playinfo.append(weight)
schoolType = i['playerProfile']['schoolType']#学历
playinfo.append(schoolType)
teamName = i['teamProfile']['name']#球队队名
playinfo.append(teamName)
city = i['teamProfile']['city']#球队所在城市
playinfo.append(city)
# self.write_mysql(playinfo) #需要保存到数据库,可以调用
infoList.append(playinfo)
return infoList, codeList
def writr_csv(self, infolist):
pp = pd.DataFrame(infolist)
pp.to_csv('./player.csv', index=False, encoding="utf-8")
def write_mysql(self, playinfo):
insert = "insert into players(playerId , name, country, draftYear, experience, position, weight, schoolType, teamName, city)\
values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
# execute第二个参数一定为列表,顺序补位(sql命令)
self.cursor.execute(insert, playinfo)
# 提交到数据库执行
self.db.commit()
def workrun(self):
# 直接把_和开头首字母去掉,就是全部球员基本信息
#方法1
# -------------------------------------------
# num = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
# for key in num:
# url = "https://china.nba.com/static/data/league/playerlist_%s.json" % key
# jsonfile = self.get_json(url)
# self.parse_jsonData(jsonfile)
#上面这个方式也可以
#方法2
person_url = "https://china.nba.com/static/data/league/playerlist.json"
jsonfile = self.get_json(person_url)
codeList = self.parse_jsonData(jsonfile)
self.writr_csv(codeList[0])
# ----------------------------------------------
#获取球员比赛数据
p1_cols = []
# 存放球员的信息
p2_cols = []
# 存放球队信息
p3_cols = []
# 存放球员平均数据
p4_cols = []
# 存放球员数据
t = 0
for code in codeList:
data_url = "https://china.nba.com/static/data/player/stats_%s.json" % code
print(data_url)
dataresult = self.get_data(data_url)
# print(dataresult)
t = t + 1
try:
if t == 1:
# 遍历其中一个['playerProfile'],['teamProfile'] 得到各自列名,添加到p1_cols和p2_cols列表中
for x in dataresult['payload']['player']['playerProfile']:
p1_cols.append(x)
for x in dataresult['payload']['player']['teamProfile']:
p2_cols.append(x)
for x in dataresult['payload']['player']['stats']['currentSeasonTypeStat'][
'currentSeasonTypePlayerTeamStats'][0]['statAverage']:
p3_cols.append(x)
for x in dataresult['payload']['player']['stats']['currentSeasonTypeStat'][
'currentSeasonTypePlayerTeamStats'][0]['statTotal']:
p4_cols.append(x)
p1 = pd.DataFrame(columns=p1_cols)
p2 = pd.DataFrame(columns=p2_cols)
p3 = pd.DataFrame(columns=p3_cols)
p4 = pd.DataFrame(columns=p4_cols)
except Exception as e:
print(e)
# 初始化一个DataFrame 用来存放数据
try:
player = pd.DataFrame([dataresult['payload']['player']['playerProfile']])
team = pd.DataFrame([dataresult['payload']['player']['teamProfile']])
p1 = p1.append(player, ignore_index=True)
p2 = p2.append(team, ignore_index=True)
data1 = dataresult['payload']['player']['stats']['regularSeasonStat']['playerTeams']
for i in data1:
if i["season"] == '2018':
statAverage = pd.DataFrame([i['statAverage']])
statTotal = pd.DataFrame([i['statTotal']])
# print(">>>>>>",i['statAverage'])
# print(">>>>>>",i['statTotal'])
p3 = p3.append(statAverage, ignore_index=True)
p4 = p4.append(statTotal, ignore_index=True)
break
except Exception as e:
print(e)
continue
# 数据合并 两两合并
p6 = pd.merge(p1, p2, left_index=True, right_index=True)
p7 = pd.merge(p3, p4, left_index=True, right_index=True)
p5 = pd.merge(p6, p7, left_index=True, right_index=True)
p5.to_csv('./nba_player_2018.csv', index=False, encoding="utf-8")
if __name__ == "__main__":
nba = NBA()
nba.workrun()
发现只有401位球员信息,和全部球员基本信息的524位球员来说差了100多位,这是由于,很多人是长期饮水机守护神,很可能一场球都没有上场,更不要说得分数据了,有了数据要开始数据分析了。请看下一篇