NBA 球员数据采集测验
为了完成本关任务,你需要掌握:
数据获取简介
数据采集
本关采用 requests 库完成数据采集。
数据获取简介
url 地址: https://www.basketball-reference.com/players/a/
请求头:
进入网站后,等待网页加载完毕,点击 F12 或右击选择检查,搜索找到页面 a/,设置请求头信息。
请求头图
网页主界面如下图所示:
网页主界面
在本次教学中,我们需要获取所有 A 姓球员的基本数据和一些详细数据。
基本数据就是从网页主界面中获取的数据,如下图所示:
基本数据图
我们需要在网页主界面中获取球员的姓名、位置、身高。由于体重列表栏中部分球员存在空值,我们使用 Xpath 解析后会导致排列顺序混乱,所以体重数据我们在球员详情页中获取。
我们在网页主界面中点击 F12 或右击选择检查,查看球员详情页链接:
球员详情页链接获取图
可以直接获取到球员详情页链接的后缀。
在球员详情页中,我们需要获取球员的详细数据:
详细数据图
所有需要获取的字段信息如下:
字段名 解释 获取信息说明
id 球员 id 解析球员详情页链接获取,如:https://www.basketball-reference.com/players/a/abdelal01.html,则 id 为:abdelal01
info_url 球员详情信息网址 网站首页表格中 Player 列
player_name 球员姓名 网站首页表格中 Player 列
player_pos 战术位置 网站首页表格中 Pos 列
player_ht 身高(英尺) 网站首页表格中 Ht 列
player_wt 体重(磅) 球员详情页(网站首页的体重列中存在空值)
player_age 球员年龄 球员详情页,动态加载数据,需要手动计算。(格式与网址保持一致)
country 国籍 球员详情页(大写)
college 就读大学 球员详情页
high_school 就读高中 球员详情页
rank_year 同届排名 球员详情页
draft 选秀信息 球员详情页
draft_date 选秀日期 球员详情页
work_year 经验 球员详情页
team_count 效力球队数量 球员详情页
last_team_name 最后效力球队 球员详情页
season 赛季 球员详情页
games_count 场次 球员详情页
PTS 场均得分 球员详情页
TRB 场均篮板 球员详情页
AST 场均助攻 球员详情页
FG 投篮命中率 球员详情页
FG3 三分球命中率 球员详情页
FT 罚球命中率 球员详情页
EFG 有效命中率 球员详情页
PER 效率值 球员详情页
WS 胜率 球员详情页
firstTime 首秀时间 球员详情页中表格内的season列,其第一个赛季链接中第一比赛上场时间。
lastTime 退役时间 球员详情页中表格内的season列,其最后一个赛季链接中最后一场比赛上场时间。
数据采集
首先,我们定义相关全局变量,建立初始化函数和入口函数。
全局变量
save_fp = open("./nba_data.csv", "w", encoding="utf-8-sig", newline="") # 创建存储文件对象
csv_writer = csv.writer(save_fp) # 创建 csv 对象
start_time = int(time.time()) # 记录开始时间
main_response = None # 记录主页面内容
main_count = 0 # 记录主页面当前循环次数
total_count = 0 # 记录当前页面总循环次数
初始化函数
写入 csv 文件表头。
def open_spider(csv_writer):
print("--------------------------开始爬取--------------------------")
header = [
"id", "info_url", "player_name", "player_pos", "player_ht", "player_wt", "player_age",
"country", "college", "high_school", "rank_year", "draft", "draft_date",
"work_year", "team_count", "last_team_name", "season", "games_count", "PTS",
"TRB", "AST", "FG", "FG3", "FT", "EFG", "PER", "WS", "firstTime", "lastTime"
]
csv_writer.writerow(header)
入口函数
if __name__ == '__main__':
# 调用初始化函数
open_spider(csv_writer=csv_writer)
# 主界面 url
start_urls = 'https://www.basketball-reference.com/players/a/'
# 请求头,能正常访问则无需设置。
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language":"zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "max-age=0",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
"cookie": "__qca=P0-1886065764-1658383903526; hubspotutk=c69f86272b8d6b7951413faad4bbba8e; _gcl_au=1.1.1479764202.1658383906; _fssid=4eb62bce-9135-4575-8c6c-5d8ad0cbf2f4; _cc_id=183aa1de207640f3684240685f5542ec; _gid=GA1.2.241408601.1665543062; fs.session.id=28b82855-63f1-4e4d-91e6-7b89700f6372; _pbjs_userid_consent_data=3524755945110770; cookie=0f478402-4db9-4a70-b701-97f74220b4bc; _lr_env_src_ats=false; _fbp=fb.1.1665624118904.1085698668; meta_more_button=1; sr_note_box_countdown=0; srcssfull=yes; is_live=true; __gpi=UID=000007ecf062bd3b:T=1658383923:RT=1665709635:S=ALNI_Mb2bDIonimZiSivHyVtJXd-izqHOA; fs.bot.check=true; __hssrc=1; __cf_bm=EmPpD3muqzz_31sAtEV0s4gGLvIdFhi6Shpzyu6SgfY-1665729535-0-AWm21C7A17no++kUeumsEqsLsuuBhWdoJfZOSjm84UE4KRKzbBnHrW7CjXx09VK80YmgWIrmVGgGufu+RC7CiiM=; __hstc=180814520.c69f86272b8d6b7951413faad4bbba8e.1658383903774.1665709602912.1665729511448.44; __hssc=180814520.1.1665729511448; _ga_NR1HN85GXQ=GS1.1.1665729503.38.1.1665730082.0.0.0; _ga=GA1.2.1217857762.1658383900; _gat_gtag_UA_1890630_2=1; _gat_gtag_UA_1890630_9=1; __gads=ID=8491b8023e0f5ed5:T=1658383923:S=ALNI_MaR4IK8peM0OjZI9ohiD4Ayvp-YjQ; _lr_retry_request=true; cto_bundle=KErEG19BMG1DTUFvZTB1ZURsWXlUTFc4RTRJUkEwWlI2MlBnYTdkVXhNQ3F2ekNWYkJ3QUhJc3N0V0RqYm9YRkU1T1p2eUZVUXd2MmQ1c1RCJTJCV3ZUNzBVQlFra2VPc0s5amlob1RRZG9yd3JhJTJGOXpyVHhGd1JIaDdvNm1yMVAlMkI0M21RSXI4dU9GU25MY1RkSTdWU2dsSU8wYTZYOTNkUlhUUyUyRlVPZDAxd1Y5Vzd0dyUzRA; cto_bidid=zXPnyV8xOVFMMlJzTHRsak4lMkJiemlZc0tQdG44WTNWdTI3a25qNG9YcU1QTURSJTJGd0dWOHdiWUVXVnp5Tm1qellmWGZUREppVU11Y0FpNDhzQUtJallkd3ViUU5lNkFoTTk5OENDWXpDZEtZY2U1RkV0UjlaRWhvNTNMcmM1dU9XY0RkUUc",
"if-modified-since": "Fri, 14 Oct 2022 06:38:55 GMT",
"sec-ch-ua": '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1"
}
# 发起请求
page = requests.get(url=start_urls,headers=None,timeout=30)
# 获取响应内容
page_text = page.text
response = etree.HTML(page_text) # 封装内容,便于使用 Xpath 处理
main_response = response # 记录主界面内容
parse(response, main_count) # 调用主界面解析函数
现在我们来根据获取的字段对主界面进行解析,创建主界面解析函数。
主界面解析函数
解析传入的主界面内容,获取球员详情页链接、名称、位置、身高,由于首页的体重可能为空,所以不在首页这里获取体重。
通过获取的球员详情页链接列表来循环发起请求。
def parse(response, count):
try:
print("【主页面解析】")
# 首页球员详情页链接列表
list_href = response.xpath("//table[@id='players']/tbody/tr/th//a/@href")
# 首页球员名称列表
list_name = response.xpath("//table[@id='players']/tbody/tr/th//a/text()")
# 首页球员位置列表
list_pos = response.xpath("//table[@id='players']/tbody/tr/td[@data-stat='pos']/text()")
# 首页球员身高列表
list_ht = response.xpath("//table[@id='players']/tbody/tr/td[@data-stat='height']/text()")
# 校验列表长度是否一致,如果不一致则说明中间有空值数据,则该列需要去详情页获取数据。
# 长度一致才表明可以做到一一对应。
print("长度校验:")
print(len(list_href))
print(len(list_name))
print(len(list_pos))
print(len(list_ht))
# 存放首页获取到的每一个球员数据列表
list_res = []
# id正则,通过球员详情页链接获取球员 id
rex = re.compile(r".*/(.*).html")
# 循环列表,按顺序添加到球员数据列表中
for i in range(0, len(list_name)):
temp = [rex.findall(list_href[i])[0], "https://www.basketball-reference.com" + list_href[i],
list_name[i],
list_pos[i], list_ht[i]]
list_res.append(temp)
# 循环发起球员详情页数据请求
total_count = len(list_res) # 记录总的循环数
for m in range(count, total_count):
main_count = m # 记录当前请求数
page_text = requests.get(url=list_res[m][1], timeout=20).text
response = etree.HTML(page_text)
# 调用球员详情页解析函数
parse_detail(response, list_res[m])
time.sleep(0.3) # 防止速度过快导致 IP 被封
except Exception as ex:
# 跳过请求发生异常的链接,进入下一次循环
print("---------------发生异常---------------")
print(ex)
parse(main_response, main_count + 1)
球员详情页解析
解析球员详情页数据,根据获取字段信息来获取数据。
def parse_detail(response, data):
try:
print("【球员详情页解析】")
playerData = data
# 定义获取的字段,注意:部分球员中的字段不全。
college = None # 大学
high_school = None # 高中就读地
rank_year = None # 同届排名
draft = None # 选秀信息
draft_date = None # 选秀日期
work_year = None # 经验 Career Length 与 Experience 合为一列
country = None # 国籍
age = None # 年龄
wt = None # 体重
# 通过观察球员详情页数据,可以发现,球员信息标签 <p> 的个数不同,且 <p> 没有任何属性作为标识,直接使用 xpath 解析到的数据肯定有误!
# 解决方法: 可以发现,每个 <p> 标签中都有一个 <strong> 标签,且其内容并不重复,这样我们就可以循环的去判断匹配 <strong> 标签中的内容,
# 如果符合,则通过下标解析出对应数据。
# 观察 <p> 标签的最大数,将循环最大次数设为 15(但不一定要15,给一个相对的最大值即可)
for i in range(1, 15):
print(i)
# 判断 <strong> 标签中值的是否为空,如果为空,则跳过此次循环。
temp = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/strong/text() | //div[@id='info']/div[@id='meta']//p[{}]//text()".format(
i, i))
print(temp)
if temp is None or temp == [] or temp == "":
continue
else:
# 去除多余字符,进行字段匹配。
for j in range(0, len(temp)):
temp[j] = temp[j].replace(" ", "").replace("\n", "").replace(":", "")
# TODO 获取数据
# 获取大学名称
if "Colleges" in temp or "College" in temp:
college = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/a//text()".format(i))
# 非空值,处理数据。
if college:
college = ",".join(college)
# 获取高中就读地
elif "HighSchool" in temp:
high_school = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]//text()".format(i))
# 非空值,处理数据。
if high_school:
s = ",".join(high_school)
high_school = re.findall(r".*in\s(.*?),.*", s)[0]
# 获取同届排名
elif "RecruitingRank" in temp:
rank_year = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]//text()".format(i))
# 非空值,处理数据。
if rank_year:
rank_year = str(rank_year[3]) + str(rank_year[4].replace('\n', ''))
# 获取选秀信息
elif "Draft" in temp:
draft_team = response.xpath("//div[@id='info']/div[@id='meta']//p[{}]/a[1]/text()".format(i))
draft_rank = response.xpath("//div[@id='info']/div[@id='meta']//p[{}]/text()[3]".format(i))
draft_year = response.xpath("//div[@id='info']/div[@id='meta']//p[{}]/a[2]/text()".format(i))
# 非空值,处理数据。
if draft_team and draft_rank and draft_year:
draft = (draft_team[0] + draft_rank[0] + draft_year[0]).replace('\n', '')
# 获取选秀日期
elif "NBADebut" in temp:
draft_date = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/a/text()".format(i))
# 非空值,处理数据。
if draft_date:
draft_date = draft_date[0]
# 获取国籍和年龄
elif "Born" in temp:
# 获取国籍
country = response.xpath("//div[@id='info']/div[@id='meta']//p[{}]/span[@class]/text()".format(i))
# 国籍非空值,处理数据。
if country:
country = country[-1].upper()
# 获取年龄
# 由于年龄是动态加载数据,所以需要手动计算
# 获取出生日期
born = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/span[@id='necro-birth']/@data-birth".format(
i))
if not born:
continue
else:
born = born[0]
# 获取离世标签
is_died = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/strong/text()".format(i + 1))
if is_died:
is_died = is_died[0]
# 判断出生日期是否存在,且格式正确(如:2002-11-23)。
if born is not None and len(born.split('-')) == 3:
# 判断是否离世
if "Died" in is_died:
died = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/span[@id='necro-death']/@data-death".format(
i + 1))[0]
# 格式是否正确
if len(died.split('-')) != 3:
continue
# 计算整岁
year = int(died.split('-')[0]) - int(born.split('-')[0]) - 1
# 计算天数 (上一年生日距离离世的天数)
d = born.split('-')
# 如果生日是闰年的2月29
# 将生日设为日期为28,当前日期减1,避免日期越界错误
if int(d[1]) == 2 and int(d[2]) == 29:
d[2] = 28
died = died.split('-')
died = datetime.date(int(died[0]), int(died[1]), int(died[2]))
died += datetime.timedelta(-1)
d_temp = str(int(str(died).split('-')[0]) - 1) + '-' + str(d[1]) + '-' + str(d[2])
days = abs(
datetime.datetime.strptime(str(died), "%Y-%m-%d") - datetime.datetime.strptime(d_temp,
"%Y-%m-%d")).days
# 判断是否刚好达到一年,如果达到则整岁加1,天数减365
if days >= 365:
year += 1
days -= 365
# 年龄格式调整
age = str(year) + "-" + str(days) + "d"
# 在世
else:
# 计算整岁
year = datetime.date.today().year - int(born.split('-')[0]) - 1
today = datetime.date.today()
# 计算天数 (上一年生日距今的天数)
d = born.split('-')
# 如果生日是闰年的2月29
# 将生日设为日期为28,当前日期减1,避免日期越界错误
if int(d[1]) == 2 and int(d[2]) == 29:
d[2] = 28
today += datetime.timedelta(-1)
d_temp = str(datetime.date.today().year - 1) + '-' + str(d[1]) + '-' + str(d[2])
days = abs(
datetime.datetime.strptime(str(today), "%Y-%m-%d") - datetime.datetime.strptime(d_temp,
"%Y-%m-%d")).days
# 判断是否刚好达到一年,如果达到则整岁加1,天数减365
if days >= 365:
days -= 365
year += 1
# 年龄格式调整
age = str(year) + "-" + str(days) + "d"
# 获取体重,在详情页里面拿是因为主界面部分球员有缺失
elif "Position" in temp:
wt = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/span[2]/text()".format(i + 1))
# 非空值,处理数据。
if wt:
wt = wt[0]
# 获取工作年份
elif "CareerLength" in temp or "Experience" in temp:
work_year = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/text()[2]".format(i))
# 非空值,处理数据。
if work_year:
work_year = work_year[0]
# 效力球队数量
team_count = response.xpath("//div[@class='uni_holder bbr']/a")
# 非空值,处理数据。
if team_count:
team_count = len(team_count)
else:
team_count = None
# 最后效力球队
last_team_name = response.xpath("//div[@class='uni_holder bbr']/a/@data-tip")
# 非空值,处理数据。
if last_team_name:
last_team_name = re.findall(".*[a-zA-z]", last_team_name[-1])[0]
else:
last_team_name = None
# TODO 获取球员比赛数据 (只获取职业生涯的数据)
# 赛季
season = response.xpath("//div[@class='stats_pullout']/div[1]//p[2]//text()")
# 非空值,处理数据。
if season:
season = season[0]
else:
season = None
# 总场次
games_count = response.xpath("//div[@class='stats_pullout']/div[@class='p1']/div[1]/p[2]//text()")
# 非空值,处理数据。
if games_count:
games_count = games_count[0]
else:
games_count = None
# 平均得分
PTS = response.xpath("//div[@class='stats_pullout']/div[@class='p1']/div[2]/p[2]//text()")
# 非空值,处理数据。
if PTS:
PTS = PTS[0]
else:
PTS = None
# 平均篮板
TRB = response.xpath("//div[@class='stats_pullout']/div[@class='p1']/div[3]/p[2]//text()")
# 非空值,处理数据。
if TRB:
TRB = TRB[0]
else:
TRB = None
# 平均助攻
AST = response.xpath("//div[@class='stats_pullout']/div[@class='p1']/div[4]/p[2]//text()")
# 非空值,处理数据。
if AST:
AST = AST[0]
else:
AST = None
# 第二栏部分球员没有 FG3 和 eFG 这两个参数,需要分类处理。正常排列顺序为:FG、FG3、FT、eFG
count = len(response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div/span"))
# 投篮命中率
FG = response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div[1]/p[2]//text()")
if FG:
FG = FG[0]
else:
FG = None
FT = None
FG3 = None
EFG = None
if count == 2:
# 罚球命中率
FT = response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div[2]/p[2]//text()")
if FT:
FT = FT[0]
elif count == 4:
# 三分球命中率
FG3 = response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div[2]/p[2]//text()")
if FG3:
FG3 = FG3[0]
# 罚球命中率
FT = response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div[3]/p[2]//text()")
if FT:
FT = FT[0]
# 投篮影响率
EFG = response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div[4]/p[2]//text()")
if EFG:
EFG = EFG[0]
# 效率值
PER = response.xpath("//div[@class='stats_pullout']/div[@class='p3']/div[1]/p[2]//text()")
if PER:
PER = PER[0]
# 胜率
WS = response.xpath("//div[@class='stats_pullout']/div[@class='p3']/div[2]/p[2]//text()")
if WS:
WS = WS[0]
# 添加数据
playerData.append(wt)
playerData.append(age)
playerData.append(country)
playerData.append(college)
playerData.append(high_school)
playerData.append(rank_year)
playerData.append(draft)
playerData.append(draft_date)
playerData.append(work_year)
playerData.append(team_count)
playerData.append(last_team_name)
playerData.append(season)
playerData.append(games_count)
playerData.append(PTS)
playerData.append(TRB)
playerData.append(AST)
playerData.append(FG)
playerData.append(FG3)
playerData.append(FT)
playerData.append(EFG)
playerData.append(PER)
playerData.append(WS)
# TODO 获取首秀时间
# 解析获得首秀 url 后缀
time = response.xpath("//div[@class='table_wrapper tabbed'][1]//tbody/tr/th/a/@href")
# 首秀 url 后缀为空
if not time:
playerData.append(None)
# 不为空
else:
# 首秀 url
url = "https://www.basketball-reference.com" + str(time[0])
# 极个别链接前面自带链接头,将其保持不变。
# 如:"https://www.basketball-reference.comhttps://www.basketball-reference.com/gleague/players/c/cookqu01d.html"
if len(url.split("https")) != 2:
url = str(time[0])
# 发起首秀时间请求
page_text = requests.get(url=url, timeout=20).text
first_response = etree.HTML(page_text)
print("【首秀时间界面解析】")
firstTime = first_response.xpath("//tbody/tr[1]/td[@data-stat='date_game']/a/text()")
if not firstTime:
firstTime = None
else:
firstTime = firstTime[0]
# 添加首秀时间数据
playerData.append(firstTime)
# TODO 获取退役时间 (最后上场时间)
# 解析获得退役 url 后缀
last_time = response.xpath('//tbody[last()]/tr/th[@data-stat="season"]/a/@href')
# 退役 url 后缀为空
if not last_time:
playerData.append(None)
# 直接进行存储
saveData(csv_writer, playerData)
# 不为空
else:
# 找出最后上场时间 (最后一次上场的年份)
lastTempList = [] # 存放年份时间
# 遍历找出符合格式的时间(年份)
for l_time in last_time:
lastTempList.append(re.findall(r"[0-9]{4}", l_time)[0])
# 取最大值
max_time = max(lastTempList)
# 通过最大值取比对判断,找到最后上场的日期链接
lastTime_url = None
for l_time in last_time:
if max_time in l_time:
lastTime_url = l_time
break
# 退役 url
lastTime_url = "https://www.basketball-reference.com" + lastTime_url
# 发起获取最后上场时间请求
page_text = requests.get(url=lastTime_url, timeout=20).text
last_response = etree.HTML(page_text)
print("【退役时间解析】")
lastTime = last_response.xpath("//tbody[last()]/tr/td[@data-stat='date_game']/a/text()")
if lastTime:
lastTime = lastTime[-1]
else:
lastTime = None
# 添加数据
playerData = data
playerData.append(lastTime)
# 保存数据
saveData(csv_writer, playerData)
except Exception as ex:
print(ex)
数据保存函数
保存数据到 csv 文件中。
def saveData(csv_writer, data):
try:
print(data)
csv_writer.writerow(data)
# 当前循环数等于总数-1时代表程序运行完成,结束任务。
if main_count == total_count - 1:
close_spider(save_fp, start_time)
except Exception as ex:
print(ex)
任务结束函数
def close_spider(fp, start_time):
fp.close()
print("--------------------------爬取完成--------------------------")
print("用时:{}s".format(int(time.time()) - start_time))
编程要求
本关作为教学示例关卡,请进入下一关 NBA 球员数据采集实战 中完成任务。
如果运行后抛出以下异常信息:
requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='www.basketball-reference.com', port=443):
Max retries exceeded with url: /players/a/ (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x7f43a73029e8>,
'Connection to www.basketball-reference.com timed out. (connect timeout=30)'))
解决方法:
在入口函数的请求中添加 headers 参数来设置请求头。请求头内容请打开网站进入调试模式后查询页面 a/ 中的 Requests Headers 获取。
import sys
import csv
import datetime
import re
import string
import time
from lxml import etree
import requests
# TODO 全局变量
save_fp = open("./nba_data.csv", "w", encoding="utf-8-sig", newline="") # 创建存储文件对象
csv_writer = csv.writer(save_fp) # 创建 csv 对象
start_time = int(time.time()) # 记录开始时间
main_response = None # 记录主页面内容
main_count = 0 # 记录主页面当前循环次数
total_count = 0 # 记录当前页面总循环次数
# 解析页面
# 获取球员详情页链接、名称、位置、身高,由于首页的体重可能为空,所以不在首页这里获取体重
def parse(response, count):
try:
print("【主页面解析】")
# 首页球员详情页链接列表
list_href = response.xpath("//table[@id='players']/tbody/tr/th//a/@href")
# 首页球员名称列表
list_name = response.xpath("//table[@id='players']/tbody/tr/th//a/text()")
# 首页球员位置列表
list_pos = response.xpath("//table[@id='players']/tbody/tr/td[@data-stat='pos']/text()")
# 首页球员身高列表
list_ht = response.xpath("//table[@id='players']/tbody/tr/td[@data-stat='height']/text()")
# 校验列表长度是否一致,如果不一致则说明中间有空值数据,则该列需要去详情页获取数据。
# 长度一致才表明可以做到一一对应。
print("长度校验:")
print(len(list_href))
print(len(list_name))
print(len(list_pos))
print(len(list_ht))
# 存放首页获取到的每一个球员数据列表
list_res = []
# id正则,通过球员详情页链接获取球员 id
rex = re.compile(r".*/(.*).html")
# 循环列表,按顺序添加到球员数据列表中
for i in range(0, len(list_name)):
temp = [rex.findall(list_href[i])[0], "https://www.basketball-reference.com" + list_href[i],
list_name[i],
list_pos[i], list_ht[i]]
list_res.append(temp)
# 循环发起球员详情页数据请求
total_count = len(list_res) # 记录总的循环数
for m in range(count, total_count):
main_count = m # 记录当前请求数
page_text = requests.get(url=list_res[m][1], timeout=20).text
response = etree.HTML(page_text)
# 调用球员详情页解析函数
parse_detail(response, list_res[m])
time.sleep(0.3) # 防止速度过快导致 IP 被封
except Exception as ex:
# 跳过请求发生异常的链接,进入下一次循环
print("---------------发生异常---------------")
print(ex)
parse(main_response, main_count + 1)
# 球员详情页解析
def parse_detail(response, data):
try:
print("【球员详情页解析】")
playerData = data
# 定义获取的字段,注意:部分球员中的字段不全。
college = None # 大学
high_school = None # 高中就读地
rank_year = None # 同届排名
draft = None # 选秀信息
draft_date = None # 选秀日期
work_year = None # 经验 Career Length 与 Experience 合为一列
country = None # 国籍
age = None # 年龄
wt = None # 体重
# 通过观察球员详情页数据,可以发现,球员信息标签 <p> 的个数不同,且 <p> 没有任何属性作为标识,直接使用 xpath 解析到的数据肯定有误!
# 解决方法: 可以发现,每个 <p> 标签中都有一个 <strong> 标签,且其内容并不重复,这样我们就可以循环的去判断匹配 <strong> 标签中的内容,
# 如果符合,则通过下标解析出对应数据。
# 观察 <p> 标签的最大数,将循环最大次数设为 15(但不一定要15,给一个相对的最大值即可)
for i in range(1, 15):
# 判断 <strong> 标签中值的是否为空,如果为空,则跳过此次循环。
temp = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/strong/text() | //div[@id='info']/div[@id='meta']//p[{}]//text()".format(
i, i))
# 为空则跳过这次循环
if temp is None or temp == [] or temp == "":
continue
else:
# 去除多余字符,进行字段匹配。
for j in range(0, len(temp)):
temp[j] = temp[j].replace(" ", "").replace("\n", "").replace(":", "")
# TODO 获取数据
# 获取大学名称
if "Colleges" in temp or "College" in temp:
college = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/a//text()".format(i))
# 非空值,处理数据。
if college:
college = ",".join(college)
# 获取高中就读地
elif "HighSchool" in temp:
high_school = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]//text()".format(i))
# 非空值,处理数据。
if high_school:
s = ",".join(high_school)
high_school = re.findall(r".*in\s(.*?),.*", s)[0]
# 获取同届排名
elif "RecruitingRank" in temp:
rank_year = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]//text()".format(i))
# 非空值,处理数据。
if rank_year:
rank_year = str(rank_year[3]) + str(rank_year[4].replace('\n', ''))
# 获取选秀信息
elif "Draft" in temp:
draft_team = response.xpath("//div[@id='info']/div[@id='meta']//p[{}]/a[1]/text()".format(i))
draft_rank = response.xpath("//div[@id='info']/div[@id='meta']//p[{}]/text()[3]".format(i))
draft_year = response.xpath("//div[@id='info']/div[@id='meta']//p[{}]/a[2]/text()".format(i))
# 非空值,处理数据。
if draft_team and draft_rank and draft_year:
draft = (draft_team[0] + draft_rank[0] + draft_year[0]).replace('\n', '')
# 获取选秀日期
elif "NBADebut" in temp:
draft_date = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/a/text()".format(i))
# 非空值,处理数据。
if draft_date:
draft_date = draft_date[0]
# 获取国籍和年龄
elif "Born" in temp:
# 获取国籍
country = response.xpath("//div[@id='info']/div[@id='meta']//p[{}]/span[@class]/text()".format(i))
# 国籍非空值,处理数据。
if country:
country = country[-1].upper()
# 获取年龄
# 由于年龄是动态加载数据,所以需要手动计算
# 获取出生日期
born = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/span[@id='necro-birth']/@data-birth".format(
i))
if not born:
continue
else:
born = born[0]
# 获取离世标签
is_died = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/strong/text()".format(i + 1))
if is_died:
is_died = is_died[0]
# 判断出生日期是否存在,且格式正确(如:2002-11-23)。
if born is not None and len(born.split('-')) == 3:
# 判断是否离世
if "Died" in is_died:
died = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/span[@id='necro-death']/@data-death".format(
i + 1))[0]
# 格式是否正确
if len(died.split('-')) != 3:
continue
# 计算整岁
year = int(died.split('-')[0]) - int(born.split('-')[0]) - 1
# 计算天数 (上一年生日距离离世的天数)
d = born.split('-')
# 如果生日是闰年的2月29
# 将生日设为日期为28,当前日期减1,避免日期越界错误
if int(d[1]) == 2 and int(d[2]) == 29:
d[2] = 28
died = died.split('-')
died = datetime.date(int(died[0]), int(died[1]), int(died[2]))
died += datetime.timedelta(-1)
d_temp = str(int(str(died).split('-')[0]) - 1) + '-' + str(d[1]) + '-' + str(d[2])
days = abs(
datetime.datetime.strptime(str(died), "%Y-%m-%d") - datetime.datetime.strptime(d_temp,
"%Y-%m-%d")).days
# 判断是否刚好达到一年,如果达到则整岁加1,天数减365
if days >= 365:
year += 1
days -= 365
# 年龄格式调整
age = str(year) + "-" + str(days) + "d"
# 在世
else:
# 计算整岁
year = datetime.date.today().year - int(born.split('-')[0]) - 1
today = datetime.date.today()
# 计算天数 (上一年生日距今的天数)
d = born.split('-')
# 如果生日是闰年的2月29
# 将生日设为日期为28,当前日期减1,避免日期越界错误
if int(d[1]) == 2 and int(d[2]) == 29:
d[2] = 28
today += datetime.timedelta(-1)
d_temp = str(datetime.date.today().year - 1) + '-' + str(d[1]) + '-' + str(d[2])
days = abs(
datetime.datetime.strptime(str(today), "%Y-%m-%d") - datetime.datetime.strptime(d_temp,
"%Y-%m-%d")).days
# 判断是否刚好达到一年,如果达到则整岁加1,天数减365
if days >= 365:
days -= 365
year += 1
# 年龄格式调整
age = str(year) + "-" + str(days) + "d"
# 获取体重,在详情页里面拿是因为主界面部分球员有缺失
elif "Position" in temp:
wt = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/span[2]/text()".format(i + 1))
# 非空值,处理数据。
if wt:
wt = wt[0]
# 获取工作年份
elif "CareerLength" in temp or "Experience" in temp:
work_year = response.xpath(
"//div[@id='info']/div[@id='meta']//p[{}]/text()[2]".format(i))
# 非空值,处理数据。
if work_year:
# 多年后缀 —— years 1年后缀 —— year
years = re.findall(r"[0-9].*years", work_year[0])
if not years:
work_year = re.findall(r"[0-9].*year", work_year[0])[0]
else:
work_year = years[0]
# 效力球队数量
team_count = response.xpath("//div[@class='uni_holder bbr']/a")
# 非空值,处理数据。
if team_count:
team_count = len(team_count)
else:
team_count = None
# 最后效力球队
last_team_name = response.xpath("//div[@class='uni_holder bbr']/a/@data-tip")
# 非空值,处理数据。
if last_team_name:
last_team_name = re.findall(".*[a-zA-z]", last_team_name[-1])[0]
else:
last_team_name = None
# TODO 获取球员比赛数据 (只获取职业生涯的数据)
# 赛季
season = response.xpath("//div[@class='stats_pullout']/div[1]//p[2]//text()")
# 非空值,处理数据。
if season:
season = season[0]
else:
season = None
# 总场次
games_count = response.xpath("//div[@class='stats_pullout']/div[@class='p1']/div[1]/p[2]//text()")
# 非空值,处理数据。
if games_count:
games_count = games_count[0]
else:
games_count = None
# 平均得分
PTS = response.xpath("//div[@class='stats_pullout']/div[@class='p1']/div[2]/p[2]//text()")
# 非空值,处理数据。
if PTS:
PTS = PTS[0]
else:
PTS = None
# 平均篮板
TRB = response.xpath("//div[@class='stats_pullout']/div[@class='p1']/div[3]/p[2]//text()")
# 非空值,处理数据。
if TRB:
TRB = TRB[0]
else:
TRB = None
# 平均助攻
AST = response.xpath("//div[@class='stats_pullout']/div[@class='p1']/div[4]/p[2]//text()")
# 非空值,处理数据。
if AST:
AST = AST[0]
else:
AST = None
# 第二栏部分球员没有 FG3 和 eFG 这两个参数,需要分类处理。正常排列顺序为:FG、FG3、FT、eFG
count = len(response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div/span"))
# 投篮命中率
FG = response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div[1]/p[2]//text()")
if FG:
FG = FG[0]
else:
FG = None
FT = None
FG3 = None
EFG = None
if count == 2:
# 罚球命中率
FT = response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div[2]/p[2]//text()")
if FT:
FT = FT[0]
elif count == 4:
# 三分球命中率
FG3 = response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div[2]/p[2]//text()")
if FG3:
FG3 = FG3[0]
# 罚球命中率
FT = response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div[3]/p[2]//text()")
if FT:
FT = FT[0]
# 投篮影响率
EFG = response.xpath("//div[@class='stats_pullout']/div[@class='p2']/div[4]/p[2]//text()")
if EFG:
EFG = EFG[0]
# 效率值
PER = response.xpath("//div[@class='stats_pullout']/div[@class='p3']/div[1]/p[2]//text()")
if PER:
PER = PER[0]
# 胜率
WS = response.xpath("//div[@class='stats_pullout']/div[@class='p3']/div[2]/p[2]//text()")
if WS:
WS = WS[0]
# 添加数据
playerData.append(wt)
playerData.append(age)
playerData.append(country)
playerData.append(college)
playerData.append(high_school)
playerData.append(rank_year)
playerData.append(draft)
playerData.append(draft_date)
playerData.append(work_year)
playerData.append(team_count)
playerData.append(last_team_name)
playerData.append(season)
playerData.append(games_count)
playerData.append(PTS)
playerData.append(TRB)
playerData.append(AST)
playerData.append(FG)
playerData.append(FG3)
playerData.append(FT)
playerData.append(EFG)
playerData.append(PER)
playerData.append(WS)
# TODO 获取首秀时间
# 解析获得首秀 url 后缀
time = response.xpath("//div[@class='table_wrapper tabbed'][1]//tbody/tr/th/a/@href")
# 首秀 url 后缀为空
if not time:
playerData.append(None)
# 不为空
else:
# 首秀 url
url = "https://www.basketball-reference.com" + str(time[0])
# 极个别链接前面自带链接头,将其保持不变。
# 如:"https://www.basketball-reference.comhttps://www.basketball-reference.com/gleague/players/c/cookqu01d.html"
if len(url.split("https")) != 2:
url = str(time[0])
# 发起首秀时间请求
page_text = requests.get(url=url, timeout=20).text
first_response = etree.HTML(page_text)
print("【首秀时间界面解析】")
firstTime = first_response.xpath("//tbody/tr[1]/td[@data-stat='date_game']/a/text()")
if not firstTime:
firstTime = None
else:
firstTime = firstTime[0]
# 添加首秀时间数据
playerData.append(firstTime)
# TODO 获取退役时间 (最后上场时间)
# 解析获得退役 url 后缀
last_time = response.xpath('//tbody[last()]/tr/th[@data-stat="season"]/a/@href')
# 退役 url 后缀为空
if not last_time:
playerData.append(None)
# 直接进行存储
saveData(csv_writer, playerData)
# 不为空
else:
# 找出最后上场时间 (最后一次上场的年份)
lastTempList = [] # 存放年份时间
# 遍历找出符合格式的时间(年份)
for l_time in last_time:
lastTempList.append(re.findall(r"[0-9]{4}", l_time)[0])
# 取最大值
max_time = max(lastTempList)
# 通过最大值取比对判断,找到最后上场的日期链接
lastTime_url = None
for l_time in last_time:
if max_time in l_time:
lastTime_url = l_time
break
# 退役 url
lastTime_url = "https://www.basketball-reference.com" + lastTime_url
# 发起获取最后上场时间请求
page_text = requests.get(url=lastTime_url, timeout=20).text
last_response = etree.HTML(page_text)
print("【退役时间解析】")
lastTime = last_response.xpath("//tbody[last()]/tr/td[@data-stat='date_game']/a/text()")
if lastTime:
lastTime = lastTime[-1]
else:
lastTime = None
# 添加数据
playerData = data
playerData.append(lastTime)
# 保存数据
saveData(csv_writer, playerData)
except Exception as ex:
print(ex)
# 爬虫初始化
def open_spider(csv_writer):
print("--------------------------开始爬取--------------------------")
header = [
"id", "info_url", "player_name", "player_pos", "player_ht", "player_wt", "player_age",
"country", "college", "high_school", "rank_year", "draft", "draft_date",
"work_year", "team_count", "last_team_name", "season", "games_count", "PTS",
"TRB", "AST", "FG", "FG3", "FT", "EFG", "PER", "WS", "firstTime", "lastTime"
]
csv_writer.writerow(header)
# 数据保存
def saveData(csv_writer, data):
try:
print(data)
csv_writer.writerow(data)
# 当前循环数等于总数-1时代表程序运行完成,结束任务。
if main_count == total_count - 1:
close_spider(save_fp, start_time)
# 设置测试运行时间(30s)
end_time = int(time.time()) - start_time
if end_time > 30:
sys.exit(0)
except Exception as ex:
print(ex)
# 任务执行结束
def close_spider(fp, start_time):
fp.close()
print("--------------------------爬取完成--------------------------")
print("用时:{}s".format(int(time.time()) - start_time))
# 入口函数
if __name__ == '__main__':
# 调用初始化函数
open_spider(csv_writer=csv_writer)
# 主界面 url
start_urls = 'https://www.basketball-reference.com/players/a/'
# 请求头,能正常访问则无需设置。
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language":"zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "max-age=0",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
"cookie": "__qca=P0-1886065764-1658383903526; hubspotutk=c69f86272b8d6b7951413faad4bbba8e; _gcl_au=1.1.1479764202.1658383906; _fssid=4eb62bce-9135-4575-8c6c-5d8ad0cbf2f4; _cc_id=183aa1de207640f3684240685f5542ec; _gid=GA1.2.241408601.1665543062; fs.session.id=28b82855-63f1-4e4d-91e6-7b89700f6372; _pbjs_userid_consent_data=3524755945110770; cookie=0f478402-4db9-4a70-b701-97f74220b4bc; _lr_env_src_ats=false; _fbp=fb.1.1665624118904.1085698668; meta_more_button=1; sr_note_box_countdown=0; srcssfull=yes; is_live=true; __gpi=UID=000007ecf062bd3b:T=1658383923:RT=1665709635:S=ALNI_Mb2bDIonimZiSivHyVtJXd-izqHOA; fs.bot.check=true; __hssrc=1; __cf_bm=EmPpD3muqzz_31sAtEV0s4gGLvIdFhi6Shpzyu6SgfY-1665729535-0-AWm21C7A17no++kUeumsEqsLsuuBhWdoJfZOSjm84UE4KRKzbBnHrW7CjXx09VK80YmgWIrmVGgGufu+RC7CiiM=; __hstc=180814520.c69f86272b8d6b7951413faad4bbba8e.1658383903774.1665709602912.1665729511448.44; __hssc=180814520.1.1665729511448; _ga_NR1HN85GXQ=GS1.1.1665729503.38.1.1665730082.0.0.0; _ga=GA1.2.1217857762.1658383900; _gat_gtag_UA_1890630_2=1; _gat_gtag_UA_1890630_9=1; __gads=ID=8491b8023e0f5ed5:T=1658383923:S=ALNI_MaR4IK8peM0OjZI9ohiD4Ayvp-YjQ; _lr_retry_request=true; cto_bundle=KErEG19BMG1DTUFvZTB1ZURsWXlUTFc4RTRJUkEwWlI2MlBnYTdkVXhNQ3F2ekNWYkJ3QUhJc3N0V0RqYm9YRkU1T1p2eUZVUXd2MmQ1c1RCJTJCV3ZUNzBVQlFra2VPc0s5amlob1RRZG9yd3JhJTJGOXpyVHhGd1JIaDdvNm1yMVAlMkI0M21RSXI4dU9GU25MY1RkSTdWU2dsSU8wYTZYOTNkUlhUUyUyRlVPZDAxd1Y5Vzd0dyUzRA; cto_bidid=zXPnyV8xOVFMMlJzTHRsak4lMkJiemlZc0tQdG44WTNWdTI3a25qNG9YcU1QTURSJTJGd0dWOHdiWUVXVnp5Tm1qellmWGZUREppVU11Y0FpNDhzQUtJallkd3ViUU5lNkFoTTk5OENDWXpDZEtZY2U1RkV0UjlaRWhvNTNMcmM1dU9XY0RkUUc",
"if-modified-since": "Fri, 14 Oct 2022 06:38:55 GMT",
"sec-ch-ua": '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1"
}
# 发起请求
page = requests.get(url=start_urls,headers=None,timeout=30)
# 获取响应内容
page_text = page.text
response = etree.HTML(page_text) # 封装内容,便于使用 Xpath 处理
main_response = response # 记录主界面内容
parse(response, main_count) # 调用主界面解析函数