python抓取懂球帝部分球员数据

最新推荐文章于 2025-03-25 11:33:02 发布

qdu_neymar

最新推荐文章于 2025-03-25 11:33:02 发布

阅读量3.1k

点赞数 9

本文链接：https://blog.csdn.net/qq_45717424/article/details/121909235

版权

Python爬虫懂球帝球员数据数据抓取数据分析

关键词由CSDN通过智能技术生成

期末项目要求自己在天池上找或者爬取数据进行分析，所以我试着用python写了一个小爬虫爬取懂球帝的部分球员数据，为什么是部分，因为全部球员太多了，花费时间会很长很长。直接附代码，新手一个，可能写的不会太好看。

懂球帝球员界面：

我试了一下/player/后面的索引是从50000001开始，一直到5064****，说明懂球帝球员数据一共有六十多万条

获取的数据有：

球员名称
球员俱乐部
国籍
身高
位置（有教练，也有裁判）
年龄
体重
号码
生日
惯用脚
职业生涯（年）
累计出场
累计进球
累计助攻
累计黄牌
累计红牌
综合能力评分
速度评分
力量评分
防守评分
盘带评分
传球评分
射门评分

import urllib
import csv
from bs4 import BeautifulSoup
from lxml import etree

# 检查是否存在球员
def checkHtml(num):
    url = "https://www.dongqiudi.com/player/%s.html" % num
    html = askURL(url)
    soup = BeautifulSoup(html, "html.parser")
    name = soup.find('p', attrs={'class': 'china-name'})
    if (name == None):
        print('无效网站')
        return 'none'
    else:
        return soup

# 获取数据，并储存
def getData(soup):
    # url = "https://www.dongqiudi.com/player/%s.html" % num
    # html = askURL(url)
    # soup = BeautifulSoup(html, "html.parser")

# 姓名
    name = soup.find('p', attrs={'class': 'china-name'})
    name = str(name)
    con = etree.HTML(name)
    namestr = con.xpath("//p/text()")
    name = namestr[0]
    print(name)


# 获取详细信息list
    detail_list = []
    detail_info_div = soup.find('div',attrs={'class': 'detail-info'})
    # con2 = etree.HTML(detail_info_div)
    detail_info_ul = detail_info_div.find_all('li')
    for each in detail_info_ul:
        detail = each.text.strip()
        detail_list.append(detail)
    # print(detail_list)

# 俱乐部
    club = str(detail_list[0]).replace('俱乐部：' ,'')
    # print('俱乐部', club)
# 国籍
    contry = str(detail_list[1]).replace('国   籍：' ,'')
    # print('国籍', contry)
# 身高
    height = 0
    heightstr = str(detail_list[2]).replace('CM', '')
    heightstr = heightstr.replace('身   高：', '')
    if heightstr != '':
        height = int(heightstr)
    # print('身高', height)
# 位置
    location = str(detail_list[3]).replace('位   置：', '')
    # print('位置', location)
# 年龄
    age = 0
    agestr = str(detail_list[4]).replace('年   龄：', '')
    agestr = agestr.replace('岁', '')
    if agestr != '':
        age = int(agestr)
    # print('年龄', age)
# 体重
    weight = 0
    weightstr = str(detail_list[5]).replace('体   重：', '')
    weightstr = weightstr.replace('KG', '')
    if weightstr != '':
        weight = weightstr
    # print('体重', weight)
# 号码
    number = 0
    numberstr = str(detail_list[6]).replace('号   码：', '')
    numberstr = numberstr.replace('号', '')
    if numberstr != '':
        number = int(numberstr)
    # print('号码', number)
# 生日
    birth = str(detail_list[7]).replace('生   日：', '')
    # print(birth)
# 惯用脚
    foot = str(detail_list[8]).replace('惯用脚：', '')
    # print(foot)

# 获取俱乐部比赛数据详细信息list
    total_con_wrap_div = soup.find('div', attrs={'class': 'total-con-wrap'})
    total_con_wrap_td = str(total_con_wrap_div.find_all('p', attrs={'class': 'td'}))
    con3 = etree.HTML(total_con_wrap_td)
    detail_info_list = con3.xpath("//p//span/text()")
    detail_info_list_years = con3.xpath("//p")

# 一线队时间（年）
    years = len(detail_info_list_years) - 1
    # print('一线队时长', len(detail_info_list_years) - 1)

# 总计上场次数
    total_session = 0
    for i in range(2, len(detail_info_list), 9):
        if detail_info_list[i] == '~':
            detail_info_list[i] = 0
        total_session = total_session+int(detail_info_list[i])
    # print('累计出场数', total_session)

# 总计进球数
    total_goals = 0
    for i in range(4, len(detail_info_list), 9):
        if detail_info_list[i] == '~':
            detail_info_list[i] = 0
        total_goals = total_goals + int(detail_info_list[i])
    # print('累计进球数', total_goals)

# 总计助攻数
    total_assist = 0
    for i in range(5, len(detail_info_list), 9):
        if detail_info_list[i] == '~':
            detail_info_list[i] = 0
        total_assist = total_assist + int(detail_info_list[i])
    # print('累计助攻数', total_assist)

# 总计黄牌数
    total_yellow_card = 0
    for i in range(6, len(detail_info_list), 9):
        if detail_info_list[i] == '~':
            detail_info_list[i] = 0
        total_yellow_card = total_yellow_card + int(detail_info_list[i])
    # print('累计黄牌数', total_yellow_card)

# 总计红牌数
    total_red_card = 0
    for i in range(7, len(detail_info_list), 9):
        if detail_info_list[i] == '~':
            detail_info_list[i] = 0
        total_red_card = total_red_card + int(detail_info_list[i])
    # print('累计红牌数', total_red_card)


# 获取总评分
    average = 0
    speed = 0
    power = 0
    guard = 0
    dribbling = 0
    passing = 0
    shooting = 0
    grade_average = soup.find('p', attrs={'class': 'average'})
    if grade_average != None:
        con4 = etree.HTML(str(grade_average))
        average = con4.xpath("//p//b/text()")
        average = int(average[0])
        # print('综合能力', average)
# 详细评分
    grade_detail_div = soup.find('div', attrs={'class': 'box_chart'})
    if grade_detail_div != None:
        con5 = etree.HTML(str(grade_detail_div))
        grade_detail = con5.xpath("//div//span/text()")
    # 速度
        speed = int(grade_detail[0])
        # print(speed)
    # 力量
        power = int(grade_detail[1])
        # print(power)
    # 防守
        guard = int(grade_detail[2])
        # print(guard)
    # 盘带
        dribbling = int(grade_detail[3])
        # print(dribbling)
    # 传球
        passing = int(grade_detail[4])
        # print(passing)
    # 射门
        shooting = int(grade_detail[5])
        # print(shooting)


# 写进文件

    csv.writer(f).writerow([name, club, contry, height, location, age, weight, number, birth, foot, years, total_session,
                            total_goals, total_assist, total_yellow_card, total_red_card, average, speed, power,
                            guard, dribbling, passing, shooting])

# 得到指定一个URL的网页内容
def askURL(url):
    head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29"
    }
    # 用户代理，表示告诉豆瓣服务器，我们是什么类型的机器、浏览器（本质上是告诉浏览器，我们可以接收什么水平的文件内容）

    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html



f = open("足球运动员.csv", mode="a", encoding='utf-8')
# csv.writer(f).writerow(["姓名","俱乐部","国籍","身高(CM)","位置","年龄(岁)","体重(KG)","号码","生日","惯用脚","职业生涯(年)",
#                             "累计出场","累计进球","累计助攻","累计黄牌","累计红牌","综合能力","速度","力量","防守","盘带","传球","射门"])
for num in range(50184113, 50184150):
    print(num)
    soup = checkHtml(num)
    if soup != 'none':
        getData(soup)
    # getData(num)

结果截图：