python抓取懂球帝部分球员数据

期末项目要求自己在天池上找或者爬取数据进行分析,所以我试着用python写了一个小爬虫爬取懂球帝的部分球员数据,为什么是部分,因为全部球员太多了,花费时间会很长很长。直接附代码,新手一个,可能写的不会太好看。

懂球帝球员界面:

我试了一下/player/后面的索引是从50000001开始,一直到5064****,说明懂球帝球员数据一共有六十多万条

获取的数据有:

  1. 球员名称
  2. 球员俱乐部
  3. 国籍
  4. 身高
  5. 位置(有教练,也有裁判)
  6. 年龄
  7. 体重
  8. 号码
  9. 生日
  10. 惯用脚
  11. 职业生涯(年)
  12. 累计出场
  13. 累计进球
  14. 累计助攻
  15. 累计黄牌
  16. 累计红牌
  17. 综合能力评分
  18. 速度评分
  19. 力量评分
  20. 防守评分
  21. 盘带评分
  22. 传球评分
  23. 射门评分
import urllib
import csv
from bs4 import BeautifulSoup
from lxml import etree

# 检查是否存在球员
def checkHtml(num):
    url = "https://www.dongqiudi.com/player/%s.html" % num
    html = askURL(url)
    soup = BeautifulSoup(html, "html.parser")
    name = soup.find('p', attrs={'class': 'china-name'})
    if (name == None):
        print('无效网站')
        return 'none'
    else:
        return soup

# 获取数据,并储存
def getData(soup):
    # url = "https://www.dongqiudi.com/player/%s.html" % num
    # html = askURL(url)
    # soup = BeautifulSoup(html, "html.parser")

# 姓名
    name = soup.find('p', attrs={'class': 'china-name'})
    name = str(name)
    con = etree.HTML(name)
    namestr = con.xpath("//p/text()")
    name = namestr[0]
    print(name)


# 获取详细信息list
    detail_list = []
    detail_info_div = soup.find('div',attrs={'class': 'detail-info'})
    # con2 = etree.HTML(detail_info_div)
    detail_info_ul = detail_info_div.find_all('li')
    for each in detail_info_ul:
        detail = each.text.strip()
        detail_list.append(detail)
    # print(detail_list)

# 俱乐部
    club = str(detail_list[0]).replace('俱乐部:' ,'')
    # print('俱乐部', club)
# 国籍
    contry = str(detail_list[1]).replace('国   籍:' ,'')
    # print('国籍', contry)
# 身高
    height = 0
    heightstr = str(detail_list[2]).replace('CM', '')
    heightstr = heightstr.replace('身   高:', '')
    if heightstr != '':
        height = int(heightstr)
    # print('身高', height)
# 位置
    location = str(detail_list[3]).replace('位   置:', '')
    # print('位置', location)
# 年龄
    age = 0
    agestr = str(detail_list[4]).replace('年   龄:', '')
    agestr = agestr.replace('岁', '')
    if agestr != '':
        age = int(agestr)
    # print('年龄', age)
# 体重
    weight = 0
    weightstr = str(detail_list[5]).replace('体   重:', '')
    weightstr = weightstr.replace('KG', '')
    if weightstr != '':
        weight = weightstr
    # print('体重', weight)
# 号码
    number = 0
    numberstr = str(detail_list[6]).replace('号   码:', '')
    numberstr = numberstr.replace('号', '')
    if numberstr != '':
        number = int(numberstr)
    # print('号码', number)
# 生日
    birth = str(detail_list[7]).replace('生   日:', '')
    # print(birth)
# 惯用脚
    foot = str(detail_list[8]).replace('惯用脚:', '')
    # print(foot)

# 获取俱乐部比赛数据详细信息list
    total_con_wrap_div = soup.find('div', attrs={'class': 'total-con-wrap'})
    total_con_wrap_td = str(total_con_wrap_div.find_all('p', attrs={'class': 'td'}))
    con3 = etree.HTML(total_con_wrap_td)
    detail_info_list = con3.xpath("//p//span/text()")
    detail_info_list_years = con3.xpath("//p")

# 一线队时间(年)
    years = len(detail_info_list_years) - 1
    # print('一线队时长', len(detail_info_list_years) - 1)

# 总计上场次数
    total_session = 0
    for i in range(2, len(detail_info_list), 9):
        if detail_info_list[i] == '~':
            detail_info_list[i] = 0
        total_session = total_session+int(detail_info_list[i])
    # print('累计出场数', total_session)

# 总计进球数
    total_goals = 0
    for i in range(4, len(detail_info_list), 9):
        if detail_info_list[i] == '~':
            detail_info_list[i] = 0
        total_goals = total_goals + int(detail_info_list[i])
    # print('累计进球数', total_goals)

# 总计助攻数
    total_assist = 0
    for i in range(5, len(detail_info_list), 9):
        if detail_info_list[i] == '~':
            detail_info_list[i] = 0
        total_assist = total_assist + int(detail_info_list[i])
    # print('累计助攻数', total_assist)

# 总计黄牌数
    total_yellow_card = 0
    for i in range(6, len(detail_info_list), 9):
        if detail_info_list[i] == '~':
            detail_info_list[i] = 0
        total_yellow_card = total_yellow_card + int(detail_info_list[i])
    # print('累计黄牌数', total_yellow_card)

# 总计红牌数
    total_red_card = 0
    for i in range(7, len(detail_info_list), 9):
        if detail_info_list[i] == '~':
            detail_info_list[i] = 0
        total_red_card = total_red_card + int(detail_info_list[i])
    # print('累计红牌数', total_red_card)


# 获取总评分
    average = 0
    speed = 0
    power = 0
    guard = 0
    dribbling = 0
    passing = 0
    shooting = 0
    grade_average = soup.find('p', attrs={'class': 'average'})
    if grade_average != None:
        con4 = etree.HTML(str(grade_average))
        average = con4.xpath("//p//b/text()")
        average = int(average[0])
        # print('综合能力', average)
# 详细评分
    grade_detail_div = soup.find('div', attrs={'class': 'box_chart'})
    if grade_detail_div != None:
        con5 = etree.HTML(str(grade_detail_div))
        grade_detail = con5.xpath("//div//span/text()")
    # 速度
        speed = int(grade_detail[0])
        # print(speed)
    # 力量
        power = int(grade_detail[1])
        # print(power)
    # 防守
        guard = int(grade_detail[2])
        # print(guard)
    # 盘带
        dribbling = int(grade_detail[3])
        # print(dribbling)
    # 传球
        passing = int(grade_detail[4])
        # print(passing)
    # 射门
        shooting = int(grade_detail[5])
        # print(shooting)


# 写进文件

    csv.writer(f).writerow([name, club, contry, height, location, age, weight, number, birth, foot, years, total_session,
                            total_goals, total_assist, total_yellow_card, total_red_card, average, speed, power,
                            guard, dribbling, passing, shooting])

# 得到指定一个URL的网页内容
def askURL(url):
    head = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29"
    }
    # 用户代理,表示告诉豆瓣服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)

    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html



f = open("足球运动员.csv", mode="a", encoding='utf-8')
# csv.writer(f).writerow(["姓名","俱乐部","国籍","身高(CM)","位置","年龄(岁)","体重(KG)","号码","生日","惯用脚","职业生涯(年)",
#                             "累计出场","累计进球","累计助攻","累计黄牌","累计红牌","综合能力","速度","力量","防守","盘带","传球","射门"])
for num in range(50184113, 50184150):
    print(num)
    soup = checkHtml(num)
    if soup != 'none':
        getData(soup)
    # getData(num)




结果截图:

 花了4,5个小时,一共爬了三万多条数据

原码和.txt我也放到了gitee上lzk: 这是我的仓库https://gitee.com/lizengkunnb666/lzk.git

这是第一次学着写一个稍微大一点的爬虫程序,还不是很成熟

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值