Python爬取2018世界杯小组赛统计数据

from bs4 import BeautifulSoup
import re
import urllib
import urllib.request;
import sys
import io
import os
import time

def getFile(url):
    file_name = url.split('?')[0].split('/')[-1];
    try:
        request = urllib.request.Request(url=url, headers=headers);
        u = urllib.request.urlopen(request);
    except urllib.error.HTTPError:
        print(url, "url file not found");
        return;

    with open(file_name, 'wb') as f:
        while True:
            buffer = u.read(8192)
            if buffer:
                f.write(buffer)
            else:
                break
        

fifaUrl = "https://www.fifa.com";
url = "https://www.fifa.com/worldcup/matches/#groupphase";

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}  

if not os.path.exists('download'):
    os.mkdir('download');
os.chdir(os.path.join(os.getcwd(), 'download'));

request = urllib.request.Request(url=url, headers=headers);
response = urllib.request.urlopen(request);
data = response.read().decode(encoding='utf-8',errors='strict');

soup = BeautifulSoup(data, "html.parser");

lists = soup.find_all('div', 'fi-mu-list ');

root = os.getcwd();
for list in lists:
    time = list.get('data-matchesdate');
    os.chdir(root);
    if not os.path.exists(time):
        os.mkdir(time);
        os.chdir(os.path.join(os.getcwd(), time));
    timedir = os.getcwd();
    for match in list.find_all('a', 'fi-mu__link'):
        home = match.find('div', 'home').find('span', 'fi-t__nText');
        away = match.find('div', 'away').find('span', 'fi-t__nText');
        score = match.find('span', 'fi-s__scoreText');
        
        result = home.string + "  " + score.string.replace('\r','').replace('\n','').replace(' ','') + "  " + away.string;
        
        os.chdir(timedir);
        if not os.path.exists(result):
            os.mkdir(result);
            os.chdir(os.path.join(os.getcwd(), result));

            href = fifaUrl + match.get('href') + "_libraries/_matchfacts?qs=1";
            request = urllib.request.Request(url=href, headers=headers);
            response = urllib.request.urlopen(request);
            data = response.read().decode(encoding='utf-8',errors='strict');
            soup = BeautifulSoup(data, "html.parser");

            ullist = soup.find('ul', 'fi-doclist').find_all('a');
            for doc in ullist:
                pdfUrl = doc.get('href');
                getFile(pdfUrl);

FIFA官网地址: https://www.fifa.com/worldcup/

每场比赛爬取内容:

  • Players Heat Map
  • Passing Distribution
  • Tracking Statistics
  • Tracking Statistics
  • Player Statistics
  • Player Statistics
  • Actual Formation
  • Match Report
  • Half-time
  • Tactical line-up
  • Line-ups


  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,以下是一个示例代码,展示了如何使用 Python 爬取2018黄蜂的台风数据并进行路径可视化处理: ```python import requests from bs4 import BeautifulSoup import pandas as pd import matplotlib.pyplot as plt import cartopy.crs as ccrs # 爬取数据 url = 'http://typhoon.nmc.cn/weatherservice/typhoon/jsons/list_2018.json' response = requests.get(url) data = response.json() # 解析数据 typhoon = None for item in data: if item['name'] == 'Huangjian': typhoon = item break if typhoon is None: print('未找到指定台风数据!') exit() # 转换为 DataFrame 格式 df = pd.DataFrame(typhoon['data'], columns=['time', 'lat', 'lon', 'wind', 'pressure']) df['time'] = pd.to_datetime(df['time']) # 转换日期格式 df['lat'] = pd.to_numeric(df['lat']) # 转换纬度格式 df['lon'] = pd.to_numeric(df['lon']) # 转换经度格式 # 绘制地图 fig = plt.figure(figsize=(10, 10)) ax = plt.axes(projection=ccrs.PlateCarree()) ax.stock_img() ax.coastlines() # 绘制路径 ax.plot(df['lon'], df['lat'], label='Huangjian') # 添加图例 ax.legend(loc='upper left', fontsize='large') # 显示图形 plt.show() ``` 这段代码会爬取2018所有台风的数据,并从筛选出名称为“Huangjian”的台风数据。然后将该台风的路径绘制在地图上。需要注意的是,这里使用了 cartopy 库进行地图投影和绘制,需要提前安装。另外,由于数据来源可能会有更新或者变动,这段代码的运行结果可能与实际情况略有出入。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值