Python爬取2018世界杯小组赛统计数据

最新推荐文章于 2022-12-10 10:46:39 发布

alanv007

最新推荐文章于 2022-12-10 10:46:39 发布

阅读量1.9k

点赞数 1

本文链接：https://blog.csdn.net/alanv007/article/details/80885818

版权

from bs4 import BeautifulSoup
import re
import urllib
import urllib.request;
import sys
import io
import os
import time

def getFile(url):
    file_name = url.split('?')[0].split('/')[-1];
    try:
        request = urllib.request.Request(url=url, headers=headers);
        u = urllib.request.urlopen(request);
    except urllib.error.HTTPError:
        print(url, "url file not found");
        return;

    with open(file_name, 'wb') as f:
        while True:
            buffer = u.read(8192)
            if buffer:
                f.write(buffer)
            else:
                break
        

fifaUrl = "https://www.fifa.com";
url = "https://www.fifa.com/worldcup/matches/#groupphase";

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}  

if not os.path.exists('download'):
    os.mkdir('download');
os.chdir(os.path.join(os.getcwd(), 'download'));

request = urllib.request.Request(url=url, headers=headers);
response = urllib.request.urlopen(request);
data = response.read().decode(encoding='utf-8',errors='strict');

soup = BeautifulSoup(data, "html.parser");

lists = soup.find_all('div', 'fi-mu-list ');

root = os.getcwd();
for list in lists:
    time = list.get('data-matchesdate');
    os.chdir(root);
    if not os.path.exists(time):
        os.mkdir(time);
        os.chdir(os.path.join(os.getcwd(), time));
    timedir = os.getcwd();
    for match in list.find_all('a', 'fi-mu__link'):
        home = match.find('div', 'home').find('span', 'fi-t__nText');
        away = match.find('div', 'away').find('span', 'fi-t__nText');
        score = match.find('span', 'fi-s__scoreText');
        
        result = home.string + "  " + score.string.replace('\r','').replace('\n','').replace(' ','') + "  " + away.string;
        
        os.chdir(timedir);
        if not os.path.exists(result):
            os.mkdir(result);
            os.chdir(os.path.join(os.getcwd(), result));

            href = fifaUrl + match.get('href') + "_libraries/_matchfacts?qs=1";
            request = urllib.request.Request(url=href, headers=headers);
            response = urllib.request.urlopen(request);
            data = response.read().decode(encoding='utf-8',errors='strict');
            soup = BeautifulSoup(data, "html.parser");

            ullist = soup.find('ul', 'fi-doclist').find_all('a');
            for doc in ullist:
                pdfUrl = doc.get('href');
                getFile(pdfUrl);

FIFA官网地址: https://www.fifa.com/worldcup/

每场比赛爬取内容:

Players Heat Map
Passing Distribution
Tracking Statistics
Tracking Statistics
Player Statistics
Player Statistics
Actual Formation
Match Report
Half-time
Tactical line-up
Line-ups

alanv007

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
Python爬取2018世界杯小组赛统计数据

from bs4 import BeautifulSoupimport reimport urllibimport urllib.request;import sysimport ioimport osimport timedef getFile(url): file_name = url.split('?')[0].split('/')[-1]; try: ...
复制链接

扫一扫