from bs4 import BeautifulSoup
import re
import urllib
import urllib.request;
import sys
import io
import os
import time
def getFile(url):
file_name = url.split('?')[0].split('/')[-1];
try:
request = urllib.request.Request(url=url, headers=headers);
u = urllib.request.urlopen(request);
except urllib.error.HTTPError:
print(url, "url file not found");
return;
with open(file_name, 'wb') as f:
while True:
buffer = u.read(8192)
if buffer:
f.write(buffer)
else:
break
fifaUrl = "https://www.fifa.com";
url = "https://www.fifa.com/worldcup/matches/#groupphase";
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
if not os.path.exists('download'):
os.mkdir('download');
os.chdir(os.path.join(os.getcwd(), 'download'));
request = urllib.request.Request(url=url, headers=headers);
response = urllib.request.urlopen(request);
data = response.read().decode(encoding='utf-8',errors='strict');
soup = BeautifulSoup(data, "html.parser");
lists = soup.find_all('div', 'fi-mu-list ');
root = os.getcwd();
for list in lists:
time = list.get('data-matchesdate');
os.chdir(root);
if not os.path.exists(time):
os.mkdir(time);
os.chdir(os.path.join(os.getcwd(), time));
timedir = os.getcwd();
for match in list.find_all('a', 'fi-mu__link'):
home = match.find('div', 'home').find('span', 'fi-t__nText');
away = match.find('div', 'away').find('span', 'fi-t__nText');
score = match.find('span', 'fi-s__scoreText');
result = home.string + " " + score.string.replace('\r','').replace('\n','').replace(' ','') + " " + away.string;
os.chdir(timedir);
if not os.path.exists(result):
os.mkdir(result);
os.chdir(os.path.join(os.getcwd(), result));
href = fifaUrl + match.get('href') + "_libraries/_matchfacts?qs=1";
request = urllib.request.Request(url=href, headers=headers);
response = urllib.request.urlopen(request);
data = response.read().decode(encoding='utf-8',errors='strict');
soup = BeautifulSoup(data, "html.parser");
ullist = soup.find('ul', 'fi-doclist').find_all('a');
for doc in ullist:
pdfUrl = doc.get('href');
getFile(pdfUrl);
FIFA官网地址: https://www.fifa.com/worldcup/
每场比赛爬取内容:
- Players Heat Map
- Passing Distribution
- Tracking Statistics
- Tracking Statistics
- Player Statistics
- Player Statistics
- Actual Formation
- Match Report
- Half-time
- Tactical line-up
- Line-ups