1.所需库
1.requests
pip install requests
2.beautifulsoup4
pip install beautifulsoup4
3.openpyxl
pip install openpyxl
4.pillow
pip install pillow
2.爬取TFTS1-S10费卡数据
从https://lolchess.gg/champions/set爬取数据,set1代表赛季一次类推。
在S盘创建‘The editing resources’文件夹,或跟换path路径。
S10出现的akali名字中包含'\',需要进行特判。
#! python3
# crawler_s1-s10.py - downloads s1-s10 TFT champions image.
import requests, os, bs4
from pathlib import Path
urlbase = 'https://lolchess.gg/champions/set'
# download images.
def search_and_download(i, url):
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
imageElems = soup.select('#wrapper > div.container-full.mt-4.px-0.px-md-3 div > div.col-xl-3.mb-3 > div > div.guide-champion-list__content div > img')
costElems = soup.select('#wrapper > div.container-full.mt-4.px-0.px-md-3 div > div.col-xl-3.mb-3 > div > div.guide-champion-list__content div > span')
nameElems = soup.select('#wrapper > div.container-full.mt-4.px-0.px-md-3 div > div.col-xl-3.mb-3 > div > div.guide-champion-list__content a')
for j in range(len(imageElems)):
imageUrl = 'https:' + imageElems[j].get('src')
coststr = costElems[j].getText()[1:]
name = nameElems[j].get('data-keyword')
# special
if 'akali' in name:
name = 'akali'
name = name + '.png'
if int(coststr) > 5:
coststr = str(5)
imageRes = requests.get(imageUrl)
imageRes.raise_for_status()
coststr = 'S1-S10_cost' + coststr
path = Path('S:/The editing resources', coststr)
path = path / str(i)
path = path / name
# save image
imageFile = open(path, 'wb')
for data in imageRes.iter_content(100000):
imageFile.write(data)
imageFile.close()
path = Path('S:/The editing resources')
# create cost and set dir
for i in range(1,6):
costDir = 'S1-S10_cost' + i
newpath = path / costDir
for i in range(1,11):
os.makedirs(newpath / str(i))
if i >= 3 and i < 10:
os.makedirs(newpath / str(i + 0.5))
# i is set
for i in range(1,11):
setnum = str(i)
search_and_download(i, urlbase + setnum)
# after set2 have set3.5.
if i >= 3 and i < 10:
setnum = str(i + 0.5)
search_and_download(i+0.5, urlbase + setnum)
3.调整爬取图片的大小和命名格式
将爬取的图片大小都调整为[120,120],并去除图片名字中的后缀,名字全部转换为小写。
#! python3
# imageNameSize.py - Uniform picture size and remove image name suffix, use all picture names in lower case
import os
from pathlib import Path
from PIL import Image
import shutil
p_base = Path('S:/The editing resources')
for cost in range(1,6):
if cost == 1:
continue
s = 'S1-S10_cost' + str(cost)
p = p_base / s
for dirIndex in os.listdir(p):
imageDir = p / dirIndex
if imageDir.is_dir():
# Change the current working directory
os.chdir(imageDir)
for imageIndex in os.listdir(imageDir):
# use all picture names in lower case
newImageIndex = imageIndex.lower()
shutil.move(imageIndex, newImageIndex)
for imageIndex in os.listdir(imageDir):
# resize image [120,120]
image = Image.open(imageIndex)
newimg = image.resize((120, 120))
newimg.save(imageIndex)
strlist = imageIndex.split('_', 1)
if len(strlist) == 2:
# remove image name suffix
shutil.move(imageIndex, strlist[0] + '.png')
4.爬取英雄中文和英文名字
直接从攻略中心-英雄联盟官方网站-腾讯游戏进行爬取会爬不到元素,所以先将其网页进行下载。
将名字按英文-中文存储为键值对作为字典存储在name.py文件中。
注意字符集utf-8,将文件都以utf-8进行编码和解码。
#! python3
# crawlerName.py - downloads champions name.
import os, bs4, pprint, requests
url = 'https://101.qq.com/#/hero'
file = open('攻略中心-英雄联盟官方网站-腾讯游戏.html', encoding='utf-8')
soup = bs4.BeautifulSoup(file.read(), 'html.parser')
enNameElems = soup.select('#app > div > div.app-main > div > div.app-main-container.infomation-overview > ul div > div > img')
chNameElems = soup.select('#app > div > div.app-main > div > div.app-main-container.infomation-overview > ul div > p')
name = {}
for index in range(len(enNameElems)):
name[(enNameElems[index].get('alt')).lower()] = chNameElems[index].getText()
fileObj = open('name.py', 'w', encoding='utf-8')
fileObj.write('name = ' + pprint.pformat(name) + '\n')
fileObj.close()
5.按照图片名字和爬取的名字进行统计并写入Excel
根据图片的名字统计各个赛季出现次数,并进行累加。
注意特判'vi','viktor','viego','sivir','anivia'。
打印未出现在爬取字典中的名字,并用其英文名计入excel中英文名。
#! python3
# excel.py - Update excel according to the picture name
import openpyxl, os
from pathlib import Path
from openpyxl.utils import get_column_letter
import name, string
def delete_extra_zero(n):
'''Delete the extra 0 after the decimal point'''
if isinstance(n, int):
return n
if isinstance(n, float):
n = str(n).rstrip('0')
n = int(n.rstrip('.')) if n.endswith('.') else float(n)
return n
p_base = Path('S:/The editing resources')
for cost in range(1,6):
if cost == 1:
continue
s = 'S1-S10_cost' + str(cost)
p = p_base / s
# Change the current working directory
os.chdir(p)
xl = openpyxl.Workbook()
sheet = xl['Sheet']
sheet['B1'] = 'name'
sheet['A1'] = '名称'
columnName = 3
# first row set name
for setNum in range(1, 11):
columnLetter = get_column_letter(columnName)
columnName = columnName + 1
sheet[columnLetter + '1'] = 'S' + str(setNum)
if setNum >= 3 and setNum < 10:
setNum = setNum + 0.5
columnLetter = get_column_letter(columnName)
columnName = columnName + 1
sheet[columnLetter + '1'] = 'S' + str(setNum)
nowColumn = 2
# set
dirIndex = 0
while dirIndex <= 10:
if dirIndex <= 2:
dirIndex = dirIndex + 1
else:
dirIndex = dirIndex + 0.5
dirIndex = delete_extra_zero(dirIndex)
imageDir = p / str(dirIndex)
if imageDir.is_dir():
nowColumn = nowColumn + 1
for rowNum in range(2, sheet.max_row + 1):
sheet.cell(row=rowNum, column=nowColumn).value = sheet.cell(row=rowNum, column=nowColumn-1).value
# Traversal picture
for imageIndex in os.listdir(imageDir):
enName = Path(imageDir / imageIndex).stem
# Remove symbol
for i in string.punctuation:
enName = enName.replace(i, '')
enName = enName.lower()
for en,ch in name.name.items():
if enName == 'viktor' or enName == 'vi' or enName == 'viego' or enName == 'sivir' or enName == 'anivia':
break
if en in enName:
enName = en
# Determine whether a new row is required
flag = False
# skip the first row
for rowNum in range(2, sheet.max_row + 1):
if enName == sheet.cell(row=rowNum, column=2).value:
sheet.cell(row=rowNum, column=nowColumn).value = int(sheet.cell(row=rowNum, column=nowColumn-1).value) + 1
flag = True
break
if not flag:
# is or not in name.name
flag2 = False
newRow = sheet.max_row+1
for en,ch in name.name.items():
if en == enName:
sheet.cell(row=newRow, column=2).value = en
sheet.cell(row=sheet.max_row, column=1).value = ch
for beforeColumn in range(3, sheet.max_column):
sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
flag2 = True
break
if en in enName:
sheet.cell(row=newRow, column=2).value = en
sheet.cell(row=sheet.max_row, column=1).value = ch
for beforeColumn in range(3, sheet.max_column):
sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
flag2 = True
if not flag2:
sheet.cell(row=newRow, column=2).value = enName
sheet.cell(row=sheet.max_row, column=1).value = enName
for beforeColumn in range(3, sheet.max_column):
sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
print(enName + ' new cham ')
# Set column width
sheet.column_dimensions[get_column_letter(2)].width = 12
for columnIndex in range(3, sheet.max_column + 1):
sheet.column_dimensions[get_column_letter(columnIndex)].width = 5
xl.save('cost' + str(cost) + '.xlsx')
6.统计所有费卡在各个赛季的出场次数
在爬取的数据中先根据赛季再根据cost进行遍历。
#! python3
# excel_all.py - Statistical s1-s10 appearances
import openpyxl, os
from pathlib import Path
from openpyxl.utils import get_column_letter
import name, string
def delete_extra_zero(n):
'''Delete the extra 0 after the decimal point'''
if isinstance(n, int):
return n
if isinstance(n, float):
n = str(n).rstrip('0')
n = int(n.rstrip('.')) if n.endswith('.') else float(n)
return n
p_base = Path('S:/The editing resources')
# Change the current working directory
os.chdir(p_base / 'S1-S10_出场次数')
xl = openpyxl.Workbook()
sheet = xl['Sheet']
sheet['B1'] = 'name'
sheet['A1'] = '名称'
columnName = 3
for setNum in range(1, 11):
columnLetter = get_column_letter(columnName)
columnName = columnName + 1
sheet[columnLetter + '1'] = 'S' + str(setNum)
if setNum >= 3 and setNum < 10:
setNum = setNum + 0.5
columnLetter = get_column_letter(columnName)
columnName = columnName + 1
sheet[columnLetter + '1'] = 'S' + str(setNum)
nowColumn = 2
setNum = 0
while setNum <= 10:
if setNum <= 2:
setNum = setNum + 1
else:
setNum = setNum + 0.5
setNum = delete_extra_zero(setNum)
if setNum > 10:
break
nowColumn = nowColumn + 1
for rowNum in range(2, sheet.max_row + 1):
sheet.cell(row=rowNum, column=nowColumn).value = sheet.cell(row=rowNum, column=nowColumn-1).value
for costNum in range(1, 6):
imageDir = p_base / ('S1-S10_cost' + str(costNum)) / str(setNum)
if imageDir.is_dir():
for imageIndex in os.listdir(imageDir):
# Non-hero images skip
if imageIndex == (str(setNum) + '.png') or imageIndex.endswith('.psd'):
continue
enName = Path(imageDir / imageIndex).stem
# Remove symbol
for i in string.punctuation:
enName = enName.replace(i, '')
enName = enName.lower()
for en,ch in name.name.items():
if enName == 'viktor' or enName == 'vi' or enName == 'viego' or enName == 'sivir' or enName == 'anivia':
break
if en in enName:
enName = en
# Determine whether a new row is required
flag = False
# skip the first row
for rowNum in range(2, sheet.max_row + 1):
if enName == sheet.cell(row=rowNum, column=2).value:
sheet.cell(row=rowNum, column=nowColumn).value = int(sheet.cell(row=rowNum, column=nowColumn-1).value) + 1
flag = True
break
if not flag:
flag2 = False
newRow = sheet.max_row+1
for en,ch in name.name.items():
if en == enName:
sheet.cell(row=newRow, column=2).value = en
sheet.cell(row=sheet.max_row, column=1).value = ch
for beforeColumn in range(3, sheet.max_column):
sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
flag2 = True
break
if en in enName:
sheet.cell(row=newRow, column=2).value = en
sheet.cell(row=sheet.max_row, column=1).value = ch
for beforeColumn in range(3, sheet.max_column):
sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
flag2 = True
if not flag2:
sheet.cell(row=newRow, column=2).value = enName
sheet.cell(row=sheet.max_row, column=1).value = enName
for beforeColumn in range(3, sheet.max_column):
sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
print(enName + ' new cham ')
xl.save('S1-S10_出场次数.xlsx')
7.数据可视化
利用flourish进行数据可视化,导入excel文件,加载图片。