Python项目：利用爬虫爬取TFTS1-S10各费卡数据，用excel统计，并进行数据可视化

__渺如星辰

已于 2024-01-10 18:40:19 修改

阅读量396

点赞数 10

文章标签： python 爬虫 excel

于 2023-12-24 17:31:19 首次发布

本文链接：https://blog.csdn.net/m0_67673651/article/details/135142110

版权

实现效果：S1-S10 二费卡_哔哩哔哩_bilibili

1.所需库

1.requests

pip install requests

2.beautifulsoup4

pip install beautifulsoup4

3.openpyxl

pip install openpyxl

4.pillow

pip install pillow

2.爬取TFTS1-S10费卡数据

从https://lolchess.gg/champions/set爬取数据，set1代表赛季一次类推。

在S盘创建‘The editing resources’文件夹，或跟换path路径。

S10出现的akali名字中包含'\'，需要进行特判。

#! python3
# crawler_s1-s10.py - downloads s1-s10 TFT champions image.
import requests, os, bs4
from pathlib import Path

urlbase = 'https://lolchess.gg/champions/set'

# download images.
def search_and_download(i, url):
	res = requests.get(url)
	res.raise_for_status()
	soup = bs4.BeautifulSoup(res.text, 'html.parser')
	imageElems = soup.select('#wrapper > div.container-full.mt-4.px-0.px-md-3 div > div.col-xl-3.mb-3 > div > div.guide-champion-list__content div > img')
	costElems = soup.select('#wrapper > div.container-full.mt-4.px-0.px-md-3 div > div.col-xl-3.mb-3 > div > div.guide-champion-list__content div > span')
	nameElems = soup.select('#wrapper > div.container-full.mt-4.px-0.px-md-3 div > div.col-xl-3.mb-3 > div > div.guide-champion-list__content a')
	for j in range(len(imageElems)):
		imageUrl = 'https:' + imageElems[j].get('src')
		coststr = costElems[j].getText()[1:]
		name = nameElems[j].get('data-keyword')
		# special
		if 'akali' in name:
			name = 'akali'
		name = name + '.png'
		if int(coststr) > 5:
			coststr = str(5)
		imageRes = requests.get(imageUrl)
		imageRes.raise_for_status()
		coststr = 'S1-S10_cost' + coststr
		path = Path('S:/The editing resources', coststr)
		path = path / str(i)
		path = path / name
        # save image
		imageFile = open(path, 'wb')
		for data in imageRes.iter_content(100000):
			imageFile.write(data)
		imageFile.close()

path = Path('S:/The editing resources')
# create cost and set  dir
for i in range(1,6):
	costDir = 'S1-S10_cost' + i
	newpath = path / costDir
	for i in range(1,11):
		os.makedirs(newpath / str(i))
		if i >= 3 and i < 10:
			os.makedirs(newpath / str(i + 0.5))

# i is set
for i in range(1,11):
	setnum = str(i)
	search_and_download(i, urlbase + setnum)
	# after set2 have set3.5.
	if i >= 3 and i < 10:
		setnum = str(i + 0.5)
		search_and_download(i+0.5, urlbase + setnum)

3.调整爬取图片的大小和命名格式

将爬取的图片大小都调整为[120,120]，并去除图片名字中的后缀，名字全部转换为小写。

#! python3
# imageNameSize.py - Uniform picture size and remove image name suffix, use all picture names in lower case

import os
from pathlib import Path
from PIL import Image
import shutil

p_base = Path('S:/The editing resources')
for cost in range(1,6):
	if cost == 1:
		continue
	s = 'S1-S10_cost' + str(cost)
	p = p_base / s
	for dirIndex in os.listdir(p):
		imageDir = p / dirIndex
		if imageDir.is_dir(): 
			# Change the current working directory
			os.chdir(imageDir)
			for imageIndex in os.listdir(imageDir):
				# use all picture names in lower case
				newImageIndex = imageIndex.lower()
				shutil.move(imageIndex, newImageIndex)
			for imageIndex in os.listdir(imageDir):
				# resize image [120,120]
				image = Image.open(imageIndex)
				newimg = image.resize((120, 120))
				newimg.save(imageIndex)
				strlist = imageIndex.split('_', 1)
				if len(strlist) == 2:
					# remove image name suffix 
					shutil.move(imageIndex, strlist[0] + '.png')

4.爬取英雄中文和英文名字

直接从攻略中心-英雄联盟官方网站-腾讯游戏进行爬取会爬不到元素，所以先将其网页进行下载。

将名字按英文-中文存储为键值对作为字典存储在name.py文件中。

注意字符集utf-8，将文件都以utf-8进行编码和解码。

#! python3
# crawlerName.py - downloads champions name.
import os, bs4, pprint, requests

url = 'https://101.qq.com/#/hero'
file = open('攻略中心-英雄联盟官方网站-腾讯游戏.html', encoding='utf-8')
soup = bs4.BeautifulSoup(file.read(), 'html.parser')
enNameElems = soup.select('#app > div > div.app-main > div > div.app-main-container.infomation-overview > ul div > div > img')
chNameElems = soup.select('#app > div > div.app-main > div > div.app-main-container.infomation-overview > ul div > p')
name = {}
for index in range(len(enNameElems)):
	name[(enNameElems[index].get('alt')).lower()] = chNameElems[index].getText()
fileObj = open('name.py', 'w', encoding='utf-8')
fileObj.write('name = ' + pprint.pformat(name) + '\n')
fileObj.close()

5.按照图片名字和爬取的名字进行统计并写入Excel

根据图片的名字统计各个赛季出现次数，并进行累加。

注意特判'vi','viktor','viego','sivir','anivia'。

打印未出现在爬取字典中的名字，并用其英文名计入excel中英文名。

#! python3
# excel.py - Update excel according to the picture name
import openpyxl, os
from pathlib import Path
from openpyxl.utils import get_column_letter
import name, string

def delete_extra_zero(n):
    '''Delete the extra 0 after the decimal point'''
    if isinstance(n, int):
        return n
    if isinstance(n, float):
        n = str(n).rstrip('0') 
        n = int(n.rstrip('.')) if n.endswith('.') else float(n)
        return n

p_base = Path('S:/The editing resources')
for cost in range(1,6):
	if cost == 1:
		continue
	s = 'S1-S10_cost' + str(cost)
	p = p_base / s
	# Change the current working directory
	os.chdir(p)
	xl = openpyxl.Workbook()
	sheet = xl['Sheet']
	sheet['B1'] = 'name'
	sheet['A1'] = '名称'
	columnName = 3
    # first row set name
	for setNum in range(1, 11):
		columnLetter = get_column_letter(columnName)
		columnName = columnName + 1
		sheet[columnLetter + '1'] = 'S' + str(setNum)
		if setNum >= 3 and setNum < 10:
			setNum = setNum + 0.5
			columnLetter = get_column_letter(columnName)
			columnName = columnName + 1
			sheet[columnLetter + '1'] = 'S' + str(setNum)
	nowColumn = 2
    # set
	dirIndex = 0
	while dirIndex <= 10:
		if dirIndex <= 2:
			dirIndex = dirIndex + 1
		else:
			dirIndex = dirIndex + 0.5
		dirIndex = delete_extra_zero(dirIndex)
		imageDir = p / str(dirIndex)
		if imageDir.is_dir():
			nowColumn = nowColumn + 1
			for rowNum in range(2, sheet.max_row + 1):
				sheet.cell(row=rowNum, column=nowColumn).value = sheet.cell(row=rowNum, column=nowColumn-1).value
			# Traversal picture
			for imageIndex in os.listdir(imageDir):
				enName = Path(imageDir / imageIndex).stem
				# Remove symbol
				for i in string.punctuation:
					enName = enName.replace(i, '')
				enName = enName.lower()
				for en,ch in name.name.items():
					if enName == 'viktor' or enName == 'vi' or enName == 'viego' or enName == 'sivir' or enName == 'anivia':
						break
					if en in enName:
						enName = en
				# Determine whether a new row is required
				flag = False
				# skip the first row
				for rowNum in range(2, sheet.max_row + 1):
					if enName == sheet.cell(row=rowNum, column=2).value:
						sheet.cell(row=rowNum, column=nowColumn).value = int(sheet.cell(row=rowNum, column=nowColumn-1).value) + 1
						flag = True
						break
				if not flag:
                    # is or not in name.name
					flag2 = False
					newRow = sheet.max_row+1
					for en,ch in name.name.items():
						if en == enName:
							sheet.cell(row=newRow, column=2).value = en 
							sheet.cell(row=sheet.max_row, column=1).value = ch
							for beforeColumn in range(3, sheet.max_column):
								sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
							sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
							flag2 = True
							break
						if en in enName:
							sheet.cell(row=newRow, column=2).value = en 
							sheet.cell(row=sheet.max_row, column=1).value = ch
							for beforeColumn in range(3, sheet.max_column):
								sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
							sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
							flag2 = True
					if not flag2:
						sheet.cell(row=newRow, column=2).value = enName
						sheet.cell(row=sheet.max_row, column=1).value = enName
						for beforeColumn in range(3, sheet.max_column):
							sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
						sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
						print(enName + ' new cham ')
	# Set column width
	sheet.column_dimensions[get_column_letter(2)].width = 12
	for columnIndex in range(3, sheet.max_column + 1):
		sheet.column_dimensions[get_column_letter(columnIndex)].width = 5
	xl.save('cost' + str(cost) + '.xlsx')

6.统计所有费卡在各个赛季的出场次数

在爬取的数据中先根据赛季再根据cost进行遍历。

#! python3
# excel_all.py - Statistical s1-s10 appearances
import openpyxl, os
from pathlib import Path
from openpyxl.utils import get_column_letter
import name, string

def delete_extra_zero(n):
    '''Delete the extra 0 after the decimal point'''
    if isinstance(n, int):
        return n
    if isinstance(n, float):
        n = str(n).rstrip('0') 
        n = int(n.rstrip('.')) if n.endswith('.') else float(n)
        return n

p_base = Path('S:/The editing resources')
# Change the current working directory
os.chdir(p_base / 'S1-S10_出场次数')
xl = openpyxl.Workbook()
sheet = xl['Sheet']
sheet['B1'] = 'name'
sheet['A1'] = '名称'
columnName = 3
for setNum in range(1, 11):
	columnLetter = get_column_letter(columnName)
	columnName = columnName + 1
	sheet[columnLetter + '1'] = 'S' + str(setNum)
	if setNum >= 3 and setNum < 10:
		setNum = setNum + 0.5
		columnLetter = get_column_letter(columnName)
		columnName = columnName + 1
		sheet[columnLetter + '1'] = 'S' + str(setNum)
nowColumn = 2
setNum = 0
while setNum <= 10:
	if setNum <= 2:
		setNum = setNum + 1
	else:
		setNum = setNum + 0.5
	setNum = delete_extra_zero(setNum)
	if setNum > 10:
		break
	nowColumn = nowColumn + 1
	for rowNum in range(2, sheet.max_row + 1):
		sheet.cell(row=rowNum, column=nowColumn).value = sheet.cell(row=rowNum, column=nowColumn-1).value
	for costNum in range(1, 6):
		imageDir = p_base / ('S1-S10_cost' + str(costNum)) / str(setNum)
		if imageDir.is_dir():
			for imageIndex in os.listdir(imageDir):
				# Non-hero images skip
				if imageIndex == (str(setNum) + '.png') or imageIndex.endswith('.psd'):
					continue
				enName = Path(imageDir / imageIndex).stem
				# Remove symbol
				for i in string.punctuation:
					enName = enName.replace(i, '')
				enName = enName.lower()
				for en,ch in name.name.items():
					if enName == 'viktor' or enName == 'vi' or enName == 'viego' or enName == 'sivir' or enName == 'anivia':
						break
					if en in enName:
						enName = en
				# Determine whether a new row is required
				flag = False
				# skip the first row
				for rowNum in range(2, sheet.max_row + 1):
					if enName == sheet.cell(row=rowNum, column=2).value:
						sheet.cell(row=rowNum, column=nowColumn).value = int(sheet.cell(row=rowNum, column=nowColumn-1).value) + 1
						flag = True
						break
				if not flag:
					flag2 = False
					newRow = sheet.max_row+1
					for en,ch in name.name.items():
						if en == enName:
							sheet.cell(row=newRow, column=2).value = en 
							sheet.cell(row=sheet.max_row, column=1).value = ch
							for beforeColumn in range(3, sheet.max_column):
								sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
							sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
							flag2 = True
							break
						if en in enName:
							sheet.cell(row=newRow, column=2).value = en 
							sheet.cell(row=sheet.max_row, column=1).value = ch
							for beforeColumn in range(3, sheet.max_column):
								sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
							sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
							flag2 = True
					if not flag2:
						sheet.cell(row=newRow, column=2).value = enName
						sheet.cell(row=sheet.max_row, column=1).value = enName
						for beforeColumn in range(3, sheet.max_column):
							sheet.cell(row=sheet.max_row, column=beforeColumn).value = 0
						sheet.cell(row=sheet.max_row, column=nowColumn).value = 1
						print(enName + ' new cham ')


xl.save('S1-S10_出场次数.xlsx')

7.数据可视化

利用flourish进行数据可视化，导入excel文件，加载图片。

__渺如星辰

关注

10
点赞
踩
7

收藏

觉得还不错? 一键收藏
1
评论
Python项目：利用爬虫爬取TFTS1-S10各费卡数据，用excel统计，并进行数据可视化

将爬取的图片大小都调整为[120,120]，并去除图片名字中的后缀，名字全部转换为小写。注意特判'vi','viktor','viego','sivir','anivia'。打印未出现在爬取字典中的名字，并用其英文名计入excel中英文名。利用flourish进行数据可视化，导入excel文件，加载图片。注意字符集utf-8，将文件都以utf-8进行编码和解码。S10出现的akali名字中包含'\'，需要进行特判。根据图片的名字统计各个赛季出现次数，并进行累加。爬取数据，set1代表赛季一次类推。
复制链接

扫一扫