最近学习了下爬虫就自己想爬个NBA球员的数据,记录一下自己第一次的爬虫
其中有一个问题实在无法解决了,就是一些超长名字用制表符 “\t” 对不齐的情况,“\t”是根据前面有多少字符来决定空多少格,在网上看了一下好像是 8-n%8 个
数据是以txt文件的方式来储存的
第一次写爬虫,代码自我感觉很粗糙,很繁琐,先放在这里以后看能不能优化一下
import requests
from bs4 import BeautifulSoup
import traceback
import re
def getHtml(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def get_NBAplayer(title1,title2,title3,list1,list2,list3,url):
html = getHtml(url)
soup = BeautifulSoup(html,'html.parser')
try:
x_list_all = soup.find_all('div',attrs={"class":'x_list'})
x_title_all = soup.find_all('div',attrs={'class':'x_title'})
for x in x_title_all:
c1 = x.find_all('span',attrs={'class':'c1'})
c2 = x.find_all('span',attrs={'class':'c2'})
c3 = x.find_all('span',attrs={'class':'c3'})
c4 = x.find_all('span',attrs={'class':'c4'})
c5 = x.find_all('span',attrs={'class':'c5'})
for one in c1:
if len(one.text) != 0:
title1.append(one.text)
for two in c2:
if len(two.text) != 0:
title1.append(two.text)
for thr in c3:
if len(thr.text) != 0:
title2.append(thr.text)
for fou in c4:
if len(fou.text) != 0:
title2.append(fou.text)
for fiv in c5:
if len(fiv.text) != 0:
title3.append(fiv.text)
for i in x_list_all:
c2 = i.find_all('a',attrs={'target':'_blank'})
c1 = i.find_all('span',attrs={'class':'c1'})
list_1 = []
for one in c1:
if len(one.text) != 0:
list_1.append(one.text)
for two in c2:
if len(two.text) != 0:
list_1.append(two.text.split('\n')[1])
if len(list_1) != 0:
list1.append(list_1)
for j in x_list_all:
c3 = j.find_all('span',attrs={'class':'c3'})
c4 = j.find_all('span',attrs={'class':'c4'})
list_2 = []
for three in c3:
if len(three.text) != 0:
list_2.append(three.text)
for four in c4:
if len(four.text) != 0:
list_2.append(four.text)
if len(list_2) != 0:
list2.append(list_2)
for k in x_list_all:
c5 = k.find_all('span',attrs={'class':'c5'})
list_3 = []
for five in c5:
if len(five.text) != 0:
list_3.append(five.text)
if len(list_3) != 0:
list3.append(list_3)
except:
traceback.print_exc()
def printf(fpath,title1,title2,title3,list1,list2,list3):
tplt = '{:<24}\t'
with open(fpath,'a',encoding = 'utf-8') as f:
for i in range(len(title1)):
f.write(tplt.format(str(title1[i])))
for j in range(len(title2)):
f.write(tplt.format(str(title2[j])))
for k in range(len(title3)):
f.write(tplt.format(str(title3[k])))
f.write('\n\n')
for x in range(len(list1)):
for one in range(len(list1[0])):
f.write(tplt.format(str(list1[x][one])))
for two in range(len(list2[0])):
f.write(tplt.format(str(list2[x][two])))
for three in range(len(list3[0])):
f.write(tplt.format(str(list3[x][three])))
f.write('\n\n')
def main():
title1 = []
title2 = []
title3 = []
list1 = []
list2 = []
list3 = []
r_url = 'https://nba.hupu.com/teams'
list_url = []
list_name = []
r_html = getHtml(r_url)
r_soup = BeautifulSoup(r_html,'html.parser')
team = r_soup.find_all('a',attrs={'target':'_blank','class':'a_teamlink'})
team_name = re.findall(r'https\:\/\/nba\.hupu\.com\/teams\/[a-z]+',str(team))
for j in range(len(team_name)):
name = team_name[j].split('/')[-1]
list_name.append(name)
for i in range(len(list_name)):
title1 = []
title2 = []
title3 = []
list1 = []
list2 = []
list3 = []
url = 'https://nba.hupu.com/teams/' + list_name[i]
fpath = 'E:/python/NBA球员数据/' + list_name[i] +'.txt'
get_NBAplayer(title1,title2,title3,list1,list2,list3,url)
printf(fpath,title1,title2,title3,list1,list2,list3)
main()