爬取2016-2017赛季NBA球员的数据

最新推荐文章于 2024-10-21 11:33:17 发布

老问题

最新推荐文章于 2024-10-21 11:33:17 发布

阅读量3.9k

点赞数 2

分类专栏： python

本文链接：https://blog.csdn.net/qq_32511479/article/details/72628992

版权

python 专栏收录该内容

10 篇文章

订阅专栏

最近学习了下爬虫就自己想爬个NBA球员的数据，记录一下自己第一次的爬虫

其中有一个问题实在无法解决了，就是一些超长名字用制表符 “\t” 对不齐的情况,“\t”是根据前面有多少字符来决定空多少格，在网上看了一下好像是 8-n%8 个

数据是以txt文件的方式来储存的

第一次写爬虫，代码自我感觉很粗糙，很繁琐，先放在这里以后看能不能优化一下

import requests
from bs4 import BeautifulSoup
import traceback
import re


def getHtml(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ''


def get_NBAplayer(title1,title2,title3,list1,list2,list3,url):
    html = getHtml(url)
    soup = BeautifulSoup(html,'html.parser')
    try:
        x_list_all = soup.find_all('div',attrs={"class":'x_list'})
        x_title_all = soup.find_all('div',attrs={'class':'x_title'})

        for x in x_title_all:
            c1 = x.find_all('span',attrs={'class':'c1'})
            c2 = x.find_all('span',attrs={'class':'c2'})
            c3 = x.find_all('span',attrs={'class':'c3'})
            c4 = x.find_all('span',attrs={'class':'c4'})
            c5 = x.find_all('span',attrs={'class':'c5'})

            for one in c1:
                if len(one.text) != 0:
                    title1.append(one.text)

            for two in c2:
                if len(two.text) != 0:
                    title1.append(two.text)
                
            for thr in c3:
                if len(thr.text) != 0:
                    title2.append(thr.text)
            for fou in c4:
                if len(fou.text) != 0:
                    title2.append(fou.text)
            for fiv in c5:
                if len(fiv.text) != 0:
                    title3.append(fiv.text)
            
        
        for i in x_list_all:
            c2 = i.find_all('a',attrs={'target':'_blank'})
            c1 = i.find_all('span',attrs={'class':'c1'})
            list_1 = []
            for one in c1:
                if len(one.text) != 0:
                    list_1.append(one.text)
            for two in c2:
                if len(two.text) != 0:
                    list_1.append(two.text.split('\n')[1])
            if len(list_1) != 0:
                list1.append(list_1)
            

        for j in x_list_all:
            c3 = j.find_all('span',attrs={'class':'c3'})
            c4 = j.find_all('span',attrs={'class':'c4'})
            list_2 = []
            for three in c3:
                if len(three.text) != 0:
                    list_2.append(three.text)
            for four in c4:
                if len(four.text) != 0:
                    list_2.append(four.text)
            if len(list_2) != 0:
                list2.append(list_2)

        for k in x_list_all:
            c5 = k.find_all('span',attrs={'class':'c5'})
            list_3 = []
            for five in c5:
                if len(five.text) != 0:
                    list_3.append(five.text)
            if len(list_3) != 0:
                list3.append(list_3)
    except:
        traceback.print_exc()




def printf(fpath,title1,title2,title3,list1,list2,list3):
    tplt = '{:<24}\t'
    with open(fpath,'a',encoding = 'utf-8') as f:
        for i in range(len(title1)):
            f.write(tplt.format(str(title1[i])))
        for j in range(len(title2)):
            f.write(tplt.format(str(title2[j])))
        for k in range(len(title3)):
            f.write(tplt.format(str(title3[k])))
        f.write('\n\n')
        


      

        for x in range(len(list1)):
            
            for one in range(len(list1[0])):
                f.write(tplt.format(str(list1[x][one])))

            for two in range(len(list2[0])):
                f.write(tplt.format(str(list2[x][two])))

            for three in range(len(list3[0])):
                f.write(tplt.format(str(list3[x][three])))

            f.write('\n\n')

def main():
    
    title1 = []
    title2 = []
    title3 = []
    list1 = []
    list2 = []
    list3 = []

    r_url = 'https://nba.hupu.com/teams'
    list_url = []
    list_name = []

    r_html = getHtml(r_url)

    r_soup = BeautifulSoup(r_html,'html.parser')

    team = r_soup.find_all('a',attrs={'target':'_blank','class':'a_teamlink'})
    team_name = re.findall(r'https\:\/\/nba\.hupu\.com\/teams\/[a-z]+',str(team))
    
    for j in range(len(team_name)):
        name = team_name[j].split('/')[-1]
        list_name.append(name)


    for i in range(len(list_name)):
        title1 = []
        title2 = []
        title3 = []
        list1 = []
        list2 = []
        list3 = []
        
        url = 'https://nba.hupu.com/teams/' + list_name[i]

        fpath = 'E:/python/NBA球员数据/' + list_name[i] +'.txt'
        
        get_NBAplayer(title1,title2,title3,list1,list2,list3,url)

        printf(fpath,title1,title2,title3,list1,list2,list3)

main()