python爬取NBA球员信息写入MySQL

python爬取NBA球员信息

学习了一段时间python,在此写一个简单的爬虫案例,爬取的球员信息,使用HDFS存储并利用Hive进行简单的分析。

过程:
- 登录百度百科NBA球队页面,查看HTML源码
- 编写python程序,抓取对应url的html content
- 解析html内容,获取每个球队url链接,进行轮询
- 解析html层次结构获取球员列表数据
- 将球员信息存放之数组对象中
- 编写数据入库的方法,将爬取的球员数据写入至mysql数据库nbaplayers
- 使用”Sqoop”将”nbaplayers”表中的数据导入至Hadoop集群的Hive库
- 利用Hive对球员信息进行简单的分析


关键代码

  • 获取网页内容
if __name__ == '__main__':
    html = getHtmlContent('http://baike.baidu.com/item/NBA')
    teams = crawl(html,type='team')
    save_data2mysql(teams)
def getHtmlContent(url): #根据type类型判断使用不同的爬取方式
    header = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Content-Type': 'application/json; charset=utf-8',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
    }
    timeout = random.choice(range(80, 180))
    while True:
        try:
            rep = requests.get(url,headers = header,timeout = timeout)
            rep.encoding = 'utf-8'
            break
        except socket.timeout as e:
            print( '3:', e)
            time.sleep(random.choice(range(8,15)))

        except socket.error as e:
            print( '4:', e)
            time.sleep(random.choice(range(20, 60)))
    return rep.text
  • 保存数据至MySQL

    def save_data2mysql(data):
    
    #保存数据至数据库或hdfs
    
    
    conn = pymysql.connect(user='root', passwd='hadoop',
                     host='192.168.100.20', db='hadoop', charset="utf8")#注意编码需要与mysql一致
    print('获取连接')
    cur = conn.cursor()
    for teams in data:
        for player in teams:
            print(player)
            team = player['team']
            name=player['name']
            area=player['area']
            age=player['age']
            high=player['high']
            weight=player['weight']
            birthday=player['birthday']
            #拼装sql
            values = team+"','"+name+"','"+area+"','"+age+"','"+high+"','"+weight+"','"+birthday
            sql = "insert into nbaplayers (team,name,area,age,high,weight,birthday) values ('"+values+"')"
            print(sql)
            cur.execute(sql)
    
    #关闭连接
    
    conn.commit() 
    print('关闭连接')
    cur.close()    
    conn.close()
  • 获取球队信息

    def getTeamData(html):
    print('获取球队信息')
    bs = BeautifulSoup(html,'html.parser')
    teams = {}
    body = bs.body
    table = body.find('table',class_='table-view log-set-param')
    aTags = table.find_all('a')
    for a in aTags:
        teams[a.string]=[a['href']]
    return teams
  • 获取球员数据

    def getPlayersData(html,team):
    print('获取球员信息')
    bs = BeautifulSoup(html,'html.parser')
    body = bs.body
    divs = body.find('div',class_='main-content')
    tableSize=0;
    i=0;
    while tableSize<12:
        table = divs.find_all(attrs={"log-set-param": "table_view"})[i]
        trs = table.find_all('tr')
        tableSize = len(trs)
        i+=1
    players=[]
    for tr in trs:
        #如果为第一行表头则跳过
        if trs.index(tr)==0:
            continue
        player={}
        tds = tr.find_all('td')
        try:
            name = parserDes(tds[1])
            area = parserDes(tds[2])
            high = parserDes(tds[3])
            weight = parserDes(tds[4])
            age = parserDes(tds[5])
            birthday = parserDes(tds[6])
        except:
            print(tds)
            continue
        if name!='球员' :
            player['team']=team
            player['name']=name
            player['area']=area
            player['high']=high.replace('米','')
            player['weight']=weight.replace('公斤','')
            player['age']=age.replace('岁','')
            player['birthday']=birthday
        players.append(player)
    return players
    • Sqoop连接测试
#通过sqoop将mysql数据导入至hdfs中
#sqoop 使用
#检查连接
./sqoop list-databases --connect jdbc:mysql://192.168.100.20:3306/ --username root --password hadoop
  • 导入数据至Hive
#创建与mysql一样结构的表  
sqoop create-hive-table --connect jdbc:mysql://192.168.100.20:3306/hadoop --table nbaplayers --username root --password hadoop --hive-table nbaplayers
#创建表 并导入数据
sqoop import --connect jdbc:mysql://192.168.100.20:3306/hadoop --username root --password hadoop --table nbaplayers --hive-import -m 1

-flag

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值