python爬取百度音乐(二)——保存数据到mysql中

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/smallpizza/article/details/77425205

上篇博客只是爬取了网页上的数据并打印出来,但是在实际开发应用中,数据是要持久保存起来的,数据可以保存到关系型数据库如:mysql中或NoSQL数据库中如:mongodb
这篇博客是把数据保存到mysql数据库中

采用的模块是pymysql,[使用方法]http://www.runoob.com/python3/python3-mysql.html
本程序的数据代码段:

# 保存音乐信息到数据库中
def savaMusicToDB(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect):
    print 'savaMusicToDB start'
    DBConnection=getDBConnection()
    print 'dbconnection='+str(DBConnection)
    #创建游标
    cursor=DBConnection.cursor()
    sql='insert into orgmusic(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect) values(%s,%s,%s,%s,%s,%s,%s,%s)'
    cursor.execute(sql,(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect))
    DBConnection.commit()
    closeDBConnection(DBConnection,cursor)

#建立数据库连接
def getDBConnection():
    print 'getDBConnection start'
    host = '182.254.220.188'
    port = 3306
    user = 'root'
    password = 'ldy123456'
    db = 'music'
    charset = 'utf8'
    # 建立数据库链接
    DBConnection=pymysql.connect(host=host,port=port,user=user,passwd=password,db=db,charset=charset)
    return DBConnection

#关闭数据库连接
def closeDBConnection(DBConnection):
    DBConnection.close()

#关闭数据库连接和游标
def closeDBConnection(DBConnection,cursor):
    cursor.close()
    DBConnection.close()

本爬虫完整代码:

#coding=utf-8 #设置编码
#获取百度音乐

import urllib2
from bs4 import BeautifulSoup
import pymysql
import datetime
import random

#百度音乐的根路径url
baiduMusicRootURL='http://music.baidu.com'
#百度音乐分类的基本的根路径url
baiduMusicTagURL='http://music.baidu.com/tag'

#获取音乐的分类标签
def getMusicTags(musicTagURL):
    print 'getMusicTag='+musicTagURL
    musicTags={}
    htmlContent=getHTML(musicTagURL)
    print 'getMusicTags='+htmlContent
    #解析网页,获取分类标签
    soup=BeautifulSoup(htmlContent,'lxml')
    Tags=soup.find_all('span','tag-list clearfix')
    #print Tags
    for tag in Tags:
        #获取连接文本内容
        tagName=tag.get_text()
        #获取链接
        aSoup=BeautifulSoup(str(tag),'lxml')
        a=aSoup.select_one('a[href]')
        tagLink=a.get('href')
        #tagName作为键,tagLink作为值保存到字典中
        musicTags[tagName]=tagLink
    return musicTags

#获取网页
def getHTML(musicTagURL):
    print 'getHTML= '+musicTagURL
    headers={}
    request=urllib2.Request(musicTagURL,headers=headers)
    response=urllib2.urlopen(request)
    htmlContent=response.read()
    return htmlContent

#获取该页面的所有歌曲的内容
def getAllMusic(sourceURL):
    print 'getAllMusic start sourceURL='+sourceURL
    noData='#'
    try:
        #获取页面数量
        size=20
        sURL=sourceURL+'?start=0&size=20&third_type=0'
        htmlContent=getHTML(sURL)
        soup=BeautifulSoup(htmlContent,'lxml')
        aLists=soup.find_all('div','page-inner')
        if aLists:
            aSoup = BeautifulSoup(str(aLists), 'lxml')
            pageNumberLists = aSoup.find_all('a')
            #print pageNumberLists
            if pageNumberLists:
                aStr=pageNumberLists[len(pageNumberLists)-2]
                #print 'aStr='+str(aStr)
                pageASoup=BeautifulSoup(str(aStr),'lxml')
                pageNumber=int(pageASoup.find('a').get_text())
            else:
                pageNumber=0
        else:
            pageNumber=0
        print 'pageNumber='+str(pageNumber)
        #获取该类型的所有歌曲
        count = 0
        for i in range(0,pageNumber+1):
            print 'i='+str(i)
            sURL=''
            try:
                sURL=sourceURL+'?start='+str(count)+'&size=20&third_type=0'
                print 'sURL='+sURL
                #获取每个页面上的歌曲
                # 获取html页面
                htmlContent = getHTML(sURL)
                # 对页面进行解析
                soup = BeautifulSoup(htmlContent, 'lxml')
                # 获取歌曲类型
                m_type= soup.find('span', 'title').get_text()
                print 'm_type=' + m_type
                #获取歌曲列表
                musicList=soup.find('div','main-body-cont')
                #print 'musicListSoup='+str(musicList)
                musicListSoup=BeautifulSoup(str(musicList),'lxml')
                musicsLists=musicListSoup.find_all('div','song-item')
                print 'musicsLists='+ str(musicsLists)
                print 'musicsLists len='+str(len(musicsLists))
                for music in musicsLists:
                    #print 'music='+str(music)
                    # 获取(设置)id
                    m_id = setMusicID()
                    print 'm_id=' + m_id
                    musicSoup=BeautifulSoup(str(music),'lxml')
                    # 获取歌曲名和歌曲链接
                    spanStr = musicSoup.find('span', 'song-title')
                    spanSoup = BeautifulSoup(str(spanStr), 'lxml')
                    # 获取歌曲名
                    m_name = spanSoup.find('a').get_text()
                    if not m_name:
                        m_name=noData
                    print 'm_name=' + m_name
                    # 获取歌曲链接
                    m_link = baiduMusicRootURL + spanSoup.select_one('a[href]').get('href')
                    if not m_link:
                        m_link=noData
                    print 'm_link=' + m_link
                    # 获取歌手名
                    m_singer = musicSoup.find('span', 'author_list').get_text()
                    #第一个字符是‘\n’
                    if m_singer and len(m_singer)>1:
                        nStr = m_singer[0:1]
                        if nStr == '\n':
                            m_singer = m_singer[1:len(m_singer)]
                    else:
                        m_singer=noData
                    print 'm_singer len=' + str(len(m_singer))
                    print 'm_singer=' + m_singer
                    # 获取专辑
                    m_album = musicSoup.find('span', 'album-title').get_text()
                    if not m_album:
                        m_album='#'
                    print 'm_album=' + m_album
                    # 获取(设置)点播量
                    m_click = 0
                    # 获取(设置)收藏量
                    m_collect = 0
                    # 保存到数据库中
                    savaMusicToDB(m_id, m_name, m_link, m_type, m_singer, m_album, m_click, m_collect)

                count = count + 20
            except:
                pass
    except:
        pass
#获取(设置)音乐的id
def setMusicID():
    print 'setMusicID start'
    nowTime=datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    randomNum=random.randint(0,1000)
    m_id=str(nowTime)+str(randomNum)
    return m_id

# 保存音乐信息到数据库中
def savaMusicToDB(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect):
    print 'savaMusicToDB start'
    DBConnection=getDBConnection()
    print 'dbconnection='+str(DBConnection)
    #创建游标
    cursor=DBConnection.cursor()
    sql='insert into orgmusic(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect) values(%s,%s,%s,%s,%s,%s,%s,%s)'
    cursor.execute(sql,(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect))
    DBConnection.commit()
    closeDBConnection(DBConnection,cursor)

#建立数据库连接
def getDBConnection():
    print 'getDBConnection start'
    host = '182.254.220.188'
    port = 3306
    user = 'root'
    password = 'ldy123456'
    db = 'music'
    charset = 'utf8'
    # 建立数据库链接
    DBConnection=pymysql.connect(host=host,port=port,user=user,passwd=password,db=db,charset=charset)
    return DBConnection

#关闭数据库连接
def closeDBConnection(DBConnection):
    DBConnection.close()

#关闭数据库连接和游标
def closeDBConnection(DBConnection,cursor):
    cursor.close()
    DBConnection.close()

#主程序
if __name__ == '__main__':
    print 'Music Spider start'

    #获取百度音乐的分类标签
    musicTags=getMusicTags(baiduMusicTagURL)
    print musicTags
    #按照分类爬取音乐
    for k,v in musicTags.items():
        print 'k='+k
        print 'v='+str(v)
        httpStr=str(v)[0:7]
        if httpStr=='http://':
            sourceURL=str(v)
        else:
            sourceURL=baiduMusicRootURL+str(v)
        print 'sourceURL='+sourceURL
        #获取歌曲
        getAllMusic(sourceURL)

不喜勿喷!!!

展开阅读全文

没有更多推荐了,返回首页