多线程爬取某网站歌手、MV及歌曲信息,解析网页元素;实现异步下载图片及记录入库操作
爬虫脚本
# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import time
import _thread
import requests
import pymysql
import threading
import difflib
import asyncio
import aiomysql
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning
# 禁用安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
# 自定义线程
class myThread(threading.Thread):
def __init__(self, singerTag, singerType, singerLetter):
threading.Thread.__init__(self)
self.singerTag = singerTag
self.singerType = singerType
self.singerLetter = singerLetter
def run(self):
print ("开始线程:{}_{}_{}".format(self.singerTag, self.singerType, self.singerLetter))
threadLock.acquire()
getSingerList(self.singerTag, self.singerType, self.singerLetter, 1)
threadLock.release()
print ("退出线程:{}_{}_{}".format(self.singerTag, self.singerType, self.singerLetter))
loop = asyncio.get_event_loop()
# 取消证书验证
context = ssl._create_unverified_context()
# 请求头定义
headers = {
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept':'text/html,application/xhtml+xml,application/xml;\
q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36\
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
# 代理的ip及端口(需要自行更新ip及端口)
proxies = {
'http': 'http://localhost:12345/',
'http': 'http://localhost:54321/'
}
# 文件存放路径
filePath = 'E:/Reptilian/music/xxx/'
singerImagePath = filePath + 'singer/img/'
songImagePath = filePath + 'song/img/'
mvImagePath = filePath + 'mv/img/'
# 基础地址
baseUrl = 'https://xxx.xxx.xxx{}'
# 歌手列表页面地址
singerListUrl = 'https://xxx.xxx.xxx/xx/music/artist?tagId={}&type={}&firstLetter={}&page={}'
# 歌手列表页歌手数量限制
singerListPageSize = 30
# 歌手下的MV歌曲数量限制
mvListSize = 30
# 歌手属性
# 标签(分别代表“华语”、“欧美”、“日韩”)
singerTags = [1, 2, 3]
# 类型(分别代表“男”、“女”、“组合”)
singerTypes = ['A', 'B', 'C']
# 首字母(26个字母)
letters = [
'A', 'B', 'C',
'D', 'E', 'F',
'G', 'H', 'I',
'J', 'K', 'L',
'M', 'N', 'O',
'P', 'Q', 'R',
'S', 'T', 'U',
'V', 'W', 'X',
'Y', 'Z'
]
# 开始读取歌手列表
def getSingerList(singerTag, singerType, singerLetter, page):
# 歌手子列表地址
singerSubListUrl = singerListUrl.format(singerTag, singerType, singerLetter, page)
rsp = requests.get(url = singerSubListUrl, headers = headers, proxies = proxies, timeout = 15)
html = rsp.text
singerNum = bs4ParaserSingerList(html, singerTag, singerType)
# print('标签{}类型{}首字母{}第{}页的歌手数量:{}'.format(singerTag, singerType, singerLetter, page, singerNum))
if (singerNum >= singerListPageSize):
getSingerList(singerTag, singerType, singerLetter, page + 1)
# 解析歌手列表页html
def bs4ParaserSingerList(html, singerTag, singerType):
soup = BeautifulSoup(html, 'html.parser')
singers = soup.find_all('div', attrs = {'class': 'thumbnail'}, limit = 30)
if (len(singers) < 1):
# 没有请求到就不走下面的逻辑了
return 0
for singer in singers:
singerUrl = singer.a.get('href')
# singerCode = singer.a.get('href').split('/')[-1]
getSinger(singerUrl, singerTag, singerType)
# 一定要返回歌手列表的数量,方便getSingerList方法判断是否要进行递归查询
return len(singers)
# 获取歌手信息
def getSinger(url, tag, type):
singerUrl = baseUrl.format(url)
rsp = requests.get(url = singerUrl, headers = headers, proxies = proxies, timeout = 15)
html = rsp.text
bs4ParaserSinger(html, tag, type)
# 解析歌手页html
def bs4ParaserSinger(html, tag, type):
soup = BeautifulSoup(html, 'html.parser')
singer = soup.find_all('div', attrs = {'class': 'artist-info'}, limit = 1)
if (len(singer) < 1):
return
# 从头像的div解析出头像地址
singerImageUrl = 'https:' + singer[0].find_all('div', attrs = {'class': 'artist-avatar'}, limit = 1)[0].img.get('src')
# 从名称的div解析出头像地址
singerCode = singer[0].find_all('div', attrs = {'class': 'artist-name'}, limit = 1)[0].a.get('href').split('/')[-1]
singerName = singer[0].find_all('div', attrs = {'class': 'artist-name'}, limit = 1)[0].a.string.strip()
# 歌手介绍(比较长)
# singerDesc = singer[0].find_all('div', attrs = {'class': 'content'}, limit = 1)[0].string.strip()
# 1、下载歌手图片
downloadPic(singerImageUrl, singerCode + '.' + singerImageUrl.split('.')[-1], singerImagePath);
# 2、将歌手的数据插入到歌手表
singerSql = ("insert into t_singer values(null, '{}', \"" + singerName + "\", '{}', '{}', '')").format(singerCode, tag, type)
insertToDB(singerSql)
# 3、解析歌手内热门MV列表
singerSections = soup.find_all('div', attrs = {'class': 'artist-section container'})
for singerSection in singerSections:
singerSectionTitle = singerSection.find_all('div', attrs = {'class': 'artist-section-title'}, limit = 1)
# if (singerSection.span.string):
isExistMV = 'MV' in singerSectionTitle[0].span.string
if (isExistMV == False) :
continue
mvListSizeStr = str(singerSectionTitle[0].a.next_element).strip()
mvListSizeStr = mvListSizeStr.strip('全部')
mvListSizeStr = mvListSizeStr.strip('支')
mvListSize = 0
try:
mvListSize = int(mvListSizeStr)
except:
return
# MV数量大于5的是不会全部显示的(要跳到MV列表页),小于等于5的直接读列表就可以
if (mvListSize > 5):
mvListUrl = singerSectionTitle[0].a.get('href')
getMVList(mvListUrl, 1)
else:
# MV地址列表
mvUrlList = []
mvList = singerSection.find_all('a', attrs = {'class': 'thumb-link'}, limit = 5)
for mv in mvList:
mvUrl = mv.get('href')
mvCode = mvUrl.split('/')[-1]
mvImageUrl = 'https:' + mv.img.get('data-original')
# 1、下载MV的图片
downloadPic(mvImageUrl, mvCode + '.' + mvImageUrl.split('.')[-1], mvImagePath)
# print('歌手{}的MV《{}》的地址:{}'.format(singerName, mvCode, mvImageUrl))
# 2、获取MV的信息
getMV(mvUrl)
# 获取MV列表信息
def getMVList(url, page):
mvListUrl = baseUrl.format(url) + '?page={}'.format(page)
rsp = requests.get(url = mvListUrl, headers = headers, verify = False, proxies = proxies, timeout = 15)
html = rsp.text
# MV数量
mvNum = bs4ParaserMVList(html)
if (mvNum >= mvListSize):
getMVList(url, page + 1)
# 解析MV列表页html
def bs4ParaserMVList(html):
soup = BeautifulSoup(html, 'html.parser')
# MV列表页至少有6个MV存在
if ((mvList is not None) and len(mvList) > 0):
mvList = soup.find_all('div', attrs = {'class': 'artist-mv-list'}, limit = 1)[0].find_all('li', limit = mvListSize)
else:
return 0
for mv in mvList:
mvUrl = mv.a.get('href')
mvCode = mvUrl.split('/')[-1]
mvImageUrl = 'https:' + mv.find_all('a', attrs = {'class': 'thumb-link'}, limit = 1)[0].img.get('data-original').strip()
singerName = mv.find_all('div', attrs = {'class': 'mv-singers'}, limit = 1)[0].a.string.strip()
# 1、下载MV的图片
downloadPic(mvImageUrl, mvCode + '.' + mvImageUrl.split('.')[-1], mvImagePath)
# print('歌手{}的MV《{}》的地址:{}'.format(singerName, mvCode, mvImageUrl))
# 2、获取MV的信息
getMV(mvUrl)
# 一定要返回MV列表的数量,方便getMVList方法判断是否要进行递归查询
return len(mvList)
# 获取MV信息
def getMV(url):
mvUrl = baseUrl.format(url)
rsp = requests.get(url = mvUrl, headers = headers, proxies = proxies, timeout = 15)
html = rsp.text
bs4ParaserMV(html)
# 解析MV页html
def bs4ParaserMV(html):
soup = BeautifulSoup(html, 'html.parser')
mv = soup.find_all('div', attrs = {'class': 'mv-info'}, limit = 1)
if (len(mv) < 1):
return
# MV及对应歌手基本信息
mvName = mv[0].find_all('div', attrs = {'class': 'title'}, limit = 1)[0].string.strip()
mvDetailInfo = mv[0].find_all('div', attrs = {'class': 'mv-detail-info'}, limit = 1)[0]
singerName = mvDetailInfo.a.string
singerCode = mvDetailInfo.a.get('href').split('/')[-1]
mvPlayNum = 0
if (mvDetailInfo.span):
try:
mvPlayNum = mvDetailInfo.span.string.strip()
except:
print('MV《{}》的播放量转换错误!'.format(mvName))
# MV的时长(可能拿不到)
mvWatchInfo = soup.find_all('div', attrs = {'id': 'J_WatchContainer'}, limit = 1)[0]
mvCode = mvWatchInfo.find_all('input', attrs = {'id': 'J_MvContent'}, limit = 1)[0].get('value').strip()
# 播控
fpDuration = mvWatchInfo.find_all('div', attrs = {'class': 'fp-duration'}, limit = 1)
duration = '00:00'
if (len(fpDuration) > 0):
duration = fpDuration[0].string.strip()
# MV对应歌曲信息
song = mv[0].find_all('a', attrs = {'class': 'download'}, limit = 1)
songCode = ''
if (len(song) > 0):
songUrl = song[0].get('href')
songCode = songUrl.split('/')[-1]
# print('MV《{}》的歌曲编码:{}'.format(mvName, songCode))
getSong(songUrl)
# 将MV的数据插入到MV表
mvSql = ("insert into t_mv values(null, '{}', \"" + mvName + "\", '{}', '{}', '{}', '{}')").format(mvCode
, singerCode, songCode, mvPlayNum, duration)
# print('MV《{}》插入数据库:{}'.format(mvName, mvSql))
insertToDB(mvSql)
# 获取歌曲信息
def getSong(url):
songUrl = baseUrl.format(url)
rsp = requests.get(url = songUrl, headers = headers, proxies = proxies, timeout = 15)
html = rsp.text
bs4ParaserSong(html)
# 解析歌曲页html
def bs4ParaserSong(html):
soup = BeautifulSoup(html, 'html.parser')
songInfo = soup.find_all('div', attrs = {'class': 'info_contain'}, limit = 1)
songImage = soup.find_all('div', attrs = {'class': 'img_contain'}, limit = 1)
# 默认歌曲头像
songImageUrl = 'https://xxx.xxx.xxx/picture/2019/1217/2156/AS7be98b7ae42644bbb5e21f8a6b5167b8.jpg'
if ((songInfo is None) or (songImage is None)):
return
songImageUrl = 'https:' + songImage[0].img.get('data-original').strip()
songName = songInfo[0].h2.string.strip()
songCode = soup.find_all('input', attrs = {'id': 'songcid'}, limit = 1)[0].get('value').strip()
singer = songInfo[0].find_all('div', attrs = {'class': 'info_singer'}, limit = 1)[0]
singerName = singer.a.string.strip()
singerCode = singer.a.get('href').split('/')[-1]
tags = songInfo[0].find_all('span', attrs = {'class': 'songtag'})
songTag = ''
if (len(tags) > 0):
for tag in tags:
songTag += tag.string.strip() + ','
songTag = songTag[:-1]
lyric = ''
lyricTexts = soup.find_all('p', attrs = {'class': 'lyric-text'})
if (len(lyricTexts) > 0):
for lyricText in lyricTexts:
(lyricText is not None):
lyric += lyricText.string.strip() + '\n'
lyric = lyric[:-1]
# 1、下载歌曲图片
downloadPic(songImageUrl, songCode + '.' + songImageUrl.split('.')[-1], songImagePath)
# 2、将歌曲的数据插入到歌曲表
songSql = ("insert into t_song values(null, '{}', \"" + songName + "\", '{}', '{}', \"" + lyric + "\")").format(songCode
, singerCode, songTag)
insertToDB(songSql)
# # 同步插入记录到数据库
# def insertToDB(sql):
# try:
# cursor.execute(sql)
# db.commit()
# except:
# db.rollback()
def insertToDB(sql):
loop.run_until_complete(asyncInsertToDB(sql))
# 异步插入记录到数据库
# async def asyncInsertToDB(sql):
# try:
# conn = await aiomysql.connect(
# host = '127.0.0.1',
# port = 3306,
# user = 'test',
# password = 'xxxxx',
# db = 'test',
# charset = 'utf8',
# loop = loop
# )
# cursor = await conn.cursor()
# await cursor.execute(sql)
# await conn.commit()
# print('sql语句: {} 执行成功!'.format(sql))
# except Exception as e:
# print('sql语句: {} 执行失败!'.format(sql))
# finally:
# await cursor.close()
# conn.close()
# 异步插入记录到数据库(连接池实现)
async def asyncInsertToDB(sql):
try:
pool = await aiomysql.create_pool(
host = '127.0.0.1',
port = 3306,
user = 'test',
password = 'xxxxx',
db = 'test',
charset = 'utf8',
loop = loop
)
task = asyncio.ensure_future(insert(pool, sql))
await asyncio.gather(task)
print('sql语句: {} 执行成功!'.format(sql))
except Exception as e:
print('sql语句: {} 执行失败!'.format(sql))
finally:
pool.close()
# await pool.wait_closed()
async def insert(pool, sql):
async with pool.acquire() as conn:
async with conn.cursor() as cursor:
await cursor.execute(sql)
await conn.commit()
async def select(pool, sql):
async with pool.acquire() as conn:
async with conn.cursor() as cursor:
await cursor.execute(sql)
r = await cursor.fetchone()
# 同步下载单个图片的方法
# def downloadPic(imgUrl, imgName, imagePath):
# try:
# r = requests.get(url = imgUrl, headers = headers, verify = False, proxies = proxies, timeout = 15)
# with open(imagePath + imgName, 'wb') as f:
# f.write(r.content)
# except requests.exceptions.ConnectionError:
# print('图片{}请求错误!'.format(imgUrl))
# return
# f.close()
def downloadPic(imgUrl, imgName, imagePath):
loop.run_until_complete(asyncDownloadPic(imgUrl, imgName, imagePath))
# 异步下载单个图片的方法
async def asyncDownloadPic(imgUrl, imgName, imagePath):
try:
r = requests.get(url = imgUrl, headers = headers, verify = False, proxies = proxies, timeout = 15)
with open(imagePath + imgName, 'wb') as f:
f.write(r.content)
except requests.exceptions.ConnectionError:
print('图片{}请求错误!'.format(imgUrl))
return
f.close()
# 创建制定目录
def mkDir(path):
path = path.strip()
path = path.rstrip('\\')
isExist = os.path.exists(path)
if not isExist:
os.makedirs(path)
# else:
# print('目录{}已存在,不需要重复创建!'.format(path))
# Main方法
if __name__ == '__main__':
# 创建目录
mkDir(singerImagePath)
mkDir(songImagePath)
mkDir(mvImagePath)
# 多线程实现方式
# 线程锁及线程数组
threadLock = threading.Lock()
threads = []
for singerTag in singerTags:
for singerType in singerTypes:
for letter in letters:
thread = myThread(singerTag, singerType, letter)
threads.append(thread)
# # 单线程测试
# thread = myThread(1, 'A', 'A')
# threads.append(thread)
# # 歌曲测试
# getSong('/xx/music/song/xxxxxxxxxx')
for th in threads:
th.start()
for th in threads:
th.join()
数据库脚本
-- ----------------------------
-- Table structure for t_mv
-- ----------------------------
DROP TABLE IF EXISTS `t_mv`;
CREATE TABLE `t_mv` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`mv_code` varchar(32) NOT NULL COMMENT 'MV编码',
`mv_name` varchar(64) NOT NULL COMMENT 'MV名称',
`singer_code` varchar(32) NOT NULL COMMENT '歌手编码',
`song_code` varchar(32) DEFAULT NULL COMMENT '歌曲编码',
`play_num` int(11) DEFAULT NULL COMMENT '播放量',
`duration` varchar(16) DEFAULT NULL COMMENT '播放时长',
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 0 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for t_singer
-- ----------------------------
DROP TABLE IF EXISTS `t_singer`;
CREATE TABLE `t_singer` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`singer_code` varchar(32) NOT NULL COMMENT '歌手编码',
`singer_name` varchar(64) NOT NULL COMMENT '歌手名称',
`singer_tag` int(2) NOT NULL COMMENT '歌手标签(1:xxx、2:yyy、3:zzz韩)',
`singer_type` varchar(8) NOT NULL COMMENT '歌手类型(A:xx、B:yy、C:zz)',
`desc` text DEFAULT NULL COMMENT '歌手介绍',
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 0 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for t_song
-- ----------------------------
DROP TABLE IF EXISTS `t_song`;
CREATE TABLE `t_song` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`song_code` varchar(32) NOT NULL COMMENT '歌曲编码',
`song_name` varchar(64) NOT NULL COMMENT '歌曲编码',
`singer_code` varchar(32) NOT NULL COMMENT '歌手编码',
`song_tag` varchar(128) DEFAULT NULL COMMENT '歌曲标签',
`lyric` text DEFAULT NULL COMMENT '歌词',
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 0 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;