通过爬虫获得来自博客上的数据集
博客中爬取出的数据集:
代码:
import requests
import json
from bs4 import BeautifulSoup
import logging
import time
# 使用headers模拟浏览器请求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
# 使用日志记录所有请求和错误信息,便于在出错时进行调试和处理。
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='qqM.log',
filemode='w')
class QQMusic:
def __init__(self):
pass
# 获取所有专辑的链接
def getAlbumLinks(self):
album_links = []
print("qqMusicSpider starts············")
for page_num in range(100): # page从0开始
base_url = 'https://c.y.qq.com/v8/fcg-bin/album_library?cmd=get_album_info&page=' + str(
page_num) + '&pagesize=20&sort=1'
logging.info(base_url)
# 时间延时
time.sleep(2)
page = requests.get(base_url, headers=headers)
html = page.content
# 替换掉开头结尾,用于字典的转换
html2 = html.replace("MusicJsonCallback(".encode(), "".encode())
html3 = html2.replace("})".encode(), "}".encode())
html_dict = json.loads(html3)
album_list = html_dict['data']['albumlist']
# 获取关键字album_mid的内容,拼接成专辑的链接
for album in album_list:
link_id = album['album_mid']
# print lin1
# k_id
album_link = 'https://y.qq.com/n/yqq/album/' + str(link_id) + '.html'
album_links.append(album_link)
print("Get album_urls of Page " + str(page_num + 1))
return album_links
# 获取专辑的详细信息
def getAlbumInfo(self, url):
logging.info(url)
album_dict = {} # 创建字典储存所有信息
try:
time.sleep(2)
info_page = requests.get(url, headers=headers)
info_html = info_page.text
# print info_html
soup = BeautifulSoup(info_html, 'html.parser')
album_name = soup.find_all('h1', class_='data__name_txt')[0].get_text(strip=True) # 专辑名称
album_dict['album_name'] = album_name
# 有的也没有歌手,没有歌手信息的,以下5个小信息对应的key不同。
# 多出来的演奏,译名舍掉,key值存为空
singer = soup.find_all('a', class_='data__singer_txt')[0].get_text(strip=True) # 歌手
if len(singer) == 0:
info_all = soup.find_all('ul', class_='data_info__item') # 演奏,发行时间,发行公司,译名,类型。
album_dict['singer'] = "null"
album_dict['music_genre'] = "null"
album_dict['language'] = "null"
# 有的也没有类型
if len(info_all) == 4:
pub_time = info_all[1].get_text(strip=True) # 发行时间
pub_company = info_all[2].get_text(strip=True) # 发行公司
album_dict['pub_time'] = pub_time
album_dict['pub_company'] = pub_company
# album_type = info_all[3].get_text(strip=True) # 类型
album_dict['album_type'] = 'null'
# 这个条件下有的info_all长度只有3, 只有发行公司、发行时间、类型
elif len(info_all) == 3:
pub_time = info_all[0].get_text(strip=True) # 发行时间
pub_company = info_all[1].get_text(strip=True) # 发行公司
album_dict['pub_time'] = pub_time
album_dict['pub_company'] = pub_company
album_type = info_all[2].get_text(strip=True) # 类型
album_dict['album_type'] = album_type
else:
pub_time = info_all[1].get_text(strip=True) # 发行时间
pub_company = info_all[2].get_text(strip=True) # 发行公司
album_dict['pub_time'] = pub_time
album_dict['pub_company'] = pub_company
album_type = info_all[4].get_text(strip=True) # 类型
album_dict['album_type'] = album_type
else:
album_dict['singer'] = singer
info_all = soup.find_all('li', class_='data_info__item_song') # 流派,语种,发行时间,发行公司,类型等信息在一起
# 有的没有发行公司,长度为4,完整的5个都有,长度为5
if len(info_all) == 4:
music_genre = info_all[0].get_text(strip=True) # 流派
language = info_all[1].get_text(strip=True) # 语种
pub_time = info_all[2].get_text(strip=True) # 发行时间
unknown = info_all[3].get_text(strip=True)
if '发行公司' in unknown:
pub_company = unknown
album_dict['pub_company'] = pub_company # 发行公司
album_dict['album_type'] = "无"
if '类型' in unknown:
album_type = unknown
album_dict['album_type'] = album_type # 专辑类型
album_dict['pub_company'] = "无"
album_dict['music_genre'] = music_genre
album_dict['language'] = language
album_dict['pub_time'] = pub_time
else:
music_genre = info_all[0].get_text(strip=True) # 流派
language = info_all[1].get_text(strip=True) # 语种
pub_time = info_all[2].get_text(strip=True) # 发行时间
pub_company = info_all[3].get_text(strip=True) # 发行公司
album_type = info_all[4].get_text(strip=True) # 专辑类型
album_dict['music_genre'] = music_genre
album_dict['language'] = language
album_dict['pub_time'] = pub_time
album_dict['pub_company'] = pub_company
album_dict['album_type'] = album_type
album_intro = soup.find_all('div', class_='about__cont')[0].get_text(strip=True) # 专辑简介
album_dict['album_intro'] = album_intro
# 专辑内的所有歌曲信息
song_list = []
songs = soup.find_all('span', class_='songlist__songname_txt') # 歌名
for i in songs:
song = i.get_text(strip=True)
song_list.append(song)
artist_list = []
artists = soup.find_all('div', class_='songlist__artist') # 演唱者
for i in artists:
artist = i.get_text(strip=True)
artist_list.append(artist)
time_list = []
song_time = soup.find_all('div', class_='songlist__time') # 时长
for i in song_time:
time_minute = i.get_text(strip=True)
time_list.append(time_minute)
album_info = zip(song_list, artist_list, time_list)
album_info2 = []
for item in album_info:
songs_list = dict()
songs_list['song'] = item[0]
songs_list['who_sing'] = item[1]
songs_list['time'] = item[2]
album_info2.append(songs_list)
album_dict['album_info2'] = album_info2
# print album_dict, url
return album_dict
except Exception as e:
logging.info(e)
# 有的页面会请求失败,一个是版权无法查看,一个是找不到该页面,此时返回空字典,value全部为null
album_dict['album_name'] = "null"
album_dict['singer'] = "null"
album_dict['music_genre'] = "null"
album_dict['language'] = "null"
album_dict['pub_time'] = "null"
album_dict['pub_company'] = "null"
album_dict['album_type'] = "null"
album_dict['album_info2'] = "null"
return album_dict