获取热门歌手信息:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import csv
def get_html_src(url):
driver = webdriver.Chrome()
driver.get(url)
#切换成frame
driver.switch_to_frame("g_iframe")
#休眠3秒等待加载
time.sleep(3)
#获取页面源
page_src = driver.page_source
driver.close()
#print(page_src)
return page_src
#解析网页
def parse_html_page(html):
soup = BeautifulSoup(html,"lxml")
items = soup.find('ul',id = 'm-artist-box').find_all('li') #找到所有热门歌手所在位置
return items
#保存csv文件
def write_to_csv(items):
with open("music163_artists.csv",'w',newline='',encoding="utf-8") as csvfile: # newline = '' ,可避免空行
writer= csv.writer(csvfile)
for item in items:
artist_id = item.a['href'].replace('/artist?id=', '') # 查找歌手id
artist_name = item.a['title'].rstrip("的音乐") # 歌手姓名
print(artist_id, artist_name)
writer.writerow([artist_name,artist_id])
csvfile.close()
html = get_html_src('https://music.163.com/#/discover/artist')
items = parse_html_page(html)
write_to_csv(items)
获取热门歌手歌曲:
#获取页面代码
def get_html_src(url):
driver = webdriver.Chrome()
driver.get(url)
#切换成frame
driver.switch_to_frame("g_iframe")
#休眠3秒等待加载
time.sleep(3)
#获取页面源
page_src = driver.page_source
driver.close()
#print(page_src)
return page_src
#get_html_src('https://music.163.com/#/artist?id=6731')
#解析页面
def parse_html_page(html):
# pattern = '<span class="txt"><a href="/song?id=(\d*)"><b title="(.*?)">'
# 这里是使用lxml解析器进行解析,lxml速度快,文档容错能力强,也能使用html5lib
soup = BeautifulSoup(html,"lxml")
items = soup.find_all('span','txt')
return items
#写入CSV文件
def write_to_csv(items,artist_name):
with open("music163_songs.csv","a",encoding="utf-8") as csvfile:#需要设置编码
writer = csv.writer(csvfile)
writer.writerow(["歌手名字",artist_name])
for item in items:
writer.writerow([item.a['href'].replace('/song?id=',''),item.b['title']])
print('歌曲ID:',item.a['href'].replace('/song?id=',''))
song_name = item.b['title']
print('歌曲名字:',song_name)
csvfile.close()
#读取歌手文件
def read_csv():
with open(r"E:\pyobject\music\music163_artists.csv","r",encoding="utf-8") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
artist_name,artist_id = row
yield artist_name,artist_id
#程序主函数
for readcsv in read_csv():
artist_name, artist_id = readcsv
url = 'https://music.163.com/#/artist?id='+str(artist_id)
print("正在加载{}的音乐".format(artist_name))
html = get_html_src(url)
items = parse_html_page(html)
write_to_csv(items,artist_name)
print("{}的歌曲写入完成".format(artist_name))