以网易云音乐为例,提取歌手陈奕迅eason所有专辑信息(包括专辑名称、专辑图片网址、专辑网址、发布时间),并保存到txt文件中
链接为为:网易云陈奕迅所有专辑
1. 导入相关模块以及初始化
在这里面我们利用lxml.etree模块下面的xpath()方法提取信息
import requests
from urllib.parse import urlencode
from lxml import etree
import time
base_url = 'https://music.163.com/artist/album?'
headers = {
'Host': 'music.163.com',
'Referer': 'https://music.163.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'}
max_page = 9
2. 利用Ajax获取网站信息
def get_page(page):
params = {
'id': '2116',
'limit': '12',
'offset': 12*page}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text, page
except requests.ConnectionError as e:
print('Error', e.args)
3. 解析网址,提取信息
def parse_page(text):
if text:
html=etree.HTML(text)
items=html.xpath('//html//ul[@id="m-song-module"]')
print(len(items))
album=[]
image_url=items[0].xpath('//img//@src')
album_time=items[0].xpath('//p//span[@class="s-fc3"]/text()')
album_title=items[0].xpath('//p[@class="dec dec-1 f-thide2 f-pre"]/@title')
album_url=items[0].xpath('//div//a[@class="msk"]/@href')
for j in range(len(album_title)):
album.append([album_title[j] ,image_url[j] ,album_url[j] ,album_time[j]])
#print(album)
return album
else:
return None
4. 将提取信息写入文本文件中
def save_to_txt(result):
with open('album_result.txt','a',encoding='utf-8',errors='ignore') as f:
result_str=",".join(result)
f.write(result_str+'\n')
5. 主程序
if __name__ == '__main__':
for page in range(0, max_page):
print('****爬取第{}页****'.format(page+1))
text,thispage = get_page(page)
results = parse_page(text)
for result in results:
print(result)
save_to_txt(result)
time.sleep(1)
*运行结果如下所示