接触爬虫差不多一个月,上班时候看视频学习,学了大概五六十个小时样子,代码写的烂,希望别见怪。后期继续努力,以下是我抓取网易云音乐
抓取的页面
https://music.163.com/#/discover/artist/cat?id=4003&initial=0
import urllib.request
import random
from lxml import etree
from openpyxl import workbook,load_workbook
head =['Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50']
def get_head(head): #伪装浏览器
tou = random.choice(head)
url_head = ('User-Agent', tou)
openner = urllib.request.build_opener()
openner.addheaders=[url_head]
urllib.request.install_opener(openner) #全局变量
def getlink(url):
get_head(head)
data = urllib.request.urlopen(url).read().decode()
texy = etree.HTML(data)
aa = texy.xpath('//a[@class="nm nm-icn f-thide s-fc0"]/@href')
# print(aa) #元素带有空格
shankong=[]
for i in aa: #这个for是删除列表里面空格
i=i.strip()
shankong.append(i)
hechengdizhi = [url] #这里把地址添加上是为了方便写入到execl表格,可以看图
for h in shankong: #合成地址
xianglink = 'https://music.163.com'+h
hechengdizhi