这个实例用正则表达式最为简单
import re
import requests
import xlwt
key_value = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
def get_html(url):
try:
webData = requests.get(url,headers = key_value)
webData.raise_for_status()
webData.encoding = webData.apparent_encoding
return webData.text
except:
return None
def get_info(html,mylist):
# selector = etree.HTML(html)观察结构发现Xpath不好用,select也不好使,故用re
# find_all = selector.xpath('//')
ranks = re.findall('<p class="num">(.*?)</p>',html,re.S)
titles = re.findall('2016" target="_blank">(.*?)</a>',html,re.S)
singers = re.findall('<a href="/artist/content\?name=(.*?)">',html,re.S)
for rank,title,singer in zip(ranks,titles,singers):
temp = [rank,title,singer]
mylist.append(temp)
def getExel(mylist):
header = ['排名','歌名','歌手']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('Sheet1')
for k in range(len(header)):
sheet.write(0,k,header[k])
i = 1#注意要从1开始,上面那一行标题已经占了一行
for list in mylist:
j = 0
for data in list:
sheet.write(i,j,data)
j += 1
i += 1
book.save('D:/酷我音乐排行榜前200.xls')
if __name__ == '__main__':
mylist = []
url = 'http://www.kuwo.cn/bang/index'
html = get_html(url)
get_info(html,mylist)
getExel(mylist)
每天坚持。。
Result!
热爱生活,热爱编程。