python爬虫案例-python爬虫案例

一、案例一:酷狗top50

分析:

1.Requests库用于请求网页获取网页数据,BeautifulSoup用于解析网页数据,time库的sleep()方法可以让程序暂停

2.添加User-Agent,用于伪装为浏览器,便于爬虫的稳定性。

3.定义get_info()函数,用于获取网页信息并输出信息。

4.程序的主入口利用列表的推导式构造23个URL,并依次调用get_info()函数

代码:

import requests

from bs4 import BeautifulSoup

import time #导入相应的库文件

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}

def get_info(url):

wb_data = requests.get(url, headers=headers)

soup = BeautifulSoup(wb_data.text, 'lxml')

ranks = soup.select('span.pc_temp_num')

titles = soup.select('div.pc_temp_songlist > ul > li > a')

times = soup.select('span.pc_temp_tips_r > span')

for rank, title, time in zip(ranks, titles, times):

data = {

'rank': rank.get_text().strip(),

'singer': title.get_text().split('-')[0],

'song': title.get_text().split('-')[0], # 通过split获取歌手和歌曲信息

'time': time.get_text().strip()

}

print(data) # 获取爬虫信息并按字典格式打印

if __name__ == '__main__': # 程序主入口

urls = ['http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for i in range(1, 24)] # 构造多页URL

for url in urls:

get_info(url) # 循环调用get_info()函数

time.sleep(1) # 睡眠1秒

二、案例二:下载网易云top50

from bs4 import BeautifulSoup

import requests

import json

import os

import time

class Music:

def __init__(self, init_url, download):

self.init_url = init_url

self.download = download

def mkdir(self, path):

path = path.strip()

if not os.path.exists(path): # 判断此文件夹存不存在

print('创建 ', path, '文件夹')

os.makedirs(path)

return True

else:

print(path, '文件夹已存在,无需创建')

return False

def download_video(self, video_url, name):

path = self.download + "\" + name + '.mp3' # 拼接保存后的文件路径

# print(path)

headers = {

"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36",

}

header = {

"Origin": "http://music.163.com/",

"Referer": video_url, # 请求头必须添加referer

}

headers.update(header) # 更新头部信息

size = 0

start = time.time()

try:

result = requests.get(video_url, headers=headers, stream=True, verify=False)

# print('result', result)

with open(path, "wb") as f:

for chunk in result.iter_content(1024):

f.write(chunk)

f.flush() # 清空缓存

size = size + len(chunk)

print("已下载:%0.2f Mb" % (size / (1024 * 1024)))

except Exception as e:

print("url下载错误:%s" % video_url)

print(e)

stop = time.time()

print("下载完成,耗时:%0.2f秒" % (stop - start))

def spider(self):

r = requests.get(self.init_url).text

soupObj = BeautifulSoup(r, 'lxml')

song_ids = soupObj.find('textarea').text

# print(song_ids)

jobj = json.loads(song_ids)

list01 = []

for item in jobj:

dict01 = {}

# print(item['id']) # 歌曲id

# print(item['name']) # 歌曲名称

dict01['name'] = item['name']

dict01['id'] = item['id']

list01.append(dict01)

print(list01)

len_list = len(list01)

print("一共", len_list, "首歌曲")

self.mkdir(self.download)

print('开始切换文件夹')

os.chdir(self.download)

for i in list01:

name = i['name']

id = i['id']

song_url = "http://music.163.com/song/media/outer/url?id=" + str(id) + ".mp3"

print(song_url) # 最终下载的音乐链接

self.download_video(song_url, name) # 下载

len_list = len_list - 1

print("还剩", len_list, "首歌曲需要下载")

if __name__ == '__main__':

init_url = 'https://music.163.com/artist?id=30284835' # 枯木逢春

download = 'D:\Music\downloads' # 保存地址

s = Music(init_url, download)

s.spider()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值