一、爬取酷狗top500第一页的数据为例
import requests
from bs4 import BeautifulSoup
# 1、分析目标网页,确定爬取的url路径,headers参数
url = 'https://www.kugou.com/yy/rank/home/1-8888.html'
headers = {
'User-Agent': '更换成自己的User-Agent'
}
# 2、发送请求 -- requests 模拟浏览器发送请求,获取响应数据
response = requests.get(url=url, headers=headers)
html = response.text
# 3、解析数据 -- BeautifulSoup转换数据,方法有'html.parser' 和 'lxml',提取数据用select方法提取,能够对转换的数据进行处理(以Chrome浏览器为例,找到对应数据,右键-copy-select)。
soup = BeautifulSoup(html, 'html.parser')
# 提取排名
ranks = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > span.pc_temp_num')
# for rank in ranks:
# print(rank.get_text())
# 提取名字
names = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > a')
# for name in names:
# print(name.get_text())
# 提取时长
times = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > span.pc_temp_tips_r > span')
# for time in times:
# print(time.get_text())
# 数据打包
for rank, name, time in zip(ranks, names, times):
data = {
'rank': rank.get_text().strip(),
'singer': name.get_text().split('-')[0],
'song': name.get_text().split('-')[1],
'time': time.get_text().strip()
}
print(data)
二、爬取酷狗top500全部数据
主要是获取全部的url地址,数据解析与提取不变(修改url地址即可)
import requests
from bs4 import BeautifulSoup
for i in range(6, 24):
# 1、分析目标网页,确定爬取的url路径,headers参数,获取全部的url地址
url = 'https://www.kugou.com/yy/rank/home/{}-8888.html'.format(i)
headers = {
'User-Agent': '更换成自己的User-Agent'
}
# 2、发送请求 -- requests 模拟浏览器发送请求,获取响应数据
response = requests.get(url=url, headers=headers)
html = response.text
# 3、解析数据 -- BeautifulSoup转换数据,方法有'html.parser' 和 'lxml',提取数据用select方法提取,能够对转换的数据进行处理。
soup = BeautifulSoup(html, 'html.parser')
ranks = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > span.pc_temp_num')
# for rank in ranks:
# print(rank.get_text())
names = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > a')
# for name in names:
# print(name.get_text())
times = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > span.pc_temp_tips_r > span')
# for time in times:
# print(time.get_text())
for rank, name, time in zip(ranks, names, times):
data = {
'rank': rank.get_text().strip(),
'singer': name.get_text().split('-')[0],
'song': name.get_text().split('-')[1],
'time': time.get_text().strip()
}
print(data)
# 4 文件的保存
with open('top_500.txt', mode='a') as f:
f.write(str(data))
三、采取多线程爬取
进行函数封装
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool # 导入多线程模块
headers = {
'User-Agent': '更换成自己的User-Agent'
}
# 获取网站的内容
def get_info(url):
# 请求网站
response = requests.get(url, headers=headers)
html = response.text
# 解析网站
soup = BeautifulSoup(html, 'html.parser')
# 获取网站内容
ranks = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > span.pc_temp_num')
names = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > a')
times = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > span.pc_temp_tips_r > span')
for rank, name, time in zip(ranks, names, times):
data = {
'rank': rank.get_text().strip(),
'name': name.get_text(),
'time': time.get_text().strip()
}
print(data)
# 请求网站
def get_url(page):
urls = f'https://www.kugou.com/yy/rank/home/{page}-8888.html'
results = get_info(urls)
print(results)
# if __name__ == '__main__':
# for i in range(1, 24):
# get_url(i)
# 多线程爬取
if __name__ == '__main__':
pool = Pool()
pool.map(get_url, [i for i in range(1, 24)])