简单多线程,利用地址池创建多线程,n倍提高爬虫速度,获取音乐文件破解中
import re
import time
import json
import datetime
import threading
import requests_html
path = 'json.txt'
session = requests_html.HTMLSession()
data_urls = []
get_all_song_url = []
class Wangyi(object):
def __init__(self):
# 所有歌单页面的url
self.list_urls = [
f'https://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset={offset*35}'
for offset in range(38)
]
self.head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
}
self.proxies = {
'http': '171.15.66.177:9999'
}
# 获取所有页面的url
def get_all_page_urls(self):
self.many_thread()
return data_urls
# 获取一个歌单page的所有url
def get_urls(self, url):
rous = session.get(url, headers=self.head, proxies=self.proxies)
html = requests_html.etree.HTML(rous.text)
lis = html.xpath('//*[@id="m-pl-container"]/li')
for li in lis:
# 歌单详情url
url_id = 'https://music.163.com' + li.xpath('./div[1]/a/@href')[0]
# 歌单名
song_name = li.xpath('./div[1]/a/@title')[0]
urls_data = {
"url_id": url_id,
"song_name": song_name,
}
data_urls.append(urls_data)
# 获取所有歌单page的url
def many_thread(self):
threads = []
for url in self.list_urls:
t = threading.Thread(target=Wangyi.get_urls, args=(self, url,))
threads.append(t)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
# 获取一个歌单里所有歌曲的url
def get_song_url(self, url, song_sheet):
head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
}
rous = session.get(url, headers=head)
# print(rous.text)
html = requests_html.etree.HTML(rous.text)
links = html.xpath('//*[@class="f-hide"]/li')
for link in links:
song_name = link.xpath('./a/text()')[0]
link = 'https://music.163.com' + link.xpath('./a/@href')[0]
song_id = re.findall('song\?id=(.*)', link)[0]
song_dic = {
"song_sheet": song_sheet, # 歌单
"song_name": song_name, # 歌名
"link": link, # 链接
"song_id": song_id, # 歌曲id
}
get_all_song_url.append(song_dic)
# 多线程获取所有歌单页面里歌曲url
def many_thread_get_song_urls(self):
threads_songs = []
for urls in self.get_all_page_urls():
url = urls['url_id']
song_sheet = urls['song_name']
t = threading.Thread(target=Wangyi.get_song_url, args=(self, url, song_sheet,))
threads_songs.append(t)
for threads_song in threads_songs:
threads_song.start()
for threads_song in threads_songs:
threads_song.join()
if __name__ == "__main__":
Wangyi().many_thread_get_song_urls()
string = json.dumps(get_all_song_url)
with open(path, 'w') as f:
f.write(string)
print(string)
有12980首歌