【Python——爬取网易云相关歌曲信息(多线程)】

import re
import requests
import Proxy
import time
from threading import Thread
headers = {省略}
cookies1 = {省略} 
cookies2 = {省略} 
cookies3 = {省略} 
cookies4 = {省略}
proxies = Proxy.get_ip()
url1 = 'https://music.163.com/discover/artist'
response1 = requests.get(url=url1, proxies=proxies, headers=headers, cookies=cookies1)
content1 = response1.content.decode()
# print(content)
# with open('wyy.txt', 'w',encoding='utf8') as f:
#     f.write(content)
datas = re.findall('href="([^"]+)"[^>]*>([^<]+)<', content1)[6:21]
print(datas)


def task(data):
    global proxies
    url2 = 'https://music.163.com'+data[0]
    # print(url2)
    response2 = ''
    for i in range(2):
        try:
            response2 = requests.get(url=url2, proxies=proxies, headers=headers, cookies=cookies2)
            break
        except:
            proxies = Proxy.get_ip()
    content2 = response2.content.decode()
    # print(content2)
    singers = re.findall('<a class="f-tdn" href="(.*?)" title="(.*?)">', content2)
    # print(singers)

    for singer in range(0, len(singers)):
        url3 = 'https://music.163.com' + singers[singer][0]
        # print(url3)
        response3 = ''
        for i in range(2):
            try:
                response3 = requests.get(url=url3, proxies=proxies, headers=headers, cookies=cookies3)
                break
            except:
                proxies = Proxy.get_ip()
        content3 = response3.content.decode()
        # print(content3)
        results = ''.join(re.findall('<a href="(.*?)" hidefocus="true" class="u-btn2 u-btn2-1"><i>查看歌手页</i></a>',
                             content3))
        # print(results)

        url4 = 'https://music.163.com' + results
        # print(url4)
        response4 = ''
        for i in range(2):
            try:
                response4 = requests.get(url=url4, proxies=proxies, headers=headers, cookies=cookies4)
                break
            except:
                proxies = Proxy.get_ip()
        content4 = response4.content.decode()
        # print(content4)
        songs = re.findall(r'<li><a href="/song\?id=[0-9]+">(.*?)</a></li>', content4)
        print(f'{data[1]}---{singers[singer][1]}---{songs}\n')


if __name__ == '__main__':
    start_time = time.time()
    t_list = []
    for i in range(15):
        t = Thread(target=task, args=(datas[i],))
        t.start()
        t_list.append(t)
    for t in t_list:
        t.join()
    print("cost time: ", time.time() - start_time)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值