python 多线程抓取xunlei磁力下载链接

weixin_53748624

于 2022-10-12 16:24:56 发布

阅读量1.4k

点赞数

文章标签： python pycharm

本文链接：https://blog.csdn.net/weixin_53748624/article/details/127282929

版权

import urllib.request
import re
import time
import threading


class Spider(object):

    def __init__(self):
        # 定义字典，用于保存影片信息
        self.films_dict = {}
        self.i = 1
        self.lock1 = threading.Lock()

    def start(self):

        # 调用下载函数，获取下载连接
        for page in range(1, 3):
            t1 = threading.Thread(target=self.get_movie_links, args=(page,))
            t1.start()

        # 得到字典对应的数组
        list1 = self.films_dict.items()

        # 所有线程执行完毕后再退出
        while len(threading.enumerate()) != 1:
            time.sleep(1)

        # 遍历下载字典，获取影片名称，下载地址
        for file_name, film_link in list1:
            print("%s | %s" % (file_name, film_link))

    def get_movie_links(self, page):
        """获取列表页影片信息"""
        # 1列表页的1 列表页的地址
        film_list_url = "https://www.ygdy8.net/html/gndy/china/list_4_%d.html" % page
        # 2 打开url地址，获取数据（这个数据的是html代码数据并且是二进制数据）
        reponse_list = urllib.request.urlopen(film_list_url)
        # 2.1通过read() 读取网络资源数据
        response_list_data = reponse_list.read()
        # 3 解码获取到的数据
        response_list_text = response_list_data.decode("GBK")
        # 4 使用正则得到所有的影片的内容页地址地址
        # 4.1 使用findall() 根据正则查找所有影片对应的内容页地址
        url_list = re.findall("<a href=\"(.*)\" class=\"ulink\">(.*)</a>", response_list_text)
        # 4.3 循环便利内容页地址列表 url_list
        for content_url, film_name in url_list:
            content_url = "https://www.ygdy8.net" + content_url
            # print(content_url)
            # print("影片名称：%s， 内容与地址：%s" % (film_name, content_url))
            # 4.4 打开内容页
            response_content = urllib.request.urlopen(content_url)
            # 4.5 接受内容页数据
            response_content_data = response_content.read()
            # 4.7 解码得到内容页的文本内容
            response_content_text = response_content_data.decode("GBK")
            # 4.8 取出下载地址 https://www.ygdy8.net
            # print(response_content_text)
            # regexBuddy 工具中使用 = 号合法 ， 在python中的正则 = 号不合法
            result = re.search(r" href=\"(.*?)\"><strong>|磁力链  (.*)</font>", response_content_text)
            # print(result.group(1))

            if result:
                # 5 保存影片名称和地址到字典中
                self.lock1.acquire()
                self.films_dict[film_name] = result.group(1)
                self.lock1.release()
                print("已经成功爬取%d个影片地址！" % self.i)
                self.i += 1
            else:
                continue


def main():
    film_spider = Spider()
    film_spider.start()


if __name__ == '__main__':
    main()