使用python爬取电影下载地址并使用transmissionrpc下载

最新推荐文章于 2024-07-08 03:49:07 发布

青鸟逐梦

最新推荐文章于 2024-07-08 03:49:07 发布

阅读量2.9k

点赞数

分类专栏： Python学习文章标签： python爬虫 transmissionrpc python

本文链接：https://blog.csdn.net/qq_35842042/article/details/102611667

版权

Python学习专栏收录该内容

2 篇文章

订阅专栏

本文介绍了一种使用Python进行电影天堂网站爬虫的实践案例，包括如何利用BeautifulSoup解析HTML获取电影磁力链接，以及使用transmissionrpc进行下载。文章详细讲解了爬虫流程，从配置数据库到解析网页，再到下载资源的全过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

说明

python练手，爬取电影天堂的新电影，获取到磁力链接，输出到日志文件，使用transmissionrpc下载，
涉及知识点：
1、python 操作mongodBD,参考文档
2、BeautifulSoup解析html文档，参考官方开发文档

爬取电影磁力地址

配置文件setting.py

# 数据库配置
db_config = {
    "host": "localhost",
    "port": 27017,
    "db_name": "spider"
}

# 数据库集合映射关系
db_collections = {
    # 电影集合
    "movies": "movies",
    # url爬取管理
    "urlManager": "urlManager"
}

使用了mongodDB存储数据,db_utils.py

import conf.settings as settings
import pymongo


# 返回项目数据库
def _get_db():
    url = "mongodb://" + settings.db_config["host"] + ":" + str(settings.db_config["port"])
    my_client = pymongo.MongoClient(url)
    return my_client[settings.db_config["db_name"]]


# 新增单个
def insert_one(col_name, entry):
    collection = _get_db()[col_name]
    collection.insert_one(entry)


# 批量新增
def insert_list(col_name, entry_list):
    collection = _get_db()[col_name]
    collection.insert_many(entry_list)


# 查找唯一
def find_one(col_name, param):
    collection = _get_db()[col_name]
    return collection.find_one(param)


def find(col_name, param):
    collection = _get_db()[col_name]
    return collection.find(param)


# 更新单个
def update_one(col_name, param, update):
    collection = _get_db()[col_name]
    collection.update_one(param, update)


# 更新
def update_many(col_name, param, update):
    collection = _get_db()[col_name]
    collection.update_many(param, update)

使用数据库制作简单网址爬取链，去重并获取下一个执行网址，url_manager.py

import conf.settings as settings
import libs.db_utils as db_utils


class UrlManager:

    def __init__(self):
        self.col_name = settings.db_collections["urlManager"]

    def add_url(self, url):
        url_doc = db_utils.find_one(self.col_name, {"url": url})
        if url_doc is None:
            doc = {
                "url": url,
                "isCrawl": False
            }
            db_utils.insert_one(self.col_name, doc)

    def has_crawl(self, url):
        url_doc = db_utils.find_one(self.col_name, {"url": url, "isCrawl": True})
        if url_doc is not None:
            return True
        return False

    # 去除url
    def remove_url(self, url):
        param = {"url": url, "isCrawl": False}
        db_utils.update_one(self.col_name, param, {"$set": {"isCrawl": True}})

    # 随机获取一个url
    def get_next_url(self):
        return db_utils.find_one(self.col_name, {"isCrawl": False})

日志工具类，log_utils.py

import logging


class Logger:

    def __init__(self, file_path):
        # 第一步，创建一个logger
        self.logger = logging.getLogger()
        self.logger.setLevel(logging.INFO)  # Log等级总开关

        # 第二步，创建一个handler，用于写入日志文件
        fh = logging.FileHandler(file_path, mode='a')
        fh.setLevel(logging.INFO)  # 用于写到file的等级开关

        # 第三步，再创建一个handler,用于输出到控制台
        ch = logging.StreamHandler()
        ch.setLevel(logging.INFO)  # 输出到console的log等级的开关

        # 第四步，定义handler的输出格式
        formatter = logging.Formatter("%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s")
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)

        # 第五步，将logger添加到handler里面
        self.logger.addHandler(fh)
        self.logger.addHandler(ch)

以下为爬取核心代码，关于使用BeautifulSoup获取下载地址连接，
可以参考官方开发文档

import libs.db_utils as db_utils
import conf.settings as settings
from libs.log_utils import Logger
from libs.url_manager import UrlManager
import requests
import re
from bs4 import BeautifulSoup


class Movies:
    def __init__(self):

        self.headers = {
            "Host": "www.dytt8.net",
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept": "text/html, */*; q=0.01",
            "X-Requested-With": "XMLHttpRequest",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/76.0.3809.132 Safari/537.36",
            "DNT": "1",
            "Referer": "http://dytt8.net/",
            "Accept-Encoding": "gzip, deflate, sdch",
            "Accept-Language": "zh-CN,zh;q=0.8,ja;q=0.6"
        }
        self.domain = "https://www.dytt8.net"
        self.url_domain = "https://www.dytt8.net/html/gndy/dyzz/"
        self.url_manager = UrlManager()
        self.url_manager.add_url("https://www.dytt8.net/html/gndy/dyzz/index.html")
        self.download_url1_reg = re.compile(r'ftp://\S+')
        self.download_url2_reg = re.compile(r'magnet:\S+')
        self.logger = Logger("../log/movie_service.log").logger

    def spider_movies(self):
        while True:
            url_doc = self.url_manager.get_next_url()
            if url_doc is None:
                return
            self.single_spider(url_doc["url"])

    def single_spider(self, url):
        response = requests.get(url=url, headers=self.headers)
        response.encoding = "GBK"
        html_str = response.text
        soup = BeautifulSoup(html_str)
        tables = soup.select(".co_content8 table")
        list_a = soup.select(".x td a")
        for a in list_a:
            a_url = self.url_domain + a["href"]
            # self.logger.info("新url链：" + a_url)
            self.url_manager.add_url(a_url)
        for table_tag in tables:
            move_doc = self.get_move_doc(table_tag)
            if move_doc is None:
                continue
            param = {"name": str(move_doc["name"])}
            select = db_utils.find_one(settings.db_collections["movies"], param)
            if select is None:
                db_utils.insert_one(settings.db_collections["movies"], move_doc)
            else:
                db_utils.update_many(settings.db_collections["movies"], param, {"$set": move_doc})
        self.url_manager.remove_url(url)

    def get_move_doc(self, table_tag):
        try:
            name = table_tag.select_one(".ulink").contents[0]
            des = table_tag.select("tr")[3].select_one("td").contents[0]
            detail_url = self.domain + table_tag.select_one(".ulink")["href"]
            response = requests.get(url=detail_url, headers=self.headers)
            response.encoding = "GBK"
            html_str = response.text
            soup = BeautifulSoup(html_str)
            zoom = soup.select_one("#Zoom")
            download_url1 = ""
            table_a1_List = zoom.select("table a")
            # 获取ftp下载地址
            if table_a1_List is not None:
                for table_a1 in table_a1_List:
                    if table_a1.get("href") is not None and self.download_url1_reg.match(
                            table_a1.get("href")) is not None:
                        download_url1 = table_a1.get("href")
            # 磁力链接有些页面没有
            download_url2 = ""
            a2_List = zoom.select("p a")
            if a2_List is not None:
                for a2 in a2_List:
                    if a2.get("href") is not None and self.download_url2_reg.match(a2.get("href")) is not None:
                        download_url2 = a2.get("href")
            icon = zoom.select_one("img").get("src")
            self.logger.info("电影名称：" + name)
            self.logger.info("ftp下载地址：" + download_url1)
            self.logger.info("磁力链接地址：" + download_url2)
            self.logger.info("电影详情" + detail_url)
            self.logger.info(icon)
            self.logger.info("========================================================================================")
            return {
                "name": name,
                "des": des,
                "icon": icon,
                "detailUrl": detail_url,
                "downloadUrl1": download_url1,
                "download_url2": download_url2
            }
        except:
            return None


movie = Movies()
movie.spider_movies()

使用transmissionrpc来下载磁力链接地址，一下为python操作transmissionrp代码
请先下载Transmission Qt Client并安装具体配置如下
使用python

#安装
pip install transmissionrpc

#开发代码
import transmissionrpc
#有帐号密码的使用：
tc = transmissionrpc.Client(address='127.0.0.1', port=9091, user='test', password='abcdefg123')
#无帐号密码的使用：
tc = transmissionrpc.Client(address='127.0.0.1', port=9091)
#添加下载任务（torrent文件或torrent url）
tc.add_torrent(torrent=r"/data/1.torrent")
tc.add_torrent(torrent="magnet:?xt=urn:btih:...")	
#操作
#获取torrent_id
tc.get_torrents()
#删除下载任务，需要先获取torrent_id
tc.remove_torrent({torrent_id})
#获取任务对象：
tr1 = tc.get_torrent(1)
#torrent对象的控制：开始、暂停、状态、更新分别用start()，stop()，status()，update()
tr1.start()
tr1.stop()
tr1.status()
#注意：每次调用start()、stop()后都要再调用一次update()，否则不会生效。
tr1.update()

操作示例代码

class MovieDownload:

    def __init__(self):
        self.tc = transmissionrpc.Client(address='127.0.0.1', port=9091, user='test', password='123456')
        pass

    def download_movie(self, torrent_url):
        self.tc.add_torrent(torrent=torrent_url)


movie_download = MovieDownload()
torrent = "magnet:?xt=..."
movie_download.download_movie(torrent)