使用python爬取电影下载地址并使用transmissionrpc下载

说明

python练手,爬取电影天堂的新电影,获取到磁力链接,输出到日志文件,使用transmissionrpc下载,
涉及知识点:
1、python 操作mongodBD,参考文档
2、BeautifulSoup解析html文档,参考官方开发文档

爬取电影磁力地址

配置文件setting.py

# 数据库配置
db_config = {
    "host": "localhost",
    "port": 27017,
    "db_name": "spider"
}

# 数据库集合映射关系
db_collections = {
    # 电影集合
    "movies": "movies",
    # url爬取管理
    "urlManager": "urlManager"
}

使用了mongodDB存储数据,db_utils.py

import conf.settings as settings
import pymongo


# 返回项目数据库
def _get_db():
    url = "mongodb://" + settings.db_config["host"] + ":" + str(settings.db_config["port"])
    my_client = pymongo.MongoClient(url)
    return my_client[settings.db_config["db_name"]]


# 新增单个
def insert_one(col_name, entry):
    collection = _get_db()[col_name]
    collection.insert_one(entry)


# 批量新增
def insert_list(col_name, entry_list):
    collection = _get_db()[col_name]
    collection.insert_many(entry_list)


# 查找唯一
def find_one(col_name, param):
    collection = _get_db()[col_name]
    return collection.find_one(param)


def find(col_name, param):
    collection = _get_db()[col_name]
    return collection.find(param)


# 更新单个
def update_one(col_name, param, update):
    collection = _get_db()[col_name]
    collection.update_one(param, update)


# 更新
def update_many(col_name, param, update):
    collection = _get_db()[col_name]
    collection.update_many(param, update)

使用数据库制作简单网址爬取链,去重并获取下一个执行网址,url_manager.py

import conf.settings as settings
import libs.db_utils as db_utils


class UrlManager:

    def __init__(self):
        self.col_name = settings.db_collections["urlManager"]

    def add_url(self, url):
        url_doc = db_utils.find_one(self.col_name, {"url": url})
        if url_doc is None:
            doc = {
                "url": url,
                "isCrawl": False
            }
            db_utils.insert_one(self.col_name, doc)

    def has_crawl(self, url):
        url_doc = db_utils.find_one(self.col_name, {"url": url, "isCrawl": True})
        if url_doc is not None:
            return True
        return False

    # 去除url
    def remove_url(self, url):
        param = {"url": url, "isCrawl": False}
        db_utils.update_one(self.col_name, param, {"$set": {"isCrawl": True}})

    # 随机获取一个url
    def get_next_url(self):
        return db_utils.find_one(self.col_name, {"isCrawl": False})

日志工具类,log_utils.py

import logging


class Logger:

    def __init__(self, file_path):
        # 第一步,创建一个logger
        self.logger = logging.getLogger()
        self.logger.setLevel(logging.INFO)  # Log等级总开关

        # 第二步,创建一个handler,用于写入日志文件
        fh = logging.FileHandler(file_path, mode='a')
        fh.setLevel(logging.INFO)  # 用于写到file的等级开关

        # 第三步,再创建一个handler,用于输出到控制台
        ch = logging.StreamHandler()
        ch.setLevel(logging.INFO)  # 输出到console的log等级的开关

        # 第四步,定义handler的输出格式
        formatter = logging.Formatter("%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s")
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)

        # 第五步,将logger添加到handler里面
        self.logger.addHandler(fh)
        self.logger.addHandler(ch)

以下为爬取核心代码,关于使用BeautifulSoup获取下载地址连接,
可以参考官方开发文档

import libs.db_utils as db_utils
import conf.settings as settings
from libs.log_utils import Logger
from libs.url_manager import UrlManager
import requests
import re
from bs4 import BeautifulSoup


class Movies:
    def __init__(self):

        self.headers = {
            "Host": "www.dytt8.net",
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept": "text/html, */*; q=0.01",
            "X-Requested-With": "XMLHttpRequest",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/76.0.3809.132 Safari/537.36",
            "DNT": "1",
            "Referer": "http://dytt8.net/",
            "Accept-Encoding": "gzip, deflate, sdch",
            "Accept-Language": "zh-CN,zh;q=0.8,ja;q=0.6"
        }
        self.domain = "https://www.dytt8.net"
        self.url_domain = "https://www.dytt8.net/html/gndy/dyzz/"
        self.url_manager = UrlManager()
        self.url_manager.add_url("https://www.dytt8.net/html/gndy/dyzz/index.html")
        self.download_url1_reg = re.compile(r'ftp://\S+')
        self.download_url2_reg = re.compile(r'magnet:\S+')
        self.logger = Logger("../log/movie_service.log").logger

    def spider_movies(self):
        while True:
            url_doc = self.url_manager.get_next_url()
            if url_doc is None:
                return
            self.single_spider(url_doc["url"])

    def single_spider(self, url):
        response = requests.get(url=url, headers=self.headers)
        response.encoding = "GBK"
        html_str = response.text
        soup = BeautifulSoup(html_str)
        tables = soup.select(".co_content8 table")
        list_a = soup.select(".x td a")
        for a in list_a:
            a_url = self.url_domain + a["href"]
            # self.logger.info("新url链:" + a_url)
            self.url_manager.add_url(a_url)
        for table_tag in tables:
            move_doc = self.get_move_doc(table_tag)
            if move_doc is None:
                continue
            param = {"name": str(move_doc["name"])}
            select = db_utils.find_one(settings.db_collections["movies"], param)
            if select is None:
                db_utils.insert_one(settings.db_collections["movies"], move_doc)
            else:
                db_utils.update_many(settings.db_collections["movies"], param, {"$set": move_doc})
        self.url_manager.remove_url(url)

    def get_move_doc(self, table_tag):
        try:
            name = table_tag.select_one(".ulink").contents[0]
            des = table_tag.select("tr")[3].select_one("td").contents[0]
            detail_url = self.domain + table_tag.select_one(".ulink")["href"]
            response = requests.get(url=detail_url, headers=self.headers)
            response.encoding = "GBK"
            html_str = response.text
            soup = BeautifulSoup(html_str)
            zoom = soup.select_one("#Zoom")
            download_url1 = ""
            table_a1_List = zoom.select("table a")
            # 获取ftp下载地址
            if table_a1_List is not None:
                for table_a1 in table_a1_List:
                    if table_a1.get("href") is not None and self.download_url1_reg.match(
                            table_a1.get("href")) is not None:
                        download_url1 = table_a1.get("href")
            # 磁力链接有些页面没有
            download_url2 = ""
            a2_List = zoom.select("p a")
            if a2_List is not None:
                for a2 in a2_List:
                    if a2.get("href") is not None and self.download_url2_reg.match(a2.get("href")) is not None:
                        download_url2 = a2.get("href")
            icon = zoom.select_one("img").get("src")
            self.logger.info("电影名称:" + name)
            self.logger.info("ftp下载地址:" + download_url1)
            self.logger.info("磁力链接地址:" + download_url2)
            self.logger.info("电影详情" + detail_url)
            self.logger.info(icon)
            self.logger.info("========================================================================================")
            return {
                "name": name,
                "des": des,
                "icon": icon,
                "detailUrl": detail_url,
                "downloadUrl1": download_url1,
                "download_url2": download_url2
            }
        except:
            return None


movie = Movies()
movie.spider_movies()

使用transmissionrpc来下载磁力链接地址,一下为python操作transmissionrp代码
请先下载Transmission Qt Client并安装具体配置如下
使用python

#安装
pip install transmissionrpc

#开发代码
import transmissionrpc
#有帐号密码的使用:
tc = transmissionrpc.Client(address='127.0.0.1', port=9091, user='test', password='abcdefg123')
#无帐号密码的使用:
tc = transmissionrpc.Client(address='127.0.0.1', port=9091)
#添加下载任务(torrent文件或torrent url)
tc.add_torrent(torrent=r"/data/1.torrent")
tc.add_torrent(torrent="magnet:?xt=urn:btih:...")	
#操作
#获取torrent_id
tc.get_torrents()
#删除下载任务,需要先获取torrent_id
tc.remove_torrent({torrent_id})
#获取任务对象:
tr1 = tc.get_torrent(1)
#torrent对象的控制:开始、暂停、状态、更新分别用start(),stop(),status(),update()
tr1.start()
tr1.stop()
tr1.status()
#注意:每次调用start()、stop()后都要再调用一次update(),否则不会生效。
tr1.update()

操作示例代码

class MovieDownload:

    def __init__(self):
        self.tc = transmissionrpc.Client(address='127.0.0.1', port=9091, user='test', password='123456')
        pass

    def download_movie(self, torrent_url):
        self.tc.add_torrent(torrent=torrent_url)


movie_download = MovieDownload()
torrent = "magnet:?xt=..."
movie_download.download_movie(torrent)
  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值