说明
python练手,爬取电影天堂的新电影,获取到磁力链接,输出到日志文件,使用transmissionrpc下载,
涉及知识点:
1、python 操作mongodBD,参考文档
2、BeautifulSoup解析html文档,参考官方开发文档
爬取电影磁力地址
配置文件setting.py
# 数据库配置
db_config = {
"host": "localhost",
"port": 27017,
"db_name": "spider"
}
# 数据库集合映射关系
db_collections = {
# 电影集合
"movies": "movies",
# url爬取管理
"urlManager": "urlManager"
}
使用了mongodDB存储数据,db_utils.py
import conf.settings as settings
import pymongo
# 返回项目数据库
def _get_db():
url = "mongodb://" + settings.db_config["host"] + ":" + str(settings.db_config["port"])
my_client = pymongo.MongoClient(url)
return my_client[settings.db_config["db_name"]]
# 新增单个
def insert_one(col_name, entry):
collection = _get_db()[col_name]
collection.insert_one(entry)
# 批量新增
def insert_list(col_name, entry_list):
collection = _get_db()[col_name]
collection.insert_many(entry_list)
# 查找唯一
def find_one(col_name, param):
collection = _get_db()[col_name]
return collection.find_one(param)
def find(col_name, param):
collection = _get_db()[col_name]
return collection.find(param)
# 更新单个
def update_one(col_name, param, update):
collection = _get_db()[col_name]
collection.update_one(param, update)
# 更新
def update_many(col_name, param, update):
collection = _get_db()[col_name]
collection.update_many(param, update)
使用数据库制作简单网址爬取链,去重并获取下一个执行网址,url_manager.py
import conf.settings as settings
import libs.db_utils as db_utils
class UrlManager:
def __init__(self):
self.col_name = settings.db_collections["urlManager"]
def add_url(self, url):
url_doc = db_utils.find_one(self.col_name, {"url": url})
if url_doc is None:
doc = {
"url": url,
"isCrawl": False
}
db_utils.insert_one(self.col_name, doc)
def has_crawl(self, url):
url_doc = db_utils.find_one(self.col_name, {"url": url, "isCrawl": True})
if url_doc is not None:
return True
return False
# 去除url
def remove_url(self, url):
param = {"url": url, "isCrawl": False}
db_utils.update_one(self.col_name, param, {"$set": {"isCrawl": True}})
# 随机获取一个url
def get_next_url(self):
return db_utils.find_one(self.col_name, {"isCrawl": False})
日志工具类,log_utils.py
import logging
class Logger:
def __init__(self, file_path):
# 第一步,创建一个logger
self.logger = logging.getLogger()
self.logger.setLevel(logging.INFO) # Log等级总开关
# 第二步,创建一个handler,用于写入日志文件
fh = logging.FileHandler(file_path, mode='a')
fh.setLevel(logging.INFO) # 用于写到file的等级开关
# 第三步,再创建一个handler,用于输出到控制台
ch = logging.StreamHandler()
ch.setLevel(logging.INFO) # 输出到console的log等级的开关
# 第四步,定义handler的输出格式
formatter = logging.Formatter("%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s")
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# 第五步,将logger添加到handler里面
self.logger.addHandler(fh)
self.logger.addHandler(ch)
以下为爬取核心代码,关于使用BeautifulSoup获取下载地址连接,
可以参考官方开发文档
import libs.db_utils as db_utils
import conf.settings as settings
from libs.log_utils import Logger
from libs.url_manager import UrlManager
import requests
import re
from bs4 import BeautifulSoup
class Movies:
def __init__(self):
self.headers = {
"Host": "www.dytt8.net",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Accept": "text/html, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
"DNT": "1",
"Referer": "http://dytt8.net/",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8,ja;q=0.6"
}
self.domain = "https://www.dytt8.net"
self.url_domain = "https://www.dytt8.net/html/gndy/dyzz/"
self.url_manager = UrlManager()
self.url_manager.add_url("https://www.dytt8.net/html/gndy/dyzz/index.html")
self.download_url1_reg = re.compile(r'ftp://\S+')
self.download_url2_reg = re.compile(r'magnet:\S+')
self.logger = Logger("../log/movie_service.log").logger
def spider_movies(self):
while True:
url_doc = self.url_manager.get_next_url()
if url_doc is None:
return
self.single_spider(url_doc["url"])
def single_spider(self, url):
response = requests.get(url=url, headers=self.headers)
response.encoding = "GBK"
html_str = response.text
soup = BeautifulSoup(html_str)
tables = soup.select(".co_content8 table")
list_a = soup.select(".x td a")
for a in list_a:
a_url = self.url_domain + a["href"]
# self.logger.info("新url链:" + a_url)
self.url_manager.add_url(a_url)
for table_tag in tables:
move_doc = self.get_move_doc(table_tag)
if move_doc is None:
continue
param = {"name": str(move_doc["name"])}
select = db_utils.find_one(settings.db_collections["movies"], param)
if select is None:
db_utils.insert_one(settings.db_collections["movies"], move_doc)
else:
db_utils.update_many(settings.db_collections["movies"], param, {"$set": move_doc})
self.url_manager.remove_url(url)
def get_move_doc(self, table_tag):
try:
name = table_tag.select_one(".ulink").contents[0]
des = table_tag.select("tr")[3].select_one("td").contents[0]
detail_url = self.domain + table_tag.select_one(".ulink")["href"]
response = requests.get(url=detail_url, headers=self.headers)
response.encoding = "GBK"
html_str = response.text
soup = BeautifulSoup(html_str)
zoom = soup.select_one("#Zoom")
download_url1 = ""
table_a1_List = zoom.select("table a")
# 获取ftp下载地址
if table_a1_List is not None:
for table_a1 in table_a1_List:
if table_a1.get("href") is not None and self.download_url1_reg.match(
table_a1.get("href")) is not None:
download_url1 = table_a1.get("href")
# 磁力链接有些页面没有
download_url2 = ""
a2_List = zoom.select("p a")
if a2_List is not None:
for a2 in a2_List:
if a2.get("href") is not None and self.download_url2_reg.match(a2.get("href")) is not None:
download_url2 = a2.get("href")
icon = zoom.select_one("img").get("src")
self.logger.info("电影名称:" + name)
self.logger.info("ftp下载地址:" + download_url1)
self.logger.info("磁力链接地址:" + download_url2)
self.logger.info("电影详情" + detail_url)
self.logger.info(icon)
self.logger.info("========================================================================================")
return {
"name": name,
"des": des,
"icon": icon,
"detailUrl": detail_url,
"downloadUrl1": download_url1,
"download_url2": download_url2
}
except:
return None
movie = Movies()
movie.spider_movies()
使用transmissionrpc来下载磁力链接地址,一下为python操作transmissionrp代码
请先下载Transmission Qt Client并安装具体配置如下
#安装
pip install transmissionrpc
#开发代码
import transmissionrpc
#有帐号密码的使用:
tc = transmissionrpc.Client(address='127.0.0.1', port=9091, user='test', password='abcdefg123')
#无帐号密码的使用:
tc = transmissionrpc.Client(address='127.0.0.1', port=9091)
#添加下载任务(torrent文件或torrent url)
tc.add_torrent(torrent=r"/data/1.torrent")
tc.add_torrent(torrent="magnet:?xt=urn:btih:...")
#操作
#获取torrent_id
tc.get_torrents()
#删除下载任务,需要先获取torrent_id
tc.remove_torrent({torrent_id})
#获取任务对象:
tr1 = tc.get_torrent(1)
#torrent对象的控制:开始、暂停、状态、更新分别用start(),stop(),status(),update()
tr1.start()
tr1.stop()
tr1.status()
#注意:每次调用start()、stop()后都要再调用一次update(),否则不会生效。
tr1.update()
操作示例代码
class MovieDownload:
def __init__(self):
self.tc = transmissionrpc.Client(address='127.0.0.1', port=9091, user='test', password='123456')
pass
def download_movie(self, torrent_url):
self.tc.add_torrent(torrent=torrent_url)
movie_download = MovieDownload()
torrent = "magnet:?xt=..."
movie_download.download_movie(torrent)