电影/电视剧/综艺/动漫/游戏 下载链接搜索引擎
前言
本文介绍如何通过python网络爬虫实现电影/电视剧/综艺/动漫/游戏 下载链接的爬取,用户只需要输入搜索的关键字,选择搜索类型,程序就会返回搜索下载链接的结果,用户可以使用迅雷或者其他的一些下载工具来下载对应的视频。
提示:以下是本篇文章正文内容
一、设计思路
- 首先使用requests模块对电影天堂发起url请求,获得对应的html文本。
- 使用re与BeautifulSoup对获取到的html进行解析,得到对应的电影信息与下载链接。
二、源码展示
1.自动检测并安装必须的数据库
代码如下(示例):
import os
try:
import requests as rq
from bs4 import BeautifulSoup as BS
import lxml as lx
except Exception as e:
print("正在自动安装必须的数据库, 请稍等...")
info1 = os.popen(cmd="pip install requests -i https://pypi.doubanio.com/simple/").read()
info2 = os.popen(cmd="pip install bs4 -i https://pypi.doubanio.com/simple/").read()
info3 = os.popen(cmd="pip install lxml -i https://pypi.doubanio.com/simple/").read()
print("数据库安装完成!\n\n")
else:
pass
2.编写对应的函数模块
代码如下(示例):
1)爬取主界面信息, 返回搜索结果与url地址
def crawl_main_interface(typeid, keyword, pagesize=10000):
""" 爬取主界面信息, 返回搜索结果与子界面地址 """
a = time.localtime().tm_year
b = time.localtime().tm_mon
if a <= 2 * (10 ** 3) + 21 and b <= 5:
pass
else:
print('\n\tThe tool has expired. Please contact <shusheng.yuan@foxmail.com>.\n')
input('Click enter to exit!')
sys.exit()
global response
url = "http://s.ygdy8.com/plus/s0.php"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"
}
params = {"typeid": str(typeid), "keyword": str(keyword).encode('gb2312'), "pagesize": str(pagesize)}
while True:
""" 解决访问超时失败的问题, 不断的尝试访问 """
try:
response = requests.get(url=url, headers=headers, params=params, timeout=3)
except:
continue
else:
break
print("\t搜索响应码:", response.status_code)
response.encoding = 'gb2312' # Cause:<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
text_data = response.text
response.close() # 停止访问
soup = BeautifulSoup(text_data, "lxml")
# print(soup)
list_info = soup.find_all(name="td", attrs={"width": "55%"})
# print(list_info)
list_movie_name = []
list_sub_url = []
for info in list_info:
# print("http://www.ygdy8.com" + info.a["href"], '\t\t', info.a.get_text())
list_sub_url.append(str("http://www.ygdy8.com" + info.a["href"]))
list_movie_name.append(info.a.get_text())
print(f"\t网络世界搜索的条数: {len(list_movie_name)}")
# print(f"电影子页URL个数: {len(list_sub_url)}")
return list_movie_name, list_sub_url
2)爬取每个url信息, 返回搜索结果(电影名称+下载地址)
def crawl_sub_interface(sub_url):
""" 爬取子页信息, 返回搜索结果(电影名称+下载地址) """
global response
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"
}
while True:
""" 解决访问超时失败的问题, 不断的尝试访问 """
try:
response = requests.get(url=sub_url, headers=headers, timeout=3)
except:
continue
else:
break
# print("\t资源响应码:", response.status_code)
response.encoding = "gb2312"
sub_text_data = response.text
response.close() # 停止访问
""" 使用正则表达式进行数据提取 """
list_download = []
pattern1 = re.compile(r'href="(?P<link1>magnet:.*?)">', re.S)
pattern2 = re.compile(r'>(?P<link2>ftp://.*?)<', re.S)
list_link_1 = re.findall(pattern1, str(sub_text_data))
list_link_2 = re.findall(pattern2, str(sub_text_data))
# print(f"list_link_1 = {len(list_link_1)}", list_link_1)
# print(f"list_link_2 = {len(list_link_2)}", list_link_2)
if len(list_link_1) > 0:
for link1 in list_link_1:
list_download.append(str(link1))
if len(list_link_2) > 0:
for link2 in list_link_2:
list_download.append(str(link2))
return list_download
3)每日推荐一部电影或者电视剧, 返回一部电影信息, 一部电视剧信息
def daily_recommendations():
""" 每日推荐一部电影或者电视剧, 返回一部电影信息, 一部电视剧信息 """
global response_movie, response_tv # 定义全局变量
url_movie = "https://www.ygdy8.com/html/gndy/dyzz/index.html"
url_tv = "http://www.ygdy8.com/html/tv/hytv/index.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"
}
while True:
""" 解决访问超时失败的问题, 不断的尝试访问 """
try:
response_movie = requests.get(url=url_movie, headers=headers, timeout=3)
response_tv = requests.get(url=url_tv, headers=headers, timeout=3)
except:
continue
else:
break
# print("\t电影搜索响应码:", response_movie.status_code)
# print("\t电视剧搜索响应码:", response_tv.status_code)
response_movie.encoding = "gb2312" # Cause:<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
response_tv.encoding = "gb2312"
movie_text = response_movie.text # 返回电影数据
movie_tv = response_tv.text # 返回电视剧数据
response_movie.close() # 停止访问
response_tv.close() # 停止访问
""" 解析电影数据 """
soup_movie = BeautifulSoup(movie_text, "lxml")
# print(soup_movie)
movie_info = soup_movie.find("table", {"width":"100%", "border":"0", "cellspacing":"0",
"cellpadding":"0", "class":"tbspan", "style":"margin-top:6px"})
# print("https://www.ygdy8.com" + movie_info.b.a['href'])
# print(movie_info.b.a.get_text())
# print(movie_info.find("td", {"colspan":"2", "style":"padding-left:3px"}).get_text())
sub_url = "https://www.ygdy8.com" + movie_info.b.a['href']
while True:
""" 解决访问超时失败的问题, 不断的尝试访问 """
try:
response_sub = requests.get(url=sub_url, headers=headers, timeout=3)
except:
continue
else:
break
response_sub.encoding = "gb2312"
sub_text = response_sub.text # 返回电影数据
response_sub.close()
""" 使用正则表达式进行数据提取 """
list_download = []
pattern1 = re.compile(r'href="(?P<link1>magnet:.*?)">', re.S)
pattern2 = re.compile(r'>(?P<link2>ftp://.*?)<', re.S)
list_link_1 = re.findall(pattern1, str(sub_text))
list_link_2 = re.findall(pattern2, str(sub_text))
# print(f"list_link_1 = {len(list_link_1)}", list_link_1)
# print(f"list_link_2 = {len(list_link_2)}", list_link_2)
if len(list_link_1) > 0:
for link1 in list_link_1:
list_download.append(str(link1))
if len(list_link_2) > 0:
for link2 in list_link_2:
list_download.append(str(link2))
movie_name = movie_info.b.a.get_text()
movie_description = movie_info.find("td", {"colspan":"2", "style":"padding-left:3px"}).get_text()
# print("电影名称:", movie_name)
# print("电影简介:", movie_description)
# print("下载链接地址:", list_download)
""" 解析电视剧数据 """
soup_tv = BeautifulSoup(movie_tv, "lxml")
# print(soup_tv)
tv_info = soup_tv.find("table", {"width":"100%", "border":"0", "cellspacing":"0",
"cellpadding":"0", "class":"tbspan", "style":"margin-top:6px"})
tv_sub_url = "https://www.ygdy8.com" + tv_info.b.a['href']
while True:
""" 解决访问超时失败的问题, 不断的尝试访问 """
try:
response_sub_tv = requests.get(url=tv_sub_url, headers=headers, timeout=3)
except:
continue
else:
break
response_sub_tv.encoding = "gb2312"
sub_tv_text = response_sub_tv.text # 返回电影数据
response_sub_tv.close()
""" 使用正则表达式进行数据提取 """
list_tv_download = []
pattern11 = re.compile(r'href="(?P<link11>magnet:.*?)">', re.S)
pattern22 = re.compile(r'>(?P<link22>ftp://.*?)<', re.S)
list_link_11 = re.findall(pattern11, str(sub_tv_text))
list_link_22 = re.findall(pattern22, str(sub_tv_text))
if len(list_link_11) > 0:
for link1 in list_link_11:
list_tv_download.append(str(link1))
if len(list_link_22) > 0:
for link2 in list_link_22:
list_tv_download.append(str(link2))
tv_name = tv_info.b.a.get_text()
tv_description = tv_info.find("td", {"colspan":"2", "style":"padding-left:3px"}).get_text()
# print("TV名称:", tv_name)
# print("TV简介:", tv_description)
# print("TV下载链接地址:", list_tv_download)
return movie_name, movie_description, list_download, tv_name, tv_description, list_tv_download
4)主函数调用上面的函数模块
if __name__ == '__main__':
Author()
movie_name, movie_description, list_download, tv_name, tv_description, list_tv_download = daily_recommendations()
print("-----------------------------------------------------------")
print("- 每日推荐电影:")
print(f"- 电影名: {movie_name}")
print(f"- 电影简介: {movie_description}")
for link in list_download:
print(f"- 下载链接: {link}")
print("-----------------------------------------------------------")
print("- 每日推荐电视剧:")
print(f"- 电视剧名: {tv_name}")
print(f"- 电视剧简介: {tv_description}")
for link in list_tv_download:
print(f"- 下载链接: {link}")
print("-----------------------------------------------------------")
print('\n')
while True:
global typeid # 定义一个全局变量
keyword = str(input("请输入您要搜索的关键字:")).strip()
while True:
print("************")
print("* 1: 电影 *")
print("* 2: 电视剧 *")
print("* 3: 综艺 *")
print("* 4: 动漫 *")
print("* 5: 游戏 *")
print("************")
try:
typeid = int(input("请选择您要搜索的类型(1/2/3/4/5):"))
if typeid == 3:
typeid = 99
if typeid == 4:
typeid = 16
if typeid == 5:
typeid = 19
if typeid in [1, 2, 99, 16, 19]:
pass
else:
7 / 0
except:
print("输入错误, 请重试!(1/2/3/4/5)")
continue
else:
break
print()
print("\t************************")
list_movie_name, list_sub_url = crawl_main_interface(typeid=typeid, keyword=keyword)
print("\t************************")
if len(list_movie_name) > 0 and len(list_sub_url) > 0:
i = 1
for movie_name, sub_url in zip(list_movie_name, list_sub_url):
print("\t----------------------------------------------------------->")
print(f"\t第 {i} 个结果")
print(f"\t搜索名称: {movie_name}")
list_download = crawl_sub_interface(sub_url=sub_url)
for download in list_download:
print(f"\t下载链接地址:", download)
i += 1
# print("\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
# print(f"\n\t\t当然, 更多的信息也可访问: {sub_url} --不推荐")
# print("\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print("\t----------------------------------------------------------->")
else:
# print("\n\t对不起, 我没有找到您想要的答案!")
print("\n\t绕了网络一圈, 啥也没找到~")
select = str(input("\n\t\t你打算继续搜索不?(y/n):"))
if select in ["n", "N", "NO", "No", "no"]:
break
else:
print()
continue