爬取某电影网站(未写完)

最新推荐文章于 2021-03-11 12:25:23 发布

angyangguai2766

最新推荐文章于 2021-03-11 12:25:23 发布

阅读量5.6k

点赞数

文章标签： php python

原文链接：http://www.cnblogs.com/Ly-233/p/11205661.html

版权

 
import requests
import bs4
import lxml
import re
import time
from bs4 import BeautifulSoup
#网站
url = 'https://www.88ys.cc'
#电影或电视剧的名字
film = '家有女友'
#代理ip
proxy='120.24.245.33:16818'#已过期，需续费
proxies = {
   'http':'http://'+proxy,
   'https':'https://'+proxy
   }
#gzip访问速度更快
headers = {
   "Accept-Encoding": "gzip"
   }
####搜索结果####
def search():
   ####搜索结果####
   #搜索链接
   url_search = url + '/index.php?m=vod-search'
   #post需要提交的参数
   data = {
       'wd':film ,
       'submit':''
   }
   #提交搜索内容的表单
   #①无代理
   r_s = requests.post(url_search, data=data)
   #②有代理
   # r_s = requests.post(url_search, data=data, proxies=proxies, headers=headers)
   #设置编码
   r_s.encoding = 'utf-8'
   #接收返回的网页
   text_s = r_s.text
   #
   pat = re.compile(r'<a class="link-hover" href="(.*?)"')
   pat_is = re.findall(pat, text_s)
   # print(pat_is)
   return pat_is
####获取集数####
def List(pat_search):
   ####获取集数####
   #搜索结果链接
   url_list = url + pat_search
   #打开合并的链接
   # r_list = requests.get(url_list, proxies=proxies, headers=headers)
   r_list = requests.get(url_list)
   #设置字符编码
   r_list.encoding = 'utf-8'
   #接收链接网页
   text_list = r_list.text
   # print(text_list)
   #使用BeautifulSoup获取第一个片源的所有集数链接
   text_l_b = BeautifulSoup(text_list,'lxml')
   stab81 = text_l_b.find_all(name='div', attrs={'id':'stab81'})
   stab81_re = re.findall(re.compile(r'href="(.*?)"'), str(stab81[0]))
   # print(stab81_re)
   return stab81_re
####搜索结果文字信息####
def search_news(pat_search):
   ####搜索结果文字信息####
   #存储获取的信息以集合形式返回
   #[0]电影名[1]影片类型[2]语言
   information = []
   #建立连接
   url_search = url + pat_search
   r = requests.get(url_search)
   #设置编码
   r.encoding = 'utf-8'
   #获取电影信息的div
   bs = str(BeautifulSoup(r.text, 'lxml').find_all('div', class_='ct-c'))
   #获取电影名称，并添加到集合
   h1_bs = BeautifulSoup(bs,'lxml')
   h1_re = re.findall(re.compile(r'>(.*?)<'), str(h1_bs.h1))
   information.append(h1_re[0])
   #类型
   type_re = re.findall(re.compile(r'类型：</span>(.*?)</dd>'), bs)
   information.append(type_re[0])
   #语言
   language_re = re.findall(re.compile(r'语言：</span>(.*?)</dd>'), bs)
   information.append(language_re[0])
   # print(information)
   return information
####Main函数(循环获取搜索结果和集数)####
def _for_():
   ####循环获取搜索结果和集数####
   #调用搜索
   pat_search = search()
   #接收影片返回信息
   information = None
   #接收片源一返回的信息
   stab81 = None
   # 循环调用方法
   for i in pat_search:
       #调用方法并接收
       information = search_news(i)
       #调用方法并接收
       stab81 = List(i)
       #打印
       print(information)
       #打印
       print(stab81)
       #使用延时防止运行太快被网站强制断开连接
       time.sleep(3)
# 调用主(Main)函数
_for_()
####未做完，查看器和爬取源码不一致####
def a():
   _url_ = url + '/vod-play-id-56106-src-1-num-1.html'
   r = requests.get(_url_)
   r.encoding = 'utf-8'
   bs = BeautifulSoup(r.text,'lxml')
   print(bs.prettify()) 
  
(未解决问题)
查看器和爬取源码不一致
获取视频链接
电影下载操作

转载于:https://www.cnblogs.com/Ly-233/p/11205661.html

angyangguai2766

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬取某电影网站(未写完)

1 import requests 2 import bs4 3 import lxml 4 import re 5 import time 6 from bs4 import BeautifulSoup 7 #网站 8 url = 'https://www.88ys.cc' 9 #电影或电视剧的名字 10 film = '...
复制链接

扫一扫