#!/user/bin/env python
# coding=utf-8
'''获取页面的所有电影名字及其对应的href,加下载链接
用正则表达式取数据
'''
import requests
import re
import time
import json
from bs4 import BeautifulSoup
class downMovie(object):
def __init__(self):
self.server = 'http://tv.efoxconn.com'
self.Start_url = 'http://tv.efoxconn.com/Video/List/99?page=1'
self.proxy = 'username:password@host:port'
self.proxies = {
'http': 'http://' + self.proxy,
'https': 'https://' + self.proxy,
}
self.names = [] # 存放电影名
self.urls = [] # 存放电影详细界面url
self.content = {} # 存放电影相关信息
self.nums = 0 # 电影数量
'''函数说明:
获取目标页面html以及其中所含的电影名字和电影详细界面的href
Parameters:
None
Returns:
None
Modify:
2018-09-19
'''
def get_html(self):
html = requests.get(self.Start_url, proxies=self.proxies).text
soup = BeautifulSoup(html, 'lxml')
movie_name = soup.find_all('a', {'style': 'font-size: 12px;'})
self.nums = len(movie_name)
for each in movie_name:
self.names.append(each.string.strip())
self.urls.append(self.server + each.get('href'))
'''
函数说明:
根据获得的href取得界面中的电影相关信息和下载地址
Parameters:
url - 电影详细界面链接
Returns:
self.content - 电影简介
userUrl - 电影下载链接
Modify:
2018-09-19
'''
def get_content(self, url):
html = requests.get(url, proxies=self.proxies).text
# 通过BeautifulSoup来获取下载地址
soup = BeautifulSoup(html, 'lxml')
div = soup.find_all('div', {'id': 'ef03a1d2e52780a945f4e24a703fd4ad'})
soup1 = BeautifulSoup(str(div[0]), 'lxml')
movie_dl = soup1.find_all('script')
userUrl = re.search('var vid.*?vMp4url = "(.*?)"; //用户自己的视频文件地址', movie_dl[0].string, re.S).group(1)
# 通过正则表达式来获取电影相关信息
pattern = re.compile('<div.*?width:305px; height:389px; margin: 0 auto.*?style="">'
+ '(.*?)<span>(.*?)</span>.*?<p>' # rating
+ '(.*?)<span>(.*?)</span>.*?<p>' # duration
+ '(.*?)<span>(.*?)</span>.*?<p>' # producer
+ '(.*?)<span>(.*?)</span>.*?<p>' # editor
+ '(.*?)<span>(.*?)</span>.*?<p>' # screenplay
+ '(.*?)<span>(.*?)</span>.*?class="div180">' # actor
+ '(.*?)<span.*?div180 word0_223">(.*?)</span>', re.S) # Introduction
items = re.findall(pattern, html)[0]
for i in range(0, 14, 2):
self.content[items[i].strip()] = items[i + 1].strip()
return self.content, userUrl
'''
函数说明:
写文件
Parameters:
filename - 文件名称(string)
movie_name - 电影名称名称(string)
content - 电影简介内容(string)
dlurl - 电影下载链接(string)
Returns:
None
Modify:
2018-09-19
'''
def write_m(self, filename, movie_name, content, dlurl):
write_flag = True
with open(filename, 'a', encoding='UTF-8') as fp:
fp.write(movie_name + '\n')
fp.write(json.dumps(content, ensure_ascii=False) + '\n')
fp.writelines(self.server + dlurl)
fp.write('\n\n')
if __name__ == '__main__':
dl = downMovie()
dl.get_html()
print('电影信息开始下载...')
for i in range(dl.nums):
content, dlurl = dl.get_content(dl.urls[i])
dl.write_m('movies.txt', dl.names[i], content, dlurl)
time.sleep(1)
print('下载已完成!!!')
爬取foxconn传媒网获取电影信息
最新推荐文章于 2024-11-13 10:45:03 发布