def qqVipParse(url):
# don't use 'lxml', By default, lxml's parser will strip CDATA sections from the tree and
# replace them by their plain text content., more https://groups.google.com/forum/?fromgroups=#!topic/beautifulsoup/whLj3jMRq7g
soup = BeautifulSoup(requests.get(url).content, "html.parser") # requests自动解码有问题,取content交给bs解码
videoTitle = soup.title.text.split(' -', maxsplit=1)[0]
# print(repr(videoTitle))
r = requests.get('http://vip.zhanjh.com/qq.asp', params={'url': url.split('?', maxsplit=1)[0], 'hao': '456jh'}, headers={ # url.split是为了除去参数
'Referer': 'http://www.yaokanla.com',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
# 'Host': 'vip.zhanjh.com'
})
soup = BeautifulSoup(r.content, "html.parser")
videoUrl = soup.find('iframe').get('src').split('=', maxsplit=1)[-1]
if '"\n' == videoUrl:
raise # 没有地址就激起错误,以便后面好处理
# print(videoUrl)
fileExt = os.path.splitext(urllib.parse.urlparse(videoUrl).path)[-1]
# print('----baseName------fileExt-------', repr(baseName), repr(fileExt))
fileName = videoTitle + (fileExt or '.mp4')
return (fileName, [videoUrl])
def parse(queryDict):
try:
resultDict = {'parseResult': qqVipParse(queryDict['sourceUrl'])}
except Exception as e:
# logging.error(e) # 把you-get出错的信息发到log里面去
resultDict = {'parseResult': None, 'reason': '该页面无法找到视频'}
raise e
finally:
# 解析成功就返回dict,否则 ''
print('Parsing resultDict------------', resultDict)
return resultDict
if __name__ == '__main__':
urls = ['http://v.qq.com/cover/w/wussyc1mi5f6fkx.html?vid=m0014ofzky5', # 神雕侠侣动画合集版72
'https://v.qq.com/x/cover/d85gvl439e98ih0.html?ptag=baidu.aladdin.movie.pay', # VIP 十月的天空
'http://v.qq.com/error.html', # 404错误:你访问的页面丢失了
'http://v.qq.com/detail/w/wussyc1mi5f6fkx.html', # 专辑页面,也不能解析
'http://www.iqiyi.com/v_19rr9s226s.html?pltfm=11&pos=title&flashvars=videoIsFromQidan%3Ditemviewclk_a#vfrm=5-6-0-1',
]
for url in urls:
parse({'sourceUrl': url})