'''
网站网址:https://www.mh263.com/dzm_show/dy1--score------1---.html
name:电影名称
actor:电影演员
videoHref:带有简介详情的页面链接
openHref:电影视频链接
derector:导演
jianjie:简介
imgHref:电影封面图片超链接
'''
代码
import os
import urllib.request as request
from urllib.request import Request,urlopen,urlretrieve
'''
解压缩或压缩
'''
import gzip
'''
ssl协议验签
'''
import ssl
'''
解析数据
'''
from lxml import etree
'''
连接数据库
'''
import pymysql
import time
def getURLData(path,charset):
'''
下载数据
:param path: 网址
:param charset: 编码格式
:return: 下载到的数据
'''
ssl._create_default_https_context = ssl._create_unverified_context
headers={
"Accept-Encoding": "gzip, deflate",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
}
req = Request(url=path,headers=headers)
conn = urlopen(req)
if conn.code == 200:
data = conn.read()
if conn.headers["Content-Encoding"] == "gzip":
data = gzip.decompress(data)
data = data.decode(encoding=charset)
return data
else:
return ""
def getDBConn():
# 获取数据库连接
# :return: 数据库连接对象
return pymysql.connect(
host="127.0.0.1",
port=3306,
database="video",#数据库名
user="root",
password="root",
charset="utf8"
)
def saveVideo(name="", actor="", imgPath="", videoHref="",openHref="",derector="",jianjie=""):
conn = getDBConn()
cuoser = conn.cursor()
sql = "INSERT INTO video (videoname,actorname,imgPath,daoyan,videoHref,openHref,jianjie) VALUES" \
"('%s','%s','%s','%s','%s','%s','%s');"#此处video为表名
sql = sql % (name, actor, imgPath,derector, videoHref,openHref,jianjie)
num = cuoser.execute(sql)
if num > 0:
conn.commit()
cuoser.close()
conn.close()
'''
网站网址:https://www.mh263.com/dzm_show/dy1--score------1---.html
name:电影名称
actor:电影演员
videoHref:带有简介详情的页面链接
openHref:电影视频链接
derector:导演
jianjie:简介
imgHref:电影封面图片超链接
'''
if __name__ == '__main__':
for i in range(3,5):
print("第几页:",i)
n=str(i)
# time.sleep(15)
# path="https://www.mh263.com/dzm_type/dy1-%s.html"%(n)
path = "https://www.mh263.com/dzm_show/dy1--score------%s---.html" % (n)
# path = "https://www.mh263.com/dzm_type/dy1.html"
#下载数据
data = getURLData(path=path,charset="utf-8")
html = etree.HTML(data)
videoListTag="//ul[@class='c2_list']"
videoList=html.xpath(videoListTag)[0]
videosTag="./li"
videos=videoList.xpath(videosTag)
d=1
for video in videos:
aList = video.xpath("./div")
a2 = aList[1]
href = a2.xpath("./a/@href")[0]
videoHref = "https://www.mh263.com" + str(href)#带有简介详情的页面的超链接
list=videoHref.split('.')
open=list[0]+'.'+list[1]+'.'+list[2]+'-1-1.'+list[3]
list1=open.split('/')
openHref=list1[0]+'/'+list1[1]+'/'+list1[2]+'/'+'dzm_play'+'/'+list1[4]#视频页面超链接
name = a2.xpath("./a/text()")[0]#电影名称
actor = a2.xpath("./p/text()")[0]#电影演员
print("第%d个:" % d, name)
print("演员:",actor)
print("带有简介详情页面链接:",videoHref)
print("视频播放链接:",openHref)
d=d+1
# urlretrieve(url=imgPath, filename="img/%s.jpg" % (name))
# saveVideo(name=name, actor=actor, imgPath=imgPath, videoHref=videoHref)
path2=videoHref
data = getURLData(path=path2, charset="utf-8")
html = etree.HTML(data)
jianjieTag = "//div[@id='content']/text()"
jianjie = html.xpath(jianjieTag)[0]#简介
picTag="//div[@class='detail-info-pic']/img/@src"
imgPath=html.xpath(picTag)[0]#电影封面超链接
# derectorTag1="//i[@class='col33']/span/text()"
derectorTag="//i[@class='col33']/span/a/text()"
# derector1=html.xpath(derectorTag1)[0]#导演
derector1 = html.xpath(derectorTag) # 导演
derector1.extend(['未知'])
derector=derector1[0]
# derector=derector2[0]
# derector=derector.extend(['未知'])
print("导演:",derector)
# path='D://python_pycharm/team/img'
# imgpath=path+'/'+os.path.split(imgPath)[1]
# request.urlretrieve(imgPath,imgpath)
print("封面图片超链接:",imgPath)
print("简介:",jianjie)
# urlretrieve(url=imgPath, filename="img/%s"%(name))
saveVideo(name=name, actor=actor, imgPath=imgPath, videoHref=videoHref,openHref=openHref,derector=derector,jianjie=jianjie)
爬虫结果:
数据库结果: