urllib.request并存入数据库

'''
网站网址:https://www.mh263.com/dzm_show/dy1--score------1---.html
name:电影名称
actor:电影演员
videoHref:带有简介详情的页面链接
openHref:电影视频链接
derector:导演       
jianjie:简介
imgHref:电影封面图片超链接
'''

代码

import os

import urllib.request as request
from urllib.request import Request,urlopen,urlretrieve
'''
解压缩或压缩
'''
import gzip
'''
ssl协议验签
'''
import ssl
'''
解析数据
'''
from lxml import etree
'''
连接数据库
'''
import pymysql

import time



def getURLData(path,charset):
    '''
    下载数据
    :param path: 网址
    :param charset: 编码格式
    :return: 下载到的数据
    '''
    ssl._create_default_https_context = ssl._create_unverified_context
    headers={
        "Accept-Encoding": "gzip, deflate",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
    }
    req = Request(url=path,headers=headers)
    conn = urlopen(req)
    if conn.code == 200:
        data = conn.read()
        if conn.headers["Content-Encoding"] == "gzip":
            data = gzip.decompress(data)
        data = data.decode(encoding=charset)
        return data
    else:
        return ""

def getDBConn():
    # 获取数据库连接
    # :return: 数据库连接对象

    return pymysql.connect(
        host="127.0.0.1",
        port=3306,
        database="video",#数据库名
        user="root",
        password="root",
        charset="utf8"
    )


def saveVideo(name="", actor="", imgPath="", videoHref="",openHref="",derector="",jianjie=""):
    conn = getDBConn()
    cuoser = conn.cursor()
    sql = "INSERT INTO video (videoname,actorname,imgPath,daoyan,videoHref,openHref,jianjie) VALUES" \
          "('%s','%s','%s','%s','%s','%s','%s');"#此处video为表名
    sql = sql % (name, actor, imgPath,derector, videoHref,openHref,jianjie)
    num = cuoser.execute(sql)
    if num > 0:
        conn.commit()
    cuoser.close()
    conn.close()
'''
网站网址:https://www.mh263.com/dzm_show/dy1--score------1---.html
name:电影名称
actor:电影演员
videoHref:带有简介详情的页面链接
openHref:电影视频链接
derector:导演       
jianjie:简介
imgHref:电影封面图片超链接
'''

if __name__ == '__main__':
    for i in range(3,5):
        print("第几页:",i)
        n=str(i)
        # time.sleep(15)
        # path="https://www.mh263.com/dzm_type/dy1-%s.html"%(n)
        path = "https://www.mh263.com/dzm_show/dy1--score------%s---.html" % (n)
         # path = "https://www.mh263.com/dzm_type/dy1.html"
          #下载数据
        data = getURLData(path=path,charset="utf-8")
        html = etree.HTML(data)
        videoListTag="//ul[@class='c2_list']"
        videoList=html.xpath(videoListTag)[0]
        videosTag="./li"
        videos=videoList.xpath(videosTag)
        d=1
        for video in videos:
            aList = video.xpath("./div")
            a2 = aList[1]
            href = a2.xpath("./a/@href")[0]
            videoHref = "https://www.mh263.com" + str(href)#带有简介详情的页面的超链接
            list=videoHref.split('.')
            open=list[0]+'.'+list[1]+'.'+list[2]+'-1-1.'+list[3]
            list1=open.split('/')
            openHref=list1[0]+'/'+list1[1]+'/'+list1[2]+'/'+'dzm_play'+'/'+list1[4]#视频页面超链接
            name = a2.xpath("./a/text()")[0]#电影名称
            actor = a2.xpath("./p/text()")[0]#电影演员
            print("第%d个:" % d, name)
            print("演员:",actor)
            print("带有简介详情页面链接:",videoHref)
            print("视频播放链接:",openHref)
            d=d+1
            # urlretrieve(url=imgPath, filename="img/%s.jpg" % (name))
            # saveVideo(name=name, actor=actor, imgPath=imgPath, videoHref=videoHref)
            path2=videoHref
            data = getURLData(path=path2, charset="utf-8")
            html = etree.HTML(data)
            jianjieTag = "//div[@id='content']/text()"
            jianjie = html.xpath(jianjieTag)[0]#简介
            picTag="//div[@class='detail-info-pic']/img/@src"
            imgPath=html.xpath(picTag)[0]#电影封面超链接
            # derectorTag1="//i[@class='col33']/span/text()"
            derectorTag="//i[@class='col33']/span/a/text()"
            # derector1=html.xpath(derectorTag1)[0]#导演
            derector1 = html.xpath(derectorTag) # 导演
            derector1.extend(['未知'])
            derector=derector1[0]
            # derector=derector2[0]
            # derector=derector.extend(['未知'])
            print("导演:",derector)
            # path='D://python_pycharm/team/img'
            # imgpath=path+'/'+os.path.split(imgPath)[1]
            # request.urlretrieve(imgPath,imgpath)
            print("封面图片超链接:",imgPath)
            print("简介:",jianjie)
            # urlretrieve(url=imgPath, filename="img/%s"%(name))
            saveVideo(name=name, actor=actor, imgPath=imgPath, videoHref=videoHref,openHref=openHref,derector=derector,jianjie=jianjie)







爬虫结果:
在这里插入图片描述
数据库结果:
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

就躺了吧

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值