bilibili排行榜爬虫

最新推荐文章于 2024-07-15 11:23:54 发布

进击的荷兰豆

最新推荐文章于 2024-07-15 11:23:54 发布

阅读量1.8k

点赞数 1

本文链接：https://blog.csdn.net/qq_40962064/article/details/90183027

版权

建立数据库，将有关数据库的操作放在一个python文件中

def connect_mysql(self):
        self.connection = pymysql.connect(host='localhost',
                                          user='root',#这里输入mysql用户名
                                          password='0711',#这里输入mysql密码
                                          db='test',#这里输入mysql库名
                                          charset='utf8mb4',
                                          cursorclass=pymysql.cursors.DictCursor)

    def select_mysql(self):
        self.connect_mysql()
        try:
            with self.connection.cursor() as cursor:

                sql = "SELECT `country`, `ip`,`port`,`protocol` FROM `xicidaili`"
                cursor.execute(sql)
                result = cursor.fetchall()
        finally:
            self.connection.close()

        return result

建立python键值对，根据键值对查找对应的网页

self.targetUrlMap= {
           "BiliTrendAll":{
               "index":"/all",
               "content":{
                   "all":"/0/0/3",
                   "cartoon":"/1/0/3",
                   "orgin":"/168/0/3",
                   "music":"/3/0/3",
                   "dance":"/129/0/3",
                   "game":"/4/0/3",
                   "technique":"/36/0/3",
                   "machine":"/188/0/3",
                   "life":"/160/0/3",
                   "guichu":"/119/0/3",
                   "fashion":"/155/0/3",
                   "yuLe":"/5/0/3",
                   "movie":"/181/0/3"
               }

           }

对爬下来的网页用XPath进行路径分析，提取我们需要的元素

       	def __getRankItemInformation(self):
        try:
            xPathUrl = '//div[@class = "rank-list-wrap"]/ul[@class = "rank-list"]/li[@class = "rank-item"]'
            # 视频排名
            self.rank = self.html.xpath(xPathUrl+'/div[@class = "num"]/text()')

            # 视频名称
            self.Title = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "info"]/a[@class = "title"]/text()')

            # 视频封面
            self.Rankimg = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "img"]/a/@href')

            # 视频链接
            self.href = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "info"]/a[@class = "title"]/@href')

            # 播放次数
            self.playCount = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/span[1]/text()')

            # 观看次数
            self.view = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/span[2]/text()')

            # up主的链接
            self.upHref = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/a/@href')

            # up主的名字
            self.upName = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/a/span/text()')

            # 综合得分
            self.pts = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "pts"]/div/text()')

        except Exception as e:
            print(e)

完整代码

spiderT

# * coding:utf-8 *

from urllib import request
from lxml import etree

class Spider():
    def __init__(self):
        self.headers = {
            'Host': 'www.bilibili.com',
            'Referer': 'https://www.bilibili.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 '
                          'Safari/537.36 Edge/16.16299'
        }

        self.targetUrlMap= {
            "BiliTrendAll":{
                "index":"/all",
                "content":{
                    "all":"/0/0/3",
                    "cartoon":"/1/0/3",
                    "orgin":"/168/0/3",
                    "music":"/3/0/3",
                    "dance":"/129/0/3",
                    "game":"/4/0/3",
                    "technique":"/36/0/3",
                    "machine":"/188/0/3",
                    "life":"/160/0/3",
                    "guichu":"/119/0/3",
                    "fashion":"/155/0/3",
                    "yuLe":"/5/0/3",
                    "movie":"/181/0/3"
                }

            },
            "BiliTrendOrigin":{
                "index":"/origin",
                "content":{
                        "all": "/0/0/3",
                        "cartoon": "/1/0/3",
                        "orgin": "/168/0/3",
                        "music": "/3/0/3",
                        "dance": "/129/0/3",
                        "game": "/4/0/3",
                        "technique": "/36/0/3",
                        "machine": "/188/0/3",
                        "life": "/160/0/3",
                        "guichu": "/119/0/3",
                        "fashion": "/155/0/3",
                        "yuLe": "/5/0/3",
                        "movie": "/181/0/3"
                }
            },
            "BiliTrendBangumi":{
                "index":"/bangumi",
                "content":{
                    "Japan":"/13/0/3",
                    "China":"/167/0/3"
                }

            },
            "BiliTrendCinema":{
                "index":"/cinema",
                "content":{
                    "newsreel":"/177/0/3",
                    "movie":"/23/0/3",
                    "teleplay":"/11/0/3"

                }
            },
            "BiliTrendRookie":{
                "index":"/rookie",
                "content":{
                        "all": "/0/0/3",
                        "cartoon": "/1/0/3",
                        "music": "/3/0/3",
                        "dance": "/129/0/3",
                        "game": "/4/0/3",
                        "technique": "/36/0/3",
                        "machine": "/188/0/3",
                        "life": "/160/0/3",
                        "guichu": "/119/0/3",
                        "fashion": "/155/0/3",
                        "yuLe": "/5/0/3",
                        "movie": "/181/0/3"
                }

            },
        }

        self.targetUrlIndex = "https://www.bilibili.com/ranking"
        self.content = None
        # 设置代理默认关闭
        self.setProxyIP()
        self.setHttpProxy(False)
        # save the result in list
        self.resultList = []
        self.__setReusltDic()


    def run(self):
        self.__start()

    # 开始爬虫
    def __start(self):

        for key in self.targetUrlMap:
            dictMap = self.targetUrlMap[key]
            # print(dictMap['content'])
            self.targetUrl = self.targetUrlIndex+dictMap['index']
            self.targetUrlS = self.targetUrlIndex+dictMap['index']

            for t in dictMap['content']:
                self.targetUrl = self.targetUrl + dictMap['content'][t]
                print("start to spider:",self.targetUrl)

                self.setRequest()
                self.response = self.opener.open(self.req)
                status = self.response.getcode()

                # 禁止
                if(status != 200):
                    self.setHttpProxy(True)
                    self.response = self.opener.open(self.req)

                res = self.response.read()
                self.content = res.decode("utf-8")
                # print(self.content)
                self.__getHtml()
                self.__getRankItemInformation()
                self.__createDict()
                self.saveDataToMysql(key+t)
                print("table:"+key+t)
                self.targetUrl = self.targetUrlS


    # 开始请求
    def setRequest(self):
        self.req = request.Request(self.targetUrl, headers=self.headers)

    # 设置头部
    def setHeaders(self,headers):
        self.headers = headers

    def setHttpProxy(self,proxySwitch = False,mysqlSource=True):
        # mysqlsource是源代码的按钮：如果打开，则源代码将来自mysql，或者必须在此函数之前设置代理IP。
        # 无代理
        nullproxy_handler = request.ProxyHandler({})
        print("proxy",proxySwitch)
        if proxySwitch:
            if(mysqlSource):
                import ProxyIP
                item = ProxyIP.ProxyIP()
                item.readProxyIP()
                proxyIPinformation = item.getProxyIP(self.targetUrl, self.headers)
                print(proxyIPinformation)

                if(proxyIPinformation):
                    self.proxyIP = self.setProxyIP(proxyIPinformation['protocol'],proxyIPinformation['ip'],proxyIPinformation['port'])
                else:
                    print("None legally proxy ip or consider to close this function")
                    import sys
                    sys.exit(-1)

            httpproxy_handler = request.ProxyHandler(self.proxyIP)
            self.opener = request.build_opener(httpproxy_handler)

        else:
            self.opener = request.build_opener(nullproxy_handler)
        # req = request.Request("http://www.baidu.com")

        # reponse = self.opener.open(req)
        # print(reponse.read())

    def setProxyIP(self,protocal = "http",ip = "110.52.235.114",port = "9999"):
        self.proxyIP = {
            protocal:ip+":"+port
        }

    def __getHtml(self):
        self.html = etree.HTML(self.content)

    def __setReusltDic(self,title=None,rank=None,rankimg=None,href =None,
                       playCount= None,view = None,upHref = None,upName = None,pts = None):
        self.resultItem  = {
            "rank":rank,
            "title":title,
            "rankimg":rankimg,
            "herf":href,
            "playCount":playCount,
            "view":view,
            "upHref":upHref,
            "upName":upName,
            "pts":pts
        }


    def __getRankItemInformation(self):
        try:
            xPathUrl = '//div[@class = "rank-list-wrap"]/ul[@class = "rank-list"]/li[@class = "rank-item"]'
            # 视频排名
            self.rank = self.html.xpath(xPathUrl+'/div[@class = "num"]/text()')

            # 视频名称
            self.Title = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "info"]/a[@class = "title"]/text()')

            # 视频封面
            self.Rankimg = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "img"]/a/@href')

            # 视频链接
            self.href = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "info"]/a[@class = "title"]/@href')

            # 播放次数
            self.playCount = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/span[1]/text()')

            # 观看次数
            self.view = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/span[2]/text()')

            # up主的链接
            self.upHref = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/a/@href')

            # up主的名字
            self.upName = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/a/span/text()')

            # 综合得分
            self.pts = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "pts"]/div/text()')

        except Exception as e:
            print(e)
    def __createDict(self):
        length = len(self.rank)

        for i in range(0,length):
            thisRank = self.rank[i]
            thisTitle = self.Title[i]
            thisImg = self.Rankimg[i]
            thisHref = self.href [i]
            thisPlayCount = self.playCount[i]
            thisView = self.view[i]
            thisUpHref = self.upHref[i]
            thisUpName = self.upName[i]
            thisPts = self.pts[i]
            self.__setReusltDic(thisTitle,thisRank,thisImg,thisHref,thisPlayCount,thisView,thisUpHref,thisUpName,thisPts)
            self.resultList.append(self.resultItem)

        print(len(self.resultList))
        print(self.resultList)
    def saveDataToMysql(self,tableName):
        import mySQLConnect
        this = mySQLConnect.MySqlConnection()
        this.saveInformation(tableName,self.resultList)
        self.resultList = []

if __name__ == '__main__':
    a = Spider()
    # a.setHttpProxy(True)
    a.run()

mysqlConnect

# * coding:utf-8 *

import pymysql
import time, datetime

class MySqlConnection():
    def __init__(self):
        pass
    def connect_mysql(self):
        self.connection = pymysql.connect(host='localhost',
                                          user='root',#这里输入mysql用户名
                                          password='0711',#这里输入mysql密码
                                          db='test',#这里输入mysql库名
                                          charset='utf8mb4',
                                          cursorclass=pymysql.cursors.DictCursor)



    def select_mysql(self):
        self.connect_mysql()
        try:
            with self.connection.cursor() as cursor:

                sql = "SELECT `country`, `ip`,`port`,`protocol` FROM `xicidaili`"
                cursor.execute(sql)
                result = cursor.fetchall()
        finally:
            self.connection.close()

        return result


    def save_myself(self,dataList):
        self.connect_mysql()
        try:
            with self.connection.cursor() as cursor:
                sql = "INSERT INTO `xicidaili` (`country`, `ip`,`port`,`type`,`protocol`,`speed`,`connectTime`,`aliveTime`,`myaliveTime`,`DatabasecreatedTime`,`DatabaseupdateTime`) " \
                      "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )"
                for data in dataList:
                    # Create a new record
                    thisCountry = data['country']
                    thisip = data['ip']
                    thisport = data['port']
                    thistype = data['type']
                    thisprotocol = data['protocol']
                    thisspeed = data['speed']
                    thisconnectTime = data['connectTime']
                    thisaliveTime = data['aliveTime']
                    myaliveTime = 0
                    DatabasecreatedTime = int(round(time.time() * 1000))
                    cursor.execute(sql, (thisCountry,thisip,thisport,thistype,thisprotocol,thisspeed,thisconnectTime,thisaliveTime,myaliveTime,DatabasecreatedTime,DatabasecreatedTime))
                    self.connection.commit()
        finally:
            self.connection.close()

    def saveInformation(self,tableName,dataList):
        connection = pymysql.connect(host='localhost',
                                  user='root',  # 这里输入mysql用户名
                                  password='0711',  # 这里输入mysql密码
                                  db='test',  # 这里输入mysql库名
                                  charset='utf8mb4',
                                  cursorclass=pymysql.cursors.DictCursor)
        try:
            with connection.cursor() as cursor:
                sql = "INSERT INTO `"+tableName+"` (`rank`, `title`,`herf`,`playCount`,`view`,`upHerf`,`upName`,`pts`,`DatabasecreatedTime`,`DatabaseupdateTime`) " \
                      "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s )"
                for data in dataList:
                    # Create a new record
                    thisRank = data['rank']
                    thisTitle = data['title']
                    thisHerf = data['herf']
                    thisPlayCount = data['playCount']
                    thisview = data['view']
                    thisUpHerf = data['upHref']
                    thisUpName = data['upName']
                    thisPts = data['pts']
                    DatabasecreatedTime = int(round(time.time() * 1000))
                    cursor.execute(sql, (thisRank,thisTitle,thisHerf,thisPlayCount,thisview,thisUpHerf,thisUpName,thisPts,DatabasecreatedTime,DatabasecreatedTime))
                    connection.commit()
        finally:
            connection.close()


if __name__ == '__main__':
    a = MySqlConnection()
    s = a.select_mysql()
    print(s)