建立数据库,将有关数据库的操作放在一个python文件中
def connect_mysql(self):
self.connection = pymysql.connect(host='localhost',
user='root',#这里输入mysql用户名
password='0711',#这里输入mysql密码
db='test',#这里输入mysql库名
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
def select_mysql(self):
self.connect_mysql()
try:
with self.connection.cursor() as cursor:
sql = "SELECT `country`, `ip`,`port`,`protocol` FROM `xicidaili`"
cursor.execute(sql)
result = cursor.fetchall()
finally:
self.connection.close()
return result
建立python键值对,根据键值对查找对应的网页
self.targetUrlMap= {
"BiliTrendAll":{
"index":"/all",
"content":{
"all":"/0/0/3",
"cartoon":"/1/0/3",
"orgin":"/168/0/3",
"music":"/3/0/3",
"dance":"/129/0/3",
"game":"/4/0/3",
"technique":"/36/0/3",
"machine":"/188/0/3",
"life":"/160/0/3",
"guichu":"/119/0/3",
"fashion":"/155/0/3",
"yuLe":"/5/0/3",
"movie":"/181/0/3"
}
}
对爬下来的网页用XPath进行路径分析,提取我们需要的元素
def __getRankItemInformation(self):
try:
xPathUrl = '//div[@class = "rank-list-wrap"]/ul[@class = "rank-list"]/li[@class = "rank-item"]'
# 视频排名
self.rank = self.html.xpath(xPathUrl+'/div[@class = "num"]/text()')
# 视频名称
self.Title = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "info"]/a[@class = "title"]/text()')
# 视频封面
self.Rankimg = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "img"]/a/@href')
# 视频链接
self.href = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "info"]/a[@class = "title"]/@href')
# 播放次数
self.playCount = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/span[1]/text()')
# 观看次数
self.view = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/span[2]/text()')
# up主的链接
self.upHref = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/a/@href')
# up主的名字
self.upName = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/a/span/text()')
# 综合得分
self.pts = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "pts"]/div/text()')
except Exception as e:
print(e)
完整代码
spiderT
# * coding:utf-8 *
from urllib import request
from lxml import etree
class Spider():
def __init__(self):
self.headers = {
'Host': 'www.bilibili.com',
'Referer': 'https://www.bilibili.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 '
'Safari/537.36 Edge/16.16299'
}
self.targetUrlMap= {
"BiliTrendAll":{
"index":"/all",
"content":{
"all":"/0/0/3",
"cartoon":"/1/0/3",
"orgin":"/168/0/3",
"music":"/3/0/3",
"dance":"/129/0/3",
"game":"/4/0/3",
"technique":"/36/0/3",
"machine":"/188/0/3",
"life":"/160/0/3",
"guichu":"/119/0/3",
"fashion":"/155/0/3",
"yuLe":"/5/0/3",
"movie":"/181/0/3"
}
},
"BiliTrendOrigin":{
"index":"/origin",
"content":{
"all": "/0/0/3",
"cartoon": "/1/0/3",
"orgin": "/168/0/3",
"music": "/3/0/3",
"dance": "/129/0/3",
"game": "/4/0/3",
"technique": "/36/0/3",
"machine": "/188/0/3",
"life": "/160/0/3",
"guichu": "/119/0/3",
"fashion": "/155/0/3",
"yuLe": "/5/0/3",
"movie": "/181/0/3"
}
},
"BiliTrendBangumi":{
"index":"/bangumi",
"content":{
"Japan":"/13/0/3",
"China":"/167/0/3"
}
},
"BiliTrendCinema":{
"index":"/cinema",
"content":{
"newsreel":"/177/0/3",
"movie":"/23/0/3",
"teleplay":"/11/0/3"
}
},
"BiliTrendRookie":{
"index":"/rookie",
"content":{
"all": "/0/0/3",
"cartoon": "/1/0/3",
"music": "/3/0/3",
"dance": "/129/0/3",
"game": "/4/0/3",
"technique": "/36/0/3",
"machine": "/188/0/3",
"life": "/160/0/3",
"guichu": "/119/0/3",
"fashion": "/155/0/3",
"yuLe": "/5/0/3",
"movie": "/181/0/3"
}
},
}
self.targetUrlIndex = "https://www.bilibili.com/ranking"
self.content = None
# 设置代理默认关闭
self.setProxyIP()
self.setHttpProxy(False)
# save the result in list
self.resultList = []
self.__setReusltDic()
def run(self):
self.__start()
# 开始爬虫
def __start(self):
for key in self.targetUrlMap:
dictMap = self.targetUrlMap[key]
# print(dictMap['content'])
self.targetUrl = self.targetUrlIndex+dictMap['index']
self.targetUrlS = self.targetUrlIndex+dictMap['index']
for t in dictMap['content']:
self.targetUrl = self.targetUrl + dictMap['content'][t]
print("start to spider:",self.targetUrl)
self.setRequest()
self.response = self.opener.open(self.req)
status = self.response.getcode()
# 禁止
if(status != 200):
self.setHttpProxy(True)
self.response = self.opener.open(self.req)
res = self.response.read()
self.content = res.decode("utf-8")
# print(self.content)
self.__getHtml()
self.__getRankItemInformation()
self.__createDict()
self.saveDataToMysql(key+t)
print("table:"+key+t)
self.targetUrl = self.targetUrlS
# 开始请求
def setRequest(self):
self.req = request.Request(self.targetUrl, headers=self.headers)
# 设置头部
def setHeaders(self,headers):
self.headers = headers
def setHttpProxy(self,proxySwitch = False,mysqlSource=True):
# mysqlsource是源代码的按钮:如果打开,则源代码将来自mysql,或者必须在此函数之前设置代理IP。
# 无代理
nullproxy_handler = request.ProxyHandler({})
print("proxy",proxySwitch)
if proxySwitch:
if(mysqlSource):
import ProxyIP
item = ProxyIP.ProxyIP()
item.readProxyIP()
proxyIPinformation = item.getProxyIP(self.targetUrl, self.headers)
print(proxyIPinformation)
if(proxyIPinformation):
self.proxyIP = self.setProxyIP(proxyIPinformation['protocol'],proxyIPinformation['ip'],proxyIPinformation['port'])
else:
print("None legally proxy ip or consider to close this function")
import sys
sys.exit(-1)
httpproxy_handler = request.ProxyHandler(self.proxyIP)
self.opener = request.build_opener(httpproxy_handler)
else:
self.opener = request.build_opener(nullproxy_handler)
# req = request.Request("http://www.baidu.com")
# reponse = self.opener.open(req)
# print(reponse.read())
def setProxyIP(self,protocal = "http",ip = "110.52.235.114",port = "9999"):
self.proxyIP = {
protocal:ip+":"+port
}
def __getHtml(self):
self.html = etree.HTML(self.content)
def __setReusltDic(self,title=None,rank=None,rankimg=None,href =None,
playCount= None,view = None,upHref = None,upName = None,pts = None):
self.resultItem = {
"rank":rank,
"title":title,
"rankimg":rankimg,
"herf":href,
"playCount":playCount,
"view":view,
"upHref":upHref,
"upName":upName,
"pts":pts
}
def __getRankItemInformation(self):
try:
xPathUrl = '//div[@class = "rank-list-wrap"]/ul[@class = "rank-list"]/li[@class = "rank-item"]'
# 视频排名
self.rank = self.html.xpath(xPathUrl+'/div[@class = "num"]/text()')
# 视频名称
self.Title = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "info"]/a[@class = "title"]/text()')
# 视频封面
self.Rankimg = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "img"]/a/@href')
# 视频链接
self.href = self.html.xpath(xPathUrl+'/div[@class = "content"]/div[@class = "info"]/a[@class = "title"]/@href')
# 播放次数
self.playCount = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/span[1]/text()')
# 观看次数
self.view = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/span[2]/text()')
# up主的链接
self.upHref = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/a/@href')
# up主的名字
self.upName = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "detail"]/a/span/text()')
# 综合得分
self.pts = self.html.xpath(xPathUrl + '/div[@class = "content"]/div[@class = "info"]/div[@class = "pts"]/div/text()')
except Exception as e:
print(e)
def __createDict(self):
length = len(self.rank)
for i in range(0,length):
thisRank = self.rank[i]
thisTitle = self.Title[i]
thisImg = self.Rankimg[i]
thisHref = self.href [i]
thisPlayCount = self.playCount[i]
thisView = self.view[i]
thisUpHref = self.upHref[i]
thisUpName = self.upName[i]
thisPts = self.pts[i]
self.__setReusltDic(thisTitle,thisRank,thisImg,thisHref,thisPlayCount,thisView,thisUpHref,thisUpName,thisPts)
self.resultList.append(self.resultItem)
print(len(self.resultList))
print(self.resultList)
def saveDataToMysql(self,tableName):
import mySQLConnect
this = mySQLConnect.MySqlConnection()
this.saveInformation(tableName,self.resultList)
self.resultList = []
if __name__ == '__main__':
a = Spider()
# a.setHttpProxy(True)
a.run()
mysqlConnect
# * coding:utf-8 *
import pymysql
import time, datetime
class MySqlConnection():
def __init__(self):
pass
def connect_mysql(self):
self.connection = pymysql.connect(host='localhost',
user='root',#这里输入mysql用户名
password='0711',#这里输入mysql密码
db='test',#这里输入mysql库名
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
def select_mysql(self):
self.connect_mysql()
try:
with self.connection.cursor() as cursor:
sql = "SELECT `country`, `ip`,`port`,`protocol` FROM `xicidaili`"
cursor.execute(sql)
result = cursor.fetchall()
finally:
self.connection.close()
return result
def save_myself(self,dataList):
self.connect_mysql()
try:
with self.connection.cursor() as cursor:
sql = "INSERT INTO `xicidaili` (`country`, `ip`,`port`,`type`,`protocol`,`speed`,`connectTime`,`aliveTime`,`myaliveTime`,`DatabasecreatedTime`,`DatabaseupdateTime`) " \
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )"
for data in dataList:
# Create a new record
thisCountry = data['country']
thisip = data['ip']
thisport = data['port']
thistype = data['type']
thisprotocol = data['protocol']
thisspeed = data['speed']
thisconnectTime = data['connectTime']
thisaliveTime = data['aliveTime']
myaliveTime = 0
DatabasecreatedTime = int(round(time.time() * 1000))
cursor.execute(sql, (thisCountry,thisip,thisport,thistype,thisprotocol,thisspeed,thisconnectTime,thisaliveTime,myaliveTime,DatabasecreatedTime,DatabasecreatedTime))
self.connection.commit()
finally:
self.connection.close()
def saveInformation(self,tableName,dataList):
connection = pymysql.connect(host='localhost',
user='root', # 这里输入mysql用户名
password='0711', # 这里输入mysql密码
db='test', # 这里输入mysql库名
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
sql = "INSERT INTO `"+tableName+"` (`rank`, `title`,`herf`,`playCount`,`view`,`upHerf`,`upName`,`pts`,`DatabasecreatedTime`,`DatabaseupdateTime`) " \
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s )"
for data in dataList:
# Create a new record
thisRank = data['rank']
thisTitle = data['title']
thisHerf = data['herf']
thisPlayCount = data['playCount']
thisview = data['view']
thisUpHerf = data['upHref']
thisUpName = data['upName']
thisPts = data['pts']
DatabasecreatedTime = int(round(time.time() * 1000))
cursor.execute(sql, (thisRank,thisTitle,thisHerf,thisPlayCount,thisview,thisUpHerf,thisUpName,thisPts,DatabasecreatedTime,DatabasecreatedTime))
connection.commit()
finally:
connection.close()
if __name__ == '__main__':
a = MySqlConnection()
s = a.select_mysql()
print(s)