使用库
requests
是用Python语言编写,基于urllib的HTTP库,满足HTTP测试需求
安装:pip install requests
函数功能
- 发起HTTP请求,获得url对应的网页内容
import requests
response = requests.get('https://www.baidu.com/')
print(type(response))
print(response.status_code)
print(type(response.text))
print(response.text)
print(response.cookies)
其他的一些请求方式
import requests
requests.post('http://httpbin.org/post')
requests.put('http://httpbin.org/put')
requests.delete('http://httpbin.org/delete')
requests.head('http://httpbin.org/get')
requests.options('http://httpbin.org/get')
selenium
Selenium是一个自动化测试工具,利用它可以驱动浏览器执行特定的动作,如点击、下拉等操作,同时还可以获取浏览器当前呈现的页面的源代码,做到可见即可爬。对于一些JavaScript动态渲染的页面来说,这种抓取方式十分有效。
使用google浏览器驱动器
chromeDriver下载地址http://chromedriver.storage.googleapis.com/index.html
按照图示位置,复制爬取内容的xpath路径
代码示例
爬取LPL的比赛链接
import requests
import os
import csv
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.wanplus.com/event/820.html'
root = "D://spidersheet//"
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text,"html.parser")
# print(soup.prettify())
if not os.path.exists(root):
os.mkdir(root)
else:
team = []
link = []
matchname=soup.find('div',class_='bread_nav').find_all('a')[-1].text
for match in soup.find('div',class_='new-match end-match').find_all('li'):
link.append(match.find('a').get('href'))
team.append(match.find_all('span')[0].string+' VS '+match.find_all('span')[1].string)
for i in range(len(link)):
link[i]='https://www.wanplus.com'+link[i]
dataframe = pd.DataFrame({
'Team' : team,'Link': link})
print(dataframe)
dataframe.to_csv(root + matchname+".csv",index = False,sep=',',encoding = 'utf_8_sig')
爬取比赛数据内容
import requests
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from openpyxl import Workbook
import os
import numpy as np
from selenium.webdriver.common.by import By
import selenium.webdriver.support.ui as ui
import pandas as pd
import re
#选手数据读取
def player(browser):
playername =[]
playerKDA = []
playergolden =[]
CreepScore=[]
playerdamage=[]
playerdamagetolerated=[]
for player in browser.find_elements_by_class_name('player'):
playertext = player.text.split('\n')
while '' in playertext:
playertext.remove('')
playername.append(playertext[0])
playerKDA.append(playertext[4])
playergolden.append(playertext[7])
CreepScore.append(playertext[10])
playerdamage.append(playertext[13])
playerdamagetolerated.append(playertext[16])
playername.append(playertext[-4])
playerKDA.append(playertext[6])
playergolden.append(playertext[9])
CreepScore.append(playertext[12])
playerdamage.append(playertext[15])
playerdamagetolerated.append(playertext[18])
# print(playername)
dataframe = pd.DataFrame({
'PlayerName' : playername,'PlayerKDA': playerKDA,'PlayerGolden': playergolden,\
'CreepScore':CreepScore,'PlayerDamage': playerdamage,'PlayerDamageTolerated':playerdamagetolerated})
print(dataframe)
return(dataframe,playername)
#选手百分比数据
def percentage(browser,playername):
#teamposition = [Team1+'Top',Team1+'Jun',Team1+'Mid',Team1+'Bot',Team1+'Sup',Team2+'Top',Team2+'Jun',Team2+'Mid',Team2+'Bot',Team2+'Sup']
teamposition = [playername[0],playername[2],playername[4],playername[6],playername[8],playername[1],playername[3],playername[5],playername[7],playername[9]]
damagepercent = []
damagetoleratedpercent =[]
goldenpercent = []
time.sleep(1)
for a in browser.find_elements_by_class_name('highcharts-text-outline'):
damagepercent