python爬取数据网站多网页数据

使用库

requests

是用Python语言编写,基于urllib的HTTP库,满足HTTP测试需求
安装:pip install requests
函数功能

  1. 发起HTTP请求,获得url对应的网页内容
import requests
response = requests.get('https://www.baidu.com/')
print(type(response))
print(response.status_code)
print(type(response.text))
print(response.text)
print(response.cookies)

其他的一些请求方式

import requests
requests.post('http://httpbin.org/post')
requests.put('http://httpbin.org/put')
requests.delete('http://httpbin.org/delete')
requests.head('http://httpbin.org/get')
requests.options('http://httpbin.org/get')

selenium

Selenium是一个自动化测试工具,利用它可以驱动浏览器执行特定的动作,如点击、下拉等操作,同时还可以获取浏览器当前呈现的页面的源代码,做到可见即可爬。对于一些JavaScript动态渲染的页面来说,这种抓取方式十分有效。

使用google浏览器驱动器
chromeDriver下载地址http://chromedriver.storage.googleapis.com/index.html

按照图示位置,复制爬取内容的xpath路径
在这里插入图片描述

代码示例

爬取LPL的比赛链接

import requests
import os
import csv
import pandas as pd
from bs4 import BeautifulSoup

url = 'https://www.wanplus.com/event/820.html'
root = "D://spidersheet//"
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text,"html.parser")
# print(soup.prettify())
if not os.path.exists(root):
	os.mkdir(root)
else:
	team = []
	link = []
	matchname=soup.find('div',class_='bread_nav').find_all('a')[-1].text

	for match in soup.find('div',class_='new-match end-match').find_all('li'):
		link.append(match.find('a').get('href'))
		team.append(match.find_all('span')[0].string+' VS '+match.find_all('span')[1].string)
	for i in range(len(link)):
		link[i]='https://www.wanplus.com'+link[i]
	dataframe = pd.DataFrame({
   'Team' : team,'Link': link})
	print(dataframe)
	dataframe.to_csv(root + matchname+".csv",index = False,sep=',',encoding = 'utf_8_sig')

爬取比赛数据内容

import requests
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from openpyxl import Workbook 
import os
import numpy as np
from selenium.webdriver.common.by import By
import selenium.webdriver.support.ui as ui
import pandas as pd
import re

#选手数据读取
def player(browser):
	playername =[]
	playerKDA = []
	playergolden =[]
	CreepScore=[]
	playerdamage=[]
	playerdamagetolerated=[]

	for player in browser.find_elements_by_class_name('player'):
		playertext = player.text.split('\n')
		while '' in playertext:
			playertext.remove('')
		playername.append(playertext[0])
		playerKDA.append(playertext[4])
		playergolden.append(playertext[7])
		CreepScore.append(playertext[10])
		playerdamage.append(playertext[13])
		playerdamagetolerated.append(playertext[16])
		playername.append(playertext[-4])
		playerKDA.append(playertext[6])
		playergolden.append(playertext[9])
		CreepScore.append(playertext[12])
		playerdamage.append(playertext[15])
		playerdamagetolerated.append(playertext[18])
	# print(playername)
	dataframe = pd.DataFrame({
   'PlayerName' : playername,'PlayerKDA': playerKDA,'PlayerGolden': playergolden,\
	'CreepScore':CreepScore,'PlayerDamage': playerdamage,'PlayerDamageTolerated':playerdamagetolerated})
	print(dataframe)
	return(dataframe,playername)


#选手百分比数据
def percentage(browser,playername):
	#teamposition = [Team1+'Top',Team1+'Jun',Team1+'Mid',Team1+'Bot',Team1+'Sup',Team2+'Top',Team2+'Jun',Team2+'Mid',Team2+'Bot',Team2+'Sup']
	teamposition = [playername[0],playername[2],playername[4],playername[6],playername[8],playername[1],playername[3],playername[5],playername[7],playername[9]]
	damagepercent = []
	damagetoleratedpercent =[]
	goldenpercent = []
	time.sleep(1)
	for a in browser.find_elements_by_class_name('highcharts-text-outline'):
		damagepercent
  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值