爬取数据
通过selenium爬取2021年世界500强企业数据
import time
import requests
import csv
from selenium import webdriver
# 目标网址,构造头部信息
url = '版权问题,请查看项目地址'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3;q=0.9',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/97.0.4692.71 '
'Safari/537.36 Edg/97.0.1072.55 '
}
def crawler():
response = requests.get(url, headers=headers)
if response.status_code != 200:
print('access failed')
return
chrome = webdriver.Chrome(r'chromedriver.exe')
chrome.get(url)
script = 'Object.defineProperty(navigator,"webdriver",{get:()=>undefined,});'
chrome.execute_script(script)
time.sleep(2)
for i in range(1, 11):
for j in range(1, 51):
# selenium通过xpath定位获取数据
rank = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[1]'.format(j)).text
company = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[2]/a'.format(j)).text
income = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[3]'.format(j)).text
profit = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[4]'.format(j)).text
nation = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[5]'.format(j)).text
# 追加写入csv
with open(r'Fortune500.csv', 'a+', encoding='utf-8') as f:
row = [rank, company, income, profit, nation]
writer = csv.writer(f)
writer.writerow(row)
print(row)
nextPage = chrome.find_element_by_xpath('//*[@id="table1_next"]')
nextPage.click()
if __name__ == '__main__':
crawler()
写入csv后
使用openrefine进行清洗
清洗后得到
作图小分析
from pywebio.output import put_html
import csv
from pyecharts.charts import Bar, Pie
from pyecharts import options as opts
def handle():
put_html(proportionBar())
put_html(proportionPie())
put_html(Proportion())
put_html(incomeProfit())
def proportionBar():
nationDict = {}
with open(r'Fortune500After.csv', encoding='utf-8') as jd:
for i in range(1):
jd.readline() # 跳过第一行
for row in csv.reader(jd):
if row[4] not in nationDict:
nationDict[row[4]] = 1
else:
nationDict[row[4]] += 1
nationValueList = []
nationKeyList = []
for key in nationDict:
nationValueList.append(nationDict[key])
nationKeyList.append(key)
bar = Bar()
bar.add_xaxis(nationKeyList)
bar.add_yaxis("世界500强数量", nationValueList)
bar.set_global_opts(title_opts=opts.TitleOpts(title="各个国家拥有世界500强企业"),
xaxis_opts=opts.AxisOpts(name_rotate=60, axislabel_opts={"rotate": 45}))
return bar.render_notebook()
def proportionPie():
nationDict = {}
with open(r'Fortune500After.csv', encoding='utf-8') as jd:
for i in range(1):
jd.readline() # 跳过第一行
for row in csv.reader(jd):
if row[4] not in nationDict:
nationDict[row[4]] = 1
else:
nationDict[row[4]] += 1
nationValueList = []
nationKeyList = []
for key in nationDict:
nationValueList.append(nationDict[key])
nationKeyList.append(key)
pie = Pie()
pie.add('数量', [list(z) for z in zip(nationKeyList, nationValueList)], radius='45%', center=["50%", "65%"])
return pie.render_notebook()
def incomeProfit():
company = []
income = []
profit = []
proportion = []
with open(r'Fortune500After.csv', encoding='utf-8') as jd:
for i in range(1):
jd.readline() # 跳过第一行
for row in csv.reader(jd):
try:
if float(row[3]) > 0:
company.append(row[1])
income.append(float(row[2]))
profit.append(float(row[3]))
temp = float(row[3]) / float(row[2]) * 100
proportion.append(temp)
except Exception:
pass
bar = Bar(init_opts=opts.InitOpts(width='4000px', height='30000px'))
bar.add_xaxis(company)
bar.add_yaxis("营业收入", income)
bar.add_yaxis("利润", profit)
# bar.add_yaxis("利润占营业收入", proportion)
bar.reversal_axis()
bar.set_series_opts(label_opts=opts.LabelOpts(position="right"))
bar.set_global_opts(title_opts=opts.TitleOpts(title="营业收入与利润(不包括利润小于0)"))
return bar.render_notebook()
def Proportion():
company = []
proportion = []
with open(r'Fortune500After.csv', encoding='utf-8') as jd:
for i in range(1):
jd.readline() # 跳过第一行
for row in csv.reader(jd):
try:
if float(row[3]) > 0:
company.append(row[1])
temp = float(row[3]) / float(row[2]) * 100
proportion.append(temp)
except Exception:
pass
bar = Bar(init_opts=opts.InitOpts(width='4000px', height='30000px'))
bar.add_xaxis(company)
bar.add_yaxis("利润率", proportion)
bar.reversal_axis()
bar.set_series_opts(label_opts=opts.LabelOpts(position="right"))
bar.set_global_opts(title_opts=opts.TitleOpts(title="利润率分析(不包括利润小于0)"))
return bar.render_notebook()
if __name__ == '__main__':
handle()
随机端口得html文件查看
觉得不错请给一个star