爬虫小实战（selenium）数据小分析（pywebio、pyecharts）python分析写在网页爬取2021年世界500强企业

touero

已于 2023-08-18 16:39:05 修改

阅读量1.8k

点赞数 5

文章标签： python 爬虫 selenium

于 2022-01-16 19:00:37 首次发布

本文链接：https://blog.csdn.net/Tlouer_Elle/article/details/122526748

版权

爬取数据

通过selenium爬取2021年世界500强企业数据

import time
import requests
import csv
from selenium import webdriver

# 目标网址，构造头部信息
url = '版权问题，请查看项目地址'
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
              'application/signed-exchange;v=b3;q=0.9',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/97.0.4692.71 '
                  'Safari/537.36 Edg/97.0.1072.55 '
}


def crawler():
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print('access failed')
        return
    chrome = webdriver.Chrome(r'chromedriver.exe')
    chrome.get(url)
    script = 'Object.defineProperty(navigator,"webdriver",{get:()=>undefined,});'
    chrome.execute_script(script)
    time.sleep(2)
    for i in range(1, 11):
        for j in range(1, 51):
            # selenium通过xpath定位获取数据
            rank = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[1]'.format(j)).text
            company = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[2]/a'.format(j)).text
            income = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[3]'.format(j)).text
            profit = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[4]'.format(j)).text
            nation = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[5]'.format(j)).text
            # 追加写入csv
            with open(r'Fortune500.csv', 'a+', encoding='utf-8') as f:
                row = [rank, company, income, profit, nation]
                writer = csv.writer(f)
                writer.writerow(row)
                print(row)
        nextPage = chrome.find_element_by_xpath('//*[@id="table1_next"]')
        nextPage.click()


if __name__ == '__main__':
    crawler()

写入csv后

使用openrefine进行清洗

清洗后得到

作图小分析

from pywebio.output import put_html
import csv
from pyecharts.charts import Bar, Pie
from pyecharts import options as opts


def handle():
    put_html(proportionBar())
    put_html(proportionPie())
    put_html(Proportion())
    put_html(incomeProfit())


def proportionBar():
    nationDict = {}
    with open(r'Fortune500After.csv', encoding='utf-8') as jd:
        for i in range(1):
            jd.readline()  # 跳过第一行
        for row in csv.reader(jd):
            if row[4] not in nationDict:
                nationDict[row[4]] = 1
            else:
                nationDict[row[4]] += 1
    nationValueList = []
    nationKeyList = []
    for key in nationDict:
        nationValueList.append(nationDict[key])
        nationKeyList.append(key)
    bar = Bar()
    bar.add_xaxis(nationKeyList)
    bar.add_yaxis("世界500强数量", nationValueList)
    bar.set_global_opts(title_opts=opts.TitleOpts(title="各个国家拥有世界500强企业"),
                        xaxis_opts=opts.AxisOpts(name_rotate=60, axislabel_opts={"rotate": 45}))

    return bar.render_notebook()


def proportionPie():
    nationDict = {}
    with open(r'Fortune500After.csv', encoding='utf-8') as jd:
        for i in range(1):
            jd.readline()  # 跳过第一行
        for row in csv.reader(jd):
            if row[4] not in nationDict:
                nationDict[row[4]] = 1
            else:
                nationDict[row[4]] += 1
    nationValueList = []
    nationKeyList = []
    for key in nationDict:
        nationValueList.append(nationDict[key])
        nationKeyList.append(key)
    pie = Pie()
    pie.add('数量', [list(z) for z in zip(nationKeyList, nationValueList)], radius='45%', center=["50%", "65%"])
    return pie.render_notebook()


def incomeProfit():
    company = []
    income = []
    profit = []
    proportion = []
    with open(r'Fortune500After.csv', encoding='utf-8') as jd:
        for i in range(1):
            jd.readline()  # 跳过第一行
        for row in csv.reader(jd):
            try:
                if float(row[3]) > 0:
                    company.append(row[1])
                    income.append(float(row[2]))
                    profit.append(float(row[3]))
                    temp = float(row[3]) / float(row[2]) * 100
                    proportion.append(temp)
            except Exception:
                pass
    bar = Bar(init_opts=opts.InitOpts(width='4000px', height='30000px'))
    bar.add_xaxis(company)
    bar.add_yaxis("营业收入", income)
    bar.add_yaxis("利润", profit)
    # bar.add_yaxis("利润占营业收入", proportion)
    bar.reversal_axis()
    bar.set_series_opts(label_opts=opts.LabelOpts(position="right"))
    bar.set_global_opts(title_opts=opts.TitleOpts(title="营业收入与利润（不包括利润小于0）"))
    return bar.render_notebook()


def Proportion():
    company = []
    proportion = []
    with open(r'Fortune500After.csv', encoding='utf-8') as jd:
        for i in range(1):
            jd.readline()  # 跳过第一行
        for row in csv.reader(jd):
            try:
                if float(row[3]) > 0:
                    company.append(row[1])
                    temp = float(row[3]) / float(row[2]) * 100
                    proportion.append(temp)
            except Exception:
                pass
    bar = Bar(init_opts=opts.InitOpts(width='4000px', height='30000px'))
    bar.add_xaxis(company)
    bar.add_yaxis("利润率", proportion)
    bar.reversal_axis()
    bar.set_series_opts(label_opts=opts.LabelOpts(position="right"))
    bar.set_global_opts(title_opts=opts.TitleOpts(title="利润率分析（不包括利润小于0）"))
    return bar.render_notebook()


if __name__ == '__main__':
    handle()

随机端口得html文件查看