基于大数据+Spark电力能耗数据分析与可视化平台设计与实现(源码+LW+部署文档+讲解等)

程序员gelei

已于 2024-06-03 11:23:33 修改

阅读量587

点赞数 3

分类专栏： Java毕业设计项目文章标签： python 开发语言电力能耗数据分析与可视化平台

于 2024-06-03 11:21:49 首次发布

本文链接：https://blog.csdn.net/kkkkkfffd/article/details/139409952

版权

Java毕业设计项目专栏收录该内容

2036 篇文章

订阅专栏

💗博主介绍：✌全网粉丝1W+,CSDN作者、博客专家、全栈领域优质创作者，博客之星、平台优质作者、专注于Java、小程序技术领域和毕业项目实战✌💗

🌟文末获取源码+数据库🌟
感兴趣的可以先收藏起来，还有大家在毕设选题，项目以及论文编写等相关问题都可以给我留言咨询，希望帮助更多的人

Java精品实战案例《600套》

2023-2025年最值得选择的Java毕业设计选题大全：1000个热门选题推荐✅✅✅

详细视频演示：

请联系我获取更详细的演示视频

具体实现截图：

系统介绍：

近年来，随着大数据技术的快速发展，电力能耗数据分析系统在国内得到了广泛的关注和应用。国内学者和研究人员在电力能耗数据分析领域开展了大量的研究工作，涉及到数据清洗、特征提取、模型训练等方面。基于Spark技术的电力能耗数据分析系统成为了研究的热点之一。许多研究者利用Spark技术对电力能耗数据进行分析，得出了不同时间段内的用电趋势、用电负荷分布、能源消耗结构等信息，为电力企业的生产管理和决策提供了科学依据。

在国外，电力能耗数据分析也受到了广泛的关注和应用。许多国外的研究机构和企业都在开展相关的研究工作。其中，美国、欧洲等发达国家在电力能耗数据分析领域的研究处于领先地位。这些国家的研究者们利用先进的技术和方法对电力能耗数据进行分析，探索出了一些新的模型和方法，如深度学习、神经网络等。这些国家还注重将研究成果应用到实际生产中，为电力企业提供更加智能化的决策支持。

部分代码参考：

# # -*- coding: utf-8 -*-
 
# 数据爬取文件
 
import scrapy
import pymysql
import pymssql
from ..items import DianlixinxiItem
import time
from datetime import datetime,timedelta
import datetime as formattime
import re
import random
import platform
import json
import os
import urllib
from urllib.parse import urlparse
import requests
import emoji
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from selenium.webdriver import ChromeOptions, ActionChains
from scrapy.http import TextResponse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
# 电力信息
class DianlixinxiSpider(scrapy.Spider):
    name = 'dianlixinxiSpider'
    spiderUrl = 'http://www.chinapower.com.cn/sj/index.html'
    start_urls = spiderUrl.split(";")
    protocol = ''
    hostname = ''
    realtime = False
 
 
    def __init__(self,realtime=False,*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.realtime = realtime=='true'
 
    def start_requests(self):
 
        plat = platform.system().lower()
        if not self.realtime and (plat == 'linux' or plat == 'windows'):
            connect = self.db_connect()
            cursor = connect.cursor()
            if self.table_exists(cursor, '5129hf00_dianlixinxi') == 1:
                cursor.close()
                connect.close()
                self.temp_data()
                return
        pageNum = 1 + 1
        for url in self.start_urls:
            if '{}' in url:
                for page in range(1, pageNum):
                    next_link = url.format(page)
                    yield scrapy.Request(
                        url=next_link,
                        callback=self.parse
                    )
            else:
                yield scrapy.Request(
                    url=url,
                    callback=self.parse
                )
 
    # 列表解析
    def parse(self, response):
        _url = urlparse(self.spiderUrl)
        self.protocol = _url.scheme
        self.hostname = _url.netloc
        plat = platform.system().lower()
        if not self.realtime and (plat == 'linux' or plat == 'windows'):
            connect = self.db_connect()
            cursor = connect.cursor()
            if self.table_exists(cursor, '5129hf00_dianlixinxi') == 1:
                cursor.close()
                connect.close()
                self.temp_data()
                return
        list = response.css('div.list div.item')
        for item in list:
            fields = DianlixinxiItem()
 
            if '(.*?)' in '''h2 a::text''':
                try:
                    fields["title"] = str( re.findall(r'''h2 a::text''', item.extract(), re.DOTALL)[0].strip())
 
                except:
                    pass
            else:
                try:
                    fields["title"] = str( self.remove_html(item.css('h2 a::text').extract_first()))
 
                except:
                    pass
            if '(.*?)' in '''div.desc::text''':
                try:
                    fields["miaoshu"] = str( re.findall(r'''div.desc::text''', item.extract(), re.DOTALL)[0].strip())
 
                except:
                    pass
            else:
                try:
                    fields["miaoshu"] = str( self.remove_html(item.css('div.desc::text').extract_first()))
 
                except:
                    pass
            if '(.*?)' in '''div.info span::text''':
                try:
                    fields["faburiqi"] = str( re.findall(r'''div.info span::text''', item.extract(), re.DOTALL)[0].strip())
 
                except:
                    pass
            else:
                try:
                    fields["faburiqi"] = str( self.remove_html(item.css('div.info span::text').extract_first()))
 
                except:
                    pass
            if '(.*?)' in '''h2 a::attr(href)''':
                try:
                    fields["laiyuan"] = str('http://www.chinapower.com.cn' + re.findall(r'''h2 a::attr(href)''', item.extract(), re.DOTALL)[0].strip())
 
                except:
                    pass
            else:
                try:
                    fields["laiyuan"] = str('http://www.chinapower.com.cn' + self.remove_html(item.css('h2 a::attr(href)').extract_first()))
 
                except:
                    pass
            detailUrlRule = 'http://www.chinapower.com.cn' + item.css('h2 a::attr(href)').extract_first()
            if self.protocol in detailUrlRule:
                pass
            elif detailUrlRule.startswith('//'):
                detailUrlRule = self.protocol + ':' + detailUrlRule
            elif detailUrlRule.startswith('/'):
                detailUrlRule = self.protocol + '://' + self.hostname + detailUrlRule
                # fields["laiyuan"] = detailUrlRule
            else:
                detailUrlRule = self.protocol + '://' + self.hostname + '/' + detailUrlRule
            yield scrapy.Request(url=detailUrlRule, meta={'fields': fields},  callback=self.detail_parse, dont_filter=True)
 
    # 详情解析
    def detail_parse(self, response):
        fields = response.meta['fields']
        try:
            fields["fabuwang"] = str( response.xpath('''/html/body/div[2]/div[3]/div[1]/div[1]/text()[1]''').extract()[0].strip())
 
        except:
            pass
        try:
            if '(.*?)' in '''<small>&nbsp;作者：(.*?)</small>''':
                fields["zuozhe"] = str( re.findall(r'''<small>&nbsp;作者：(.*?)</small>''', response.text, re.S)[0].strip())
 
            else:
                if 'zuozhe' != 'xiangqing' and 'zuozhe' != 'detail' and 'zuozhe' != 'pinglun' and 'zuozhe' != 'zuofa':
                    fields["zuozhe"] = str( self.remove_html(response.css('''<small>&nbsp;作者：(.*?)</small>''').extract_first()))
 
                else:
                    try:
                        fields["zuozhe"] = str( emoji.demojize(response.css('''<small>&nbsp;作者：(.*?)</small>''').extract_first()))
 
                    except:
                        pass
        except:
            pass
        try:
            if '(.*?)' in '''div.content''':
                fields["detail"] = str( re.findall(r'''div.content''', response.text, re.S)[0].strip())
 
            else:
                if 'detail' != 'xiangqing' and 'detail' != 'detail' and 'detail' != 'pinglun' and 'detail' != 'zuofa':
                    fields["detail"] = str( self.remove_html(response.css('''div.content''').extract_first()))
 
                else:
                    try:
                        fields["detail"] = str( emoji.demojize(response.css('''div.content''').extract_first()))
 
                    except:
                        pass
        except:
            pass
        return fields
 
    # 数据清洗
    def pandas_filter(self):
        engine = create_engine('mysql+pymysql://root:123456@localhost/spider5129hf00?charset=UTF8MB4')
        df = pd.read_sql('select * from dianlixinxi limit 50', con = engine)
 
        # 重复数据过滤
        df.duplicated()
        df.drop_duplicates()
 
        #空数据过滤
        df.isnull()
        df.dropna()
 
        # 填充空数据
        df.fillna(value = '暂无')
 
        # 异常值过滤
 
        # 滤出 大于800 和 小于 100 的
        a = np.random.randint(0, 1000, size = 200)
        cond = (a<=800) & (a>=100)
        a[cond]
 
        # 过滤正态分布的异常值
        b = np.random.randn(100000)
        # 3σ过滤异常值，σ即是标准差
        cond = np.abs(b) > 3 * 1
        b[cond]
 
        # 正态分布数据
        df2 = pd.DataFrame(data = np.random.randn(10000,3))
        # 3σ过滤异常值，σ即是标准差
        cond = (df2 > 3*df2.std()).any(axis = 1)
        # 不满⾜条件的⾏索引
        index = df2[cond].index
        # 根据⾏索引，进⾏数据删除
        df2.drop(labels=index,axis = 0)
 
    # 去除多余html标签
    def remove_html(self, html):
        if html == None:
            return ''
        pattern = re.compile(r'<[^>]+>', re.S)
        return pattern.sub('', html).strip()
 
    # 数据库连接
    def db_connect(self):
        type = self.settings.get('TYPE', 'mysql')
        host = self.settings.get('HOST', 'localhost')
        port = int(self.settings.get('PORT', 3306))
        user = self.settings.get('USER', 'root')
        password = self.settings.get('PASSWORD', '123456')
 
        try:
            database = self.databaseName
        except:
            database = self.settings.get('DATABASE', '')
 
        if type == 'mysql':
            connect = pymysql.connect(host=host, port=port, db=database, user=user, passwd=password, charset='utf8')
        else:
            connect = pymssql.connect(host=host, user=user, password=password, database=database)
        return connect
 
    # 断表是否存在
    def table_exists(self, cursor, table_name):
        cursor.execute("show tables;")
        tables = [cursor.fetchall()]
        table_list = re.findall('(\'.*?\')',str(tables))
        table_list = [re.sub("'",'',each) for each in table_list]
 
        if table_name in table_list:
            return 1
        else:
            return 0
 
    # 数据缓存源
    def temp_data(self):
 
        connect = self.db_connect()
        cursor = connect.cursor()
        sql = '''
            insert into `dianlixinxi`(
                id
                ,title
                ,miaoshu
                ,faburiqi
                ,fabuwang
                ,zuozhe
                ,laiyuan
                ,detail
            )
            select
                id
                ,title
                ,miaoshu
                ,faburiqi
                ,fabuwang
                ,zuozhe
                ,laiyuan
                ,detail
            from `5129hf00_dianlixinxi`
            where(not exists (select
                id
                ,title
                ,miaoshu
                ,faburiqi
                ,fabuwang
                ,zuozhe
                ,laiyuan
                ,detail
            from `dianlixinxi` where
                `dianlixinxi`.id=`5129hf00_dianlixinxi`.id
            ))
        '''
 
        cursor.execute(sql)
        connect.commit()
        connect.close()