💗博主介绍:✌全网粉丝1W+,CSDN作者、博客专家、全栈领域优质创作者,博客之星、平台优质作者、专注于Java、小程序技术领域和毕业项目实战✌💗
🌟文末获取源码+数据库🌟
感兴趣的可以先收藏起来,还有大家在毕设选题,项目以及论文编写等相关问题都可以给我留言咨询,希望帮助更多的人
详细视频演示:
请联系我获取更详细的演示视频
具体实现截图:
系统介绍:
近年来,随着大数据技术的快速发展,电力能耗数据分析系统在国内得到了广泛的关注和应用。国内学者和研究人员在电力能耗数据分析领域开展了大量的研究工作,涉及到数据清洗、特征提取、模型训练等方面。基于Spark技术的电力能耗数据分析系统成为了研究的热点之一。许多研究者利用Spark技术对电力能耗数据进行分析,得出了不同时间段内的用电趋势、用电负荷分布、能源消耗结构等信息,为电力企业的生产管理和决策提供了科学依据。
在国外,电力能耗数据分析也受到了广泛的关注和应用。许多国外的研究机构和企业都在开展相关的研究工作。其中,美国、欧洲等发达国家在电力能耗数据分析领域的研究处于领先地位。这些国家的研究者们利用先进的技术和方法对电力能耗数据进行分析,探索出了一些新的模型和方法,如深度学习、神经网络等。这些国家还注重将研究成果应用到实际生产中,为电力企业提供更加智能化的决策支持。
部分代码参考:
# # -*- coding: utf-8 -*-
# 数据爬取文件
import scrapy
import pymysql
import pymssql
from ..items import DianlixinxiItem
import time
from datetime import datetime,timedelta
import datetime as formattime
import re
import random
import platform
import json
import os
import urllib
from urllib.parse import urlparse
import requests
import emoji
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from selenium.webdriver import ChromeOptions, ActionChains
from scrapy.http import TextResponse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
# 电力信息
class DianlixinxiSpider(scrapy.Spider):
name = 'dianlixinxiSpider'
spiderUrl = 'http://www.chinapower.com.cn/sj/index.html'
start_urls = spiderUrl.split(";")
protocol = ''
hostname = ''
realtime = False
def __init__(self,realtime=False,*args, **kwargs):
super().__init__(*args, **kwargs)
self.realtime = realtime=='true'
def start_requests(self):
plat = platform.system().lower()
if not self.realtime and (plat == 'linux' or plat == 'windows'):
connect = self.db_connect()
cursor = connect.cursor()
if self.table_exists(cursor, '5129hf00_dianlixinxi') == 1:
cursor.close()
connect.close()
self.temp_data()
return
pageNum = 1 + 1
for url in self.start_urls:
if '{}' in url:
for page in range(1, pageNum):
next_link = url.format(page)
yield scrapy.Request(
url=next_link,
callback=self.parse
)
else:
yield scrapy.Request(
url=url,
callback=self.parse
)
# 列表解析
def parse(self, response):
_url = urlparse(self.spiderUrl)
self.protocol = _url.scheme
self.hostname = _url.netloc
plat = platform.system().lower()
if not self.realtime and (plat == 'linux' or plat == 'windows'):
connect = self.db_connect()
cursor = connect.cursor()
if self.table_exists(cursor, '5129hf00_dianlixinxi') == 1:
cursor.close()
connect.close()
self.temp_data()
return
list = response.css('div.list div.item')
for item in list:
fields = DianlixinxiItem()
if '(.*?)' in '''h2 a::text''':
try:
fields["title"] = str( re.findall(r'''h2 a::text''', item.extract(), re.DOTALL)[0].strip())
except:
pass
else:
try:
fields["title"] = str( self.remove_html(item.css('h2 a::text').extract_first()))
except:
pass
if '(.*?)' in '''div.desc::text''':
try:
fields["miaoshu"] = str( re.findall(r'''div.desc::text''', item.extract(), re.DOTALL)[0].strip())
except:
pass
else:
try:
fields["miaoshu"] = str( self.remove_html(item.css('div.desc::text').extract_first()))
except:
pass
if '(.*?)' in '''div.info span::text''':
try:
fields["faburiqi"] = str( re.findall(r'''div.info span::text''', item.extract(), re.DOTALL)[0].strip())
except:
pass
else:
try:
fields["faburiqi"] = str( self.remove_html(item.css('div.info span::text').extract_first()))
except:
pass
if '(.*?)' in '''h2 a::attr(href)''':
try:
fields["laiyuan"] = str('http://www.chinapower.com.cn' + re.findall(r'''h2 a::attr(href)''', item.extract(), re.DOTALL)[0].strip())
except:
pass
else:
try:
fields["laiyuan"] = str('http://www.chinapower.com.cn' + self.remove_html(item.css('h2 a::attr(href)').extract_first()))
except:
pass
detailUrlRule = 'http://www.chinapower.com.cn' + item.css('h2 a::attr(href)').extract_first()
if self.protocol in detailUrlRule:
pass
elif detailUrlRule.startswith('//'):
detailUrlRule = self.protocol + ':' + detailUrlRule
elif detailUrlRule.startswith('/'):
detailUrlRule = self.protocol + '://' + self.hostname + detailUrlRule
# fields["laiyuan"] = detailUrlRule
else:
detailUrlRule = self.protocol + '://' + self.hostname + '/' + detailUrlRule
yield scrapy.Request(url=detailUrlRule, meta={'fields': fields}, callback=self.detail_parse, dont_filter=True)
# 详情解析
def detail_parse(self, response):
fields = response.meta['fields']
try:
fields["fabuwang"] = str( response.xpath('''/html/body/div[2]/div[3]/div[1]/div[1]/text()[1]''').extract()[0].strip())
except:
pass
try:
if '(.*?)' in '''<small> 作者:(.*?)</small>''':
fields["zuozhe"] = str( re.findall(r'''<small> 作者:(.*?)</small>''', response.text, re.S)[0].strip())
else:
if 'zuozhe' != 'xiangqing' and 'zuozhe' != 'detail' and 'zuozhe' != 'pinglun' and 'zuozhe' != 'zuofa':
fields["zuozhe"] = str( self.remove_html(response.css('''<small> 作者:(.*?)</small>''').extract_first()))
else:
try:
fields["zuozhe"] = str( emoji.demojize(response.css('''<small> 作者:(.*?)</small>''').extract_first()))
except:
pass
except:
pass
try:
if '(.*?)' in '''div.content''':
fields["detail"] = str( re.findall(r'''div.content''', response.text, re.S)[0].strip())
else:
if 'detail' != 'xiangqing' and 'detail' != 'detail' and 'detail' != 'pinglun' and 'detail' != 'zuofa':
fields["detail"] = str( self.remove_html(response.css('''div.content''').extract_first()))
else:
try:
fields["detail"] = str( emoji.demojize(response.css('''div.content''').extract_first()))
except:
pass
except:
pass
return fields
# 数据清洗
def pandas_filter(self):
engine = create_engine('mysql+pymysql://root:123456@localhost/spider5129hf00?charset=UTF8MB4')
df = pd.read_sql('select * from dianlixinxi limit 50', con = engine)
# 重复数据过滤
df.duplicated()
df.drop_duplicates()
#空数据过滤
df.isnull()
df.dropna()
# 填充空数据
df.fillna(value = '暂无')
# 异常值过滤
# 滤出 大于800 和 小于 100 的
a = np.random.randint(0, 1000, size = 200)
cond = (a<=800) & (a>=100)
a[cond]
# 过滤正态分布的异常值
b = np.random.randn(100000)
# 3σ过滤异常值,σ即是标准差
cond = np.abs(b) > 3 * 1
b[cond]
# 正态分布数据
df2 = pd.DataFrame(data = np.random.randn(10000,3))
# 3σ过滤异常值,σ即是标准差
cond = (df2 > 3*df2.std()).any(axis = 1)
# 不满⾜条件的⾏索引
index = df2[cond].index
# 根据⾏索引,进⾏数据删除
df2.drop(labels=index,axis = 0)
# 去除多余html标签
def remove_html(self, html):
if html == None:
return ''
pattern = re.compile(r'<[^>]+>', re.S)
return pattern.sub('', html).strip()
# 数据库连接
def db_connect(self):
type = self.settings.get('TYPE', 'mysql')
host = self.settings.get('HOST', 'localhost')
port = int(self.settings.get('PORT', 3306))
user = self.settings.get('USER', 'root')
password = self.settings.get('PASSWORD', '123456')
try:
database = self.databaseName
except:
database = self.settings.get('DATABASE', '')
if type == 'mysql':
connect = pymysql.connect(host=host, port=port, db=database, user=user, passwd=password, charset='utf8')
else:
connect = pymssql.connect(host=host, user=user, password=password, database=database)
return connect
# 断表是否存在
def table_exists(self, cursor, table_name):
cursor.execute("show tables;")
tables = [cursor.fetchall()]
table_list = re.findall('(\'.*?\')',str(tables))
table_list = [re.sub("'",'',each) for each in table_list]
if table_name in table_list:
return 1
else:
return 0
# 数据缓存源
def temp_data(self):
connect = self.db_connect()
cursor = connect.cursor()
sql = '''
insert into `dianlixinxi`(
id
,title
,miaoshu
,faburiqi
,fabuwang
,zuozhe
,laiyuan
,detail
)
select
id
,title
,miaoshu
,faburiqi
,fabuwang
,zuozhe
,laiyuan
,detail
from `5129hf00_dianlixinxi`
where(not exists (select
id
,title
,miaoshu
,faburiqi
,fabuwang
,zuozhe
,laiyuan
,detail
from `dianlixinxi` where
`dianlixinxi`.id=`5129hf00_dianlixinxi`.id
))
'''
cursor.execute(sql)
connect.commit()
connect.close()
论文参考:
源码获取:
文章下方名片联系我即可~
大家点赞、收藏、关注、评论啦 、查看👇🏻获取联系方式👇🏻