一、item编写
import scrapy
class GzweatherItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# title = scrapy.Field()
date = scrapy.Field()
maxtemp = scrapy.Field()
mintemp = scrapy.Field()
weather = scrapy.Field()
wind = scrapy.Field()
power = scrapy.Field()
pass
二、pipeline编写
from scrapy.exceptions import DropItem
class GzweatherPipeline(object):
def process_item(self, item, spider):
if True:
return item
else:
raise DropItem('reason')
import scrapy
from gzweather.items import GzweatherItem
class TtffSpider(scrapy.Spider):
# name定义spider名字的字符串,唯一的,必须的
# 可以生成多个spider实例
name = 'tianqi'
# allowed_domains包含了允许爬取的域名列表
allowed_domains = ['lishi.tianqi.com']
# start_urls是URL列表
# 当没有指定特定的URL时,spider将从该列表开始进行爬取
start_urls = ['http://lishi.tianqi.com/guangzhou/201101.html']
# custom_setting是一个dict,当启动spider时,该设置将会覆盖项目级的设置
custom_settings = {}
# crawler在初始化class后,由类方法from_crawler()设置
# 并且链接了本spider实例对应的Crawler对象
# Crawler包含了很多项目中的组件,作为单一的入口点 (例如插件,中间件,信号管理器等)
# settings是一个setting实例
# logger与spider的名字一起创建,可以用来发送日志信息
# from_crawler()设置crawler和setting属性
# start_requests()返回一个iterable,包含了spider用于爬取的第一个Request
# 只被scrapy调用一次,可以实现为生成器
# make_requests_from_url()返回用于爬去的Request对象
# parse()默认的回调函数
# parse负责处理response并返回处理的数据以及(/或)跟进的URL
# Spider对其他的Request的回调函数也有相同的要求
def parse(self, response):
# 回调函数必须返回一个包含Request, dict或Item的可迭代的对象
self.logger.info('A response from %s just arrived!', response.url)
# 在此项目中一直response是html_reponse
# 实例化Selector
sel = scrapy.Selector(response)
title = sel.xpath('//title/text()').extract_first()
# gzitem['title'] = title
print ('打印输出**************')
print (title)
uls = sel.xpath('//div[@class="tqtongji2"]/ul')
# print 'uls:', uls.extract()
for index, ul in enumerate(uls):
gzitem = GzweatherItem()
if index == 0:
continue
args = ul.xpath('li/text()').extract()
if len(args) == 5:
gzitem['date'] = ul.xpath('li/a/text()').extract()[0]
gzitem['maxtemp'] = args[0]
gzitem['mintemp'] = args[1]
gzitem['weather'] = args[2]
gzitem['wind'] = args[3]
gzitem['power'] = args[4]
yield gzitem
elif len(args) == 6:
gzitem['date'] = args[0]
gzitem['maxtemp'] = args[1]
gzitem['mintemp'] = args[2]
gzitem['weather'] = args[3]
gzitem['wind'] = args[4]
gzitem['power'] = args[5]
yield gzitem
# for h3 in response.xpath('//h3').extract():
# yield GzweatherItem(date=h3)
print ('#####')
print ('#####')
# print sel.xpath('//div[contains(@id, "tool_site")]/div[1]/span[1]/a[last()]/@href').extract()
for url in sel.xpath('//div[contains(@id, "tool_site")]/div[1]/span[1]/a[last()]/@href').extract():
print (url)
# //span[@class="tqxiangqing"/a[2]]
# '//a/@href'
yield scrapy.Request(url, self.parse)
pass
# log(message[, level, component])
# 使用scrapy.log.msg()方法记录(log)message
# 自动带上该spider的name属性,封装了通过logger来发送log消息的方法,向后兼容
# closed(reason)spider关闭时调用
# 替代调用signals.connect()来监听spider_closed信号的快捷方式
#文档中运行
#from scrapy.cmdline import execute
#execute()
四、获得数据
五、数据处理
# encoding: utf-8
import pandas as pd
filename = r'E:\Users\3404\gzweather.csv'
outpath = r'E:\Users\3404\newgzweather.csv'
if __name__ == '__main__':
df = pd.read_csv(filename, header=None)
print ('先看一下')
print (df.head())
df.columns = df.loc[0]
df = df.drop(0)
df.index = df['date'].values
df = df.sort_index()
print ('排序调整一下')
print (df.head())
df = df.drop_duplicates()
df = df.drop('date', axis=1)
df = df.dropna(how='any')
print ('最后')
print (df.head())
df.to_csv(outpath)
六、画图显示
# encoding: utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
inputpath = r'E:\Users\3404\Desktop\\newgzweather.csv'
def converttoint(series11):
datalist = [int(x) for x in series11.tolist()]
# series = pd.Series(datalist, series11.index)
return datalist
if __name__ == '__main__':
df = pd.read_csv(inputpath, header=None,encoding = "ISO-8859-1",names=['maxtemp', 'power', 'mintemp', 'weather', 'wind'])
series11 = df.loc['2011-12-01':'2011-12-31', 'mintemp']
series12 = df.loc['2012-12-01':'2012-12-31', 'mintemp']
series13 = df.loc['2013-12-01':'2013-12-31', 'mintemp']
series14 = df.loc['2014-12-01':'2014-12-31', 'mintemp']
series15 = df.loc['2015-12-01':'2015-12-31', 'mintemp']
series11h = df.loc['2011-12-01':'2011-12-31', 'maxtemp']
series12h = df.loc['2012-12-01':'2012-12-31', 'maxtemp']
series13h = df.loc['2013-12-01':'2013-12-31', 'maxtemp']
series14h = df.loc['2014-12-01':'2014-12-31', 'maxtemp']
series15h = df.loc['2015-12-01':'2015-12-31', 'maxtemp']
fig = plt.figure()
list11 = converttoint(series11)
list12 = converttoint(series12)
list13 = converttoint(series13)
list14 = converttoint(series14)
list15 = converttoint(series15)
plt.plot(range(31), list11, label='2011')
plt.plot(range(31), list12, label='2012')
plt.plot(range(31), list13, label='2013')
plt.plot(range(31), list14, label='2014')
plt.plot(range(31), list15, label='2015')
plt.xlabel('12-01 to 12-31')
plt.ylabel('tempature')
plt.title('tempature variation in past 5 years')
plt.legend(loc='best')
plt.show()
# series11.plot(style='b')
# fig.autofmt_xdate()
# plt.show()
# series12.plot(style='b')
# fig.autofmt_xdate()
# plt.show()
# series13.plot(style='b')
# fig.autofmt_xdate()
# plt.show()
# series14.plot(style='b')
# fig.autofmt_xdate()
# plt.show()
# series15.plot(style='b')
# fig.autofmt_xdate()
# plt.show()
m11 = np.array(list11).mean()
m12 = np.array(list12).mean()
m13 = np.array(list13).mean()
m14 = np.array(list14).mean()
m15 = np.array(list15).mean()
meantemps = [m11, m12, m13, m14, m15]
m11h = np.array(converttoint(series11h)).mean() - m11
m12h = np.array(converttoint(series12h)).mean() - m12
m13h = np.array(converttoint(series13h)).mean() - m13
m14h = np.array(converttoint(series14h)).mean() - m14
m15h = np.array(converttoint(series15h)).mean() - m15
meantemphs = [m11h, m12h, m13h, m14h, m15h]
std11 = np.array(list11).std()
std12 = np.array(list12).std()
std13 = np.array(list13).std()
std14 = np.array(list14).std()
std15 = np.array(list15).std()
stdtemps = [std11, std12, std13, std14, std15]
std11h = np.array(converttoint(series11h)).std()
std12h = np.array(converttoint(series12h)).std()
std13h = np.array(converttoint(series13h)).std()
std14h = np.array(converttoint(series14h)).std()
std15h = np.array(converttoint(series15h)).std()
stdtemphs = [std11h, std12h, std13h, std14h, std15h]
ind = np.arange(5)
width = 0.35
p1 = plt.bar(ind, meantemps, width, color='r', yerr=stdtemps)
p2 = plt.bar(ind, meantemphs, width, color='y',
bottom=meantemps, yerr=stdtemphs)
plt.ylabel('tempature')
plt.title('mean of mintempature and mean of maxtempature in past 5 years')
plt.xticks(ind + width/2., ('2011', '2012', '2013', '2014', '2015'))
plt.legend((p1[0], p2[0]), ('mintempature', 'delttempature'))
plt.show()