Scrapy抓取天气数据和显示

最新推荐文章于 2024-07-24 14:36:42 发布

三名狂客

最新推荐文章于 2024-07-24 14:36:42 发布

阅读量2.2k

点赞数

分类专栏： python爬虫文章标签：数据 python爬虫 Scrapy 天气

python爬虫专栏收录该内容

35 篇文章 0 订阅

订阅专栏

一、item编写

import scrapy


class GzweatherItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # title = scrapy.Field()
    date = scrapy.Field()
    maxtemp = scrapy.Field()
    mintemp = scrapy.Field()
    weather = scrapy.Field()
    wind = scrapy.Field()
    power = scrapy.Field()
    pass

二、pipeline编写

from scrapy.exceptions import DropItem


class GzweatherPipeline(object):

	def process_item(self, item, spider):
		if True:
			return item
		else:
			raise DropItem('reason')

三、spider编写

import scrapy
from gzweather.items import GzweatherItem



class TtffSpider(scrapy.Spider):
	# name定义spider名字的字符串，唯一的，必须的
	# 可以生成多个spider实例
	name = 'tianqi'
	# allowed_domains包含了允许爬取的域名列表
	allowed_domains = ['lishi.tianqi.com']
	# start_urls是URL列表
	# 当没有指定特定的URL时，spider将从该列表开始进行爬取
	start_urls = ['http://lishi.tianqi.com/guangzhou/201101.html']
	# custom_setting是一个dict，当启动spider时，该设置将会覆盖项目级的设置
	custom_settings = {}
	# crawler在初始化class后，由类方法from_crawler()设置
	# 并且链接了本spider实例对应的Crawler对象
	# Crawler包含了很多项目中的组件,作为单一的入口点 (例如插件,中间件,信号管理器等)

	# settings是一个setting实例

	# logger与spider的名字一起创建，可以用来发送日志信息

	# from_crawler()设置crawler和setting属性

	# start_requests()返回一个iterable，包含了spider用于爬取的第一个Request
	# 只被scrapy调用一次，可以实现为生成器

	# make_requests_from_url()返回用于爬去的Request对象

	# parse()默认的回调函数
	# parse负责处理response并返回处理的数据以及(/或)跟进的URL
	# Spider对其他的Request的回调函数也有相同的要求
	def parse(self, response):
		# 回调函数必须返回一个包含Request, dict或Item的可迭代的对象
		self.logger.info('A response from %s just arrived!', response.url)
		# 在此项目中一直response是html_reponse
		# 实例化Selector
		sel = scrapy.Selector(response)
		title = sel.xpath('//title/text()').extract_first()
		# gzitem['title'] = title
		print ('打印输出**************')
		print (title)
		uls = sel.xpath('//div[@class="tqtongji2"]/ul')
		# print 'uls:', uls.extract()
		for index, ul in enumerate(uls):
			gzitem = GzweatherItem()
			if index == 0:
				continue
			args = ul.xpath('li/text()').extract()
			if len(args) == 5:
				gzitem['date'] = ul.xpath('li/a/text()').extract()[0]
				gzitem['maxtemp'] = args[0]
				gzitem['mintemp'] = args[1]
				gzitem['weather'] = args[2]
				gzitem['wind'] = args[3]
				gzitem['power'] = args[4]
				yield gzitem
			elif len(args) == 6:
				gzitem['date'] = args[0]
				gzitem['maxtemp'] = args[1]
				gzitem['mintemp'] = args[2]
				gzitem['weather'] = args[3]
				gzitem['wind'] = args[4]
				gzitem['power'] = args[5]
				yield gzitem
		# for h3 in response.xpath('//h3').extract():
		# 	yield GzweatherItem(date=h3)
		print ('#####')
		print ('#####')
		# print sel.xpath('//div[contains(@id, "tool_site")]/div[1]/span[1]/a[last()]/@href').extract()
		for url in sel.xpath('//div[contains(@id, "tool_site")]/div[1]/span[1]/a[last()]/@href').extract():
			print  (url)
			# //span[@class="tqxiangqing"/a[2]]
			# '//a/@href'
			yield scrapy.Request(url, self.parse)
		pass

	# log(message[, level, component])
	# 使用scrapy.log.msg()方法记录(log)message
	# 自动带上该spider的name属性，封装了通过logger来发送log消息的方法，向后兼容

	# closed(reason)spider关闭时调用
	# 替代调用signals.connect()来监听spider_closed信号的快捷方式
	#文档中运行
	#from scrapy.cmdline import execute
    #execute()

四、获得数据

五、数据处理

# encoding: utf-8
import pandas as pd
filename = r'E:\Users\3404\gzweather.csv'
outpath = r'E:\Users\3404\newgzweather.csv'


if __name__ == '__main__':
	df = pd.read_csv(filename, header=None)
	print ('先看一下')
	print (df.head())
	df.columns = df.loc[0]
	df = df.drop(0)
	df.index = df['date'].values
	df = df.sort_index()
	print ('排序调整一下')
	print (df.head())
	df = df.drop_duplicates()
	df = df.drop('date', axis=1)
	df = df.dropna(how='any')
	print ('最后')
	print (df.head())
	df.to_csv(outpath)

六、画图显示

# encoding: utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
inputpath = r'E:\Users\3404\Desktop\\newgzweather.csv'


def converttoint(series11):
	datalist = [int(x) for x in series11.tolist()]
	# series = pd.Series(datalist, series11.index)
	return datalist


if __name__ == '__main__':
	df = pd.read_csv(inputpath, header=None,encoding = "ISO-8859-1",names=['maxtemp', 'power', 'mintemp', 'weather', 'wind'])
	series11 = df.loc['2011-12-01':'2011-12-31', 'mintemp']
	series12 = df.loc['2012-12-01':'2012-12-31', 'mintemp']
	series13 = df.loc['2013-12-01':'2013-12-31', 'mintemp']
	series14 = df.loc['2014-12-01':'2014-12-31', 'mintemp']
	series15 = df.loc['2015-12-01':'2015-12-31', 'mintemp']
	series11h = df.loc['2011-12-01':'2011-12-31', 'maxtemp']
	series12h = df.loc['2012-12-01':'2012-12-31', 'maxtemp']
	series13h = df.loc['2013-12-01':'2013-12-31', 'maxtemp']
	series14h = df.loc['2014-12-01':'2014-12-31', 'maxtemp']
	series15h = df.loc['2015-12-01':'2015-12-31', 'maxtemp']
	fig = plt.figure()
	list11 = converttoint(series11)
	list12 = converttoint(series12)
	list13 = converttoint(series13)
	list14 = converttoint(series14)
	list15 = converttoint(series15)
	plt.plot(range(31), list11, label='2011')
	plt.plot(range(31), list12, label='2012')
	plt.plot(range(31), list13, label='2013')
	plt.plot(range(31), list14, label='2014')
	plt.plot(range(31), list15, label='2015')
	plt.xlabel('12-01 to 12-31')
	plt.ylabel('tempature')
	plt.title('tempature variation in past 5 years')
	plt.legend(loc='best')
	plt.show()
	# series11.plot(style='b')
	# fig.autofmt_xdate()
	# plt.show()
	# series12.plot(style='b')
	# fig.autofmt_xdate()
	# plt.show()
	# series13.plot(style='b')
	# fig.autofmt_xdate()
	# plt.show()
	# series14.plot(style='b')
	# fig.autofmt_xdate()
	# plt.show()
	# series15.plot(style='b')
	# fig.autofmt_xdate()
	# plt.show()
	m11 = np.array(list11).mean()
	m12 = np.array(list12).mean()
	m13 = np.array(list13).mean()
	m14 = np.array(list14).mean()
	m15 = np.array(list15).mean()
	meantemps = [m11, m12, m13, m14, m15]
	m11h = np.array(converttoint(series11h)).mean() - m11
	m12h = np.array(converttoint(series12h)).mean() - m12
	m13h = np.array(converttoint(series13h)).mean() - m13
	m14h = np.array(converttoint(series14h)).mean() - m14
	m15h = np.array(converttoint(series15h)).mean() - m15
	meantemphs = [m11h, m12h, m13h, m14h, m15h]
	std11 = np.array(list11).std()
	std12 = np.array(list12).std()
	std13 = np.array(list13).std()
	std14 = np.array(list14).std()
	std15 = np.array(list15).std()
	stdtemps = [std11, std12, std13, std14, std15]
	std11h = np.array(converttoint(series11h)).std()
	std12h = np.array(converttoint(series12h)).std()
	std13h = np.array(converttoint(series13h)).std()
	std14h = np.array(converttoint(series14h)).std()
	std15h = np.array(converttoint(series15h)).std()
	stdtemphs = [std11h, std12h, std13h, std14h, std15h]
	ind = np.arange(5)
	width = 0.35
	p1 = plt.bar(ind, meantemps, width, color='r', yerr=stdtemps)
	p2 = plt.bar(ind, meantemphs, width, color='y',
	             bottom=meantemps, yerr=stdtemphs)
	plt.ylabel('tempature')
	plt.title('mean of mintempature and mean of maxtempature in past 5 years')
	plt.xticks(ind + width/2., ('2011', '2012', '2013', '2014', '2015'))
	plt.legend((p1[0], p2[0]), ('mintempature', 'delttempature'))
	plt.show()