Scrapy抓取天气数据和显示

翻译 2017年08月07日 16:35:09

 一、item编写  

import scrapy


class GzweatherItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # title = scrapy.Field()
    date = scrapy.Field()
    maxtemp = scrapy.Field()
    mintemp = scrapy.Field()
    weather = scrapy.Field()
    wind = scrapy.Field()
    power = scrapy.Field()
    pass

二、pipeline编写

from scrapy.exceptions import DropItem


class GzweatherPipeline(object):

	def process_item(self, item, spider):
		if True:
			return item
		else:
			raise DropItem('reason')


三、spider编写

import scrapy
from gzweather.items import GzweatherItem



class TtffSpider(scrapy.Spider):
	# name定义spider名字的字符串,唯一的,必须的
	# 可以生成多个spider实例
	name = 'tianqi'
	# allowed_domains包含了允许爬取的域名列表
	allowed_domains = ['lishi.tianqi.com']
	# start_urls是URL列表
	# 当没有指定特定的URL时,spider将从该列表开始进行爬取
	start_urls = ['http://lishi.tianqi.com/guangzhou/201101.html']
	# custom_setting是一个dict,当启动spider时,该设置将会覆盖项目级的设置
	custom_settings = {}
	# crawler在初始化class后,由类方法from_crawler()设置
	# 并且链接了本spider实例对应的Crawler对象
	# Crawler包含了很多项目中的组件,作为单一的入口点 (例如插件,中间件,信号管理器等)

	# settings是一个setting实例

	# logger与spider的名字一起创建,可以用来发送日志信息

	# from_crawler()设置crawler和setting属性

	# start_requests()返回一个iterable,包含了spider用于爬取的第一个Request
	# 只被scrapy调用一次,可以实现为生成器

	# make_requests_from_url()返回用于爬去的Request对象

	# parse()默认的回调函数
	# parse负责处理response并返回处理的数据以及(/或)跟进的URL
	# Spider对其他的Request的回调函数也有相同的要求
	def parse(self, response):
		# 回调函数必须返回一个包含Request, dict或Item的可迭代的对象
		self.logger.info('A response from %s just arrived!', response.url)
		# 在此项目中一直response是html_reponse
		# 实例化Selector
		sel = scrapy.Selector(response)
		title = sel.xpath('//title/text()').extract_first()
		# gzitem['title'] = title
		print ('打印输出**************')
		print (title)
		uls = sel.xpath('//div[@class="tqtongji2"]/ul')
		# print 'uls:', uls.extract()
		for index, ul in enumerate(uls):
			gzitem = GzweatherItem()
			if index == 0:
				continue
			args = ul.xpath('li/text()').extract()
			if len(args) == 5:
				gzitem['date'] = ul.xpath('li/a/text()').extract()[0]
				gzitem['maxtemp'] = args[0]
				gzitem['mintemp'] = args[1]
				gzitem['weather'] = args[2]
				gzitem['wind'] = args[3]
				gzitem['power'] = args[4]
				yield gzitem
			elif len(args) == 6:
				gzitem['date'] = args[0]
				gzitem['maxtemp'] = args[1]
				gzitem['mintemp'] = args[2]
				gzitem['weather'] = args[3]
				gzitem['wind'] = args[4]
				gzitem['power'] = args[5]
				yield gzitem
		# for h3 in response.xpath('//h3').extract():
		# 	yield GzweatherItem(date=h3)
		print ('#####')
		print ('#####')
		# print sel.xpath('//div[contains(@id, "tool_site")]/div[1]/span[1]/a[last()]/@href').extract()
		for url in sel.xpath('//div[contains(@id, "tool_site")]/div[1]/span[1]/a[last()]/@href').extract():
			print  (url)
			# //span[@class="tqxiangqing"/a[2]]
			# '//a/@href'
			yield scrapy.Request(url, self.parse)
		pass

	# log(message[, level, component])
	# 使用scrapy.log.msg()方法记录(log)message
	# 自动带上该spider的name属性,封装了通过logger来发送log消息的方法,向后兼容

	# closed(reason)spider关闭时调用
	# 替代调用signals.connect()来监听spider_closed信号的快捷方式
	#文档中运行
	#from scrapy.cmdline import execute
    #execute()

四、获得数据

 
五、数据处理

# encoding: utf-8
import pandas as pd
filename = r'E:\Users\3404\gzweather.csv'
outpath = r'E:\Users\3404\newgzweather.csv'


if __name__ == '__main__':
	df = pd.read_csv(filename, header=None)
	print ('先看一下')
	print (df.head())
	df.columns = df.loc[0]
	df = df.drop(0)
	df.index = df['date'].values
	df = df.sort_index()
	print ('排序调整一下')
	print (df.head())
	df = df.drop_duplicates()
	df = df.drop('date', axis=1)
	df = df.dropna(how='any')
	print ('最后')
	print (df.head())
	df.to_csv(outpath)

六、画图显示

# encoding: utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
inputpath = r'E:\Users\3404\Desktop\\newgzweather.csv'


def converttoint(series11):
	datalist = [int(x) for x in series11.tolist()]
	# series = pd.Series(datalist, series11.index)
	return datalist


if __name__ == '__main__':
	df = pd.read_csv(inputpath, header=None,encoding = "ISO-8859-1",names=['maxtemp', 'power', 'mintemp', 'weather', 'wind'])
	series11 = df.loc['2011-12-01':'2011-12-31', 'mintemp']
	series12 = df.loc['2012-12-01':'2012-12-31', 'mintemp']
	series13 = df.loc['2013-12-01':'2013-12-31', 'mintemp']
	series14 = df.loc['2014-12-01':'2014-12-31', 'mintemp']
	series15 = df.loc['2015-12-01':'2015-12-31', 'mintemp']
	series11h = df.loc['2011-12-01':'2011-12-31', 'maxtemp']
	series12h = df.loc['2012-12-01':'2012-12-31', 'maxtemp']
	series13h = df.loc['2013-12-01':'2013-12-31', 'maxtemp']
	series14h = df.loc['2014-12-01':'2014-12-31', 'maxtemp']
	series15h = df.loc['2015-12-01':'2015-12-31', 'maxtemp']
	fig = plt.figure()
	list11 = converttoint(series11)
	list12 = converttoint(series12)
	list13 = converttoint(series13)
	list14 = converttoint(series14)
	list15 = converttoint(series15)
	plt.plot(range(31), list11, label='2011')
	plt.plot(range(31), list12, label='2012')
	plt.plot(range(31), list13, label='2013')
	plt.plot(range(31), list14, label='2014')
	plt.plot(range(31), list15, label='2015')
	plt.xlabel('12-01 to 12-31')
	plt.ylabel('tempature')
	plt.title('tempature variation in past 5 years')
	plt.legend(loc='best')
	plt.show()
	# series11.plot(style='b')
	# fig.autofmt_xdate()
	# plt.show()
	# series12.plot(style='b')
	# fig.autofmt_xdate()
	# plt.show()
	# series13.plot(style='b')
	# fig.autofmt_xdate()
	# plt.show()
	# series14.plot(style='b')
	# fig.autofmt_xdate()
	# plt.show()
	# series15.plot(style='b')
	# fig.autofmt_xdate()
	# plt.show()
	m11 = np.array(list11).mean()
	m12 = np.array(list12).mean()
	m13 = np.array(list13).mean()
	m14 = np.array(list14).mean()
	m15 = np.array(list15).mean()
	meantemps = [m11, m12, m13, m14, m15]
	m11h = np.array(converttoint(series11h)).mean() - m11
	m12h = np.array(converttoint(series12h)).mean() - m12
	m13h = np.array(converttoint(series13h)).mean() - m13
	m14h = np.array(converttoint(series14h)).mean() - m14
	m15h = np.array(converttoint(series15h)).mean() - m15
	meantemphs = [m11h, m12h, m13h, m14h, m15h]
	std11 = np.array(list11).std()
	std12 = np.array(list12).std()
	std13 = np.array(list13).std()
	std14 = np.array(list14).std()
	std15 = np.array(list15).std()
	stdtemps = [std11, std12, std13, std14, std15]
	std11h = np.array(converttoint(series11h)).std()
	std12h = np.array(converttoint(series12h)).std()
	std13h = np.array(converttoint(series13h)).std()
	std14h = np.array(converttoint(series14h)).std()
	std15h = np.array(converttoint(series15h)).std()
	stdtemphs = [std11h, std12h, std13h, std14h, std15h]
	ind = np.arange(5)
	width = 0.35
	p1 = plt.bar(ind, meantemps, width, color='r', yerr=stdtemps)
	p2 = plt.bar(ind, meantemphs, width, color='y',
	             bottom=meantemps, yerr=stdtemphs)
	plt.ylabel('tempature')
	plt.title('mean of mintempature and mean of maxtempature in past 5 years')
	plt.xticks(ind + width/2., ('2011', '2012', '2013', '2014', '2015'))
	plt.legend((p1[0], p2[0]), ('mintempature', 'delttempature'))
	plt.show()



Python-抓取股票信息

-
  • 1970年01月01日 08:00

【Python】Python简单网络爬虫-爬取近几年的天气数据

Python简单网络爬虫引用的包import urllib2 import random import csv import datetime from bs4 import BeautifulSou...
  • qq_33304418
  • qq_33304418
  • 2016-11-21 09:46:03
  • 1500

python爬取天气网历史数据

由于工作需要,采用python 的bs4和urllib从网站:http://www.tianqihoubao.com/lishi/beijing/month/201710.html,爬取了2011年1...
  • qq_35511580
  • qq_35511580
  • 2017-11-14 12:50:26
  • 229

python历史天气采集分析

分析历史天气的趋势。 先采集 代码: #-*- coding:utf-8 -*- import requests import random import MySQLdb import ...
  • qq1124794084
  • qq1124794084
  • 2017-01-07 15:17:45
  • 1670

python爬取历史天气数据

环境说明扒取的网站:天气网,http://lishi.tianqi.com/ python版本:2.7 操作系统:windows 7 所依赖的包(如若没有请安装) 包名 说明 官方地...
  • huanbia
  • huanbia
  • 2017-01-04 15:37:16
  • 5433

python爬虫练习--爬取某城市历史气象数据(待优化)

# -*- coding=utf-8 -*- from __future__ import print_function import urllib.request from bs4 impo...
  • yaj13346943285
  • yaj13346943285
  • 2017-05-11 17:11:01
  • 847

python爬取天气数据

soup.select函数 1 通过标签名查找 soup.select('div') 2 通过类名查找  soup.select('.ebox') 3 通过id名查找  soup.select(...
  • SwettyTea
  • SwettyTea
  • 2017-06-01 21:12:41
  • 414

Python3爬虫小程序——爬取各类天气信息

本来是想从网上找找有没有现成的爬取空气质量状况和天气情况的爬虫程序,结果找了一会儿感觉还是自己写一个吧。 主要是爬取北京包括北京周边省会城市的空气质量数据和天气数据。 过程中出现了一个错误:Unico...
  • u013063099
  • u013063099
  • 2017-06-02 16:01:19
  • 4637

Python3爬虫小程序——爬取各类天气信息(4)

【爬取动态页面的数据】更新:已上传到我的GitHub上,点击打开链接上一次讲到用工具对动态页面进行数据爬取,但是感觉难度不小,而且运行效率简直低下。。。于是乎从网上查资料,有好多说可以通过获取网站的j...
  • u013063099
  • u013063099
  • 2017-06-19 15:18:20
  • 2266

Python爬取天气网历史天气数据

我的第一篇博客,哈哈哈,记录一下我的Python进阶之路!今天写了一个爬取天气网历史数据的小爬虫。主要使用Python的requests 和BeautifulSoup模块,核心是利用Beautiful...
  • haha_point
  • haha_point
  • 2017-08-15 17:02:32
  • 827
收藏助手
不良信息举报
您举报文章:Scrapy抓取天气数据和显示
举报原因:
原因补充:

(最多只允许输入30个字)