scrapy爬取天气预报
item.py
import scrapy
class WeatherItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
city = scrapy.Field()
date = scrapy.Field()
week = scrapy.Field()
img = scrapy.Field()
state= scrapy.Field()
temmax= scrapy.Field()
temmin = scrapy.Field()
wind = scrapy.Field()
weather.py
import scrapy
from weather.items import WeatherItem
class WeathersSpider(scrapy.Spider):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'}
name = 'weathers'
allowed_domains = ['www.tianqi.com']
citys=['baoding','beijing','shijiazhuang']
start_urls=[]
for city in citys:
start_urls.append('https://www.tianqi.com/'+city)
print('\n\n')
def parse(self, response):
selector=response.xpath('//div[@class="right"]')
for weather in selector:
item=WeatherItem()
item['city'] = weather.xpath('.//div[@class="top"]//h1//text()').extract()
print('\t\t')
item['date'] = weather.xpath('.//ul[@class="week"]//li//b//text()').extract()
print('\t\t')
item['week'] = weather.xpath('.//ul[@class="week"]//li//span//text()').extract()
print('\t\t')
item['img'] = weather.xpath('.//ul[@class="week"]//li//img/@src').extract()
print('\t\t')
item['state'] = weather.xpath('.//ul[@class="txt txt2"]//li//text()').extract()
print('\t\t')
item['temmax'] = weather.xpath('.//div[@class="zxt_shuju"]//ul//li//span//text()').extract()
print('\t\t')
item['temmin'] = weather.xpath('.//div[@class="zxt_shuju"]//ul//li//b//text()').extract()
print('\t\t')
item['wind'] = weather.xpath('.//ul[@class="txt"]//li//text()').extract()
return item
pipeline.py
import os.path
import urllib
class WeatherPipeline(object):
def process_item(self, item, spider):
with open('weathers.txt', 'a+') as fp:
fp.write(str(item['city'])+'\n\n\t')
fp.write(str(item['date'])+'\n\n\t')
fp.write(str(item['week'])+'\n\n\t')
fp.write(str(item['img']) + '\n\n\t')
imgname=os.path.basename(str(item['img']))
print("22222222222222222222222222222222")
print(imgname)
print("22222222222222222222222222222222222222222")
fp.write(str(imgname) + '\n\n\t')
if os.path.exists(imgname):
print("66666666666666666666666666666666666666666")
else:
with open(imgname,'wb') as fp:
response=urllib.request.urlopen(str(item['img']))
print("99999999999999999999999999999999999999999")
print(response)
print('999999999999999999999999999999999999999999999')
fp.write(response.read())
fp.write(str(item['state'])+'\n\n\t')
fp.write(str(item['temmax']) + '\n\n\t')
fp.write(str(item['temmin']) + '\n\n\t')
fp.write(str(item['wind']) + '\n\n\t')
return item
import json
class WeatherPipeline2(object):
def open_spider(self, spider):
self.filename=open('weathers.json','a')
def process_item(self, item, spider):
content=json.dumps(dict(item),ensure_ascii=False)+'\n\n\t\t'
self.filename.write(content)
return item
def close_spider(self,spider):
self.filename.close()
'''
import MySQLdb
class WeatherPipeline3(object):
def process_item(self, item, spider):
city=item['city'].encode('utf8')
date=item['date'].encode('utf8')
week=item['week'].encode('utf8')
img=os.path.basename(str(item['img']))
state=item['state'].encode('utf8')
temmax=item['temmax'].encode('utf8')
temmin=item['temmin'].encode('utf8')
wind=item['wind'].encode('utf8')
conn=MySQLdb.connect(
host='localhost',
port=3306,
user=''
)
'''
settings.py
BOT_NAME = 'weather'
SPIDER_MODULES = ['weather.spiders']
NEWSPIDER_MODULE = 'weather.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'weather (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'weather.pipelines.WeatherPipeline': 300,
'weather.pipelines.WeatherPipeline2': 400
}