scrapy爬取天气预报

scrapy爬取天气预报

item.py

import scrapy


class WeatherItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    city = scrapy.Field()
    date = scrapy.Field()
    week = scrapy.Field()
    img = scrapy.Field()
    state= scrapy.Field()
    temmax= scrapy.Field()
    temmin = scrapy.Field()
    wind = scrapy.Field()

weather.py

import scrapy
from weather.items import WeatherItem

class WeathersSpider(scrapy.Spider):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'}
    name = 'weathers'
    allowed_domains = ['www.tianqi.com']
    citys=['baoding','beijing','shijiazhuang']
    start_urls=[]
    for city in citys:
        start_urls.append('https://www.tianqi.com/'+city)
        print('\n\n')


    def parse(self, response):
        selector=response.xpath('//div[@class="right"]')
        for weather in selector:
            item=WeatherItem()
            item['city'] = weather.xpath('.//div[@class="top"]//h1//text()').extract()
            print('\t\t')
            item['date'] = weather.xpath('.//ul[@class="week"]//li//b//text()').extract()
            print('\t\t')
            item['week'] = weather.xpath('.//ul[@class="week"]//li//span//text()').extract()
            print('\t\t')
            item['img'] = weather.xpath('.//ul[@class="week"]//li//img/@src').extract()
            print('\t\t')
            item['state'] = weather.xpath('.//ul[@class="txt txt2"]//li//text()').extract()
            print('\t\t')
            item['temmax'] = weather.xpath('.//div[@class="zxt_shuju"]//ul//li//span//text()').extract()
            print('\t\t')
            item['temmin'] = weather.xpath('.//div[@class="zxt_shuju"]//ul//li//b//text()').extract()
            print('\t\t')
            item['wind'] = weather.xpath('.//ul[@class="txt"]//li//text()').extract()

            return item




pipeline.py

import os.path
import urllib
class WeatherPipeline(object):
    def process_item(self, item, spider):
        with open('weathers.txt', 'a+') as fp:
            fp.write(str(item['city'])+'\n\n\t')
            fp.write(str(item['date'])+'\n\n\t')
            fp.write(str(item['week'])+'\n\n\t')
            fp.write(str(item['img']) + '\n\n\t')
            imgname=os.path.basename(str(item['img']))
            print("22222222222222222222222222222222")
            print(imgname)
            print("22222222222222222222222222222222222222222")
            fp.write(str(imgname) + '\n\n\t')
            if os.path.exists(imgname):
                print("66666666666666666666666666666666666666666")

            else:
                with open(imgname,'wb') as fp:
                    response=urllib.request.urlopen(str(item['img']))
                    print("99999999999999999999999999999999999999999")
                    print(response)
                    print('999999999999999999999999999999999999999999999')
                    fp.write(response.read())
            fp.write(str(item['state'])+'\n\n\t')
            fp.write(str(item['temmax']) + '\n\n\t')
            fp.write(str(item['temmin']) + '\n\n\t')
            fp.write(str(item['wind']) + '\n\n\t')


        return item
import json
class WeatherPipeline2(object):
    def open_spider(self, spider):
       self.filename=open('weathers.json','a')

    def process_item(self, item, spider):
        content=json.dumps(dict(item),ensure_ascii=False)+'\n\n\t\t'
        self.filename.write(content)
        return item

    def close_spider(self,spider):
       self.filename.close()
'''
import MySQLdb
class WeatherPipeline3(object):
    def process_item(self, item, spider):
        city=item['city'].encode('utf8')
        date=item['date'].encode('utf8')
        week=item['week'].encode('utf8')
        img=os.path.basename(str(item['img']))
        state=item['state'].encode('utf8')
        temmax=item['temmax'].encode('utf8')
        temmin=item['temmin'].encode('utf8')
        wind=item['wind'].encode('utf8')
        conn=MySQLdb.connect(
            host='localhost',
            port=3306,
            user=''
        )

'''

settings.py

BOT_NAME = 'weather'

SPIDER_MODULES = ['weather.spiders']
NEWSPIDER_MODULE = 'weather.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'weather (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

ITEM_PIPELINES = {
   'weather.pipelines.WeatherPipeline': 300,
   'weather.pipelines.WeatherPipeline2': 400
}
  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值