Scrapy--爬取全国天气信息

新建一个Scrapy工程:

scrapy startproject weather

新建一个爬虫:

/scrapy_project/weather/weather/spiders$ scrapy genspider query_weather weather.com.cn

编辑query_weather.py文件:

# -*- coding: utf-8 -*-

import os
import scrapy
from weather.city_urls import city_urls_dict
from weather.items import WeatherItem

class QueryWeatherSpider(scrapy.Spider):
    name = 'query_weather'
    allowed_domains = ['weather.com.cn']
    start_urls = city_urls_dict.values()
    def parse(self, response):
        if response.status == 200:
            url = response.url
            city_code = url[url.rfind('/')+1:url.rfind('.')]
            item = WeatherItem()
            # 小于101050100说明是直辖市或特别行政区,101320000为香港,101330000为澳门
            if (int(city_code) < 101050100) or (101320000 < int(city_code) < 101340000):
                city = response.xpath('//div[@class="crumbs fl"]/a/text()').extract()[0]
                region = response.xpath('//div[@class="crumbs fl"]/span/text()').extract()[1]
                item['city'] = city + '>' + region
            else:
                province = response.xpath('//div[@class="crumbs fl"]/a/text()').extract()[0]
                city = response.xpath('//div[@class="crumbs fl"]/a/text()').extract()[1]
                region = response.xpath('//div[@class="crumbs fl"]/span/text()').extract()[2]
                item['city'] = province + '>' + city + '>' + region
            item['date'] = response.xpath('//ul[@class="t clearfix"]/li/h1/text()').extract()
            item['weather'] = response.xpath('//ul[@class="t clearfix"]/li/p[@class="wea"]/text()').extract()
            item['top_tem'] = response.xpath('//ul[@class="t clearfix"]/li/p[@class="tem"]/span/text()').extract()
            item['low_tem'] = response.xpath('//ul[@class="t clearfix"]/li/p[@class="tem"]/i/text()').extract()
            yield item

上面代码中使用到的city_urls.py内容如下:

city_urls_dict = {
"北京>城区":"http://www.weather.com.cn/weather/101010100.shtml",
"北京>通州":"http://www.weather.com.cn/weather/101010600.shtml",
"北京>顺义":"http://www.weather.com.cn/weather/101010400.shtml",
"北京>朝阳":"http://www.weather.com.cn/weather/101010300.shtml",
"北京>怀柔":"http://www.weather.com.cn/weather/101010500.shtml",
"北京>海淀":"http://www.weather.com.cn/weather/101010200.shtml",
......
}

编辑pipelines.py文件,把获取到的信息存入到mongodb:

Press ENTER or type command to continue
# -*- coding: utf-8 -*-
import time
from pymongo import MongoClient

class WeatherPipeline(object):
    def __init__(self):
        client = MongoClient("localhost", 27017)
        db = client.weather
        db.authenticate("username","password")
        collection_name = 'date_' + time.strftime("%Y%m%d")
        self.coll = db[collection_name]
    def process_item(self, item, spider):
        self.coll.insert_one({
            'city_name':item['city'],
            'weather_data':[
            [item['date'][0], item['weather'][0], item['low_tem'][0], item['top_tem'][0]],
            [item['date'][1], item['weather'][1], item['low_tem'][1], item['top_tem'][1]],
            [item['date'][2], item['weather'][2], item['low_tem'][2], item['top_tem'][2]],
            [item['date'][3], item['weather'][3], item['low_tem'][3], item['top_tem'][3]],
            [item['date'][4], item['weather'][4], item['low_tem'][4], item['top_tem'][4]],
            [item['date'][5], item['weather'][5], item['low_tem'][5], item['top_tem'][5]],
        ]})
        return item

在setting.py打开pipeline中间件:

ITEM_PIPELINES = {
    'weather.pipelines.WeatherPipeline': 300,
}

mongodb查询数据:

> db.date_20171028.createIndex({ city_name: "text" })
{
    "createdCollectionAutomatically" : false,
    "numIndexesBefore" : 1,
    "numIndexesAfter" : 2,
    "ok" : 1
}
> db.date_20171028.find( {$text : { $search: "深圳"}})
{ "_id" : ObjectId("59f3c7baf4e4b1608862957b"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>南山" }
{ "_id" : ObjectId("59f3c7e6f4e4b16088629b0e"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>城区" }
{ "_id" : ObjectId("59f3c7c2f4e4b16088629681"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>罗湖" }
{ "_id" : ObjectId("59f3c7ccf4e4b160886297e1"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>龙岗" }
{ "_id" : ObjectId("59f3c7e4f4e4b16088629ac2"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>宝安" }
{ "_id" : ObjectId("59f3c7d6f4e4b1608862993d"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>福田" }
{ "_id" : ObjectId("59f3c7d7f4e4b16088629969"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>盐田" }
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 6
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值