新建一个Scrapy工程:
scrapy startproject weather
新建一个爬虫:
/scrapy_project/weather/weather/spiders$ scrapy genspider query_weather weather.com.cn
编辑query_weather.py文件:
import os
import scrapy
from weather.city_urls import city_urls_dict
from weather.items import WeatherItem
class QueryWeatherSpider(scrapy.Spider):
name = 'query_weather'
allowed_domains = ['weather.com.cn']
start_urls = city_urls_dict.values()
def parse(self, response):
if response.status == 200:
url = response.url
city_code = url[url.rfind('/')+1:url.rfind('.')]
item = WeatherItem()
if (int(city_code) < 101050100) or (101320000 < int(city_code) < 101340000):
city = response.xpath('//div[@class="crumbs fl"]/a/text()').extract()[0]
region = response.xpath('//div[@class="crumbs fl"]/span/text()').extract()[1]
item['city'] = city + '>' + region
else:
province = response.xpath('//div[@class="crumbs fl"]/a/text()').extract()[0]
city = response.xpath('//div[@class="crumbs fl"]/a/text()').extract()[1]
region = response.xpath('//div[@class="crumbs fl"]/span/text()').extract()[2]
item['city'] = province + '>' + city + '>' + region
item['date'] = response.xpath('//ul[@class="t clearfix"]/li/h1/text()').extract()
item['weather'] = response.xpath('//ul[@class="t clearfix"]/li/p[@class="wea"]/text()').extract()
item['top_tem'] = response.xpath('//ul[@class="t clearfix"]/li/p[@class="tem"]/span/text()').extract()
item['low_tem'] = response.xpath('//ul[@class="t clearfix"]/li/p[@class="tem"]/i/text()').extract()
yield item
上面代码中使用到的city_urls.py内容如下:
city_urls_dict = {
"北京>城区":"http://www.weather.com.cn/weather/101010100.shtml",
"北京>通州":"http://www.weather.com.cn/weather/101010600.shtml",
"北京>顺义":"http://www.weather.com.cn/weather/101010400.shtml",
"北京>朝阳":"http://www.weather.com.cn/weather/101010300.shtml",
"北京>怀柔":"http://www.weather.com.cn/weather/101010500.shtml",
"北京>海淀":"http://www.weather.com.cn/weather/101010200.shtml",
......
}
编辑pipelines.py文件,把获取到的信息存入到mongodb:
Press ENTER or type command to continue
import time
from pymongo import MongoClient
class WeatherPipeline(object):
def __init__(self):
client = MongoClient("localhost", 27017)
db = client.weather
db.authenticate("username","password")
collection_name = 'date_' + time.strftime("%Y%m%d")
self.coll = db[collection_name]
def process_item(self, item, spider):
self.coll.insert_one({
'city_name':item['city'],
'weather_data':[
[item['date'][0], item['weather'][0], item['low_tem'][0], item['top_tem'][0]],
[item['date'][1], item['weather'][1], item['low_tem'][1], item['top_tem'][1]],
[item['date'][2], item['weather'][2], item['low_tem'][2], item['top_tem'][2]],
[item['date'][3], item['weather'][3], item['low_tem'][3], item['top_tem'][3]],
[item['date'][4], item['weather'][4], item['low_tem'][4], item['top_tem'][4]],
[item['date'][5], item['weather'][5], item['low_tem'][5], item['top_tem'][5]],
]})
return item
在setting.py打开pipeline中间件:
ITEM_PIPELINES = {
'weather.pipelines.WeatherPipeline': 300,
}
mongodb查询数据:
> db.date_20171028.createIndex({ city_name: "text" })
{
"createdCollectionAutomatically" : false,
"numIndexesBefore" : 1,
"numIndexesAfter" : 2,
"ok" : 1
}
> db.date_20171028.find( {$text : { $search: "深圳"}})
{ "_id" : ObjectId("59f3c7baf4e4b1608862957b"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>南山" }
{ "_id" : ObjectId("59f3c7e6f4e4b16088629b0e"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>城区" }
{ "_id" : ObjectId("59f3c7c2f4e4b16088629681"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>罗湖" }
{ "_id" : ObjectId("59f3c7ccf4e4b160886297e1"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>龙岗" }
{ "_id" : ObjectId("59f3c7e4f4e4b16088629ac2"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>宝安" }
{ "_id" : ObjectId("59f3c7d6f4e4b1608862993d"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>福田" }
{ "_id" : ObjectId("59f3c7d7f4e4b16088629969"), "weather_data" : [ [ "28日(今天)", "多云", "19℃", "28" ], [ "29日(明天)", "多云", "18℃", "27" ], [ "30日(后天)", "多云", "17℃", "26" ], [ "31日(周二)", "多云", "17℃", "25" ], [ "1日(周三)", "多云", "19℃", "26" ], [ "2日(周四)", "多云", "19℃", "27" ] ], "city_name" : "广东>深圳>盐田" }