Python urllib2实现抓取天气和世界时钟信息

#coding=utf-8
#coding=gbk
import os
import sys
import re
import time
import subprocess
import MySQLdb
import urllib
import urllib2
reload(sys)
def getHtmlData(areaCode):
    # url='http://sq.weather.com.cn/mweather/101280601.shtml'
    url=' http://m.weather.com.cn/mweather/%s.shtml'%areaCode
    # url='http://e.weather.com.cn/d/index/101010100.shtml'
    # url='http://www.weather.com.cn/'
    # url='http://www.weather.com.cn/weather1d/101010100.shtml'
    req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
      'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      #'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
      'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
      'Accept-Encoding':'en-us',
      'Connection':'keep-alive',
      'Referer':'http://www.weather.com.cn/'
       }
    user_agent =  'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    header = { 'User-Agent' : user_agent ,
                        'Referer':url}
    try:
        request = urllib2.Request(url,headers = header)
        response = urllib2.urlopen(request)
        content = response.read().decode("utf-8")
            # decode('utf8')
        # print content
        # pattern = re.compile('<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?'+'content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',re.S)
        # pattern = re.compile('<div class="today clearfix" id="today">.+<p class="tem">.+<span>(\d)</span>.+</p>.+</div>.+<ul class="clearfix">',re.S)
        # pattern = re.compile('<input type="hidden" id="hidden_title" value=(.+)/>$',re.S)
        pattern = re.compile('dataSK.+=.+(\{.*date.+\d.+\d.+\(.+\)"\})',re.M)
        items = re.findall(pattern,content)
        # pattern1 = re.compile('<li>\n<b>(.+)</b>\n</li>\n<img.*alt(.+).+\n<img.*alt(.+).+\n</li>\n<span>(.+)</span>',re.S)
        pattern1 = re.compile('<b>(?P<week>.+)</b>\n<i>\n<.+alt=(?P<weather>.+)\/\>\n<.+alt=(?P<weather2>.+)\/\>\n</i>\n<span>(?P<tmpArea>.+)</span>',re.M)
        items1 = re.findall(pattern1,content)
        print items1

        for item in items1:
            print item[0],item[1],item[2],item[3]
        dictTmpWeather={}
        for item in items:
            dictTmpWeather= eval(item)
        # print dictTmpWeather
        for key in dictTmpWeather:
            # pass
            print key,dictTmpWeather[key]
        # print dictTmpWeather['cityname'] ,dictTmpWeather['city'],dictTmpWeather['temp']

    except urllib2.URLError, e:
        if hasattr(e,"code"):
            print e.code
        if hasattr(e,"reason"):
            print e.reason
            
def getwordClockHtmlData(area):

    # url='http://sq.weather.com.cn/mweather/101280601.shtml'
    url='http://www.timedate.cn/worldclock/results.asp?query=%s'%area
    # url='http://e.weather.com.cn/d/index/101010100.shtml'
    # url='http://www.weather.com.cn/'
    # url='http://www.weather.com.cn/weather1d/101010100.shtml'
    req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
      'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      #'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
      'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
      'Accept-Encoding':'en-us',
      'Connection':'keep-alive',
      'Referer':'http://www.timedate.cn/'
       }
    user_agent =  'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    header = { 'User-Agent' : user_agent ,
                        'Referer':url}
    try:
        request = urllib2.Request(url,headers = header)
        response = urllib2.urlopen(request)
        content = response.read()
        #print content
       
        pattern=re.compile(r"nyear=(\d+);\r\nnmonth=(\d+);\r\nnday=(\d+);\r\nnwday=(\d+);\r\nnhrs=(\d+);\r\nnmin=(\d+);\r\nnsec=(\d+);",re.M)
        allItems=re.findall(pattern,content)
        
        year,month,nday,nWeek,nhour,nmin,nsec=allItems[0]
        
        print "year=%s,mounth=%s,nday=%s,nweek=%s,nhour=%s,nmin=%s,nsec=%s"%(year,month,nday,nWeek,nhour,nmin,nsec)
        
    except urllib2.URLError, e:
        if hasattr(e,"code"):
            print e.code
        if hasattr(e,"reason"):
            print e.reason            
            
                     
if __name__ == "__main__":
    getHtmlData('101010100')
    getwordClockHtmlData("Auckland")
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值