其实这些东西好像不用这样获取的,中国天气网上直接提供有天气的json数据接口,
这里有篇文章是讲这些接口的http://blog.csdn.net/hytfly/article/details/20064479
不过最近想弄一个一键新闻或者其它例如糗百、微博热点、QQ热点聚合的东西。
先拿天气来练练手
其实这个不用模拟登陆,只需要一些简单的正则就可以,所以非常简单,上代码:
# -*- coding: utf-8 -*-
import urllib
import re
import thread
class Wathereather_Spider_Model:
def __init__(self):
self.ok = False
def getHtml(self,url):
page=urllib.urlopen(url)
html=page.read()
page.close()
return html
def getWeather(self,url):
html = self.getHtml(url)
reg='<dl><dt><a title=.*?>(.*?)</a></dt><dd><a href=.*?>.*?</a><a href=.*?><span>(.*?)</span></a>(.*?)<a href=.*?><b>(.*?)</b></a></dd></dl>'
self.weatherList=re.compile(reg).findall(html)
self.ok=True
#return weatherList
def start(self,pydaihao):
#http://www.weather.com.cn/html/province/beijing.shtml
url = "".join(["http://www.weather.com.cn/html/province/",pydaihao,".shtml"])
#print url
thread.start_new_thread(self.getWeather,(url,))
weatherModel = Wathereather_Spider_Model()
pydaihao ="guangdong"
weatherModel.start(pydaihao)
print "now getting weather of ",pydaihao
while True:
if weatherModel.ok:
for weather in weatherModel.weatherList:
print ""
for li in weather:
print str(li).decode('utf-8').encode('gb2312'),
weatherModel.ok=False
break
可以根据省份来获取天气
结果如下