Python爬取国外天气预报网站的方法
来源:中文源码网 浏览: 次 日期:2018年9月2日
【下载文档: Python爬取国外天气预报网站的方法.txt 】
(友情提示:右键点上行txt文档名->目标另存为)
Python爬取国外天气预报网站的方法 本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:
crawl_weather.py如下:
#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
# Location(False, "中国", "北京", "zh")
# Location(True, "", "亚洲", "zh")
def __init__(self, is_beyond_country, country_name, loc_name, lang):
self.country_name = country_name
self.loc_name = loc_name
self.lang = lang
self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
global count
if url.find("weather-forecast") != -1:
count = count + 1
if count % 500 == 0:
prn_lock.acquire()
print "count:%d" % (count)
prn_lock.release()
return [url]
page = urllib2.urlopen(url).read()
time.sleep(0.01)
#"") != -1:
start = count
opened = True
if opened and line.find("") != -1:
end = count
opened = False
count = count + 1
return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(HTMLParser.HTMLParser().unescape(node.data))
return ''.join(rc)
def FindCondition(page):
pat = "(.*?)"
cds = re.findall(pat, page)
cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
return cds
def ExtractInfo(url):
try:
page = urllib2.urlopen(url).read()
except Exception, e:
return []
text = FindCountryBreadCrumbs(page)
text = HTMLParser.HTMLParser().unescape(text)
dom = minidom.parseString(text.encode("utf-8"))
locs = []
lis = dom.getElementsByTagName("li")
for li in lis:
adr_list = li.getElementsByTagName("a")
if adr_list:
locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
strs = li.getElementsByTagName("strong")
if strs:
locs.append(GetText(strs[0].childNodes).encode("utf-8"))
cds = FindCondition(page)
return locs, cds
def AddMap(lst, m):
for x in lst:
if m.get(x) == None:
m[x] = 1
def working():
while True:
urls = q.get()
#print len(urls)
m = {}
m2 = {}
count = 0
for url in urls:
count = count + 1
#print "%d/%d" % (count, len(urls))
locs, cds = ExtractInfo(url)
AddMap(locs, m)
AddMap(cds, m2)
locks[1].acquire()
AddMap(m.keys(), locations)
AddMap(m2.keys(), conds)
locks[1].release()
q.task_done()
def main():
if len(sys.argv) < 2:
exit()
loc_path = sys.argv[1]
fp = open(loc_path, "r")
urls = [line.strip() for line in fp]
fp.close()
#urls = urls[0:1000]
blocks = len(urls) / ThreadNumber + 1
for start in range(0, len(urls), blocks):
end = start + blocks
if end > len(urls):
end = len(urls)
q.put(urls[start:end])
for i in range(ThreadNumber):
t = Thread(target=working)
t.setDaemon(True)
t.start()
q.join()
fp = open("location_name.fr", "w")
fp.write("\n".join(locations.keys()))
fp.close()
fp = open("conditions.fr", "w")
fp.write("\n".join(conds.keys()))
fp.close()
if __name__ == '__main__':
main()希望本文所述对大家的python程序设计有所帮助。
亲,试试微信扫码分享本页! *^_^*