python爬取天气预报源代码_Python爬取国外天气预报网站的方法

本文介绍使用Python爬取国外天气预报网站的方法,通过示例代码crawl_weather.py展示如何获取地点和天气条件信息,涉及网络请求、正则表达式和多线程等技术。
摘要由CSDN通过智能技术生成

Python爬取国外天气预报网站的方法

来源:中文源码网    浏览: 次    日期:2018年9月2日

【下载文档:  Python爬取国外天气预报网站的方法.txt 】

(友情提示:右键点上行txt文档名->目标另存为)

Python爬取国外天气预报网站的方法 本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:

crawl_weather.py如下:

#encoding=utf-8

import httplib

import urllib2

import time

from threading import Thread

import threading

from Queue import Queue

from time import sleep

import re

import copy

lang = "fr"

count = 0

class Location:

# Location(False, "中国", "北京", "zh")

# Location(True, "", "亚洲", "zh")

def __init__(self, is_beyond_country, country_name, loc_name, lang):

self.country_name = country_name

self.loc_name = loc_name

self.lang = lang

self.is_beyond_country = is_beyond_country

prn_lock = threading.RLock()

def GetLocationURLs(url, recursive):

global count

if url.find("weather-forecast") != -1:

count = count + 1

if count % 500 == 0:

prn_lock.acquire()

print "count:%d" % (count)

prn_lock.release()

return [url]

page = urllib2.urlopen(url).read()

time.sleep(0.01)

#"") != -1:

start = count

opened = True

if opened and line.find("") != -1:

end = count

opened = False

count = count + 1

return "\n".join(lines[start: (end + 1)])

def GetText(nodelist):

rc = []

for node in nodelist:

if node.nodeType == node.TEXT_NODE:

rc.append(HTMLParser.HTMLParser().unescape(node.data))

return ''.join(rc)

def FindCondition(page):

pat = "(.*?)"

cds = re.findall(pat, page)

cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]

return cds

def ExtractInfo(url):

try:

page = urllib2.urlopen(url).read()

except Exception, e:

return []

text = FindCountryBreadCrumbs(page)

text = HTMLParser.HTMLParser().unescape(text)

dom = minidom.parseString(text.encode("utf-8"))

locs = []

lis = dom.getElementsByTagName("li")

for li in lis:

adr_list = li.getElementsByTagName("a")

if adr_list:

locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))

strs = li.getElementsByTagName("strong")

if strs:

locs.append(GetText(strs[0].childNodes).encode("utf-8"))

cds = FindCondition(page)

return locs, cds

def AddMap(lst, m):

for x in lst:

if m.get(x) == None:

m[x] = 1

def working():

while True:

urls = q.get()

#print len(urls)

m = {}

m2 = {}

count = 0

for url in urls:

count = count + 1

#print "%d/%d" % (count, len(urls))

locs, cds = ExtractInfo(url)

AddMap(locs, m)

AddMap(cds, m2)

locks[1].acquire()

AddMap(m.keys(), locations)

AddMap(m2.keys(), conds)

locks[1].release()

q.task_done()

def main():

if len(sys.argv) < 2:

exit()

loc_path = sys.argv[1]

fp = open(loc_path, "r")

urls = [line.strip() for line in fp]

fp.close()

#urls = urls[0:1000]

blocks = len(urls) / ThreadNumber + 1

for start in range(0, len(urls), blocks):

end = start + blocks

if end > len(urls):

end = len(urls)

q.put(urls[start:end])

for i in range(ThreadNumber):

t = Thread(target=working)

t.setDaemon(True)

t.start()

q.join()

fp = open("location_name.fr", "w")

fp.write("\n".join(locations.keys()))

fp.close()

fp = open("conditions.fr", "w")

fp.write("\n".join(conds.keys()))

fp.close()

if __name__ == '__main__':

main()希望本文所述对大家的python程序设计有所帮助。

亲,试试微信扫码分享本页! *^_^*

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值