一个简单的使用python抓取网页中的水文数据的程序

最近组里某老师做研究需求一些站点的水文数据,刚好最近在学习python,了解到python中有很多方便的库用于网页抓取和网页分析,于是就着手写了一个。勉强可以用吧,哈哈。
实现上,简单用到了线程中队列的使用,线程池的使用,网页分析。主要参考了:
1.http://www.ibm.com/developerworks/aix/library/au-threadingpython/
2.http://www.pythoner.cn/home/blog/parallelism-in-one-line/
3.http://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html

上代码,随后补上简要说明

代码块

# -*- coding: utf-8 -*-
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
from datetime import *
from dateutil.relativedelta import *
import Queue
import threading

queue = Queue.Queue()
out_queue = Queue.Queue()

#日期迭代器,迭代从start到end中间的每一天
def loopDay(start, end):
    while(start < end):
        yield start
        start = start + relativedelta(days = 1)
#向指定网站(湖南省水文查询系统)请求所需的页面
def post(url, data):
    req = urllib2.Request(url)
    data = urllib.urlencode(data)
    #enable cookie
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    response = opener.open(req, data)
    return response.read()

def getHtml(time):
    posturl = 'http://61.187.56.156/wap/hnsq_BB2.asp'
    data = {}
    data['nian'] = str(time.year)
    data['yue'] = str(time.month)
    data['ri'] = str(time.day)
    data['shi'] = '08:00'
    html = post(posturl, data)
    return html

#查找我们需要的关键字在Tag列表中的位置
def findKeyword(keyword, tagList):
    location = -1
    for tag in tagList:
        location += 1
        if (keyword.encode('GBK') in unicode(tag.string).encode("windows-1252")):
            break
    return location

#用于抓取某一天的对应的页面
class ThreadUrl(threading.Thread):
    """Threaded Url Grab"""
    def __init__(self, queue, out_queue):
        threading.Thread.__init__(self)
        self.queue = queue
        self.out_queue = out_queue

    def run(self):
        while True:
            oneDay = self.queue.get()
            print oneDay

            #grabs webpage
            webpage = getHtml(oneDay)

            #place page into out queue
            self.out_queue.put((oneDay, webpage))

#抓取获取到的指定日期对应的网页中我们需要的数据
class DatamineThread(threading.Thread):
    """Threaded Url Parsing"""
    def __init__(self, out_queue):
        threading.Thread.__init__(self)
        self.out_queue = out_queue

    def run(self):
        while True:
            (oneDay, page) = self.out_queue.get()

            #parse the page
            soup = BeautifulSoup(page, fromEncoding = "gb2312")
            tagList = soup.findAll('td')
            locations = []
            for keyword in keywords:
                locations.append(findKeyword(keyword, tagList))
            for i in range(5):
                fileName = keywords[i] + ".txt"
                f = file(fileName, "a+")
                waterLevel = unicode(tagList[locations[i] + 2].string).strip().encode("windows-1252")
                waterStorage = unicode(tagList[locations[i] + 5].string).strip().encode("windows-1252")
                f.write(str(oneDay) +"\t" + waterLevel + "\t" + waterStorage + "\n")
                f.close()

if __name__ == '__main__':
    keywords = [u"寸滩", u"万县", u"巫山", u"清溪场", u"忠县", u"武隆"]
    startDay = date(2006, 1, 1)
    endDay = date(2007, 1, 1)

    days = loopDay(startDay, endDay)

    t = ThreadUrl(queue, out_queue)
    t.setDaemon(True)
    t.start()

    for oneDay in days:
        queue.put(oneDay)

    dt = DatamineThread(out_queue)
    dt.setDaemon(True)
    dt.start()

    queue.join()
    out_queue.join()

注册好久了,第一次动手写。网速不好,后面再改吧。

  • 0
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值