在Ubuntu 14.04 64bit上使用pycURL模块示例

PycURL 传说是实现Python下多线程网页抓取的效率最高的解决方案,本质是对libcurl C语言库的封装。

在Linux上有个常用的命令 curl(非常好用),支持curl的就是大名鼎鼎的libcurl库;libcurl是功能强大的,而且是非常高效的函数库。libcurl除了提供本身的C API之外,还有多达40种编程语言的Binding,这里介绍的PycURL就是libcurl的Python binding。

在Python中对网页进行GET/POST等请求,当需要考虑高性能的时候,libcurl是非常不错的选择,一般来说会比liburl、liburl2快不少,可能也会比Requests的效率更高。特别是使用PycURL的多并发请求时,更是效率很高的。个人感觉,其唯一的缺点是,由于是直接调用的是libcurl C库,PycURL的函数接口之类的还和C中的东西很像,可能不是那么的Pythonic,写代码的学习曲线稍微比liburl高一点儿。

https://github.com/pycurl/pycurl                               //pycurl模块的源码
搭建好sphinx环境之后,文档生成,直接从git源码下面运行,make docs
就会在build/doc中看到相关文档信息,进去之后,直接点击index.html进行查看

下面是我的几个实践示例

1.最简单的网页获取

#!/usr/bin/env python
#-*- coding: utf-8 -*-

import sys, pycurl, time, cStringIO


sys.stderr.write("pycURL version [%s]\n" % pycurl.version)

start_time = time.time()

url = 'http://www.dianping.com/shanghai'
b = cStringIO.StringIO()
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEFUNCTION, b.write)
c.perform()
end_time = time.time()

content = b.getvalue()

duration = end_time - start_time
print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)
c.close()

print 'pycurl takes [%s] seconds to get [%s]' % (duration, url)
print 'length of the content is [%d]' % len(content)

2.简单的pycURL包装类

#!/usr/bin/env python
#encoding: utf-8

import sys, pycurl, cStringIO, urllib

class Curl:
    def __init__(self):
        self.c = pycurl.Curl()

    def __del__(self):
        self.c.close()

    def init(self, verbose):
        c = self.c;
        c.setopt(c.FOLLOWLOCATION, 1)
        c.setopt(c.MAXREDIRS, 5)
        c.setopt(c.CONNECTTIMEOUT, 30)
        c.setopt(c.TIMEOUT, 300)
        c.setopt(c.NOSIGNAL, 1)
        c.setopt(c.USERAGENT, "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36")
        c.setopt(c.VERBOSE, verbose)

    def get(self, url):
        b = cStringIO.StringIO()
        c = self.c;
        c.setopt(c.URL, url)
        c.setopt(c.WRITEFUNCTION, b.write)
        c.perform()
        content = b.getvalue()
        print "HTTP CODE: ", c.getinfo(c.HTTP_CODE)
        b.close()
        return content

    def post(self, url, data):
        b = cStringIO.StringIO()
        c = self.c;
        c.setopt(c.POSTFIELDS, urllib.urlencode(data))
        c.setopt(c.URL, url)
        c.setopt(c.WRITEFUNCTION, b.write)
        c.perform()
        content = b.getvalue()
        print "HTTP CODE: ", c.getinfo(c.HTTP_CODE)
        b.close()
        return content

    def purge(self, url):
        cmd = 'PURGE '
        proxy = '127.0.0.1:8080'
        c = self.c
        c.setopt(c.URL, url)
        c.setopt(c.PROXY, proxy)
        c.setopt(c.CUSTOMREQUEST, cmd)
        c.perform()
        status = c.getinfo(c.HTTP_CODE)
        print "HTTP CODE: ", status
        return status

if __name__ == '__main__':
    page = 'http://news.sohu.com/'
    c = Curl()
    c.init(True)
    c.get(page)

    page1 = 'http://www.google.com/'
    post_data_dic = {"name":"value"}
    c.post(page1, post_data_dic)

    page2 = 'http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg'
    c.purge(page2)

3.简单的pycURL multi类包装

#!/usr/bin/env python
#encoding: utf-8

import sys, pycurl, cStringIO

class MCurl:
    def __init__(self, tasks, concurrent):
        self.taskQ = tasks
        self.taskQ_size = len(tasks)
        self.max_conn = concurrent
        self.resp_dict = {}
        self.m = pycurl.CurlMulti()

    def __del__(self):
        self.m.close()

    def add_tasks(self):
        self.max_conn = min(self.taskQ_size, self.max_conn)
        assert 1 <= self.max_conn <= 100, "invalid number of concurrent urls"
        print "===Getting %d urls using %d concurrent cURL handle pool===" % (self.taskQ_size, self.max_conn)

        self.m.handles = []
        for i in range(self.max_conn):
            c = pycurl.Curl()
            c.fp = None
            c.setopt(pycurl.FOLLOWLOCATION, 1)
            c.setopt(pycurl.MAXREDIRS, 5)
            c.setopt(pycurl.CONNECTTIMEOUT, 30)
            c.setopt(pycurl.TIMEOUT, 300)
            c.setopt(pycurl.NOSIGNAL, 1)
            self.m.handles.append(c)
        self.resp_dict['total'] = self.taskQ_size
        self.resp_dict['succ'] = []
        self.resp_dict['fail'] = []

    def process_tasks(self):
        freelist = self.m.handles[:]
        queue = self.taskQ
        num_processed = 0
        while num_processed < self.taskQ_size:
            #if there is an url to process and a free curl handle, add to multi stack
            while queue and freelist:
                url, filename = queue.pop(0)
                c = freelist.pop()
                c.fp = open(filename, "wb")
                c.setopt(pycurl.URL, url)
                c.setopt(pycurl.WRITEDATA, c.fp)
                self.m.add_handle(c)
                #store some info for use later
                c.filename = filename
                c.url = url
            #run the internal curl state machine for the multi stack
            while 1:
                ret, num_handles = self.m.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break
            #check if curl handle has terminated, and add them to the freelist
            while 1:
                num_q, ok_list, err_list = self.m.info_read()
                for c in ok_list:
                    c.fp.close()
                    c.fp = None
                    self.resp_dict['succ'].append(c.url)
                    self.m.remove_handle(c)
                    print ("Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL))
                    freelist.append(c)
                for c, errno, errmsg in err_list:
                    c.fp.close()
                    c.fp = None
                    self.resp_dict['fail'].append(c.url)
                    self.m.remove_handle(c)
                    print("Failed: ", c.filename, c.url, errno, errmsg)
                    freelist.append(c)
                num_processed = num_processed + len(ok_list) + len(err_list)
                if num_q == 0:
                    break;
            #currently no more I/O is pending, we just call select() to sleep until some more data is available
            self.m.select(1.0)

    def del_tasks(self):
        for c in self.m.handles:
            if c.fp is not None:
                c.fp.close()
                c.fp = None
            c.close()

    def dump_process(self):
        print self.resp_dict

#========= main entry point ==========
#give tasks info
urls = ["http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg",
"http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg", "", "http://m2.biz.itc.cn/pic/new/n/93/87/Img7798793_n.jpg",
"http://m1.biz.itc.cn/pic/new/n/92/87/Img7798792_n.jpg", "http://m3.biz.itc.cn/pic/new/n/94/91/Img7799194_n.jpg",
"http://m1.biz.itc.cn/pic/new/n/96/87/Img7798796_n.jpg", "http://m2.biz.itc.cn/pic/new/n/97/87/Img7798797_n.jpg",
"http://m1.biz.itc.cn/pic/new/n/16/88/Img7798816_n.jpg", "http://m2.biz.itc.cn/pic/new/n/17/88/Img7798817_n.jpg",
"http://m4.biz.itc.cn/pic/new/n/95/87/Img7798795_n.jpg", "http://m4.biz.itc.cn/pic/new/n/91/91/Img7799191_n.jpg"]

concurr = 6
queue = []
for url in urls:
    url = url.strip()
    if not url or url[0] == "#":
        continue
    filename = "./sohu_%03d.jpg" % (len(queue) + 1)
    queue.append((url, filename))

mc = MCurl(queue, concurr)
mc.add_tasks()
mc.process_tasks()
mc.del_tasks()
mc.dump_process()

运行截图


4.PURGE等自定义请求实现

#!/usr/bin/env python
#encoding: utf-8

import sys, pycurl, cStringIO, urllib

url = 'http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg'
cmd = 'PURGE '
#cmd = 'DELETE '
proxy = '127.0.0.1:8080'
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.VERBOSE, 1)
c.setopt(c.PROXY, proxy)
c.setopt(c.CUSTOMREQUEST, cmd)
try:
    c.perform()
except Exception as e:
    print e
status = c.getinfo(c.HTTP_CODE)
print "HTTP CODE: ", status
c.close()

运行截图



说明:

1.使用post表单时,只需要设置

c.setopt(c.POSTFIELDS, postfields)
这个选项会自动将HTTP request mathod改为POST

源码pycurl/examples/quickstart/form_post.py 很标准

2.异步批量预取的例子在pycurl/examples/retriever-multi.py,很有代表性的

3.使用自定义方法

c.setopt(pycurl.CUSTOMREQUEST,"DELETE")
详见官网文章

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值