pycurl的使用

最新推荐文章于 2021-11-04 13:51:47 发布

weixin_34376562

最新推荐文章于 2021-11-04 13:51:47 发布

阅读量522

点赞数

文章标签：数据库爬虫 ldap

原文链接：https://my.oschina.net/u/929415/blog/219850

版权

2019独角兽企业重金招聘Python工程师标准>>>

我使用的是pycurl库是python使用libcurl的接口，官网是http://pycurl.sourceforge.net。类似urllib库，pycurl用来获取通过域名访问的网络资源。它支持多项协议：FTP, FTPS, HTTP, HTTPS, SCP, SFTP, TFTP, TELNET, DICT, LDAP, LDAPS, FILE, IMAP, SMTP, POP3。
下面代码：

c = pycurl.Curl()
    #url = "http://image.baidu.com/i?tn=baiduimage&ct=201326592&lm=-1&cl=2&nc=1&word="
    url = '/duy/d' //地址
    c.setopt(pycurl.URL,url)
    c.setopt(pycurl.USERAGENT,'Mozilla/5.0 (Windows NT 6.1; rv:27.0) Gecko/20100101 Firefox/27.0')//使用的客户端
    c.setopt(pycurl.REFERER,'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+rand_str())//上一个网页
    c.setopt(pycurl.HTTPHEADER,['text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'])//http包头
    c.setopt(pycurl.COOKIE,cookie)//使用的cookie格式是字符串:"key=value;key=value".
    c.setopt(pycurl.VERBOSE,1)//输出调试信息
    c.setopt(pycurl.FOLLOWLOCATION, 1)//遇到302时候是否进行自动跳转
    c.setopt(pycurl.MAXREDIRS, 5)
    c.setopt(pycurl.COOKIEFILE,"cookie_file_name")//使用的cookie的保存的文件名
    c.setopt(pycurl.COOKIEJAR, "cookie_file_name")
    c.setopt(pycurl.POST,1)//是否是post方法，默认是get
    c.setopt(pycurl.POSTFIELDS, urllib.urlencode(post_data))//post的数据，是字典：个字典:{"key":"value"}
    c.setopt(c.WRITEFUNCTION, t.body_callback)//结果写入的回调函数，可以是”
def body_callback(self,buf):
        self.contents = self.contents + buf
“
    c.setopt(pycurl.HEADERFUNCTION, d.body_callback)//同样是结果的回调函数
    c.setopt(pycurl.ENCODING, 'gzip,deflate')//编码

设置完了必要的参数之后可以调用c.perform进行请求。细节很多参数功能可以参照curl库的其他文档，和官网。下面是我自己使用的一个类，封装了pycurl：

class curl_request:
    c=None
    def __init__(self,url,action='get'):
        self.url = url
        self.url_para =None
        self.c = pycurl.Curl()
        print self.url,"     d"
        self.c.setopt(pycurl.URL,self.url)
        self.c.setopt(pycurl.USERAGENT,'Miozilla/4.0 (compatible; MSIE 6.0; WindowsNT 5.1');
        self.c.setopt(pycurl.REFERER,'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+rand_str())
        self.c.setopt(pycurl.COOKIE,'Hm_lvt_5251b1b3df8c7fd322ea256727293cf0=1393221156,1393223230,1393223252,1393223985;_jzqa=1.46109393469532')
        self.c.setopt(pycurl.VERBOSE,1)

        self.c.setopt(pycurl.HEADER,1)
        self.c.setopt(pycurl.FOLLOWLOCATION, 1)
        self.c.setopt(pycurl.MAXREDIRS, 5)
        self.c.setopt(pycurl.COOKIEFILE, 'cookie_file_name.txt')
        self.c.setopt(pycurl.COOKIEJAR, 'cookie_file_name.txt')
        if action == 'post':
            self.c.setopt(pycurl.POST,1)
            self.c.setopt(pycurl.POSTFIELDS, post_data = {"noe":"noe"})
        else:
            self.c.setopt(pycurl.HTTPGET,1)

#        c.setopt(c.WRITEFUNCTION, self.write)

#        c.setopt(pycurl.HEADERFUNCTION, d.body_callback)
        self.c.setopt(pycurl.ENCODING, 'gzip,deflate');

    def set_url_para(self,para):
        self.url_para = para
        url = self.url + para
        self.c.setopt(pycurl.URL,url)

    def set_post_para(self,para):
        self.c.setopt(pycurl.POST,1)
        self.c.setopt(pycurl.POSTFIELDS, urllib.urlencode( para))
    def set_cookie(self,cookie):
        self.c.setopt(pycurl.COOKIE,cookie)

    def perform(self,url='',referer=''):
        if url != '':
            self.c.setopt(pycurl.URL,url)
        if referer != '':
            self.c.setopt(pycurl.REFERER,referer)
        self.buf = cStringIO.StringIO()
        self.head = cStringIO.StringIO()
        self.c.setopt(self.c.WRITEFUNCTION, self.buf.write)
        self.c.setopt(pycurl.HEADERFUNCTION, self.head.write)
        try:
            self.c.perform()
        except Exception,e:
            self.c.close()
            self.buf.close()
            self.head.close()
        self.r = self.buf.getvalue()
        self.h = self.head.getvalue()
        self.code = self.c.getinfo(pycurl.HTTP_CODE)
        self.info = self.c.getinfo(pycurl.EFFECTIVE_URL)
        self.cookie = self.c.getinfo(pycurl.INFO_COOKIELIST)

        self.buf.close()
        self.head.close()
    def __del__(self):
        self.c.close()

    def get_body(self):
        return self.r
    def get_head(self):
        return self.h
    def get_code(self):
        return self.code
    def get_info(self):
        return self.info
    def get_cookie(self):
        return self.cookie

在涉及到某些网页需要登录才能访问时，可以设置cookie和post的数据进行登录操作。登录完成之后登录的session信息会保存到cookie文件中，以后的访问都会附带上cookie验证身份。

使用这个请求完网页内容之后，可以使用beautifulsoup来解析网页内容。这个用法类似于xml2的使用方法，可以查找，也可以遍历。

像如下的代码就是分析某个特定网站的html代码之后，进行相应图片的下载：

def get_dynamic_mm(buf):
    root_soup = BeautifulSoup(''.join( buf ),fromEncoding="utf-8")
    div = root_soup.find('div',{ "class":"mm_time"})
    if div:
        for divsub in div.div :
            if str(type(divsub)) == "<class 'BeautifulSoup.Tag'>" and divsub['class'] == "girl_info" :
                name = divsub.a.string.strip().replace(" ","")
                page = divsub.a['href']
        os.makedirs("./照片/"+name)
        img_url = div.img['src']
        get_img(img_url,name,name)
        return page
 

def get_img(url,name,path):
    while 1:
        try :
            r = urllib2.urlopen(url)
            print './照片/'+path+'/'+name+'.gif'
            f = open('./照片/'+path+'/'+name+'.gif','ab+')
            f.write(r.read())
            r.close()
            f.close()
            break
        except Exception,e:
            print 'error'
            continue

beautiful的使用文档也可以在官网找到：http://www.crummy.com/software/BeautifulSoup/

配合这两个3方库，很容易可以实现网络机器人，进行图片的下载，特定内容的监控（例如飞机票的价格）以及各个论坛去发帖。

在这里介绍一本书，比较简易，介绍了作者怎样使用php来编写爬虫，编写爬虫时应该注意的点，例如访问网站时应该间隔多长时间，以及网络机器人的用途，例如可以检测一个网页里的无效链接有多少。《Webbots, Spiders, and Screen Scrapers 2nd Editior》

下面是我自己写的简易的爬虫，爬取链接和相关的内容，把相关内容的网页内容写入sqlite文件。

使用了线程池，在线程池初始化时，启动线程，每个线程循环获取任务队列的数据，获取任务。获取到任务后就进行任务处理（爬取网页）。知道任务结束，设置flag结束所有线程。这个方法很好，以前在工作中居然没有使用过线程（池）（一直都是无尽的fork，两年的工作经验啊，真是坑了公司），可以节省系统资源，又可以灵活的调整任务的效率，相对多进程来说还节省了进程间的数据传递，以及不容易出错。

爬链接的时候只是获取 <a href=""></a>里的href字段。

爬网页的时候还进行了关键字的搜寻，搜寻到关键字后，就把内容写入队列，让主线程进行数据的写入（使用 sqlite）

import sys
import os
import re
import urllib
import urllib2
import time
import random
import pycurl
import Queue
import threading
import logging
from BeautifulSoup import BeautifulSoup
import getopt
import sqlite3
from Request import curl_request


global logger
class MyThread(threading.Thread):
    def __init__(self, workQueue, resultQueue, contentQueue, key, timeout=15):
        threading.Thread.__init__(self)
        self.mutex = threading.Lock()
        self.timeout = timeout
        self.setDaemon(True)
        self.workQueue = workQueue
        self.resultQueue = resultQueue
        self.contentQueue = contentQueue
        self.start()
        self.flag = False
        self.exit_flag = False
        self.key = key
        
    def run(self):
        while True:
            try:
               # if self.mutex.acquire(1): 
                callable, args, kwargs, deep = self.workQueue.get(timeout=self.timeout)
                #self.mutex.release()
                self.flag = True
                res = callable(args,self.resultQueue,self.contentQueue,kwargs,deep,self.key)
                self.flag = False
            except Queue.Empty:
                logger.debug('queue is emtpy')
                self.flag = False
                if self.exit_flag:
                    logger.info('exit_flag set')
                    break
                continue
            except :
                print sys.exc_info()
                raise
            
class ThreadPool:
    def __init__(self, key, num_of_threads=10):
        self.workQueue = Queue.Queue()
        self.resultQueue = Queue.Queue()
        self.contentQueue = Queue.Queue()
        self.threads = []
        self.key = key
        self.__createThreadPool(num_of_threads)
       
    def __createThreadPool(self, num_of_threads):
        for i in range( num_of_threads ):
            thread = MyThread( self.workQueue, self.resultQueue, self.contentQueue, self.key )
            self.threads.append(thread)
            
    def wait_for_complete(self):
        while len(self.threads):
            thread = self.threads.pop()
            if thread.isAlive():
                thread.join()
    def get_flag(self):
        flag = False
        for thread in self.threads:
            if thread.flag:
                flag = True
        return flag
    def get_num(self):
        num = 0
        for thread in self.threads:
            if thread.flag:
                num += 1
        return num
    def set_flag(self):
        flag = False
        for thread in self.threads:
            thread.exit_flag = True
                
    def add_job(self,callable, args,kwargs, deep):
        self.workQueue.put( (callable, args, kwargs, deep) )

def resovle_address(base_url,link):
    base_url = base_url.strip()
    logger.debug('url base is: '+base_url.encode()+' and link is: '+link.encode())
    link = link.strip()
    link.replace(';','')
    link.replace('\\','')
    link.replace('\'','')
    link.replace('/./','/')
    bash = base_url.rfind('/')
    if len(link) < 1:
        return None
    if bash != -1 and base_url[:bash+1] != "http://":
        base_url = base_url[:base_url.rfind('/')]
    m = re.search("http|www",link)
    if link[0] == '/' and len(link)>1:
        logger.debug('return url is ' + base_url.encode() + link.encode())
        return base_url + link
    elif m is not None:
        logger.debug('return link is' + link.encode()) 
        return link
    return None
    
        
    
        
def crawl_url( url, resultQueue, contentQueue, sleep, deep, key):
    global logger
    logger.debug('start to crawl the url: '+url.encode()+' and deep is: '+str(deep))
    time.sleep(int(sleep[0]))
    home_url = curl_request(url)
    home_url.perform()
    buf = home_url.get_body()
    if buf is None:
        return 
    root_soup = BeautifulSoup(''.join( buf ),fromEncoding="utf-8")
    body = root_soup.body
    u = body
    logger.info('body is '+str(u))
    m = re.findall("<a.*?>",str(u))
    for sub in m:
        if len(sub) < 1:
            continue
        tag_a = BeautifulSoup(''.join( sub ),fromEncoding="utf-8")
        if tag_a.a is not None and tag_a.a.has_key('href'):
            url_s = tag_a.a['href']
            url_s = resovle_address(url,url_s)
         #   print 'geting url and deep is ',url_s,deep
            if url_s is not None:
                #print 'adding iiiiiiiiiiiiiiiiiii',url_s
                logger.info('geting url :'+url_s.encode()+'deep is :'+str(deep))
                resultQueue.put( (url_s, deep+1) )
    if u is None:
        return
    for k in u:
        if re.search(key,str(k)) is not None:
          #  print str(k)
            contentQueue.put( (str(url), str(k) ))

def Usage():
    print 'myspider.py usage:'

def get_rand():
    return random.sample([0.1,0.2,0.3,0.4,0.5],1)
def main(argv):
    global logger
    thread_num=10
    try:
        opts, args = getopt.getopt(argv[1:],'hu:d:t:l:f:i:',['key=','thread=','dbfile='])
    except getopt.GetoptError, err:
        print str(err)
        Usage()
        sys.exit(2)
    for o, a in opts:
        if o in ('-h','--help'):
            Usage()
            sys.exit(1)
        elif o in ('-u',):
            url = a
        elif o in ('-d',):
            scrawl_level = int(a)
        elif o in ('-f',):
            log_file = a
        elif o in ('-l',):
            log_level = int(a)
        elif o in ('--key'):
            key = a
        elif o in ('--thread'):
            thread_num = int(a)
        elif o in ('--dbfile'):
            dbfile = a
        else:
            print 'unhandled option'
            sys.exit(3)

    cu = None
    cx = None
    logger = logging.getLogger()
    hdlr = logging.FileHandler(log_file)
    logger.addHandler(hdlr)
    level = (6-log_level)*10
    logger.setLevel(level)
  #  logger.info("hi")
    if dbfile is not None:
        os.remove(dbfile)
        cx = sqlite3.connect(dbfile)
        cu=cx.cursor()
        cu.execute("""create table content (id INTEGER PRIMARY KEY AUTOINCREMENT,url varchar(100), content varchar(4000)  )""")
          
    logger.debug('thread num is '+str(thread_num))
    logger.debug('scrawl_level is ' + str(scrawl_level))
    
    
    tp = ThreadPool(key,thread_num)
    tp.add_job(crawl_url, url , get_rand() ,1)
    deep = 1
    time_old = time.time()
    count = 0
    while 1:
        time_new = time.time()
        if time_new - time_old > 10:
            print '已经处理链接数：',count,'正在处理链接数',tp.get_num(),'剩余未处理的链接数：',tp.resultQueue.qsize(),'未插入数据：',tp.contentQueue.qsize()
            time_old = time.time()
        try:
            url,deep= tp.resultQueue.get(timeout=0.5)
            if url is not None and int(deep) <= scrawl_level:
               # print "adding  deep",deep
                logger.info('adding url: '+url.encode()+'and deep is: '+str(deep))
                count += 1
                tp.add_job(crawl_url, url, get_rand(), deep)
        except Queue.Empty:
            if not tp.get_flag() and tp.contentQueue.qsize() == 0 and tp.resultQueue.qsize() == 0:
                print 'work done,exiting'
                tp.set_flag()
                break
        try:
            url,content= tp.contentQueue.get(timeout=0)
            if url is not None:
              #  print 'gettingiiiiiiiiii ',content,url
                cu.execute( "insert into content(url,content) values(?,?)", (str(url), content.decode('utf-8')))
        except Queue.Empty:
            continue
            
        
    if cx is not None:
        cx.commit()
        cx.close()
    tp.wait_for_complete()
    #print tp.workQueue.qsize()
    
if __name__ == '__main__':
    main(sys.argv)

转载于:https://my.oschina.net/u/929415/blog/219850