python写一个下载器_用 python 实现一个多线程网页下载器

1 #-*- coding:utf-8 -*-

2 importurllib, httplib3 importthread4 importtime5 from Queue importQueue, Empty, Full6 HEADERS = {"Content-type": "application/x-www-form-urlencoded",7 'Accept-Language':'zh-cn',8 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0;Windows NT 5.0)',9 "Accept": "text/plain"}10 UNEXPECTED_ERROR = -1

11 POST = 'POST'

12 GET = 'GET'

13 defbase_log(msg):14 printmsg15 defbase_fail_op(task, status, log):16 log('fail op. task = %s, status = %d'%(str(task), status))17 def get_remote_data(tasks, results, fail_op = base_fail_op, log =base_log):18 whileTrue:19 task =tasks.get()20 try:21 tid = task['id']22 hpt = task['conn_args'] #hpt <= host:port, timeout

23 exceptKeyError, e:24 log(str(e))25 continue

26 log('thread_%s doing task %d'%(thread.get_ident(), tid))27 #log('hpt = ' + str(hpt))

28 conn = httplib.HTTPConnection(**hpt)29

30 try:31 params = task['params']32 exceptKeyError, e:33 params ={}34 params =urllib.urlencode(params)35 #log('params = ' + params)

36

37 try:38 method = task['method']39 exceptKeyError:40 method = 'GET'

41 #log('method = ' + method)

42

43 try:44 url = task['url']45 exceptKeyError:46 url = '/'

47 #log('url = ' + url)

48

49 headers =HEADERS50 try:51 tmp = task['headers']52 exceptKeyError, e:53 tmp ={}54 headers.update(tmp)55 #log('headers = ' + str(headers))

56 headers['Content-Length'] =len(params)57

58 try:59 if method ==POST:60 conn.request(method, url, params, headers)61 else:62 conn.request(method, url +params)63 response =conn.getresponse()64 exceptException, e:65 log('request failed. method = %s, url = %s, params = %s headers = %s'%(66 method, url, params, headers))67 log(str(e))68 fail_op(task, UNEXPECTED_ERROR, log)69 continue

70

71 if response.status !=httplib.OK:72 fail_op(task, response.status, log)73 continue

74

75 data =response.read()76 results.put((tid, data), True)77

78 classHttpPool(object):79 def __init__(self, threads_count, fail_op, log):80 self._tasks =Queue()81 self._results =Queue()82

83 for i inxrange(threads_count):84 thread.start_new_thread(get_remote_data,85 (self._tasks, self._results, fail_op, log))86

87 def add_task(self, tid, host, url, params, headers = {}, method = 'GET', timeout =None):88 task ={89 'id': tid,90 'conn_args' : {'host' : host} if timeout is None else {'host' : host, 'timeout': timeout},91 'headers': headers,92 'url': url,93 'params': params,94 'method': method,95 }96 try:97 self._tasks.put_nowait(task)98 exceptFull:99 returnFalse100 returnTrue101

102 defget_results(self):103 results =[]104 whileTrue:105 try:106 res =self._results.get_nowait()107 exceptEmpty:108 break

109 results.append(res)110 returnresults111

112 deftest_google(task_count, threads_count):113 hp =HttpPool(threads_count, base_fail_op, base_log)114 for i inxrange(task_count):115 ifhp.add_task(i,116 'www.google.cn',117 '/search?',118 {'q' : 'lai'},119 #method = 'POST'

120 ):121 print 'add task successed.'

122

123 whileTrue:124 results =hp.get_results()125 if notresults:126 time.sleep(1.0 *random.random())127 for i inresults:128 print i[0], len(i[1])129 #print unicode(i[1], 'gb18030')

130

131 if __name__ == '__main__':132 importsys, random133 task_count, threads_count = int(sys.argv[1]), int(sys.argv[2])134 test_google(task_count, threads_count)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值