在python中使用多线程下载网页

import urllib2
import socket
import threading
import fcntl
#import portalocker

def test_urls(f1,f2,f3,f4,threadnum):
  while 1:
    fcntl.flock(f1, fcntl.LOCK_EX) 
    urlstr=f1.readline()
    fcntl.flock(f1,fcntl.LOCK_UN)
    if urlstr:
      datastream=None
      try:
        request = urllib2.Request(urlstr)
        opener = urllib2.build_opener()
        datastream = opener.open(request)
        if datastream:
          if datastream.headers.has_key('Expires'):
            fcntl.flock(f2, fcntl.LOCK_EX) 
            f2.write(urlstr)
            fcntl.flock(f2,fcntl.LOCK_UN)
          else:
            str_cache_control = datastream.headers.get('cache-control')
            if str_cache_control and str_cache_control.find('max-age')!= '-1':
              fcntl.flock(f2, fcntl.LOCK_EX)
              f2.write(urlstr)
              fcntl.flock(f2,fcntl.LOCK_UN)
            else:
              fcntl.flock(f3, fcntl.LOCK_EX)
              f3.write(urlstr)
              fcntl.flock(f3,fcntl.LOCK_UN)
      except IOError, e:
        fcntl.flock(f4, fcntl.LOCK_EX)       
        f4.write(urlstr)
        fcntl.flock(f4,fcntl.LOCK_UN)
      print threadnum
    else:
      break

#set socket timeout
timeout=10
socket.setdefaulttimeout(timeout)
f1=open("urls.txt","r")
f2=open('haveExprires.txt','w')
f3=open('noExprires.txt','w')
f4=open('cantBeOpen_urls.txt', 'w')

threads = []
num = 10
for x in xrange(0,num):
  threads.append(threading.Thread(target=test_urls, args=(f1,f2,f3,f4,x,)))
for t in threads:
  t.start()
for t in threads:
  t.join()
 
f1.close()
f2.close()
f3.close()
f4.close()
 
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值