在python中使用多线程下载网页

最新推荐文章于 2024-04-15 17:12:45 发布

karlzheng

最新推荐文章于 2024-04-15 17:12:45 发布

阅读量6.3k

点赞数

文章标签：多线程 python cache import socket

本文链接：https://blog.csdn.net/ZhengKarl/article/details/2976736

版权

import urllib2
import socket
import threading
import fcntl
#import portalocker

def test_urls(f1,f2,f3,f4,threadnum):
while 1:
    fcntl.flock(f1, fcntl.LOCK_EX)
    urlstr=f1.readline()
    fcntl.flock(f1,fcntl.LOCK_UN)
    if urlstr:
      datastream=None
      try:
        request = urllib2.Request(urlstr)
        opener = urllib2.build_opener()
        datastream = opener.open(request)
        if datastream:
          if datastream.headers.has_key('Expires'):
            fcntl.flock(f2, fcntl.LOCK_EX)
            f2.write(urlstr)
            fcntl.flock(f2,fcntl.LOCK_UN)
          else:
            str_cache_control = datastream.headers.get('cache-control')
            if str_cache_control and str_cache_control.find('max-age')!= '-1':
              fcntl.flock(f2, fcntl.LOCK_EX)
              f2.write(urlstr)
              fcntl.flock(f2,fcntl.LOCK_UN)
            else:
              fcntl.flock(f3, fcntl.LOCK_EX)
              f3.write(urlstr)
              fcntl.flock(f3,fcntl.LOCK_UN)
      except IOError, e:
        fcntl.flock(f4, fcntl.LOCK_EX)
        f4.write(urlstr)
        fcntl.flock(f4,fcntl.LOCK_UN)
      print threadnum
    else:
      break

#set socket timeout
timeout=10
socket.setdefaulttimeout(timeout)
f1=open("urls.txt","r")
f2=open('haveExprires.txt','w')
f3=open('noExprires.txt','w')
f4=open('cantBeOpen_urls.txt', 'w')

threads = []
num = 10
for x in xrange(0,num):
threads.append(threading.Thread(target=test_urls, args=(f1,f2,f3,f4,x,)))
for t in threads:
t.start()
for t in threads:
t.join()

f1.close()
f2.close()
f3.close()
f4.close()