趁国庆,手写了简易Python多进程的爬虫。内什么 初次接触 大家见笑。
#! /usr/bin/python
import urllib2
import urllib
import re
import threading
import time
import urlparse
import socket
import requests
from multiprocessing import Process,Queue
import os,time,random
g_ndownloadedNum = 0
g_nVisitedTime = 0
g_visitedLinks = []
g_unvisitedLinks = []
g_curUrl = 'https://ss.postcc.us/'
get_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/51.0.2704.63 Safari/537.36',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding':'gzip',
'Connection':'close',
'Referer':g_curUrl}
class LinkTask:
def __init__(self,url):
self.TaskType = 0
self.LinkUrl = url
self.ImgList = []
self.LinkList = []
def doLikTask(self):
print "parseLink :%s" % self.LinkUrl
try:
#select cur web
nPos = self.LinkUrl.find(g_curUrl)
if nPos < 0:
print 'find Nothing!!!'
return
###############
page = urllib.urlopen(self.LinkUrl)
if (page.getcode() != 200):
print 'webpage open failed!'
return
html = page.read()
# filter large pic
reg = r'src="(.+?\.jpg)" '
# reg = r'img class=.+? src="(.+?\.jpg)" title=.+? '
imgre = re.compile(reg)
self.ImgList = re.findall(imgre,html)
print "parse %s Imgs" % len(self.ImgList)
# reg = r'href="(.+?\.html)" '
reg = r'<a[^>]+href=["\'](.*?)["\']'
htmlre = re.compile(reg,re.IGNORECASE)
self.LinkList = re.findall(htmlre,html)
print "parse %s Lins" % len(self.LinkList)
except IOError:
pass
class ImgTask:
def __init__(self,url,nCount):
self.TaskType = 1
self.LinkUrl = url
self.Count = nCount
def doImgTask(self):
try:
# urllib.urlretrieve(self.LinkUrl,'%s.jpg' % self.Count)
# with open('%s.jpg' % self.Count,"wb") as f:
# f.write(self.LinkUrl)
# f.close()
print '**********************'
print 'download_link %s' % self.LinkUrl
timeout = 30
request = urllib2.Request(self.LinkUrl,None,headers = get_headers)
response = urllib2.urlopen(request,None,timeout)
str = response.read()
foo = open('%s.jpg' % self.Count,"wb")
foo.write(str)
foo.close()
if (0):
print '**********************'
reponse = requests.get(self.LinkUrl,headers = get_headers)
print 'response.state_code %d' % reponse.status_code
if(reponse.status_code == 404):
print 'response.state_code %d' % reponse.status_code
reponse = requests.get(self.LinkUrl,headers = get_headers,allow_redirects=False)
if(reponse.status_code == 302):
url = reponse.headers['location']
reponse = requests.get(url)
foo = open('%s.jpg' % self.Count,"wb")
foo.write(str)
foo.close()
elif (reponse.status_code == 200) :
foo = open('%s.jpg' % self.Count,"wb")
foo.write(str)
foo.close()
# print 'download_link %s' % self.LinkUrl
except :
pass
print('dowmload',self.Count)
unVisitedUrlTaskmap = {} #linkmap url <--> task
VisitedUrlTaskmap = {}
ImgTaskList = {}
def parseHtml(ImgQueue):
url = g_curUrl
unVisitedUrlTaskmap[url] = LinkTask(url)
nDownloadNum = 0
while True:
if ImgQueue.qsize() > 1600 :
print 'wait for consume ImgQueue'
time.sleep(2)
elif len(unVisitedUrlTaskmap) > 0:
time.sleep(0.1)
key,value = unVisitedUrlTaskmap.popitem()
value.doLikTask()
VisitedUrlTaskmap[key] = value
print('start parsehtml')
linkList = value.LinkList
for i in linkList:
i = urlparse.urljoin(url,i)
if i in VisitedUrlTaskmap:
pass
else:
unVisitedUrlTaskmap[i] = LinkTask(i)
imgList = value.ImgList
for i in imgList:
ImgValue = ImgTask(i,nDownloadNum)
ImgQueue.put(ImgValue)
nDownloadNum +=1
else:
time.sleep(0.1)
def downloadImgList(ImgQueue):
print 'process Download Imglist start'
while True:
print 'Download Imgqueue size = %s' % ImgQueue.qsize()
time.sleep(0.1)
if ImgQueue.qsize() >0:
value = ImgQueue.get(True)
imtask = value
imtask.doImgTask()
else:
print 'dowmload_sleep'
time.sleep(2)
print 'Download_Img_thread_exit !!!'
if __name__ == '__main__':
socket.setdefaulttimeout(30)
ImgQueue = Queue()
pIn = Process(target = parseHtml,args= (ImgQueue,))
pOut = Process(target = downloadImgList, args = (ImgQueue,))
pIn.start()
pOut.start()
pIn.join()
pOut.terminate()