Python 爬虫

4 篇文章 0 订阅
3 篇文章 0 订阅

趁国庆,手写了简易Python多进程的爬虫。内什么 初次接触 大家见笑。

#! /usr/bin/python

import urllib2
import urllib
import re
import threading
import time
import urlparse
import socket
import requests

from multiprocessing import Process,Queue
import os,time,random


g_ndownloadedNum = 0
g_nVisitedTime = 0
g_visitedLinks = []
g_unvisitedLinks = []
g_curUrl = 'https://ss.postcc.us/'


get_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/51.0.2704.63 Safari/537.36',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding':'gzip',
'Connection':'close',
'Referer':g_curUrl}

class LinkTask:
	def __init__(self,url):
		self.TaskType = 0
		self.LinkUrl = url
		self.ImgList = []
		self.LinkList = []
		
	def doLikTask(self):
		print "parseLink :%s" % self.LinkUrl		
		try:
			#select cur web
			nPos = self.LinkUrl.find(g_curUrl)
			if nPos < 0:
				print 'find Nothing!!!'
				return 

			###############
			page = urllib.urlopen(self.LinkUrl)
			if (page.getcode() != 200):
				print 'webpage open failed!'
				return 
				
			html = page.read()			
# filter large pic
			reg = r'src="(.+?\.jpg)" '
#			reg = r'img class=.+? src="(.+?\.jpg)" title=.+? '
			imgre = re.compile(reg)
			self.ImgList = re.findall(imgre,html)
			print "parse %s Imgs" % len(self.ImgList)
			
#			reg = r'href="(.+?\.html)" '
			reg = r'<a[^>]+href=["\'](.*?)["\']'
			htmlre = re.compile(reg,re.IGNORECASE)
			self.LinkList = re.findall(htmlre,html)
			print "parse %s Lins" % len(self.LinkList)						
		except IOError:
			pass
		
class ImgTask:
	def __init__(self,url,nCount):
		self.TaskType = 1
		self.LinkUrl = url
		self.Count = nCount
		
	def doImgTask(self):
		try:
#			urllib.urlretrieve(self.LinkUrl,'%s.jpg' % self.Count)
#			with open('%s.jpg' % self.Count,"wb") as f:
#				f.write(self.LinkUrl)
#				f.close()
			print '**********************'
			print 'download_link %s' % self.LinkUrl			
			timeout = 30
			request = urllib2.Request(self.LinkUrl,None,headers = get_headers)
			response = urllib2.urlopen(request,None,timeout)
			str = response.read()
			foo = open('%s.jpg' % self.Count,"wb")
			foo.write(str)
			foo.close()

			if (0):
				print '**********************'
				reponse = requests.get(self.LinkUrl,headers = get_headers)
				print 'response.state_code %d' % reponse.status_code
				if(reponse.status_code == 404):
					print 'response.state_code %d' % reponse.status_code
					reponse = requests.get(self.LinkUrl,headers = get_headers,allow_redirects=False)
					if(reponse.status_code == 302):
						url = reponse.headers['location']
						reponse = requests.get(url)
						foo = open('%s.jpg' % self.Count,"wb")
						foo.write(str)
						foo.close()
				elif (reponse.status_code == 200)	:
					foo = open('%s.jpg' % self.Count,"wb")
					foo.write(str)
					foo.close()				
				
#			print 'download_link %s' % self.LinkUrl
		except :
			pass
		print('dowmload',self.Count)

unVisitedUrlTaskmap = {} #linkmap url <--> task
VisitedUrlTaskmap = {}
ImgTaskList = {}	

def parseHtml(ImgQueue):
	url = g_curUrl
	unVisitedUrlTaskmap[url] = LinkTask(url)
	nDownloadNum = 0
	while True:
		if ImgQueue.qsize() > 1600 :
			print 'wait for consume ImgQueue'
			time.sleep(2)
		elif len(unVisitedUrlTaskmap) > 0:
			time.sleep(0.1)
			key,value = unVisitedUrlTaskmap.popitem()
			value.doLikTask()
			VisitedUrlTaskmap[key] = value
			print('start parsehtml')
				
			linkList = value.LinkList		
			for i in linkList:
				i = urlparse.urljoin(url,i)
				if i in VisitedUrlTaskmap:
					pass
				else:
					unVisitedUrlTaskmap[i] = LinkTask(i)
							
			imgList = value.ImgList
			for i in imgList:
				ImgValue = ImgTask(i,nDownloadNum)
				ImgQueue.put(ImgValue)
				nDownloadNum +=1
		else:
			time.sleep(0.1)
		
def downloadImgList(ImgQueue):
	print 'process Download Imglist start'
	while True:
		print 'Download Imgqueue size = %s' % ImgQueue.qsize()
		time.sleep(0.1)
		if ImgQueue.qsize() >0:
			value = ImgQueue.get(True)
			imtask = value
			imtask.doImgTask()		
		else:
			print 'dowmload_sleep'
			time.sleep(2)
	print 'Download_Img_thread_exit !!!'

if __name__ == '__main__':
	socket.setdefaulttimeout(30)
	ImgQueue = Queue()
	pIn = Process(target = parseHtml,args= (ImgQueue,))
	pOut = Process(target = downloadImgList, args = (ImgQueue,))

	pIn.start()
	pOut.start()
	pIn.join()
	pOut.terminate()


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值