spider 4.0,

# 'encoding: UTF-8'
__author__ = 'apple'

import re
import urllib2
import threading
from threading import Thread
from Queue import Queue
import time
from time import sleep, ctime

q = Queue()         #初始化队列
NUM = 20            #线程数
imgurl = []         #保存图片地址的list
pages = 1

class web_info:
    def __init__(self):
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = {'User-Agent': self.user_agent }
    def get_herf(self, page):                                        #取得子网页链接
        address = []
        try:
                url = "http://www.meizitu.com/a/list_1_%d.html" %page
                request = urllib2.Request(url)
                respond = urllib2.urlopen(request,timeout=10)
                content = respond.read()
                pattern = re.compile('<p><a href="(.*?)"  target=.*?><img.*?src.*?style.*?/></a></p>', re.S)
                result = re.findall(pattern, content)
                for item in result:
                    address.append(item)
        except urllib2.URLError, e:
                    print e
                    print "herf error"
                    pass
        except StandardError, e:
                    print e
                    print "herf error"
                    pass
        return address

    def get_imgurl(self, herf):              #取得img的url
        address = []
        for url in herf:
                try:
                    request = urllib2.Request(url)
                    respond = urllib2.urlopen(request,timeout=10)
                    content = respond.read().decode("gbk")
                    pattern = re.compile('<img.*?alt=".*?" src="(.*?)" /><br />' , re.S)
                    result = re.findall(pattern,content)
                    for item in result:
                        address.append(item)
                except urllib2.URLError, e:
                    print e
                    print "imgurl error"
                    pass
                except StandardError, e:
                    print "imgurl error"
                    print e
                    pass
        return address
class spider:                                       #初始化spider类
    def __init__(self):
        self.name =  "spider 1"

    def read(self,jpg):                             #读取网页内容
        content = urllib2.urlopen(jpg).read()
        return content

    def grab(self,content,category_name, name):     #下载到本地当前目录
        with open(str(category_name)+"_"+str(name) + '.jpg', 'w') as f:
            f.write(content)
            print "finished saving picture:",str(category_name)+"_"+str(name), ctime()

def download(url):                                  #设置具体的处理函数,负责处理单个任务
    a = spider()
    content = a.read(url)
    a.grab(content,pages,imgurl.index(url))

def working():      #这个是工作进程,负责不断从队列取数据并处理
    while True:
        arguement = q.get()
        download(arguement)
        q.task_done()           #在完成这项工作之后,使用 queue.task_done() 函数向任务已经完成的队列发送一个信号。

for l in xrange(1, 5):   #取得1-5页的img url
    a = web_info()
    herf = a.get_herf(l)
    jpg = a.get_imgurl(herf)
    for i in jpg:
        imgurl.append(i)
print "total:", len(imgurl), "pictures"
print "processing... "


print "++++start saving++++"
for n in range(NUM):                 #初始化线程
        t = Thread(target=working)
        t.setDaemon(True)            #设置守护线程
        t.start()                   #开始
for i in imgurl:                     #导入到queue
    q.put(i)
q.join()        #对队列执行 join 操作,实际上意味着等到队列为空,再退出主程序。!!!!!!!!!有待完善,有bug

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值