spider 4.0,

最新推荐文章于 2024-05-04 11:44:11 发布

Mamihlapinatapei

最新推荐文章于 2024-05-04 11:44:11 发布

阅读量251

点赞数

本文链接：https://blog.csdn.net/u014473511/article/details/44263321

版权

# 'encoding: UTF-8'
__author__ = 'apple'

import re
import urllib2
import threading
from threading import Thread
from Queue import Queue
import time
from time import sleep, ctime

q = Queue()         #初始化队列
NUM = 20            #线程数
imgurl = []         #保存图片地址的list
pages = 1

class web_info:
    def __init__(self):
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = {'User-Agent': self.user_agent }
    def get_herf(self, page):                                        #取得子网页链接
        address = []
        try:
                url = "http://www.meizitu.com/a/list_1_%d.html" %page
                request = urllib2.Request(url)
                respond = urllib2.urlopen(request,timeout=10)
                content = respond.read()
                pattern = re.compile('<p><a href="(.*?)"  target=.*?><img.*?src.*?style.*?/></a></p>', re.S)
                result = re.findall(pattern, content)
                for item in result:
                    address.append(item)
        except urllib2.URLError, e:
                    print e
                    print "herf error"
                    pass
        except StandardError, e:
                    print e
                    print "herf error"
                    pass
        return address

    def get_imgurl(self, herf):              #取得img的url
        address = []
        for url in herf:
                try:
                    request = urllib2.Request(url)
                    respond = urllib2.urlopen(request,timeout=10)
                    content = respond.read().decode("gbk")
                    pattern = re.compile('<img.*?alt=".*?" src="(.*?)" /><br />' , re.S)
                    result = re.findall(pattern,content)
                    for item in result:
                        address.append(item)
                except urllib2.URLError, e:
                    print e
                    print "imgurl error"
                    pass
                except StandardError, e:
                    print "imgurl error"
                    print e
                    pass
        return address
class spider:                                       #初始化spider类
    def __init__(self):
        self.name =  "spider 1"

    def read(self,jpg):                             #读取网页内容
        content = urllib2.urlopen(jpg).read()
        return content

    def grab(self,content,category_name, name):     #下载到本地当前目录
        with open(str(category_name)+"_"+str(name) + '.jpg', 'w') as f:
            f.write(content)
            print "finished saving picture:",str(category_name)+"_"+str(name), ctime()

def download(url):                                  #设置具体的处理函数，负责处理单个任务
    a = spider()
    content = a.read(url)
    a.grab(content,pages,imgurl.index(url))

def working():      #这个是工作进程，负责不断从队列取数据并处理
    while True:
        arguement = q.get()
        download(arguement)
        q.task_done()           #在完成这项工作之后，使用 queue.task_done() 函数向任务已经完成的队列发送一个信号。

for l in xrange(1, 5):   #取得1-5页的img url
    a = web_info()
    herf = a.get_herf(l)
    jpg = a.get_imgurl(herf)
    for i in jpg:
        imgurl.append(i)
print "total:", len(imgurl), "pictures"
print "processing... "


print "++++start saving++++"
for n in range(NUM):                 #初始化线程
        t = Thread(target=working)
        t.setDaemon(True)            #设置守护线程
        t.start()                   #开始
for i in imgurl:                     #导入到queue
    q.put(i)
q.join()        #对队列执行 join 操作，实际上意味着等到队列为空，再退出主程序。！！！！！！！！！有待完善，有bug

Mamihlapinatapei

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spider 4.0,

# 'encoding: UTF-8'__author__ = 'apple'import reimport urllib2import threadingfrom threading import Threadfrom Queue import Queueimport timefrom time import sleep, ctimeq = Queue()
复制链接

扫一扫