程序片断4

import os
import sys
import re
import time
import urllib3
import urllib
import hashlib
from bs4 import BeautifulSoup

url = "http://www.dbmeinv.com/?pager_offset="
class Spider:

    def __init__(self , url):
        self.headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/1    7.0.963.56 Safari/535.11"}
        self.urlBase = url
        self.url = ""
        self.urllist = []
        self.md5_list = []


    def __str__(self):
        msg = "urllist content "
        for i in self.urllist:
            msg += i
            msg += '   '
        return msg

    def mkNewDir(self, newName):
        os.system("mkdir "+newName)
        os.chdir(newName)

    def getImgPageRange(self):
        startPage = input("Please input start page..")
        endPage = input("Please input end page..")
        i = int(startPage)
        while i <= int(endPage):
            self.url = self.urlBase+str(i)
            self.urllist.append(self.url)
            i +=1

    def getImaFromUrl(self):
        http = urllib3.PoolManager()
        for i in self.urllist:
            response = http.request("GET" , i , headers=self.headers , timeout = 10)
            # print(response.data.decode())
            soup = BeautifulSoup(response.data.decode() , "lxml")
            # print(soup)
            self.fineImgUrl(soup)

    def fineImgUrl(self, soup):
        try:
            for i in soup.find_all(name="img", attrs={"src": re.compile(r'^https:')}):
                img_url = i.get('src')
                img_title = i.get("title")
                print(img_url+"   "+img_title)
                self.imgSave(img_url , img_title)
        except:
            print("OPS***   An err occured!")

    def imgSave(self, imgUrl , img_title):
        try:
            http = urllib3.PoolManager()
            imgData = http.request("GET", imgUrl, headers=self.headers, timeout=10)
            get_md5 = hashlib.md5()
            get_md5.update(imgData.data)
            md5_value = get_md5.hexdigest()
            print(md5_value)
            if md5_value in self.md5_list:
                print("This image is exist, pass...")
            else:
                self.md5_list.append(md5_value)
                print(len(str(imgData.data)))
                # time.sleep(5)
                # cnt_str = imgUrl.replace("/", "a")
                # cnt_str = cnt_str.replace(":", "b")
                fileName = img_title+str(len(str(imgData.data)))+".jpg"
                print(fileName)
                with open(fileName, "wb") as f:
                    f.write(imgData.data)
                print("saving img  " + fileName)
        except:
            print("OPS.........Save image failed...")

if __name__=="__main__":
    spider = Spider(url)
    newName = input("Please input your new file name..")
    spider.mkNewDir(newName)
    print(spider)
    spider.getImgPageRange()
    print(spider)
    spider.getImaFromUrl()
    print(spider)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值