import os
import sys
import re
import time
import urllib3
import urllib
import hashlib
from bs4 import BeautifulSoup
url = "http://www.dbmeinv.com/?pager_offset="
class Spider:
def __init__(self , url):
self.headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/1 7.0.963.56 Safari/535.11"}
self.urlBase = url
self.url = ""
self.urllist = []
self.md5_list = []
def __str__(self):
msg = "urllist content "
for i in self.urllist:
msg += i
msg += ' '
return msg
def mkNewDir(self, newName):
os.system("mkdir "+newName)
os.chdir(newName)
def getImgPageRange(self):
startPage = input("Please input start page..")
endPage = input("Please input end page..")
i = int(startPage)
while i <= int(endPage):
self.url = self.urlBase+str(i)
self.urllist.append(self.url)
i +=1
def getImaFromUrl(self):
http = urllib3.PoolManager()
for i in self.urllist:
response = http.request("GET" , i , headers=self.headers , timeout = 10)
# print(response.data.decode())
soup = BeautifulSoup(response.data.decode() , "lxml")
# print(soup)
self.fineImgUrl(soup)
def fineImgUrl(self, soup):
try:
for i in soup.find_all(name="img", attrs={"src": re.compile(r'^https:')}):
img_url = i.get('src')
img_title = i.get("title")
print(img_url+" "+img_title)
self.imgSave(img_url , img_title)
except:
print("OPS*** An err occured!")
def imgSave(self, imgUrl , img_title):
try:
http = urllib3.PoolManager()
imgData = http.request("GET", imgUrl, headers=self.headers, timeout=10)
get_md5 = hashlib.md5()
get_md5.update(imgData.data)
md5_value = get_md5.hexdigest()
print(md5_value)
if md5_value in self.md5_list:
print("This image is exist, pass...")
else:
self.md5_list.append(md5_value)
print(len(str(imgData.data)))
# time.sleep(5)
# cnt_str = imgUrl.replace("/", "a")
# cnt_str = cnt_str.replace(":", "b")
fileName = img_title+str(len(str(imgData.data)))+".jpg"
print(fileName)
with open(fileName, "wb") as f:
f.write(imgData.data)
print("saving img " + fileName)
except:
print("OPS.........Save image failed...")
if __name__=="__main__":
spider = Spider(url)
newName = input("Please input your new file name..")
spider.mkNewDir(newName)
print(spider)
spider.getImgPageRange()
print(spider)
spider.getImaFromUrl()
print(spider)