import os
import time
import re
import urllib3
import urllib
from bs4 import BeautifulSoup
from PIL import Image
import hashlib
headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/1 7.0.963.56 Safari/535.11"}
url = "http://www.umei.cc//"
urllist = []
urllistAll = []
md5_list = []
#定义一个爬虫类
class Spider:
def __init__(self , containt , point = 0):
print("HelloWorld")
self.containt = containt
self.point = point
def getURL(self , inputURL):
urllist.append(inputURL)
urllistAll.append(inputURL)
print(len(urllist))
def findURL(self):
try:
http = urllib3.PoolManager()
for url in urllist:
urllist.remove(url)
self.saveList()
r = http.request("GET" , url , headers=headers ,timeout = 10)
# print(r.status)
soup = BeautifulSoup(r.data.decode() , "lxml")
self.fineImgUrl(soup)
for i in soup.find_all(name = 'a'):
# print(i)
pass
for i in soup.find_all(name='a', attrs = {"href":re.compile(r'^http:')}):
urlTmp = i.get('href')
# print(urlTmp)
if urlTmp in urllistAll:
# print(urlTmp)
pass
else:
urllist.append(urlTmp)
urllistAll.append(urlTmp)
except:
print("ops... an err occured!")
def fineImgUrl(self , soup):
try:
for i in soup.find_all(name="img", attrs = {"src":re.compile(r'^http:')}):
img_url = i.get('src')
print(img_url)
self.imgSave(img_url)
except:
print("OPS*** An err occured!")
def imgSave(self,imgUrl):
try:
http = urllib3.PoolManager()
imgData = http.request("GET",imgUrl,headers=headers ,timeout = 10)
get_md5 = hashlib.md5()
get_md5.update(imgData.data)
md5_value = get_md5.hexdigest()
print(md5_value)
if md5_value in md5_list:
print("This image is exist, pass...")
else:
md5_list.append(md5_value)
print(len(str(imgData.data)))
# time.sleep(5)
cnt_str = imgUrl.replace("/", "a")
cnt_str = cnt_str.replace(":", "b")
fileName = cnt_str
print(fileName)
with open(fileName, "wb") as f:
f.write(imgData.data)
print("saving img " + fileName)
except:
print("OPS.........Save image failed...")
def saveList(self):
try:
with open("AllList.txt", "w") as fb:
for i in urllistAll:
fb.write(i)
fb.write("\r\n")
print("Saved all urllib!!")
except:
print("Save list failed~~~")
def __str__(self):
msg = "This is a network spider, nice to meet you!"
if len(urllist) > 0:
msg += "my URL list contain " + str(len(urllist))
msg += "Item, they are "
for tmp in urllist:
msg += str(tmp)
msg += " , "
return msg
#主函数
if __name__ == "__main__":
os.system("mkdir pic6")
os.chdir("pic6")
spider = Spider("hello")
print(spider)
spider.getURL(url)
print(spider)
while True:
spider.findURL()
print(spider)
spider.saveList()