from urllib import request from bs4 import BeautifulSoup import logging, os base_url = 'http://www.xgyw.cc' url_list = [base_url+'/Xgyw'] h_list = [] path = r'E:\python\0425\pics' for i in range(2,5): url_list.append('http://www.xgyw.cc/Xgyw/page_%s.html' %i) print(url_list) headers={'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML, likeGecko) Chrome/57.0.2987.110 Safari/537.36'} #解析网址 def get_Hrefs(): maxtrynum = 5 global hrefs_list hrefs_list = [] for url in url_list: print('解析页:[%s]' % url) for tries in range(maxtrynum): try: req = request.Request(url=url, headers=headers) res = request.urlopen(req) html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '') # html = res.read().decode('gbk', 'ignore') # print(html) soup = BeautifulSoup(html, 'html.parser') # hrefs = soup.find_all('div', class_='biank1') hrefs = soup.select('a[href^="/Xgyw/Xgyw"]') # print(hrefs) for each_href in hrefs: hre = each_href.get('href') # print(hre) hrefs_list.append(base_url + hre) break except: if tries < (maxtrynum - 1): continue else: logging.error("Has tried %d times to access url %s, all failed!", maxtrynum, url) break return hrefs_list # 列表去重 def dedupe(list): global L L=[] for i in list: if i not in L: yield i L.append(i) return L # 解析page图片页 def get_pages_hrefs(href): times = 5 # for href in href_list: for t in range(times): try: print('找到页面:%s' % href) req = request.Request(url=href, headers=headers) res = request.urlopen(req) html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '') # print(html) soup = BeautifulSoup(html, 'html.parser') pages = soup.select('a[href^="/Xgyw/Xgyw"]') # print(pages) for each in pages: addr = each.get('href') # print(addr) h_list.append('http://www.xgyw.cc' + addr) break except: if t < (times - 1): continue else: logging.error("Has tried %d times to access url %s, all failed!", times, href) break return h_list # 解析图片&保存图片 def parser_pics(list): n = 5 for each_list in list: for t in range(n): try: print('解析图片地址:%s' % each_list) req = request.Request(url=each_list, headers=headers) res = request.urlopen(req) html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '') # print(html) soup = BeautifulSoup(html, 'html.parser') pics = soup.select('img[src^="/uploadfile"]') #print(pics) for each_pic in pics: srcs = each_pic.get('src') print("解析的图片url:", base_url + srcs) save_pics(base_url + srcs) break except: if t < (n - 1): continue else: logging.error("Has tried %d times to access url %s, all failed!", n, each_list) break # 保存图片 def save_pics(pic): fileName = path + os.sep + pic.split("/")[-1] if not os.path.exists(fileName): with open(fileName, "wb") as f: print("正在保存:", fileName) f.write(request.urlopen(pic).read()) if __name__=='__main__': for i in dedupe(get_Hrefs()): get_pages_hrefs(i) print(h_list) parser_pics(h_list)
09-13