爬虫草稿

Bwj_fqcd

于 2017-04-26 21:31:43 发布

阅读量309

点赞数

本文链接：https://blog.csdn.net/baowj1107/article/details/70832816

版权

from urllib import request
from bs4  import BeautifulSoup
import logging, os


base_url = 'http://www.xgyw.cc'
url_list = [base_url+'/Xgyw']
h_list = []
path = r'E:\python\0425\pics'
for i in range(2,5):
    url_list.append('http://www.xgyw.cc/Xgyw/page_%s.html' %i)
print(url_list)
headers={'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML, likeGecko) Chrome/57.0.2987.110 Safari/537.36'}

#解析网址
def get_Hrefs():
    maxtrynum = 5
    global hrefs_list
    hrefs_list = []
    for url in url_list:
        print('解析页：[%s]' % url)
        for tries in range(maxtrynum):
            try:
                req = request.Request(url=url, headers=headers)
                res = request.urlopen(req)
                html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')
              # html = res.read().decode('gbk', 'ignore')
                # print(html)                                                   
                soup = BeautifulSoup(html, 'html.parser')                        
                # hrefs = soup.find_all('div', class_='biank1')                 
                hrefs = soup.select('a[href^="/Xgyw/Xgyw"]')                    
                # print(hrefs)                                                  
                for each_href in hrefs:
                    hre = each_href.get('href')
                    # print(hre)
                    hrefs_list.append(base_url + hre)
                break
            except:
                if tries < (maxtrynum - 1):
                    continue
                else:
                    logging.error("Has tried %d times to access url %s, all failed!", maxtrynum, url)
                    break
    return hrefs_list

# 列表去重
def dedupe(list):
    global L
    L=[]
    for i in list:
        if i not in L:
            yield i
            L.append(i)
    return L
# 解析page图片页
def get_pages_hrefs(href):
    times = 5
  # for href in href_list:
    for t in range(times):
        try:
            print('找到页面：%s' % href)
            req = request.Request(url=href, headers=headers)
            res = request.urlopen(req)
            html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')
          # print(html)
            soup = BeautifulSoup(html, 'html.parser')
            pages = soup.select('a[href^="/Xgyw/Xgyw"]')
          # print(pages)
            for each in pages:
                addr = each.get('href')
               # print(addr)
                h_list.append('http://www.xgyw.cc' + addr)
            break
        except:
            if t < (times - 1):
                continue
            else:
                logging.error("Has tried %d times to access url %s, all failed!", times, href)
                break
    return h_list

# 解析图片&保存图片
def parser_pics(list):
    n = 5
    for each_list in list:
        for t in range(n):
            try:
                print('解析图片地址：%s' % each_list)
                req = request.Request(url=each_list, headers=headers)
                res = request.urlopen(req)
                html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')
                # print(html)
                soup = BeautifulSoup(html, 'html.parser')
                pics = soup.select('img[src^="/uploadfile"]')
                #print(pics)
                for each_pic in pics:
                    srcs = each_pic.get('src')
                    print("解析的图片url:", base_url + srcs)
                    save_pics(base_url + srcs)
                break
            except:
                if t < (n - 1):
                    continue
                else:
                    logging.error("Has tried %d times to access url %s, all failed!", n, each_list)
                    break

# 保存图片
def save_pics(pic):
    fileName = path + os.sep + pic.split("/")[-1]
    if not os.path.exists(fileName):
        with open(fileName, "wb") as f:
            print("正在保存:", fileName)
            f.write(request.urlopen(pic).read())



if __name__=='__main__':
    for i in dedupe(get_Hrefs()):
        get_pages_hrefs(i)
    print(h_list)
    parser_pics(h_list)