Python爬虫2

最近学习了一下Beautifulsoup4,便用BS4写了一个爬虫的小程序,可以从网页上下载图片保存到本地,并且可以自动提取网页上的链接放到链接库里以便继续爬,并且保存所有已经爬取过的网页,对于已经下载过的图片,便不再进行保存操作。小程序目前还没有写断点续爬功能,以后有时间再补上。

import os
import time
import re
import urllib3
import urllib
from bs4 import BeautifulSoup
from PIL import Image
import hashlib

headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/1    7.0.963.56 Safari/535.11"}
#填入你要爬取的网页地址
url = "http://www.xxxxxxxxxxxxxxxxx"

urllist = []#当前需要爬取的网页列表
urllistAll = []#所有已经爬取过的网页列表,用来过滤新抓取的网页是否已经被爬取过
md5_list = []#保存所有文件的哈希值,用来过滤新爬取的图片是否已经被爬取过
#定义一个爬虫类
class Spider:
    def __init__(self , containt , point = 0):
        print("HelloWorld")
        self.containt = containt
        self.point = point



    def getURL(self , inputURL):#导入原始要爬取的网页地址
        urllist.append(inputURL)
        urllistAll.append(inputURL)
        print(len(urllist))

    def findURL(self):#从原始网页开始提取最的链接
        try:
            http = urllib3.PoolManager()
            for url in urllist:
                urllist.remove(url)
                self.saveList()
                r = http.request("GET" , url , headers=headers ,timeout = 10)
                # print(r.status)
                soup = BeautifulSoup(r.data.decode() , "lxml")
                self.fineImgUrl(soup)
                for i in soup.find_all(name = 'a'):
                    # print(i)
                    pass
                for i in soup.find_all(name='a', attrs = {"href":re.compile(r'^http:')}):
                    urlTmp = i.get('href')
                    # print(urlTmp)
                    if urlTmp in urllistAll:
                        # print(urlTmp)
                        pass
                    else:
                        urllist.append(urlTmp)
                        urllistAll.append(urlTmp)


        except:
            print("ops... an err occured!")

    def fineImgUrl(self , soup):#提取出当前网页的图片链接
        try:
            for i in soup.find_all(name="img", attrs = {"src":re.compile(r'^http:')}):
                img_url = i.get('src')
                print(img_url)
                self.imgSave(img_url)
        except:
            print("OPS***   An err occured!")

    def imgSave(self,imgUrl):#保存图片到本地磁盘
        try:
            http = urllib3.PoolManager()
            imgData = http.request("GET",imgUrl,headers=headers ,timeout = 10)
            get_md5 = hashlib.md5()
            get_md5.update(imgData.data)
            md5_value = get_md5.hexdigest()
            print(md5_value)
            if md5_value in md5_list:
                print("This image is exist, pass...")
            else:
                md5_list.append(md5_value)
                print(len(str(imgData.data)))
                # time.sleep(5)
                cnt_str = imgUrl.replace("/", "a")
                cnt_str = cnt_str.replace(":", "b")
                fileName = cnt_str
                print(fileName)
                with open(fileName, "wb") as f:
                    f.write(imgData.data)
                print("saving img  " + fileName)
        except:
            print("OPS.........Save image failed...")

    def saveList(self):#保存已经爬取过的网页地址
        try:
            with open("AllList.txt", "w") as fb:
                for i in urllistAll:
                    fb.write(i)
                    fb.write("\r\n")
                print("Saved all urllib!!")
        except:
            print("Save list failed~~~")

    def __str__(self):返回当前等待爬取的网页地址列表
        msg = "This is a network spider, nice to meet you!"
        if len(urllist) > 0:
            msg += "my URL list contain " + str(len(urllist))
            msg += "Item, they are "
            for tmp in urllist:
                msg += str(tmp)
                msg += " , "
        return msg

#主函数
if __name__ == "__main__":
    os.system("mkdir pic6")
    os.chdir("pic6")
    spider = Spider("hello")
    print(spider)
    spider.getURL(url)
    print(spider)
    while True:
        spider.findURL()
        print(spider)
        spider.saveList()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值