不懂python也能写爬虫之——一晚上搞定爬虫

最新推荐文章于 2021-12-17 07:17:05 发布

HelloMingo

最新推荐文章于 2021-12-17 07:17:05 发布

阅读量469

点赞数 1

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/u010133610/article/details/104665846

版权

python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

先看看成果

本来想写一篇尽可能详细的帖子，但是写了几个小时以后，发现难度有点高。而且很多东西自己也不是很懂。所以直接把代码贴出来吧。

#coding = utf-8

import os
import re
import requests
import datetime
import time
import random
import ip_proxies


#图集保存跟目录地址
file_Root = 'I:\Test'
if not os.path.exists(file_Root):
    os.mkdir(file_Root)

#伪装成浏览器
headers ={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}

#当前时间
def getTimeNow():
    return datetime.datetime.now().strftime('%H:%M:%S')

#使用requests库的get方法请求网络内容
def req_get(url):
    return requests.get(url,headers = headers,timeout = 30)

#获得网页html信息
def getHtml(page_url):
    print("<--------------------------正在请求网页： "+page_url+" -------------------------------->start:"+getTimeNow())
    try:
        r = req_get(page_url)
        r.raise_for_status() 
        r.encoding = "utf-8"
        return r.text
    except Exception as e:
        print("网页爬取失败:"+page_url)
        print(e)
        return False


#根据页面标题，得到要保存的图集的名字
def getAtlasName(html,url):
    reg = '<title>(.+) &#8211'
    titlere = re.compile(reg)
    result = re.findall(titlere, html)
    if(len(result)>0):
        return result[0]
    else:
        print("Html Page Error: can't find title")
        return url


#下载页面中的图片
def getImg(html,pro_name):
    #记录该页面中的图片是否全部下载完毕
    result_tag = True

    #先确定文件的目标存储目录是否存在
    filePath = os.path.join(file_Root)+os.sep+pro_name
    try:
        if not os.path.exists(filePath):
            os.mkdir(filePath)
    except Exception as e:
        print(e)
        return False

    #建立查找目标文件地址的正则表达式
    reg = 'data-src="(.+?\.jpg)"'
    imgre = re.compile(reg)
    imglist = re.findall(imgre, html)

    if(len(imglist)==0):
        print("*********************图片内容查找失败，当前页面不包含目标格式图片，请检查网页和正则表达式的匹配度*****************************")
        return True

    for imgurl in imglist:
        try:
            fn = imgurl.split('/')#对图片的网络地址做分割
            filename=os.path.join(filePath)+os.sep+fn[-1]#取最后一项作为要保存的文件名
            if os.path.exists(filename):#如果文件已经存在了，则直接跳过
                #print("跳过文件:"+filename)
                continue
            else:
                print("--start    "+imgurl+"  "+getTimeNow())
                pic = req_get(imgurl)#下载图片

                #保存图片
                with open(filename, 'wb') as fp:
                    fp.write(pic.content)
                    fp.close()
                #print("下载完成:"+filename)

        except Exception as e:
            print(e)
            result_tag = False#如果下载过程中出现了异常，该页面的下载就是失败的，在下一次捡漏的过程中，就要重新检查该页面
            
    return result_tag

#---------分页地址
page_list_urlRoot="https://*****/page/"

#---------商品展示页地址
product_urlRoot="https://*****/product/"


#获取当前页面所有商品的页面链接
def getProductList(html):
    #建立解析商品目标页面链接的正则表达式
    reg = '<div class="product-header"><a href="(https://****/product/+.+?)"'
    imgre = re.compile(reg)
    return re.findall(imgre, html)

#已完成的页面列表
complete_pages = []

#未完成的页面列表
failing_pages = []


# 主函数
# first_loop：是否为第一次循环
def spider_start(first_loop):
    #------------Main------------
    for index in range(1,32):
        temp = str(index)
        list_url = page_list_urlRoot+temp#得到商品分页地址

        html = getHtml(list_url)#得到商品列表页面
        if html==False:#如果商品页面下载失败，则跳过该页面
            failing_pages.append(list_url)#并且把该页面加入到失败列表中
            continue
        elif list_url in failing_pages:
            failing_pages.remove(list_url)

        productList = getProductList(html)
        for pro_url in productList:
            #如果要操作的页面已经包含在完成列表中，则跳过
            if pro_url in complete_pages:
                continue

            pro_html = getHtml(pro_url)
            if(pro_html==False):
                failing_pages.append(pro_url)#将请求失败的页面添加到failing列表中
                continue

            
            pro_name = getAtlasName(pro_html,pro_url)
            print("请求商品页面成功："+pro_name)
            if not getImg(pro_html,pro_name):#如果处理页面中图片时出现了问题，就不能算是完成
                failing_pages.append(pro_url)
            else:
                if pro_url in failing_pages:
                    failing_pages.remove(pro_url)
                complete_pages.append(pro_url)#所有图片都得到正常处理以后，讲页面添加到complete列表

            wait = random.uniform(0,2)
            print("蜘蛛爬累了，现在要休息"+str(wait)+"秒")
            time.sleep(wait)
        

    #为了彻底而完全的爬完一个网站，防止因为网络问题或者目标网站的反扒手段
    #通过failing_pages记录下失败的请求，然后一遍再一遍的爬，直到所有的内容都正确的下载到
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>一次循环结束,failing_pages:"+str(len(failing_pages)))
    if len(failing_pages)>32:
        spider_start(False)
        
        


#开启主循环
print("+---------------------------------------------------------------+")
print("|----------------蜘蛛的要开始工作了，又是辛苦的一天-------------|")
print("+---------------------------------------------------------------+")
spider_start(True)
print("+---------------------------------------------------------------+")
print("|----------------蜘蛛的任务完成了，终于可以回家睡觉了-----------|")
print("+---------------------------------------------------------------+")

被我爬过的网页信息，都使用了***代替，如果你直接复制粘贴，是运行不了的，因为网址错误。找一个你自己中意的网站，然后自己分析页面，修改正则表达式。

以上内容纯属技术交流，不提倡到处爬人家的网站。

欢迎加我qq交流：1009570451

HelloMingo

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
不懂python也能写爬虫之——一晚上搞定爬虫

前言这篇帖子不会涉及到理论，只要你有一定的编程基础，就一定能看懂。首先说明，距离我最近一次写Python已经是三年半以前了。那时候的Python还没这么火，当时只是因为工作需要，被迫学了点，然后就写了两个月的逻辑。毕竟现在Python这么火，各种以Python之名打的广告铺天盖地，忽悠小孩子的也打上了Python的旗子。没事的时候想想好歹我曾经也是会Python的人啊...说不定哪天辞职回...
复制链接

扫一扫