【python】记录一下常用功能代码功能函数

记录一下我的一些常用功能代码功能函数

import os, sys
import re
import time
import random
from datetime import datetime
from functools import wraps
import requests
from docx import Document
from configparser import ConfigParser
from docxcompose.composer import Composer
from selenium import webdriver
from bs4 import BeautifulSoup
configFile="CommonSpider.ini"

from PIL import Image
headers = {
    "User-Agent":
        "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
}
logo = r"""
     _______  _______ _________ ______   _______  _______
    (  ____ \(  ____ )\__   __/(  __  \ (  ____ \(  ____ )
    | (    \/| (    )|   ) (   | (  \  )| (    \/| (    )|
    | (_____ | (____)|   | |   | |   ) || (__    | (____)|
    (_____  )|  _____)   | |   | |   | ||  __)   |     __)
          ) || (         | |   | |   ) || (      | (\ (
    /\____) || )      ___) (___| (__/  )| (____/\| ) \ \__
    \_______)|/       \_______/(______/ (_______/|/   \__/"""


##############################################################################################################
# 显示标题头
def ShowFaceTitle():
    print(logo)
    print('    │          ★★★    作者:     谢海亮   ★★★           │ ')
    print('    │           ----------------------------------------------│ ')
    print('    │                      2022 @ All Rights Reserved         │ ')
    print('    │                                                         │ ')
    print('    │ 本程序用于爬取网页信息,使用前请手动配置信息!          │ ')
    print('    ╰─────────────────────────────╯ ')
    print('\n')


##################################################
def message(msg):
    msg = '\r' + msg
    sys.stdout.flush()
    sys.stdout.write(msg)


###############################################
def delStrfirstNum(numberStr,charlist):
    if len(numberStr) < 1:
        return ''
    i = numberStr[0]
    if charlist==None or len(charlist)<1:
        charlist=['、']
    while (str(i).isnumeric()  or numberStr[0] in charlist) and len(numberStr) > 1:
        numberStr = numberStr[1:]
        i = numberStr[0]
    return numberStr

def isImgUrl(url):
    isTrue = False
    if str(url)[:4].lower() == 'http' and str(url)[-4:].lower() in ('.jpg', '.gif', '.png'):
        isTrue = True
    return isTrue

#################################################################################
def filterFileName(filename, prestr=""):
    newFileName = ''
    for i in filename:
        if i in (
        '?', '?', ' ', '╲', '/', '*', '“', '<', '>', '、', '|', ':', '。', '"', '%', '^', '$', '~', ',', ':', '.', ',',
        '(', ')'):
            i = ""
        newFileName = newFileName + i
    newFileName = newFileName.replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '').strip()
    if len(newFileName) < 2:
        return ""
    newFileName = prestr + delStrfirstNum(newFileName,['、',',',','])
    return newFileName
#截取图片
def image_cut_save(path, maskpos,mask_width,mask_height, save_path):
    """ 所截区域图片保存
    :param path: 图片路径
    :param left: 区块左上角位置的像素点离图片左边界的距离
    :param upper:区块左上角位置的像素点离图片上边界的距离
    :param right:区块右下角位置的像素点离图片左边界的距离
    :param lower:区块右下角位置的像素点离图片上边界的距离
     故需满足:lower > upper、right > left
    :param save_path: 所截图片保存位置
    """
    try:
        if not (path.lower().endswith(('.png', '.jpg', '.jpeg'))): return
        img = Image.open(path)  # 打开图像
        imgW, imgH = img.size
        left, upper, right, lower=0,0,imgW, imgH
        if maskpos=='左上':
            left , upper, right, lower=0,imgH-mask_height,imgW,imgH
        elif maskpos=='右上':
            left , upper, right, lower=0,imgH-mask_height,imgW,imgH
        elif maskpos=='左下':
            left , upper, right, lower=0,0,imgW,imgH-mask_height
        elif maskpos=='右下':
            left , upper, right, lower=0,0,imgW,imgH-mask_height
        else:left, upper, right, lower=0,0,imgW,imgH

        if mask_height>int(imgH/5) or mask_width>int(imgW/5): return  #如果截取高度大于图片高度的1/5 则退出不裁剪
        if left>imgW or right>imgW or upper>imgH or lower>imgH:   return
        if (left, upper, right, lower) == (0,0,imgW,imgH):  return

        box = (left, upper, right, lower)
        roi = img.crop(box)
        # 保存截取的图片
        roi.save(save_path)
        del img
    except Exception as e:
        print(e.__str__())

def isMaskImgFile(img,maskpostion,maskwidth,maskheight):
    if(img.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff'))):
        fsize = os.path.getsize(img)
        fsize = fsize / float(1024)
        if fsize<5:
            return False
        try:
            image = Image.open(img)  # 检查文件是否能正常打开
            image.verify()  # 检查文件完整性
            image.close()
            #去除水印
            image_cut_save(img, maskpostion,maskwidth,maskheight, img)
        except:
            try:
                image.close()
            except:
                return False
            raise
        else:
            return True
    else:
        return False

def isUrlStr(url):
    if re.match(r'^https?:/{2}\w.+$', url):
        return True
    else:
        return False
##################################################################################
# 更新修复URL列表
def updateUrlList(urllist, prestr=''):
    newUrlList = []
    if len(urllist) < 1:
        return newUrlList
    urllist = list(set(urllist))
    for item in urllist:
        if item[:4] != 'http':
            item = prestr + item
        if isUrlStr(item):
            newUrlList.append(item)
    returnList = list(tuple(newUrlList))  # 去重
    return returnList
##################################################################################
# 判断是否存在敏感关键字
def isIncludeRiskWord(contents, riskwordfile):
    if not os.path.exists(riskwordfile):
        with open(riskwordfile, 'w') as f:
            f.close()
    f = open(riskwordfile, 'r')
    lines = f.readlines()
    f.close()
    if len(lines) < 1:
        return False
    for line in lines:
        if line in contents:
            return True
    return False


##################################################################################
# 将一段文档文字根据首字符标题规范分成若干列表
def spilitContents(contentlist, spilitchars=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']):
    fgf = '_|_'
    contents = ''
    for content in contentlist:
        p = str(content).strip()
        if len(p) > 0:
            firstChar = p[0]
            newp = p
            while (firstChar in spilitchars) and len(newp) > 1:
                newp = newp[1:]
                firstChar = newp[0]
            if newp[0] != p[0]:
                newp = fgf + newp
            contents = contents + '\r' + newp
    newlist = contents.split(fgf)
    return newlist



###################################################################################
def make_dir(path):
    """ 新建套图文件夹并切换到该目录下 """

    # 如果目录已经存在就不用再次爬取了,去重,提高效率。存在返回 False,否则反之
    if not os.path.exists(path):
        os.makedirs(path)
        print(path)
        os.chdir(path)
        return True
    print("Folder has existed!")
    return False


###################################################################################
def delete_empty_dir(dir):
    """ 如果程序半路中断的话,可能存在已经新建好文件夹但是仍没有下载的图片的情况
    但此时文件夹已经存在所以会忽略该套图的下载,此时要删除空文件夹 """
    if os.path.exists(dir):
        if os.path.isdir(dir):
            for d in os.listdir(dir):
                path = os.path.join(dir, d)  # 组装下一级地址
                if os.path.isdir(path):
                    delete_empty_dir(path)  # 递归删除空文件夹
        if not os.listdir(dir):
            os.rmdir(dir)
            print("remove the empty dir: {}".format(dir))
    else:
        print("Please start your performance!")  # 请开始你的表演


###################################################################################
def remove_file(path):
    if os.path.isfile(path):
        os.remove(path)

###################################################################################
def check_dir(path):
    """
    检查文件夹是否存在,存在返回True;不存在则创建,返回False
    """
    if len(path) < 3:
        return False
    if not os.path.exists(path):
        os.makedirs(path)
        return False
    return True
###################################################################################
def retry(n=3, delay=0.5):
    def deco(func):
        @wraps(func)
        def wrapper(*a, **kw):
            count = 1
            while True:
                try:
                    return func(*a, **kw)
                except Exception as e:
                    if count == n + 1:
                        break
                    print('[{}]运行错误,{}s后进行第{}次重试 Err: {}'.format(func.__name__, delay, count, e))
                    count += 1
                    time.sleep(delay)
            print('重试结束,[{}]运行失败'.format(func.__name__))
            return False

        return wrapper

    return deco


###################################################################################
def download(file_url, file_name=None, file_type=None, save_path="download", headers=None, timeout=15):
    """
    :param file_url: 下载资源链接
    :param file_name: 保存文件名,默认为当前日期时间
    :param file_type: 文件类型(扩展名)
    :param save_path: 保存路径,默认为download,后面不要"/"
    :param headers: http请求头,默认为iphone
    """
    if file_name is None:
        file_name = str(datetime.now())
    file_name = filter_name(file_name)
    if file_type is None:
        if "." in file_url:
            file_type = file_url.split(".")[-1]
        else:
            file_type = "uknown"
    check_dir(save_path)
    file_name = file_name + "." + file_type
    if headers is None:
        headers = {
            "User-Agent":
                "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
        }
    # 下载提示
    if os.path.exists(f"{save_path}/{file_name}"):
        print(f'\033[33m{file_name}已存在,不再下载!\033[0m')
        return True
    message(f"Downloading {file_name}")
    try:
        with requests.get(file_url, headers=headers, stream=True, timeout=timeout) as rep:
            file_size = int(rep.headers['Content-Length'])
            if rep.status_code != 200:
                message("\033[31m下载失败\033[0m")
                return False
            with open(f"{save_path}/{file_name}", "wb") as f:
                for chunk in rep.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            message(f"\033[32m{file_name}下载成功\033[0m")
    except Exception as e:
        msg = '下载失败: ' + e.__str__()
        message(msg)
        remove_file(f"{save_path}/{file_name}")
    return True


###################################################################################
# 将html代码以图片list进行分割成块
def splitHtmlByImg(html, imglist):  # 将html代码以图片list进行分割成块
    html_parts = []  # 根据图标签将正文分割成N部分
    for imgtag in imglist:  # imgtag属性是bs4.element.Tag 后面需要使用str()函数转换成string
        html = str(html)  # 强制转化为字符串方便split分割
        str_tmp = html.split(str(imgtag))[0]  # 取图片分割的前一个元素 加入 正文list部分
        if len(str(str_tmp))>1: html_parts.append(str(str_tmp))
        # print(len(arr))
        html = html.replace((str_tmp + str(imgtag)), '')  # 将正文第一部分及图片标签字符串 从html中替换抹掉作为下一个for循环的html
        # print(html)
    html_parts.append(html)  # 把最后一张图片后的html内容补上
    return html_parts


###################################################################################
## 根据图片url保存图片,填写referer可伪装referer来源下载防盗链图片
def pic_down(referer_url, pic_url):
    headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
               "Accept-Encoding": "gzip",
               "Accept-Language": "zh-CN,zh;q=0.8",
               "Referer": referer_url,
               "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
               }
    # 保存图片至本地,因为新浪图片url中,不带后缀,这里就加了jpg后缀名,否则生成的word会报错
    pic_url = (pic_url.split('@')[0]).split('?')[0]
    img_name = pic_url.split('/')[-1]
    if str(img_name[-4:]).lower() not in ('.jpg', '.png', '.gif', 'jpeg'): img_name = img_name + '.jpg'
    try:
        tempfolder=os.getcwd()+r'\temp'
        img_name=os.path.join(tempfolder,img_name)
        if not os.path.exists(tempfolder):os.mkdir(tempfolder)
        with open(img_name, 'wb') as f:
            response = requests.get(pic_url, headers=headers).content
            f.write(response)
            f.close()
        return img_name
    except Exception as e:
        print('pic_down Error:',e.__str__())
    return ''
##################################
# 下载文件
def downloadFile(urlfile,savefile):
    connected=ifServerConnected(urlfile)
    if connected:
        content=requests.get(urlfile).content
        try:
            with open(savefile,'wb') as f:
                f.write(content)
                f.close()
        except Exception as e:
            print("下载错误:",e)

#########################################
#获取与服务器的链接
# 监测网络
def ifServerConnected(serverweb):
    try:
        res = requests.get(serverweb, timeout=10)
        if res.status_code == 200:
            return  True
        else:
            return False
    except Exception as e:
        return False

def get_findAll_urls(text, pattern):
    """
    :param text: 文本
    :return: 返回url列表
    """
    if pattern == '':
        pattern = r"(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|([a-zA-Z]+.\w+\.+[a-zA-Z0-9\/_]+)"
    urls = re.findall(pattern, text)
    urls = list(urls)
    urls = [x for x in urls if x != '']
    return urls
#####################################################################

#################################################
# 检查是否存在记录,不存在则写入
def checkSpiderLog(logfile, url):
    if url == '': return False
    lines = []

    if os.path.exists(logfile):
        with open(logfile, 'r') as f:
            lines = f.readlines()
            f.close()
    else:
        f = open(logfile, 'w')
        f.close()
    for line in lines:
        if url in line: return True

    with open(logfile, 'a') as f:
        f.write('\n' + url)
        f.close()
    return False


#################################################################################
def getWebdriver():
    driver = None
    Cfg = ConfigParser()
    Cfg.read(os.getcwd() + "\\"+configFile)
    try:
        driverpath = str(Cfg['浏览器参数']['驱动路径'].strip().lower())
        drivertype = str(Cfg['浏览器参数']['驱动类型'].strip().lower())
        showbrower = str(Cfg['浏览器参数']['是否显示浏览器'].strip().lower())

        if drivertype == 'firefox':
            # 隐藏浏览器
            opts = webdriver.FirefoxOptions()
            # opts.add_experimental_option('excludeSwitches', ['enable-logging'])
            if showbrower in ('false', '否', 'no'):
                opts.add_argument(argument="--headless")
            driver = webdriver.Firefox(executable_path=driverpath, options=opts)
        elif drivertype == 'chrome':
            # 隐藏浏览器
            opts = webdriver.ChromeOptions()
            # opts.add_experimental_option('excludeSwitches', ['enable-logging'])
            if showbrower in ('false', '否', 'no'):
                opts.add_argument(argument="--headless")
            driver = webdriver.Chrome(executable_path=driverpath, options=opts)
        else:
            opts = webdriver.Ie()
            # opts.add_experimental_option('excludeSwitches', ['enable-logging'])
            if showbrower in ('false', '否', 'no'):
                opts.add_argument(argument="--headless")
            driver = webdriver.Ie(executable_path="webdriver\IEDriverServer.exe", options=opts)
        return driver
    except Exception as e:
        msg = "[-]错误提示:getWebdriver--浏览器创建失败!" + e.__str__()
        message(msg)
        return None
    finally:
        del Cfg
################################################################################################################
def getWebdriverElement(driver, typeName, parmStr):
    if typeName.lower() == 'name':
        element = driver.find_element_by_name(parmStr)
    if typeName.lower() == 'id':
        element = driver.find_element_by_id(parmStr)
    if typeName.lower() == 'tag':
        element = driver.find_element_by_tag_name(parmStr)
    if typeName.lower() == 'class':
        element = driver.find_element_by_class_name(parmStr)
    if typeName.lower() == 'value':
        element = driver.find_element(parmStr)
    if typeName.lower() == 'link':
        element = driver.find_element_by_link_text(parmStr)
    if typeName.lower() == 'xpath':
        element = driver.find_element_by_xpath(parmStr)
    # print(type(element).__name__) FirefoxWebElement
    return element
#######################################################################
#######################################################################################################################
#判断文件是否为DOCX文件
def isExtFile(fileext,fileName=None):
    if len(fileName)<4:
        return False
    ext=str(fileName[-len(fileext):]).lower()
    if ext ==fileext:
 import os, sys
import re
import time
import random
from datetime import datetime
from functools import wraps
import requests
from docx import Document
from configparser import ConfigParser
from docxcompose.composer import Composer
from selenium import webdriver
from bs4 import BeautifulSoup
configFile="CommonSpider.ini"

from PIL import Image
headers = {
    "User-Agent":
        "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
}
logo = r"""
     _______  _______ _________ ______   _______  _______
    (  ____ \(  ____ )\__   __/(  __  \ (  ____ \(  ____ )
    | (    \/| (    )|   ) (   | (  \  )| (    \/| (    )|
    | (_____ | (____)|   | |   | |   ) || (__    | (____)|
    (_____  )|  _____)   | |   | |   | ||  __)   |     __)
          ) || (         | |   | |   ) || (      | (\ (
    /\____) || )      ___) (___| (__/  )| (____/\| ) \ \__
    \_______)|/       \_______/(______/ (_______/|/   \__/"""


##############################################################################################################
# 显示标题头
def ShowFaceTitle():
    print(logo)
    print('    │          ★★★    作者:     谢海亮   ★★★           │ ')
    print('    │           ----------------------------------------------│ ')
    print('    │                      2022 @ All Rights Reserved         │ ')
    print('    │                                                         │ ')
    print('    │ 本程序用于爬取网页信息,使用前请手动配置信息!          │ ')
    print('    ╰─────────────────────────────╯ ')
    print('\n')


##################################################
def message(msg):
    msg = '\r' + msg
    sys.stdout.flush()
    sys.stdout.write(msg)


###############################################
def delStrfirstNum(numberStr,charlist):
    if len(numberStr) < 1:
        return ''
    i = numberStr[0]
    if charlist==None or len(charlist)<1:
        charlist=['、']
    while (str(i).isnumeric()  or numberStr[0] in charlist) and len(numberStr) > 1:
        numberStr = numberStr[1:]
        i = numberStr[0]
    return numberStr

def isImgUrl(url):
    isTrue = False
    if str(url)[:4].lower() == 'http' and str(url)[-4:].lower() in ('.jpg', '.gif', '.png'):
        isTrue = True
    return isTrue

#################################################################################
def filterFileName(filename, prestr=""):
    newFileName = ''
    for i in filename:
        if i in (
        '?', '?', ' ', '╲', '/', '*', '“', '<', '>', '、', '|', ':', '。', '"', '%', '^', '$', '~', ',', ':', '.', ',',
        '(', ')'):
            i = ""
        newFileName = newFileName + i
    newFileName = newFileName.replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '').strip()
    if len(newFileName) < 2:
        return ""
    newFileName = prestr + delStrfirstNum(newFileName,['、',',',','])
    return newFileName
#截取图片
def image_cut_save(path, maskpos,mask_width,mask_height, save_path):
    """ 所截区域图片保存
    :param path: 图片路径
    :param left: 区块左上角位置的像素点离图片左边界的距离
    :param upper:区块左上角位置的像素点离图片上边界的距离
    :param right:区块右下角位置的像素点离图片左边界的距离
    :param lower:区块右下角位置的像素点离图片上边界的距离
     故需满足:lower > upper、right > left
    :param save_path: 所截图片保存位置
    """
    try:
        if not (path.lower().endswith(('.png', '.jpg', '.jpeg'))): return
        img = Image.open(path)  # 打开图像
        imgW, imgH = img.size
        left, upper, right, lower=0,0,imgW, imgH
        if maskpos=='左上':
            left , upper, right, lower=0,imgH-mask_height,imgW,imgH
        elif maskpos=='右上':
            left , upper, right, lower=0,imgH-mask_height,imgW,imgH
        elif maskpos=='左下':
            left , upper, right, lower=0,0,imgW,imgH-mask_height
        elif maskpos=='右下':
            left , upper, right, lower=0,0,imgW,imgH-mask_height
        else:left, upper, right, lower=0,0,imgW,imgH

        if mask_height>int(imgH/5) or mask_width>int(imgW/5): return  #如果截取高度大于图片高度的1/5 则退出不裁剪
        if left>imgW or right>imgW or upper>imgH or lower>imgH:   return
        if (left, upper, right, lower) == (0,0,imgW,imgH):  return

        box = (left, upper, right, lower)
        roi = img.crop(box)
        # 保存截取的图片
        roi.save(save_path)
        del img
    except Exception as e:
        print(e.__str__())

def isMaskImgFile(img,maskpostion,maskwidth,maskheight):
    if(img.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff'))):
        fsize = os.path.getsize(img)
        fsize = fsize / float(1024)
        if fsize<5:
            return False
        try:
            image = Image.open(img)  # 检查文件是否能正常打开
            image.verify()  # 检查文件完整性
            image.close()
            #去除水印
            image_cut_save(img, maskpostion,maskwidth,maskheight, img)
        except:
            try:
                image.close()
            except:
                return False
            raise
        else:
            return True
    else:
        return False

def isUrlStr(url):
    if re.match(r'^https?:/{2}\w.+$', url):
        return True
    else:
        return False
##################################################################################
# 更新修复URL列表
def updateUrlList(urllist, prestr=''):
    newUrlList = []
    if len(urllist) < 1:
        return newUrlList
    urllist = list(set(urllist))
    for item in urllist:
        if item[:4] != 'http':
            item = prestr + item
        if isUrlStr(item):
            newUrlList.append(item)
    returnList = list(tuple(newUrlList))  # 去重
    return returnList
##################################################################################
# 判断是否存在敏感关键字
def isIncludeRiskWord(contents, riskwordfile):
    if not os.path.exists(riskwordfile):
        with open(riskwordfile, 'w') as f:
            f.close()
    f = open(riskwordfile, 'r')
    lines = f.readlines()
    f.close()
    if len(lines) < 1:
        return False
    for line in lines:
        if line in contents:
            return True
    return False


##################################################################################
# 将一段文档文字根据首字符标题规范分成若干列表
def spilitContents(contentlist, spilitchars=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']):
    fgf = '_|_'
    contents = ''
    for content in contentlist:
        p = str(content).strip()
        if len(p) > 0:
            firstChar = p[0]
            newp = p
            while (firstChar in spilitchars) and len(newp) > 1:
                newp = newp[1:]
                firstChar = newp[0]
            if newp[0] != p[0]:
                newp = fgf + newp
            contents = contents + '\r' + newp
    newlist = contents.split(fgf)
    return newlist



###################################################################################
def make_dir(path):
    """ 新建套图文件夹并切换到该目录下 """

    # 如果目录已经存在就不用再次爬取了,去重,提高效率。存在返回 False,否则反之
    if not os.path.exists(path):
        os.makedirs(path)
        print(path)
        os.chdir(path)
        return True
    print("Folder has existed!")
    return False


###################################################################################
def delete_empty_dir(dir):
    """ 如果程序半路中断的话,可能存在已经新建好文件夹但是仍没有下载的图片的情况
    但此时文件夹已经存在所以会忽略该套图的下载,此时要删除空文件夹 """
    if os.path.exists(dir):
        if os.path.isdir(dir):
            for d in os.listdir(dir):
                path = os.path.join(dir, d)  # 组装下一级地址
                if os.path.isdir(path):
                    delete_empty_dir(path)  # 递归删除空文件夹
        if not os.listdir(dir):
            os.rmdir(dir)
            print("remove the empty dir: {}".format(dir))
    else:
        print("Please start your performance!")  # 请开始你的表演


###################################################################################
def remove_file(path):
    if os.path.isfile(path):
        os.remove(path)

###################################################################################
def check_dir(path):
    """
    检查文件夹是否存在,存在返回True;不存在则创建,返回False
    """
    if len(path) < 3:
        return False
    if not os.path.exists(path):
        os.makedirs(path)
        return False
    return True
###################################################################################
def retry(n=3, delay=0.5):
    def deco(func):
        @wraps(func)
        def wrapper(*a, **kw):
            count = 1
            while True:
                try:
                    return func(*a, **kw)
                except Exception as e:
                    if count == n + 1:
                        break
                    print('[{}]运行错误,{}s后进行第{}次重试 Err: {}'.format(func.__name__, delay, count, e))
                    count += 1
                    time.sleep(delay)
            print('重试结束,[{}]运行失败'.format(func.__name__))
            return False

        return wrapper

    return deco


###################################################################################
def download(file_url, file_name=None, file_type=None, save_path="download", headers=None, timeout=15):
    """
    :param file_url: 下载资源链接
    :param file_name: 保存文件名,默认为当前日期时间
    :param file_type: 文件类型(扩展名)
    :param save_path: 保存路径,默认为download,后面不要"/"
    :param headers: http请求头,默认为iphone
    """
    if file_name is None:
        file_name = str(datetime.now())
    file_name = filter_name(file_name)
    if file_type is None:
        if "." in file_url:
            file_type = file_url.split(".")[-1]
        else:
            file_type = "uknown"
    check_dir(save_path)
    file_name = file_name + "." + file_type
    if headers is None:
        headers = {
            "User-Agent":
                "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
        }
    # 下载提示
    if os.path.exists(f"{save_path}/{file_name}"):
        print(f'\033[33m{file_name}已存在,不再下载!\033[0m')
        return True
    message(f"Downloading {file_name}")
    try:
        with requests.get(file_url, headers=headers, stream=True, timeout=timeout) as rep:
            file_size = int(rep.headers['Content-Length'])
            if rep.status_code != 200:
                message("\033[31m下载失败\033[0m")
                return False
            with open(f"{save_path}/{file_name}", "wb") as f:
                for chunk in rep.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            message(f"\033[32m{file_name}下载成功\033[0m")
    except Exception as e:
        msg = '下载失败: ' + e.__str__()
        message(msg)
        remove_file(f"{save_path}/{file_name}")
    return True


###################################################################################
# 将html代码以图片list进行分割成块
def splitHtmlByImg(html, imglist):  # 将html代码以图片list进行分割成块
    html_parts = []  # 根据图标签将正文分割成N部分
    for imgtag in imglist:  # imgtag属性是bs4.element.Tag 后面需要使用str()函数转换成string
        html = str(html)  # 强制转化为字符串方便split分割
        str_tmp = html.split(str(imgtag))[0]  # 取图片分割的前一个元素 加入 正文list部分
        if len(str(str_tmp))>1: html_parts.append(str(str_tmp))
        # print(len(arr))
        html = html.replace((str_tmp + str(imgtag)), '')  # 将正文第一部分及图片标签字符串 从html中替换抹掉作为下一个for循环的html
        # print(html)
    html_parts.append(html)  # 把最后一张图片后的html内容补上
    return html_parts


###################################################################################
## 根据图片url保存图片,填写referer可伪装referer来源下载防盗链图片
def pic_down(referer_url, pic_url):
    headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
               "Accept-Encoding": "gzip",
               "Accept-Language": "zh-CN,zh;q=0.8",
               "Referer": referer_url,
               "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
               }
    # 保存图片至本地,因为新浪图片url中,不带后缀,这里就加了jpg后缀名,否则生成的word会报错
    pic_url = (pic_url.split('@')[0]).split('?')[0]
    img_name = pic_url.split('/')[-1]
    if str(img_name[-4:]).lower() not in ('.jpg', '.png', '.gif', 'jpeg'): img_name = img_name + '.jpg'
    try:
        tempfolder=os.getcwd()+r'\temp'
        img_name=os.path.join(tempfolder,img_name)
        if not os.path.exists(tempfolder):os.mkdir(tempfolder)
        with open(img_name, 'wb') as f:
            response = requests.get(pic_url, headers=headers).content
            f.write(response)
            f.close()
        return img_name
    except Exception as e:
        print('pic_down Error:',e.__str__())
    return ''
##################################
# 下载文件
def downloadFile(urlfile,savefile):
    connected=ifServerConnected(urlfile)
    if connected:
        content=requests.get(urlfile).content
        try:
            with open(savefile,'wb') as f:
                f.write(content)
                f.close()
        except Exception as e:
            print("下载错误:",e)

#########################################
#获取与服务器的链接
# 监测网络
def ifServerConnected(serverweb):
    try:
        res = requests.get(serverweb, timeout=10)
        if res.status_code == 200:
            return  True
        else:
            return False
    except Exception as e:
        return False

def get_findAll_urls(text, pattern):
    """
    :param text: 文本
    :return: 返回url列表
    """
    if pattern == '':
        pattern = r"(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|([a-zA-Z]+.\w+\.+[a-zA-Z0-9\/_]+)"
    urls = re.findall(pattern, text)
    urls = list(urls)
    urls = [x for x in urls if x != '']
    return urls
#####################################################################

#################################################
# 检查是否存在记录,不存在则写入
def checkSpiderLog(logfile, url):
    if url == '': return False
    lines = []

    if os.path.exists(logfile):
        with open(logfile, 'r') as f:
            lines = f.readlines()
            f.close()
    else:
        f = open(logfile, 'w')
        f.close()
    for line in lines:
        if url in line: return True

    with open(logfile, 'a') as f:
        f.write('\n' + url)
        f.close()
    return False


#################################################################################
def getWebdriver():
    driver = None
    Cfg = ConfigParser()
    Cfg.read(os.getcwd() + "\\"+configFile)
    try:
        driverpath = str(Cfg['浏览器参数']['驱动路径'].strip().lower())
        drivertype = str(Cfg['浏览器参数']['驱动类型'].strip().lower())
        showbrower = str(Cfg['浏览器参数']['是否显示浏览器'].strip().lower())

        if drivertype == 'firefox':
            # 隐藏浏览器
            opts = webdriver.FirefoxOptions()
            # opts.add_experimental_option('excludeSwitches', ['enable-logging'])
            if showbrower in ('false', '否', 'no'):
                opts.add_argument(argument="--headless")
            driver = webdriver.Firefox(executable_path=driverpath, options=opts)
        elif drivertype == 'chrome':
            # 隐藏浏览器
            opts = webdriver.ChromeOptions()
            # opts.add_experimental_option('excludeSwitches', ['enable-logging'])
            if showbrower in ('false', '否', 'no'):
                opts.add_argument(argument="--headless")
            driver = webdriver.Chrome(executable_path=driverpath, options=opts)
        else:
            opts = webdriver.Ie()
            # opts.add_experimental_option('excludeSwitches', ['enable-logging'])
            if showbrower in ('false', '否', 'no'):
                opts.add_argument(argument="--headless")
            driver = webdriver.Ie(executable_path="webdriver\IEDriverServer.exe", options=opts)
        return driver
    except Exception as e:
        msg = "[-]错误提示:getWebdriver--浏览器创建失败!" + e.__str__()
        message(msg)
        return None
    finally:
        del Cfg
################################################################################################################
def getWebdriverElement(driver, typeName, parmStr):
    if typeName.lower() == 'name':
        element = driver.find_element_by_name(parmStr)
    if typeName.lower() == 'id':
        element = driver.find_element_by_id(parmStr)
    if typeName.lower() == 'tag':
        element = driver.find_element_by_tag_name(parmStr)
    if typeName.lower() == 'class':
        element = driver.find_element_by_class_name(parmStr)
    if typeName.lower() == 'value':
        element = driver.find_element(parmStr)
    if typeName.lower() == 'link':
        element = driver.find_element_by_link_text(parmStr)
    if typeName.lower() == 'xpath':
        element = driver.find_element_by_xpath(parmStr)
    # print(type(element).__name__) FirefoxWebElement
    return element
#######################################################################
#######################################################################################################################
#判断文件是否为DOCX文件
def isExtFile(fileext,fileName=None):
    if len(fileName)<4:
        return False
    ext=str(fileName[-len(fileext):]).lower()
    if ext ==fileext:
        return True
    else:
        return False
#######################################################################################################################
#获取目录下的docx文件列表
def getExtFileList(fileext,DirName):
    extFilelist=[]
    if os.path.exists(DirName):
        file_list = os.listdir(DirName)
        for file in file_list:
            if isExtFile(fileext,file):
                extFilelist.append(file)
    return extFilelist
    else:
        return False
#######################################################################################################################
#获取目录下的docx文件列表
def getExtFileList(fileext,DirName):
    extFilelist=[]
    if os.path.exists(DirName):
        file_list = os.listdir(DirName)
        for file in file_list:
            if isExtFile(fileext,file):
                extFilelist.append(file)
    return extFilelist
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值