记录一下我的一些常用功能代码功能函数
import os, sys
import re
import time
import random
from datetime import datetime
from functools import wraps
import requests
from docx import Document
from configparser import ConfigParser
from docxcompose.composer import Composer
from selenium import webdriver
from bs4 import BeautifulSoup
configFile="CommonSpider.ini"
from PIL import Image
headers = {
"User-Agent":
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
}
logo = r"""
_______ _______ _________ ______ _______ _______
( ____ \( ____ )\__ __/( __ \ ( ____ \( ____ )
| ( \/| ( )| ) ( | ( \ )| ( \/| ( )|
| (_____ | (____)| | | | | ) || (__ | (____)|
(_____ )| _____) | | | | | || __) | __)
) || ( | | | | ) || ( | (\ (
/\____) || ) ___) (___| (__/ )| (____/\| ) \ \__
\_______)|/ \_______/(______/ (_______/|/ \__/"""
##############################################################################################################
# 显示标题头
def ShowFaceTitle():
print(logo)
print(' │ ★★★ 作者: 谢海亮 ★★★ │ ')
print(' │ ----------------------------------------------│ ')
print(' │ 2022 @ All Rights Reserved │ ')
print(' │ │ ')
print(' │ 本程序用于爬取网页信息,使用前请手动配置信息! │ ')
print(' ╰─────────────────────────────╯ ')
print('\n')
##################################################
def message(msg):
msg = '\r' + msg
sys.stdout.flush()
sys.stdout.write(msg)
###############################################
def delStrfirstNum(numberStr,charlist):
if len(numberStr) < 1:
return ''
i = numberStr[0]
if charlist==None or len(charlist)<1:
charlist=['、']
while (str(i).isnumeric() or numberStr[0] in charlist) and len(numberStr) > 1:
numberStr = numberStr[1:]
i = numberStr[0]
return numberStr
def isImgUrl(url):
isTrue = False
if str(url)[:4].lower() == 'http' and str(url)[-4:].lower() in ('.jpg', '.gif', '.png'):
isTrue = True
return isTrue
#################################################################################
def filterFileName(filename, prestr=""):
newFileName = ''
for i in filename:
if i in (
'?', '?', ' ', '╲', '/', '*', '“', '<', '>', '、', '|', ':', '。', '"', '%', '^', '$', '~', ',', ':', '.', ',',
'(', ')'):
i = ""
newFileName = newFileName + i
newFileName = newFileName.replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '').strip()
if len(newFileName) < 2:
return ""
newFileName = prestr + delStrfirstNum(newFileName,['、',',',','])
return newFileName
#截取图片
def image_cut_save(path, maskpos,mask_width,mask_height, save_path):
""" 所截区域图片保存
:param path: 图片路径
:param left: 区块左上角位置的像素点离图片左边界的距离
:param upper:区块左上角位置的像素点离图片上边界的距离
:param right:区块右下角位置的像素点离图片左边界的距离
:param lower:区块右下角位置的像素点离图片上边界的距离
故需满足:lower > upper、right > left
:param save_path: 所截图片保存位置
"""
try:
if not (path.lower().endswith(('.png', '.jpg', '.jpeg'))): return
img = Image.open(path) # 打开图像
imgW, imgH = img.size
left, upper, right, lower=0,0,imgW, imgH
if maskpos=='左上':
left , upper, right, lower=0,imgH-mask_height,imgW,imgH
elif maskpos=='右上':
left , upper, right, lower=0,imgH-mask_height,imgW,imgH
elif maskpos=='左下':
left , upper, right, lower=0,0,imgW,imgH-mask_height
elif maskpos=='右下':
left , upper, right, lower=0,0,imgW,imgH-mask_height
else:left, upper, right, lower=0,0,imgW,imgH
if mask_height>int(imgH/5) or mask_width>int(imgW/5): return #如果截取高度大于图片高度的1/5 则退出不裁剪
if left>imgW or right>imgW or upper>imgH or lower>imgH: return
if (left, upper, right, lower) == (0,0,imgW,imgH): return
box = (left, upper, right, lower)
roi = img.crop(box)
# 保存截取的图片
roi.save(save_path)
del img
except Exception as e:
print(e.__str__())
def isMaskImgFile(img,maskpostion,maskwidth,maskheight):
if(img.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff'))):
fsize = os.path.getsize(img)
fsize = fsize / float(1024)
if fsize<5:
return False
try:
image = Image.open(img) # 检查文件是否能正常打开
image.verify() # 检查文件完整性
image.close()
#去除水印
image_cut_save(img, maskpostion,maskwidth,maskheight, img)
except:
try:
image.close()
except:
return False
raise
else:
return True
else:
return False
def isUrlStr(url):
if re.match(r'^https?:/{2}\w.+$', url):
return True
else:
return False
##################################################################################
# 更新修复URL列表
def updateUrlList(urllist, prestr=''):
newUrlList = []
if len(urllist) < 1:
return newUrlList
urllist = list(set(urllist))
for item in urllist:
if item[:4] != 'http':
item = prestr + item
if isUrlStr(item):
newUrlList.append(item)
returnList = list(tuple(newUrlList)) # 去重
return returnList
##################################################################################
# 判断是否存在敏感关键字
def isIncludeRiskWord(contents, riskwordfile):
if not os.path.exists(riskwordfile):
with open(riskwordfile, 'w') as f:
f.close()
f = open(riskwordfile, 'r')
lines = f.readlines()
f.close()
if len(lines) < 1:
return False
for line in lines:
if line in contents:
return True
return False
##################################################################################
# 将一段文档文字根据首字符标题规范分成若干列表
def spilitContents(contentlist, spilitchars=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']):
fgf = '_|_'
contents = ''
for content in contentlist:
p = str(content).strip()
if len(p) > 0:
firstChar = p[0]
newp = p
while (firstChar in spilitchars) and len(newp) > 1:
newp = newp[1:]
firstChar = newp[0]
if newp[0] != p[0]:
newp = fgf + newp
contents = contents + '\r' + newp
newlist = contents.split(fgf)
return newlist
###################################################################################
def make_dir(path):
""" 新建套图文件夹并切换到该目录下 """
# 如果目录已经存在就不用再次爬取了,去重,提高效率。存在返回 False,否则反之
if not os.path.exists(path):
os.makedirs(path)
print(path)
os.chdir(path)
return True
print("Folder has existed!")
return False
###################################################################################
def delete_empty_dir(dir):
""" 如果程序半路中断的话,可能存在已经新建好文件夹但是仍没有下载的图片的情况
但此时文件夹已经存在所以会忽略该套图的下载,此时要删除空文件夹 """
if os.path.exists(dir):
if os.path.isdir(dir):
for d in os.listdir(dir):
path = os.path.join(dir, d) # 组装下一级地址
if os.path.isdir(path):
delete_empty_dir(path) # 递归删除空文件夹
if not os.listdir(dir):
os.rmdir(dir)
print("remove the empty dir: {}".format(dir))
else:
print("Please start your performance!") # 请开始你的表演
###################################################################################
def remove_file(path):
if os.path.isfile(path):
os.remove(path)
###################################################################################
def check_dir(path):
"""
检查文件夹是否存在,存在返回True;不存在则创建,返回False
"""
if len(path) < 3:
return False
if not os.path.exists(path):
os.makedirs(path)
return False
return True
###################################################################################
def retry(n=3, delay=0.5):
def deco(func):
@wraps(func)
def wrapper(*a, **kw):
count = 1
while True:
try:
return func(*a, **kw)
except Exception as e:
if count == n + 1:
break
print('[{}]运行错误,{}s后进行第{}次重试 Err: {}'.format(func.__name__, delay, count, e))
count += 1
time.sleep(delay)
print('重试结束,[{}]运行失败'.format(func.__name__))
return False
return wrapper
return deco
###################################################################################
def download(file_url, file_name=None, file_type=None, save_path="download", headers=None, timeout=15):
"""
:param file_url: 下载资源链接
:param file_name: 保存文件名,默认为当前日期时间
:param file_type: 文件类型(扩展名)
:param save_path: 保存路径,默认为download,后面不要"/"
:param headers: http请求头,默认为iphone
"""
if file_name is None:
file_name = str(datetime.now())
file_name = filter_name(file_name)
if file_type is None:
if "." in file_url:
file_type = file_url.split(".")[-1]
else:
file_type = "uknown"
check_dir(save_path)
file_name = file_name + "." + file_type
if headers is None:
headers = {
"User-Agent":
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
}
# 下载提示
if os.path.exists(f"{save_path}/{file_name}"):
print(f'\033[33m{file_name}已存在,不再下载!\033[0m')
return True
message(f"Downloading {file_name}")
try:
with requests.get(file_url, headers=headers, stream=True, timeout=timeout) as rep:
file_size = int(rep.headers['Content-Length'])
if rep.status_code != 200:
message("\033[31m下载失败\033[0m")
return False
with open(f"{save_path}/{file_name}", "wb") as f:
for chunk in rep.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
message(f"\033[32m{file_name}下载成功\033[0m")
except Exception as e:
msg = '下载失败: ' + e.__str__()
message(msg)
remove_file(f"{save_path}/{file_name}")
return True
###################################################################################
# 将html代码以图片list进行分割成块
def splitHtmlByImg(html, imglist): # 将html代码以图片list进行分割成块
html_parts = [] # 根据图标签将正文分割成N部分
for imgtag in imglist: # imgtag属性是bs4.element.Tag 后面需要使用str()函数转换成string
html = str(html) # 强制转化为字符串方便split分割
str_tmp = html.split(str(imgtag))[0] # 取图片分割的前一个元素 加入 正文list部分
if len(str(str_tmp))>1: html_parts.append(str(str_tmp))
# print(len(arr))
html = html.replace((str_tmp + str(imgtag)), '') # 将正文第一部分及图片标签字符串 从html中替换抹掉作为下一个for循环的html
# print(html)
html_parts.append(html) # 把最后一张图片后的html内容补上
return html_parts
###################################################################################
## 根据图片url保存图片,填写referer可伪装referer来源下载防盗链图片
def pic_down(referer_url, pic_url):
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
"Accept-Encoding": "gzip",
"Accept-Language": "zh-CN,zh;q=0.8",
"Referer": referer_url,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
}
# 保存图片至本地,因为新浪图片url中,不带后缀,这里就加了jpg后缀名,否则生成的word会报错
pic_url = (pic_url.split('@')[0]).split('?')[0]
img_name = pic_url.split('/')[-1]
if str(img_name[-4:]).lower() not in ('.jpg', '.png', '.gif', 'jpeg'): img_name = img_name + '.jpg'
try:
tempfolder=os.getcwd()+r'\temp'
img_name=os.path.join(tempfolder,img_name)
if not os.path.exists(tempfolder):os.mkdir(tempfolder)
with open(img_name, 'wb') as f:
response = requests.get(pic_url, headers=headers).content
f.write(response)
f.close()
return img_name
except Exception as e:
print('pic_down Error:',e.__str__())
return ''
##################################
# 下载文件
def downloadFile(urlfile,savefile):
connected=ifServerConnected(urlfile)
if connected:
content=requests.get(urlfile).content
try:
with open(savefile,'wb') as f:
f.write(content)
f.close()
except Exception as e:
print("下载错误:",e)
#########################################
#获取与服务器的链接
# 监测网络
def ifServerConnected(serverweb):
try:
res = requests.get(serverweb, timeout=10)
if res.status_code == 200:
return True
else:
return False
except Exception as e:
return False
def get_findAll_urls(text, pattern):
"""
:param text: 文本
:return: 返回url列表
"""
if pattern == '':
pattern = r"(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|([a-zA-Z]+.\w+\.+[a-zA-Z0-9\/_]+)"
urls = re.findall(pattern, text)
urls = list(urls)
urls = [x for x in urls if x != '']
return urls
#####################################################################
#################################################
# 检查是否存在记录,不存在则写入
def checkSpiderLog(logfile, url):
if url == '': return False
lines = []
if os.path.exists(logfile):
with open(logfile, 'r') as f:
lines = f.readlines()
f.close()
else:
f = open(logfile, 'w')
f.close()
for line in lines:
if url in line: return True
with open(logfile, 'a') as f:
f.write('\n' + url)
f.close()
return False
#################################################################################
def getWebdriver():
driver = None
Cfg = ConfigParser()
Cfg.read(os.getcwd() + "\\"+configFile)
try:
driverpath = str(Cfg['浏览器参数']['驱动路径'].strip().lower())
drivertype = str(Cfg['浏览器参数']['驱动类型'].strip().lower())
showbrower = str(Cfg['浏览器参数']['是否显示浏览器'].strip().lower())
if drivertype == 'firefox':
# 隐藏浏览器
opts = webdriver.FirefoxOptions()
# opts.add_experimental_option('excludeSwitches', ['enable-logging'])
if showbrower in ('false', '否', 'no'):
opts.add_argument(argument="--headless")
driver = webdriver.Firefox(executable_path=driverpath, options=opts)
elif drivertype == 'chrome':
# 隐藏浏览器
opts = webdriver.ChromeOptions()
# opts.add_experimental_option('excludeSwitches', ['enable-logging'])
if showbrower in ('false', '否', 'no'):
opts.add_argument(argument="--headless")
driver = webdriver.Chrome(executable_path=driverpath, options=opts)
else:
opts = webdriver.Ie()
# opts.add_experimental_option('excludeSwitches', ['enable-logging'])
if showbrower in ('false', '否', 'no'):
opts.add_argument(argument="--headless")
driver = webdriver.Ie(executable_path="webdriver\IEDriverServer.exe", options=opts)
return driver
except Exception as e:
msg = "[-]错误提示:getWebdriver--浏览器创建失败!" + e.__str__()
message(msg)
return None
finally:
del Cfg
################################################################################################################
def getWebdriverElement(driver, typeName, parmStr):
if typeName.lower() == 'name':
element = driver.find_element_by_name(parmStr)
if typeName.lower() == 'id':
element = driver.find_element_by_id(parmStr)
if typeName.lower() == 'tag':
element = driver.find_element_by_tag_name(parmStr)
if typeName.lower() == 'class':
element = driver.find_element_by_class_name(parmStr)
if typeName.lower() == 'value':
element = driver.find_element(parmStr)
if typeName.lower() == 'link':
element = driver.find_element_by_link_text(parmStr)
if typeName.lower() == 'xpath':
element = driver.find_element_by_xpath(parmStr)
# print(type(element).__name__) FirefoxWebElement
return element
#######################################################################
#######################################################################################################################
#判断文件是否为DOCX文件
def isExtFile(fileext,fileName=None):
if len(fileName)<4:
return False
ext=str(fileName[-len(fileext):]).lower()
if ext ==fileext:
import os, sys
import re
import time
import random
from datetime import datetime
from functools import wraps
import requests
from docx import Document
from configparser import ConfigParser
from docxcompose.composer import Composer
from selenium import webdriver
from bs4 import BeautifulSoup
configFile="CommonSpider.ini"
from PIL import Image
headers = {
"User-Agent":
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
}
logo = r"""
_______ _______ _________ ______ _______ _______
( ____ \( ____ )\__ __/( __ \ ( ____ \( ____ )
| ( \/| ( )| ) ( | ( \ )| ( \/| ( )|
| (_____ | (____)| | | | | ) || (__ | (____)|
(_____ )| _____) | | | | | || __) | __)
) || ( | | | | ) || ( | (\ (
/\____) || ) ___) (___| (__/ )| (____/\| ) \ \__
\_______)|/ \_______/(______/ (_______/|/ \__/"""
##############################################################################################################
# 显示标题头
def ShowFaceTitle():
print(logo)
print(' │ ★★★ 作者: 谢海亮 ★★★ │ ')
print(' │ ----------------------------------------------│ ')
print(' │ 2022 @ All Rights Reserved │ ')
print(' │ │ ')
print(' │ 本程序用于爬取网页信息,使用前请手动配置信息! │ ')
print(' ╰─────────────────────────────╯ ')
print('\n')
##################################################
def message(msg):
msg = '\r' + msg
sys.stdout.flush()
sys.stdout.write(msg)
###############################################
def delStrfirstNum(numberStr,charlist):
if len(numberStr) < 1:
return ''
i = numberStr[0]
if charlist==None or len(charlist)<1:
charlist=['、']
while (str(i).isnumeric() or numberStr[0] in charlist) and len(numberStr) > 1:
numberStr = numberStr[1:]
i = numberStr[0]
return numberStr
def isImgUrl(url):
isTrue = False
if str(url)[:4].lower() == 'http' and str(url)[-4:].lower() in ('.jpg', '.gif', '.png'):
isTrue = True
return isTrue
#################################################################################
def filterFileName(filename, prestr=""):
newFileName = ''
for i in filename:
if i in (
'?', '?', ' ', '╲', '/', '*', '“', '<', '>', '、', '|', ':', '。', '"', '%', '^', '$', '~', ',', ':', '.', ',',
'(', ')'):
i = ""
newFileName = newFileName + i
newFileName = newFileName.replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '').strip()
if len(newFileName) < 2:
return ""
newFileName = prestr + delStrfirstNum(newFileName,['、',',',','])
return newFileName
#截取图片
def image_cut_save(path, maskpos,mask_width,mask_height, save_path):
""" 所截区域图片保存
:param path: 图片路径
:param left: 区块左上角位置的像素点离图片左边界的距离
:param upper:区块左上角位置的像素点离图片上边界的距离
:param right:区块右下角位置的像素点离图片左边界的距离
:param lower:区块右下角位置的像素点离图片上边界的距离
故需满足:lower > upper、right > left
:param save_path: 所截图片保存位置
"""
try:
if not (path.lower().endswith(('.png', '.jpg', '.jpeg'))): return
img = Image.open(path) # 打开图像
imgW, imgH = img.size
left, upper, right, lower=0,0,imgW, imgH
if maskpos=='左上':
left , upper, right, lower=0,imgH-mask_height,imgW,imgH
elif maskpos=='右上':
left , upper, right, lower=0,imgH-mask_height,imgW,imgH
elif maskpos=='左下':
left , upper, right, lower=0,0,imgW,imgH-mask_height
elif maskpos=='右下':
left , upper, right, lower=0,0,imgW,imgH-mask_height
else:left, upper, right, lower=0,0,imgW,imgH
if mask_height>int(imgH/5) or mask_width>int(imgW/5): return #如果截取高度大于图片高度的1/5 则退出不裁剪
if left>imgW or right>imgW or upper>imgH or lower>imgH: return
if (left, upper, right, lower) == (0,0,imgW,imgH): return
box = (left, upper, right, lower)
roi = img.crop(box)
# 保存截取的图片
roi.save(save_path)
del img
except Exception as e:
print(e.__str__())
def isMaskImgFile(img,maskpostion,maskwidth,maskheight):
if(img.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff'))):
fsize = os.path.getsize(img)
fsize = fsize / float(1024)
if fsize<5:
return False
try:
image = Image.open(img) # 检查文件是否能正常打开
image.verify() # 检查文件完整性
image.close()
#去除水印
image_cut_save(img, maskpostion,maskwidth,maskheight, img)
except:
try:
image.close()
except:
return False
raise
else:
return True
else:
return False
def isUrlStr(url):
if re.match(r'^https?:/{2}\w.+$', url):
return True
else:
return False
##################################################################################
# 更新修复URL列表
def updateUrlList(urllist, prestr=''):
newUrlList = []
if len(urllist) < 1:
return newUrlList
urllist = list(set(urllist))
for item in urllist:
if item[:4] != 'http':
item = prestr + item
if isUrlStr(item):
newUrlList.append(item)
returnList = list(tuple(newUrlList)) # 去重
return returnList
##################################################################################
# 判断是否存在敏感关键字
def isIncludeRiskWord(contents, riskwordfile):
if not os.path.exists(riskwordfile):
with open(riskwordfile, 'w') as f:
f.close()
f = open(riskwordfile, 'r')
lines = f.readlines()
f.close()
if len(lines) < 1:
return False
for line in lines:
if line in contents:
return True
return False
##################################################################################
# 将一段文档文字根据首字符标题规范分成若干列表
def spilitContents(contentlist, spilitchars=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']):
fgf = '_|_'
contents = ''
for content in contentlist:
p = str(content).strip()
if len(p) > 0:
firstChar = p[0]
newp = p
while (firstChar in spilitchars) and len(newp) > 1:
newp = newp[1:]
firstChar = newp[0]
if newp[0] != p[0]:
newp = fgf + newp
contents = contents + '\r' + newp
newlist = contents.split(fgf)
return newlist
###################################################################################
def make_dir(path):
""" 新建套图文件夹并切换到该目录下 """
# 如果目录已经存在就不用再次爬取了,去重,提高效率。存在返回 False,否则反之
if not os.path.exists(path):
os.makedirs(path)
print(path)
os.chdir(path)
return True
print("Folder has existed!")
return False
###################################################################################
def delete_empty_dir(dir):
""" 如果程序半路中断的话,可能存在已经新建好文件夹但是仍没有下载的图片的情况
但此时文件夹已经存在所以会忽略该套图的下载,此时要删除空文件夹 """
if os.path.exists(dir):
if os.path.isdir(dir):
for d in os.listdir(dir):
path = os.path.join(dir, d) # 组装下一级地址
if os.path.isdir(path):
delete_empty_dir(path) # 递归删除空文件夹
if not os.listdir(dir):
os.rmdir(dir)
print("remove the empty dir: {}".format(dir))
else:
print("Please start your performance!") # 请开始你的表演
###################################################################################
def remove_file(path):
if os.path.isfile(path):
os.remove(path)
###################################################################################
def check_dir(path):
"""
检查文件夹是否存在,存在返回True;不存在则创建,返回False
"""
if len(path) < 3:
return False
if not os.path.exists(path):
os.makedirs(path)
return False
return True
###################################################################################
def retry(n=3, delay=0.5):
def deco(func):
@wraps(func)
def wrapper(*a, **kw):
count = 1
while True:
try:
return func(*a, **kw)
except Exception as e:
if count == n + 1:
break
print('[{}]运行错误,{}s后进行第{}次重试 Err: {}'.format(func.__name__, delay, count, e))
count += 1
time.sleep(delay)
print('重试结束,[{}]运行失败'.format(func.__name__))
return False
return wrapper
return deco
###################################################################################
def download(file_url, file_name=None, file_type=None, save_path="download", headers=None, timeout=15):
"""
:param file_url: 下载资源链接
:param file_name: 保存文件名,默认为当前日期时间
:param file_type: 文件类型(扩展名)
:param save_path: 保存路径,默认为download,后面不要"/"
:param headers: http请求头,默认为iphone
"""
if file_name is None:
file_name = str(datetime.now())
file_name = filter_name(file_name)
if file_type is None:
if "." in file_url:
file_type = file_url.split(".")[-1]
else:
file_type = "uknown"
check_dir(save_path)
file_name = file_name + "." + file_type
if headers is None:
headers = {
"User-Agent":
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
}
# 下载提示
if os.path.exists(f"{save_path}/{file_name}"):
print(f'\033[33m{file_name}已存在,不再下载!\033[0m')
return True
message(f"Downloading {file_name}")
try:
with requests.get(file_url, headers=headers, stream=True, timeout=timeout) as rep:
file_size = int(rep.headers['Content-Length'])
if rep.status_code != 200:
message("\033[31m下载失败\033[0m")
return False
with open(f"{save_path}/{file_name}", "wb") as f:
for chunk in rep.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
message(f"\033[32m{file_name}下载成功\033[0m")
except Exception as e:
msg = '下载失败: ' + e.__str__()
message(msg)
remove_file(f"{save_path}/{file_name}")
return True
###################################################################################
# 将html代码以图片list进行分割成块
def splitHtmlByImg(html, imglist): # 将html代码以图片list进行分割成块
html_parts = [] # 根据图标签将正文分割成N部分
for imgtag in imglist: # imgtag属性是bs4.element.Tag 后面需要使用str()函数转换成string
html = str(html) # 强制转化为字符串方便split分割
str_tmp = html.split(str(imgtag))[0] # 取图片分割的前一个元素 加入 正文list部分
if len(str(str_tmp))>1: html_parts.append(str(str_tmp))
# print(len(arr))
html = html.replace((str_tmp + str(imgtag)), '') # 将正文第一部分及图片标签字符串 从html中替换抹掉作为下一个for循环的html
# print(html)
html_parts.append(html) # 把最后一张图片后的html内容补上
return html_parts
###################################################################################
## 根据图片url保存图片,填写referer可伪装referer来源下载防盗链图片
def pic_down(referer_url, pic_url):
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
"Accept-Encoding": "gzip",
"Accept-Language": "zh-CN,zh;q=0.8",
"Referer": referer_url,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
}
# 保存图片至本地,因为新浪图片url中,不带后缀,这里就加了jpg后缀名,否则生成的word会报错
pic_url = (pic_url.split('@')[0]).split('?')[0]
img_name = pic_url.split('/')[-1]
if str(img_name[-4:]).lower() not in ('.jpg', '.png', '.gif', 'jpeg'): img_name = img_name + '.jpg'
try:
tempfolder=os.getcwd()+r'\temp'
img_name=os.path.join(tempfolder,img_name)
if not os.path.exists(tempfolder):os.mkdir(tempfolder)
with open(img_name, 'wb') as f:
response = requests.get(pic_url, headers=headers).content
f.write(response)
f.close()
return img_name
except Exception as e:
print('pic_down Error:',e.__str__())
return ''
##################################
# 下载文件
def downloadFile(urlfile,savefile):
connected=ifServerConnected(urlfile)
if connected:
content=requests.get(urlfile).content
try:
with open(savefile,'wb') as f:
f.write(content)
f.close()
except Exception as e:
print("下载错误:",e)
#########################################
#获取与服务器的链接
# 监测网络
def ifServerConnected(serverweb):
try:
res = requests.get(serverweb, timeout=10)
if res.status_code == 200:
return True
else:
return False
except Exception as e:
return False
def get_findAll_urls(text, pattern):
"""
:param text: 文本
:return: 返回url列表
"""
if pattern == '':
pattern = r"(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|([a-zA-Z]+.\w+\.+[a-zA-Z0-9\/_]+)"
urls = re.findall(pattern, text)
urls = list(urls)
urls = [x for x in urls if x != '']
return urls
#####################################################################
#################################################
# 检查是否存在记录,不存在则写入
def checkSpiderLog(logfile, url):
if url == '': return False
lines = []
if os.path.exists(logfile):
with open(logfile, 'r') as f:
lines = f.readlines()
f.close()
else:
f = open(logfile, 'w')
f.close()
for line in lines:
if url in line: return True
with open(logfile, 'a') as f:
f.write('\n' + url)
f.close()
return False
#################################################################################
def getWebdriver():
driver = None
Cfg = ConfigParser()
Cfg.read(os.getcwd() + "\\"+configFile)
try:
driverpath = str(Cfg['浏览器参数']['驱动路径'].strip().lower())
drivertype = str(Cfg['浏览器参数']['驱动类型'].strip().lower())
showbrower = str(Cfg['浏览器参数']['是否显示浏览器'].strip().lower())
if drivertype == 'firefox':
# 隐藏浏览器
opts = webdriver.FirefoxOptions()
# opts.add_experimental_option('excludeSwitches', ['enable-logging'])
if showbrower in ('false', '否', 'no'):
opts.add_argument(argument="--headless")
driver = webdriver.Firefox(executable_path=driverpath, options=opts)
elif drivertype == 'chrome':
# 隐藏浏览器
opts = webdriver.ChromeOptions()
# opts.add_experimental_option('excludeSwitches', ['enable-logging'])
if showbrower in ('false', '否', 'no'):
opts.add_argument(argument="--headless")
driver = webdriver.Chrome(executable_path=driverpath, options=opts)
else:
opts = webdriver.Ie()
# opts.add_experimental_option('excludeSwitches', ['enable-logging'])
if showbrower in ('false', '否', 'no'):
opts.add_argument(argument="--headless")
driver = webdriver.Ie(executable_path="webdriver\IEDriverServer.exe", options=opts)
return driver
except Exception as e:
msg = "[-]错误提示:getWebdriver--浏览器创建失败!" + e.__str__()
message(msg)
return None
finally:
del Cfg
################################################################################################################
def getWebdriverElement(driver, typeName, parmStr):
if typeName.lower() == 'name':
element = driver.find_element_by_name(parmStr)
if typeName.lower() == 'id':
element = driver.find_element_by_id(parmStr)
if typeName.lower() == 'tag':
element = driver.find_element_by_tag_name(parmStr)
if typeName.lower() == 'class':
element = driver.find_element_by_class_name(parmStr)
if typeName.lower() == 'value':
element = driver.find_element(parmStr)
if typeName.lower() == 'link':
element = driver.find_element_by_link_text(parmStr)
if typeName.lower() == 'xpath':
element = driver.find_element_by_xpath(parmStr)
# print(type(element).__name__) FirefoxWebElement
return element
#######################################################################
#######################################################################################################################
#判断文件是否为DOCX文件
def isExtFile(fileext,fileName=None):
if len(fileName)<4:
return False
ext=str(fileName[-len(fileext):]).lower()
if ext ==fileext:
return True
else:
return False
#######################################################################################################################
#获取目录下的docx文件列表
def getExtFileList(fileext,DirName):
extFilelist=[]
if os.path.exists(DirName):
file_list = os.listdir(DirName)
for file in file_list:
if isExtFile(fileext,file):
extFilelist.append(file)
return extFilelist
else:
return False
#######################################################################################################################
#获取目录下的docx文件列表
def getExtFileList(fileext,DirName):
extFilelist=[]
if os.path.exists(DirName):
file_list = os.listdir(DirName)
for file in file_list:
if isExtFile(fileext,file):
extFilelist.append(file)
return extFilelist