# coding:utf-8
import urllib.request
import urllib.error
import ssl
import xlrd
import time
from bs4 import BeautifulSoup
import os
import screenshots
import set_proxy
import eventlet
from cfg.Config import cfg
# 设置代理
set_proxy.set_proxy()
ssl._create_default_https_context = ssl._create_unverified_context # 解决某些环境下报<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed
# 下面是文件获取路径,需自己定义
html_resource_save_path = cfg("html_resource_save_path")[0][1]
img_save_path = cfg("img_save_path")[0][1]
url_file_path = cfg("url_file_path")[0][1]
excel_sheet_select = cfg("url_file_path")[1][1]
col = cfg("url_file_path")[2][1]
def get_html(url):
"""
:param url: 目标url网址
:return: html源代码
"""
html = ""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
}
print("url: ",("http://"+url))
req = urllib.request.Request(url="http://"+url+"/", headers=headers)
# 快速请求
response = urllib.request.urlopen(req, timeout=30)
html = response.read().decode('utf-8','ignore')
except:
print("爬区网站失败")
finally:
return html
def write_html(html_resource,file_name):
"""
:param html_resource: html 源码
:return:
"""
pageFile = open(html_resource_save_path +"%s.txt"%(str(file_name)), 'w', encoding='utf-8')
# pageFile = open(r'F:/crawl_data/html_resource_data/%s.txt'%file_name, 'w', encoding='utf-8')
pageFile.write(html_resource)
pageFile.close()
def read_excel(file_path,target_sheet,target_col):
"""
:param file_path: excel文件绝对路径
:param target_sheet: excel文件Sheet名
:param target_col: 选择获取的第n列
:return: 第n列数据的列表
"""
book = xlrd.open_workbook(file_path)
sheet = book.sheet_by_name(target_sheet)
target_col_list = sheet.col_values(target_col)
return target_col_list
def mkdir(path):
"""
:param path: 指定路径
:return: Booleans
"""
path = path.strip() # 去除首位空格
path = path.rstrip("\\") # 去除尾部 \ 符号
isExists = os.path.exists(path) # 判断路径是否存在
# 判断结果
if not isExists:
# 如果不存在则创建目录
os.makedirs(path) # 创建目录操作函数
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
def get_img(url,folder_name):
"""
:param url: 获取地址
:param folder_name: 图片放在的文件夹
:return:
"""
try:
# path = (r'F:/crawl_data/img_data/%s/' % folder_name)
path = (img_save_path+"/%s/"%(str(folder_name)))
mkdir(path) # 判断path路径是否存在
url_final = "http://"+url
eventlet.monkey_patch()
i = 0
with eventlet.Timeout(200, False):
html = urllib.request.urlopen(url_final).read().decode('UTF-8', errors="replace")
i += 1
if i == 1:
soup = BeautifulSoup(html, 'lxml')
links = soup.find_all('img')
# links = set(links)
if links.__len__() == 0: # 若url链接中url中
screenshots.screenshots(url_final,path)
else:
count_num = 0
for link in links:
print("=====================>",count_num)
png_path = link.attrs['src']
if png_path.split(':')[0] != "http" and png_path.split(':')[0] != "https":
with eventlet.Timeout(200, False): # 执行现在图片超出时间就跳出
urllib.request.urlretrieve("http://"+url+"/"+link.attrs['src'],path+'\%s.jpg' % str(count_num))
else:
with eventlet.Timeout(200, False):
urllib.request.urlretrieve(str(link.attrs['src']).replace(' ',''), path + '\%s.png' % str(count_num))
count_num += 1
""" ==========图片已写本地文件夹========== """
else:
screenshots.screenshots(url_final, path)
except :
""" ==========该url不能被爬取========== """
screenshots.screenshots(url_final, path)
if __name__ == "__main__":
print("==========正在读取excel文件==========")
target_col_list = read_excel(url_file_path,excel_sheet_select,int(col))
print("==========读取excel文件读取完毕==========")
for line in range(len(target_col_list)):
line = line+38
print("==========正在读取第%d个html源码=========="%line)
print("读取html:",target_col_list[line])
# 获取url源代码
get_html_resouce = get_html(target_col_list[line])
print("==========读取第%d个html源码完毕=========="%line)
print("==========正在写入第%d个html源码=========="%line)
# 将url源码写入指定的txt文件
write_html(get_html_resouce,line)
print("==========写入第%d个html源码完毕=========="%line)
print("==========正在下载第%d个html图片数据=========="%line)
# 获取指定url下,深度为1,所有图片
get_img(target_col_list[line],line)
print("==========下载第%d个html图片数据完毕=========="%line)
爬虫template程序
于 2019-04-08 15:46:58 首次发布