爬取代理IP:https://blog.csdn.net/weixin_36634753/article/details/100413094
用上一篇脚本爬取代理IP写入文本;
config/cfg.ini配置文件配置代理IP的地址
[File-Path]
ipFilePath = D:\CODE\pyWordSpace\ip2.txt
[TEST]
test = test
读取配置文件工具脚本
import configparser
cf = configparser.ConfigParser();
cf.read("config/cfg.ini");
# sections = cf.sections();
# print(sections);
# options = cf.options("File-Path");
# print(options);
# items = cf.items("File-Path");
# print(items);
# ipFilePath = cf.get("File-Path", "ipFilePath");
# print(ipFilePath)
def get_cfg_value(section, option):
value = cf.get(section, option);
return value;
爬取壁纸脚本
import requests
from bs4 import BeautifulSoup
import urllib
import proxyIP #引用同级目录下的脚本
import getCfg #引用同级目录下的脚本
import random
#读取配置路径
ipFilePath = getCfg.get_cfg_value("File-Path", "ipFilePath");
#图片地址 96621是图片编号 编号递减是之前的壁纸
url = "http://desk.zol.com.cn/showpic/1920x1080_96621_8.html";
def get_picture(targeturl,name):
txt = [];
txt = proxyIP.read(ipFilePath);#获取代理ip
for ip in txt:
headers = proxyIP.getheaders() # 定制请求头
proxies = {"http": "http://" + ip, "https": "https://" + ip} # 代理ip
try:
get_html(targeturl,proxies,headers,name)
break;#只要爬取成功就返回
except:
continue;#超时报错用下一个IP继续爬取
def get_html(targeturl,proxies,headers,name):
html_req = requests.get(url=targeturl, proxies=proxies, headers=headers, timeout=10);
# print(html_req.text)
html = BeautifulSoup(html_req.text, "html.parser");
# print(html)
img = html.find("img");
# print(img["src"])#img["src"]是图片链接
urllib.request.urlretrieve(img["src"], "D:\\CODE\\pyWordSpace\\" + name + '.jpg');#用urllib.request.urlretrieve(url,path)保存图片
if __name__ == '__main__':
#壁纸网页递减爬取图片
for x in range(96621,5,-1) :
url = "http://desk.zol.com.cn/showpic/1920x1080_"+ str(x)+"_8.html";
get_picture(url,str(x));
print("complete" + str(x) + ".jpg");