简单的爬虫,由于这里面写了一个简单的压缩以及异或加密程序,因此运行完会看到两个不带后缀名的文件以及一个json文件。加密的目的本来是为了放到阿里云服务器而不被云盾检测到。当然,您也可以删掉相关代码(已在代码中标出),直接以图片形式保存。
保留所有函数,删除其他部分,再见!。
# -*- coding: UTF-8 -*-
import threading
import requests
import re
import time
from bs4 import BeautifulSoup
import os
import urllib
import zipfile
import shutil
import json
def downIMG(url,path):
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(url,path)
return;
def getall(page,num,site,filepath):
mainsite=site+'/thread0806.php?fid=16&search=&page='+str(page)
mhtml = requests.get(mainsite)
mhtml.encoding = 'gbk'
part_siteURL = re.findall(r"<h3>.+?</h3>",mhtml.text,re.S)
urls = []
links=[];
titles=[];
effect=[];
if(page==1):
start=9
else:
start=0
for n in range(start,len(part_siteURL)-1):
link=re.findall(r"htm_data.+?\.html",part_siteURL[n],re.S)
if not len(link)==0:
link=site+'/'+link[0] #如https://www.baidu.com/
effect.append(n)
links.append(link)
title=re.findall(r"<h3><a.+?>(<font color=.+?>)*(.+?)<.*?/a></h3>",part_siteURL[n],re.S)
#print(title[0][1])
title=title[0][1]
titles.append(title)
website=links[num-1]
html = requests.get(website)
html.encoding = 'gbk'
res = requests.get(website)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'lxml')
folder =soup.title.text[:-32]
folderutf = folder.encode("utf-8")
folderutf = folderutf.decode('utf-8')
print('正在下载 '+str(num)+'、'+titles[num-1]+'')
soup = BeautifulSoup(html.text, 'html.parser')
#part_picURL = re.findall("src='http://img(.+?\.jpg)' type='image'>",html.text,re.S)
part_picURL = re.findall(r"src='([a-zA-Z0-9|/|www.|.com|:|_|\?|\.\=]+?)(\.|&)(jpg|gif|png|JPG|PNG|GIF)' type='image'>",html.text,re.S)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
sufs=[]
for each in part_picURL:
picURL = each[0]+each[1]+each[2]
suf=each[2]
urls.append(picURL)
sufs.append(suf)
length=len(urls)
#print(urls)
threads=[];
title=re.sub(r'[\?|\\|\/|\>|\<|\"|\:|\*|\|]','~',str(titles[num-1]))
pathURL=filepath+'/'+re.sub(r'[\?|\\|\/|\>|\<|\"|\:|\*|\|]','~',str(titles[num-1]))#此处换成你自己要保存的路径,类似'c:\users\'格式
global G;
G=0;
if (length!=0):
print("当前共有"+str(length)+"张图片。")
for k in range(0,length):
if not os.path.exists(pathURL):
os.makedirs(pathURL)
url=urls[k];paths=pathURL+'/'+str(k)+'.'+sufs[k];
#print(url+' '+paths)
threads.append(threading.Thread(target=downimgs,args=(url,paths,k)));
for thread in threads:
thread.start()
time.sleep(0.5)
for t in threads:
t.join(10)
if (len(os.listdir(pathURL))==0):
os.rmdir(pathURL)
#开始删除-------------------------------------------------------------------------------
else:
tm=time.strftime("%m%d_%H%M%S", time.localtime())
compress(pathURL, filepath+'/%s.zip'%(tm))
print('压缩成功!')
key=0x9e
enc(filepath+'/%s.zip'%(tm),filepath+'/%s'%(tm),key)
shutil.rmtree(pathURL)
os.remove(filepath+'/%s.zip'%(tm))
if os.path.exists(filepath+'/list'):
dec(filepath+'/list',filepath+'/list.json',key);
jsdec=open(filepath+'/list.json');
listdic=json.loads(jsdec.read());
jsdec.close();
os.remove(filepath+'/list');
else:
listdic={}
listdic[tm]=title;
jsdec=open(filepath+'/list.json','w');
jsdec.write(json.dumps(listdic,ensure_ascii=False))
jsdec.close()
enc(filepath+'/list.json',filepath+'/list',key)
#结束删除--------------------------------------------------------------------------------
else:
print("当前无可下载图片。");
return;
def gettitles(page,site):
mainsite=site+'/thread0806.php?fid=16&search=&page='+str(page)
mhtml = requests.get(mainsite)
mhtml.encoding = 'gbk'
part_siteURL = re.findall(r"<h3>.+?</h3>",mhtml.text,re.S)
titles=[];
effect=[];
links=[];
if(page==1):
start=9
else:
start=0
for n in range(start,len(part_siteURL)-1):
link=re.findall(r"htm_data.+?\.html",part_siteURL[n],re.S)
if not len(link)==0:
link=site+'/'+link[0] #如https://www.baidu.com/
effect.append(n)
links.append(link)
title=re.findall(r"<h3><a.+?>(<font color=.+?>)*(.+?)<.*?/a></h3>",part_siteURL[n],re.S)
title=title[0][1]
titles.append(title)
for s in range(0,len(effect)-1):
print(str(s+1)+'、'+titles[s])
return titles;
def getpic(page,dic,site,filepath):
for num in dic:
getall(page,num,site,filepath)
return;
def downimgs(url,path,k):
global G;
try:
downIMG(url,path)
print("第"+str(G+1)+"张图片下载成功。")
# print(path)
G=G+1;
except:
print("第%d张图片下载失败。"%(G+1))
G=G+1;
def enc(src,enc,key):
src=open(src, 'rb')
enc=open(enc, 'wb')
b=src.read(1)
while (b!=b''):
bi=ord(b)^(key)
enc.write(bytes([bi]))
b=src.read(1)
enc.close()
print('加密成功!')
def dec(enc,dec,key):
enc=open(enc, 'rb')
dec=open(dec, 'wb')
b=enc.read(1)
while (b!=b''):
bi=ord(b)^(key)
dec.write(bytes([bi]))
b=enc.read(1)
dec.close()
print('解密成功!')
def compress(get_files_path, set_files_path):
f = zipfile.ZipFile(set_files_path , 'w', zipfile.ZIP_DEFLATED )
for dirpath, dirnames, filenames in os.walk( get_files_path ):
fpath = dirpath.replace(get_files_path,'')
fpath = fpath and fpath + os.sep or ''
for filename in filenames:
f.write(os.path.join(dirpath,filename), fpath+filename)
f.close()
#************************************************************************************#
b=a.split(',')
lis=[];
n=0;
for e in b:
finde=re.findall(r'(^[0-9]+?$|^[0-9]+?-[0-9]+$)',e)
#print(finde)
if len(finde)==1:
if e.find('-')==-1:
if int(e)<len(titles):
lis.append([int(e)])
else:
if (n==0):
print("数值超出范围!",end='')
n=1;
else:
ch=re.findall(r'(^[0-9]+?)-([0-9]+$)',e)
if ((int(ch[0][0])<1 or int(ch[0][0])>(len(titles)-1) or int(ch[0][1]))<1 or int(ch[0][1])>(len(titles)-1)):
if(n==0):
print("数值超出范围!",end='')
n=1;
else:
lis.append(range(int(ch[0][0]),int(ch[0][1])))
else:
print("格式有错误!")
#************************************************************************************#