代码备份

最新推荐文章于 2024-04-21 15:38:13 发布

东大坡

最新推荐文章于 2024-04-21 15:38:13 发布

阅读量1.4k

点赞数

文章标签： python 爬虫

本文链接：https://blog.csdn.net/EternalSp/article/details/88134457

版权

简单的爬虫，由于这里面写了一个简单的压缩以及异或加密程序，因此运行完会看到两个不带后缀名的文件以及一个json文件。加密的目的本来是为了放到阿里云服务器而不被云盾检测到。当然，您也可以删掉相关代码（已在代码中标出），直接以图片形式保存。

保留所有函数，删除其他部分，再见！。

# -*- coding: UTF-8 -*-
import threading
import requests
import re
import time
from bs4 import BeautifulSoup
import os
import urllib
import zipfile
import shutil
import json


def downIMG(url,path):
	opener = urllib.request.build_opener()
	opener.addheaders = [('User-agent', 'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10')]
	urllib.request.install_opener(opener) 
	urllib.request.urlretrieve(url,path)
	return;

def getall(page,num,site,filepath):
	mainsite=site+'/thread0806.php?fid=16&search=&page='+str(page)
	mhtml = requests.get(mainsite)
	mhtml.encoding = 'gbk'
	part_siteURL = re.findall(r"<h3>.+?</h3>",mhtml.text,re.S)
	urls = []
	links=[];
	titles=[];
	effect=[];

	if(page==1):
		start=9
	else:
		start=0
	for n in range(start,len(part_siteURL)-1):
		link=re.findall(r"htm_data.+?\.html",part_siteURL[n],re.S)
		if not len(link)==0:
			link=site+'/'+link[0] #如https://www.baidu.com/
			effect.append(n)
			links.append(link)
			title=re.findall(r"<h3><a.+?>(<font color=.+?>)*(.+?)<.*?/a></h3>",part_siteURL[n],re.S)
			#print(title[0][1])
			title=title[0][1]
			titles.append(title)		
	website=links[num-1]
	html = requests.get(website)
	html.encoding = 'gbk'
	res = requests.get(website)
	res.encoding = 'gbk'
	soup = BeautifulSoup(res.text, 'lxml')

	folder =soup.title.text[:-32]
	folderutf = folder.encode("utf-8")
	folderutf = folderutf.decode('utf-8')
	print('正在下载  '+str(num)+'、'+titles[num-1]+'')

	soup = BeautifulSoup(html.text, 'html.parser')
	#part_picURL = re.findall("src='http://img(.+?\.jpg)' type='image'>",html.text,re.S)
	part_picURL = re.findall(r"src='([a-zA-Z0-9|/|www.|.com|:|_|\?|\.\=]+?)(\.|&)(jpg|gif|png|JPG|PNG|GIF)' type='image'>",html.text,re.S)
	headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
	
	sufs=[]
	for each in part_picURL:
		picURL = each[0]+each[1]+each[2]
		suf=each[2]
		urls.append(picURL)
		sufs.append(suf)
	length=len(urls)
	#print(urls)
	threads=[];
	title=re.sub(r'[\?|\\|\/|\>|\<|\"|\:|\*|\|]','~',str(titles[num-1]))
	pathURL=filepath+'/'+re.sub(r'[\?|\\|\/|\>|\<|\"|\:|\*|\|]','~',str(titles[num-1]))#此处换成你自己要保存的路径，类似'c:\users\'格式
	global G;
	G=0;
	if (length!=0):
		print("当前共有"+str(length)+"张图片。")
		for k in range(0,length):
			if not os.path.exists(pathURL):
				os.makedirs(pathURL)
			url=urls[k];paths=pathURL+'/'+str(k)+'.'+sufs[k];
			#print(url+' '+paths)
			threads.append(threading.Thread(target=downimgs,args=(url,paths,k)));

		for thread in threads:
			thread.start()
			time.sleep(0.5)
		for t in threads:
			t.join(10)
		if (len(os.listdir(pathURL))==0):
			os.rmdir(pathURL)
		#开始删除-------------------------------------------------------------------------------
	else:
		tm=time.strftime("%m%d_%H%M%S", time.localtime()) 
		compress(pathURL, filepath+'/%s.zip'%(tm))
		print('压缩成功！')
		key=0x9e
		enc(filepath+'/%s.zip'%(tm),filepath+'/%s'%(tm),key)
		shutil.rmtree(pathURL)
		os.remove(filepath+'/%s.zip'%(tm))
		if os.path.exists(filepath+'/list'):
			dec(filepath+'/list',filepath+'/list.json',key);
			jsdec=open(filepath+'/list.json');
			listdic=json.loads(jsdec.read());
			jsdec.close();
			os.remove(filepath+'/list');
		else:
			listdic={}
		listdic[tm]=title;
		jsdec=open(filepath+'/list.json','w');
		jsdec.write(json.dumps(listdic,ensure_ascii=False))
		jsdec.close()
		enc(filepath+'/list.json',filepath+'/list',key)
		#结束删除--------------------------------------------------------------------------------
	else:
		print("当前无可下载图片。");
	return;

def gettitles(page,site):
	mainsite=site+'/thread0806.php?fid=16&search=&page='+str(page)
	mhtml = requests.get(mainsite)
	mhtml.encoding = 'gbk'
	part_siteURL = re.findall(r"<h3>.+?</h3>",mhtml.text,re.S)
	titles=[];
	effect=[];
	links=[];
	if(page==1):
		start=9
	else:
		start=0
	for n in range(start,len(part_siteURL)-1):
		link=re.findall(r"htm_data.+?\.html",part_siteURL[n],re.S)
		if not len(link)==0:
			link=site+'/'+link[0] #如https://www.baidu.com/
			effect.append(n)
			links.append(link)
			title=re.findall(r"<h3><a.+?>(<font color=.+?>)*(.+?)<.*?/a></h3>",part_siteURL[n],re.S)
			title=title[0][1]
			titles.append(title)
	for s in range(0,len(effect)-1):
		print(str(s+1)+'、'+titles[s])
	return titles;

def getpic(page,dic,site,filepath):
	for num in dic:
		getall(page,num,site,filepath)
	return;

def downimgs(url,path,k):
	global G;

	try:
		downIMG(url,path)
		print("第"+str(G+1)+"张图片下载成功。")
		# print(path)
		G=G+1;
	except:
		print("第%d张图片下载失败。"%(G+1))
		G=G+1;
def enc(src,enc,key):
	src=open(src, 'rb')
	enc=open(enc, 'wb')
	b=src.read(1)
	while (b!=b''):
		bi=ord(b)^(key)
		enc.write(bytes([bi]))
		b=src.read(1)
	enc.close()
	print('加密成功！')

def dec(enc,dec,key):
	enc=open(enc, 'rb')
	dec=open(dec, 'wb')
	b=enc.read(1)
	while (b!=b''):
		bi=ord(b)^(key)
		dec.write(bytes([bi]))
		b=enc.read(1)
	dec.close()
	print('解密成功！')
def compress(get_files_path, set_files_path):
    f = zipfile.ZipFile(set_files_path , 'w', zipfile.ZIP_DEFLATED )
    for dirpath, dirnames, filenames in os.walk( get_files_path ):      
        fpath = dirpath.replace(get_files_path,'') 
        fpath = fpath and fpath + os.sep or ''     
        for filename in filenames: 
            f.write(os.path.join(dirpath,filename), fpath+filename)
    f.close()
#************************************************************************************#

b=a.split(',')
lis=[];
n=0;
for e in b:
	finde=re.findall(r'(^[0-9]+?$|^[0-9]+?-[0-9]+$)',e)
	#print(finde)
	if len(finde)==1:
		if e.find('-')==-1:
			if int(e)<len(titles):
				lis.append([int(e)])
			else:
				if (n==0):
					print("数值超出范围！",end='')
					n=1;
		else:
			ch=re.findall(r'(^[0-9]+?)-([0-9]+$)',e)
			if ((int(ch[0][0])<1 or int(ch[0][0])>(len(titles)-1) or int(ch[0][1]))<1 or int(ch[0][1])>(len(titles)-1)):
				if(n==0):
					print("数值超出范围！",end='')
					n=1;
			else:
				lis.append(range(int(ch[0][0]),int(ch[0][1])))
	else:
		print("格式有错误！")

#************************************************************************************#

东大坡

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
代码备份

可以爬取1024内的图片并保存到文件夹里。由于这里面写了一个简单的压缩以及加密程序，因此您运行完会看到一个不带后缀名的文件以及一个json文件。写这个的目的本来是为了放到阿里云服务器而不被云盾检测到。当然，您也可以删掉相关代码，直接以图片形式保存。# -*- coding: UTF-8 -*-#基于python3，其他版本可能无法运行。#只能爬取1024，对其他网站无效。import th...
复制链接

扫一扫