#!/usr/bin/env python#-*- coding: UTF-8 -*
importosimporttimeimportrefrom urlparse importurljoinimportrequests
ss=requests.Session()
ss.headers.update({'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'})from PIL importImage#https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001431918785710e86a1a120ce04925bae155012c7fc71e000#和StringIO类似,可以用一个bytes初始化BytesIO,然后,像读文件一样读取:
from io importBytesIOfrom string importascii_letters, digitsimportnumpy as np#ip_port_type_tuple_list = []
classMimvp():def __init__(self, num_width=None, feature_vectors=None, white_before_black=2, threshhold=100, max_nums=None, filepath=None, page=None):
self.ip_port_type_tuple_list=[]#fluent p189
if feature_vectors isNone:
self.feature_vectors=[]else:
self.feature_vectors=list(feature_vectors)
self.num_width=num_width
self.white_before_black=white_before_black
self.threshhold=threshhold
self.max_nums=max_nums
self.filepath=filepathif page isNone:
self.url= 'http://proxy.%s.com/free.php?proxy=in_hp'%'mimvp'
else:
self.url= 'http://proxy.%s.com/free.php?proxy=in_hp&sort=&page=%s' %('mimvp', page)defget_mimvp(self):#预处理提取特征组需要取得 self.port_src_list
if self.feature_vectors ==[]:
self.extract_features()
self.load_mimvp()
self.get_port_list()
self.merge_result()returnself.ip_port_type_tuple_listdefload_mimvp(self):
resp=ss.get(self.url)
self.ip_list= re.findall(r"class='tbl-proxy-ip'.*?>(.*?)
self.port_src_list= re.findall(r"class='tbl-proxy-port'.*?src=(.*?)\s*/>", resp.text) #图片链接
self.type_list = re.findall(r"class='tbl-proxy-type'.*?>(.*?)
self.port_list=[]for src inself.port_src_list:
port=self.get_port(src)
self.port_list.append(port)defget_port(self, src):
img=self.load_image_from_src(src)
split_imgs=self.split_image(img)
port= ''
for split_img insplit_imgs:
vector=self.build_vector(split_img)
compare_results=[]for t inself.feature_vectors:
cos=self.cos_similarity(vector, t.values()[0])
compare_results.append((cos, t.keys()[0]))#print sorted(compare_results, reverse=True)
port += sorted(compare_results, reverse=True)[0][1]printportreturnportdefload_image_from_src(self, src):
src=urljoin(self.url, src)printsrc,
resp=ss.get(src)
fp=BytesIO(resp.content)
img=Image.open(fp)returnimgdefsplit_image(self, img):
gray= img.convert('L')if self.num_width isNone:
img.show()printgray.getcolors()
self.num_width= int(raw_input('num_width:'))
self.white_before_black= int(raw_input('white_before_black:'))
self.threshhold= int(raw_input('BLACK < (threshhold) < WHITE:'))
gray_array=np.array(gray)
bilevel_array= np.where(gray_array
left_list=[]#从左到右按列求和
vertical =bilevel_array.sum(0)#print vertical
#从左到右按列扫描,2白1黑确定为数字左边缘
for i,c in enumerate(vertical[:-self.white_before_black]):if self.white_before_black == 1:if vertical[i] == 0 and vertical[i+1] !=0:
left_list.append(i+1)else:if vertical[i] == 0 and vertical[i+1] == 0 and vertical[i+2] !=0:
left_list.append(i+2)if len(left_list) ==self.max_nums:break
#分割可见图片
#bilevel = Image.fromarray(bilevel_array) #0/1 手工提取特征 show显示黑块 还没保存gif
bilevel = Image.fromarray(np.where(gray_array
split_imgs = [bilevel.crop((each_left, 0, each_left+self.num_width, img.height)) for each_left inleft_list]returnsplit_imgsdefbuild_vector(self, img):#img = Image.open(img)
img_array =np.array(img)#先遍历w,再遍历h,总共w+h维度,不需要/255,标记黑点个数等多余处理
return list(img_array.sum(0)) + list(img_array.sum(1))defcos_similarity(self, a, b):
A=np.array(a)
B=np.array(b)
dot_product= float(np.dot(A, B)) #A*(B.T) 达不到目的
magnitude_product = np.linalg.norm(A) *np.linalg.norm(B)
cos= dot_product /magnitude_productreturncosdefmerge_result(self):for ip, port, _type inzip(self.ip_list, self.port_list, self.type_list):if '/' in_type:
self.ip_port_type_tuple_list.append((ip, port,'both'))elif _type == 'HTTPS':
self.ip_port_type_tuple_list.append((ip, port,'HTTPS'))else:
self.ip_port_type_tuple_list.append((ip, port,'HTTP'))defextract_features(self):if self.filepath is notNone:
img_list=self.load_images_from_filepath()else:
self.load_mimvp()
img_list=self.load_images_from_src_list()for img inimg_list:
split_imgs=self.split_image(img)for split_img insplit_imgs:
split_img.show()printsplit_img.getcolors()
input= raw_input('input:')
vector=self.build_vector(split_img)
item={input: vector}if item not inself.feature_vectors:printitem
self.feature_vectors.append(item)for i insorted(self.feature_vectors):print i,','
defload_images_from_filepath(self):
img_list=[]
postfix= ['jpg', 'png', 'gif', 'bmp']for filename in [i for i in os.listdir(self.filepath) if i[-3:] inpostfix]:
file=os.path.join(self.filepath, filename)
img_list.append(Image.open(file))returnimg_listdefload_images_from_src_list(self):
img_list=[]for src inself.port_src_list:
img=self.load_image_from_src(src)
img_list.append(img)returnimg_listif __name__ == '__main__':
feature_vectors=[
{'0': [4845, 5865, 5865, 5865, 5865, 4845, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
{'1': [5865, 5865, 3825, 6120, 6120, 6375, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1275, 1020, 1020, 1275, 1275, 1275, 1275, 1275, 1275, 255, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
{'2': [5100, 5610, 5610, 5610, 5610, 5355, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1275, 1020, 1275, 1275, 1275, 1275, 0, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
{'3': [5355, 5865, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1275, 765, 1275, 1275, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
{'4': [5610, 5865, 5865, 5865, 3825, 6120, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1275, 1020, 1020, 1020, 1020, 1020, 0, 1275, 1275, 1275, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
{'5': [4845, 5610, 5610, 5610, 5610, 5100, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 0, 1275, 1275, 1275, 255, 1275, 1275, 1275, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
{'6': [4590, 5610, 5610, 5610, 5610, 5355, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 765, 1275, 1275, 1275, 255, 1020, 1020, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
{'7': [6120, 6120, 6120, 5100, 5355, 5610, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 0, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
{'8': [4590, 5610, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1020, 510, 1020, 1020, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
{'9': [5610, 5610, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1020, 255, 1275, 1275, 1275, 1275, 765, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
]#def __init__(self, feature_vectors=None, filepath=None, page=None):
obj= Mimvp(num_width=6, feature_vectors=feature_vectors)#obj = Mimvp()
#obj = Mimvp(filepath='temp/')
ip_port_type_tuple_list=obj.get_mimvp()from pprint importpprint
pprint(ip_port_type_tuple_list)