python如何提取图片特征向量_python之验证码识别 特征向量提取和余弦相似性比较...

#!/usr/bin/env python#-*- coding: UTF-8 -*

importosimporttimeimportrefrom urlparse importurljoinimportrequests

ss=requests.Session()

ss.headers.update({'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'})from PIL importImage#https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001431918785710e86a1a120ce04925bae155012c7fc71e000#和StringIO类似,可以用一个bytes初始化BytesIO,然后,像读文件一样读取:

from io importBytesIOfrom string importascii_letters, digitsimportnumpy as np#ip_port_type_tuple_list = []

classMimvp():def __init__(self, num_width=None, feature_vectors=None, white_before_black=2, threshhold=100, max_nums=None, filepath=None, page=None):

self.ip_port_type_tuple_list=[]#fluent p189

if feature_vectors isNone:

self.feature_vectors=[]else:

self.feature_vectors=list(feature_vectors)

self.num_width=num_width

self.white_before_black=white_before_black

self.threshhold=threshhold

self.max_nums=max_nums

self.filepath=filepathif page isNone:

self.url= 'http://proxy.%s.com/free.php?proxy=in_hp'%'mimvp'

else:

self.url= 'http://proxy.%s.com/free.php?proxy=in_hp&sort=&page=%s' %('mimvp', page)defget_mimvp(self):#预处理提取特征组需要取得 self.port_src_list

if self.feature_vectors ==[]:

self.extract_features()

self.load_mimvp()

self.get_port_list()

self.merge_result()returnself.ip_port_type_tuple_listdefload_mimvp(self):

resp=ss.get(self.url)

self.ip_list= re.findall(r"class='tbl-proxy-ip'.*?>(.*?)

self.port_src_list= re.findall(r"class='tbl-proxy-port'.*?src=(.*?)\s*/>", resp.text) #图片链接

self.type_list = re.findall(r"class='tbl-proxy-type'.*?>(.*?)

self.port_list=[]for src inself.port_src_list:

port=self.get_port(src)

self.port_list.append(port)defget_port(self, src):

img=self.load_image_from_src(src)

split_imgs=self.split_image(img)

port= ''

for split_img insplit_imgs:

vector=self.build_vector(split_img)

compare_results=[]for t inself.feature_vectors:

cos=self.cos_similarity(vector, t.values()[0])

compare_results.append((cos, t.keys()[0]))#print sorted(compare_results, reverse=True)

port += sorted(compare_results, reverse=True)[0][1]printportreturnportdefload_image_from_src(self, src):

src=urljoin(self.url, src)printsrc,

resp=ss.get(src)

fp=BytesIO(resp.content)

img=Image.open(fp)returnimgdefsplit_image(self, img):

gray= img.convert('L')if self.num_width isNone:

img.show()printgray.getcolors()

self.num_width= int(raw_input('num_width:'))

self.white_before_black= int(raw_input('white_before_black:'))

self.threshhold= int(raw_input('BLACK < (threshhold) < WHITE:'))

gray_array=np.array(gray)

bilevel_array= np.where(gray_array

left_list=[]#从左到右按列求和

vertical =bilevel_array.sum(0)#print vertical

#从左到右按列扫描,2白1黑确定为数字左边缘

for i,c in enumerate(vertical[:-self.white_before_black]):if self.white_before_black == 1:if vertical[i] == 0 and vertical[i+1] !=0:

left_list.append(i+1)else:if vertical[i] == 0 and vertical[i+1] == 0 and vertical[i+2] !=0:

left_list.append(i+2)if len(left_list) ==self.max_nums:break

#分割可见图片

#bilevel = Image.fromarray(bilevel_array) #0/1 手工提取特征 show显示黑块 还没保存gif

bilevel = Image.fromarray(np.where(gray_array

split_imgs = [bilevel.crop((each_left, 0, each_left+self.num_width, img.height)) for each_left inleft_list]returnsplit_imgsdefbuild_vector(self, img):#img = Image.open(img)

img_array =np.array(img)#先遍历w,再遍历h,总共w+h维度,不需要/255,标记黑点个数等多余处理

return list(img_array.sum(0)) + list(img_array.sum(1))defcos_similarity(self, a, b):

A=np.array(a)

B=np.array(b)

dot_product= float(np.dot(A, B)) #A*(B.T) 达不到目的

magnitude_product = np.linalg.norm(A) *np.linalg.norm(B)

cos= dot_product /magnitude_productreturncosdefmerge_result(self):for ip, port, _type inzip(self.ip_list, self.port_list, self.type_list):if '/' in_type:

self.ip_port_type_tuple_list.append((ip, port,'both'))elif _type == 'HTTPS':

self.ip_port_type_tuple_list.append((ip, port,'HTTPS'))else:

self.ip_port_type_tuple_list.append((ip, port,'HTTP'))defextract_features(self):if self.filepath is notNone:

img_list=self.load_images_from_filepath()else:

self.load_mimvp()

img_list=self.load_images_from_src_list()for img inimg_list:

split_imgs=self.split_image(img)for split_img insplit_imgs:

split_img.show()printsplit_img.getcolors()

input= raw_input('input:')

vector=self.build_vector(split_img)

item={input: vector}if item not inself.feature_vectors:printitem

self.feature_vectors.append(item)for i insorted(self.feature_vectors):print i,','

defload_images_from_filepath(self):

img_list=[]

postfix= ['jpg', 'png', 'gif', 'bmp']for filename in [i for i in os.listdir(self.filepath) if i[-3:] inpostfix]:

file=os.path.join(self.filepath, filename)

img_list.append(Image.open(file))returnimg_listdefload_images_from_src_list(self):

img_list=[]for src inself.port_src_list:

img=self.load_image_from_src(src)

img_list.append(img)returnimg_listif __name__ == '__main__':

feature_vectors=[

{'0': [4845, 5865, 5865, 5865, 5865, 4845, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,

{'1': [5865, 5865, 3825, 6120, 6120, 6375, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1275, 1020, 1020, 1275, 1275, 1275, 1275, 1275, 1275, 255, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,

{'2': [5100, 5610, 5610, 5610, 5610, 5355, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1275, 1020, 1275, 1275, 1275, 1275, 0, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,

{'3': [5355, 5865, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1275, 765, 1275, 1275, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,

{'4': [5610, 5865, 5865, 5865, 3825, 6120, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1275, 1020, 1020, 1020, 1020, 1020, 0, 1275, 1275, 1275, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,

{'5': [4845, 5610, 5610, 5610, 5610, 5100, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 0, 1275, 1275, 1275, 255, 1275, 1275, 1275, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,

{'6': [4590, 5610, 5610, 5610, 5610, 5355, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 765, 1275, 1275, 1275, 255, 1020, 1020, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,

{'7': [6120, 6120, 6120, 5100, 5355, 5610, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 0, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,

{'8': [4590, 5610, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1020, 510, 1020, 1020, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,

{'9': [5610, 5610, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1020, 255, 1275, 1275, 1275, 1275, 765, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,

]#def __init__(self, feature_vectors=None, filepath=None, page=None):

obj= Mimvp(num_width=6, feature_vectors=feature_vectors)#obj = Mimvp()

#obj = Mimvp(filepath='temp/')

ip_port_type_tuple_list=obj.get_mimvp()from pprint importpprint

pprint(ip_port_type_tuple_list)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值