前言
前日看到大佬发的机器学习识别XSS的项目代码
于是搞了一把,记录一下遇到的问题
思路
大佬代码中给出的流程:
数据集:GET/POST query
范化、分词:将数字串、链接替换为固定词,然后用正则分词
特征(word2vec)
神经网络(MLP/RNN/CNN)
我实践的流程:
范化,分词流程不变
特征(分别尝试了word2vec、doc2vec、统计特征)
SVM
数据预处理
以下是在本地做数据预处理时的 jupyter notebook 代码。
import graphlab as gl
1 取数据
normal = gl.SFrame.read_csv('data/normal_examples.csv')
evil = gl.SFrame.read_csv('data/xssed.csv')
Parsing completed. Parsed 40637 lines in 0.091727 secs.
normal = gl.SFrame(normal)
evil = gl.SFrame(evil)
normal
param
_%3D1498591621808
code%3Dzs_000001%2Czs_399
001%2Czs_399006%26cb% ...
_%3D1498591951848%26list%
3Dml_sh600030 ...
6053%26ri%3Dzb6-00f%7E-
04gUry-01h- ...
b1498592370545%3D1
v%3D13111002
COLLCC%3D3442798258%26
t%3Dcheck%26rec%3Dstratus
%26etyp%3Dconnect%26z ...
cn_600022%2Ccn_600516%2Cc
n_000002%2Ccn_600519% ...
_%3D1498179095094%26list%
3Dsh600030 ...
[200129 rows x 1 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
2 画数据
normal.show()
Canvas is accessible via web browser at the URL: http://localhost:54528/index.html
Opening Canvas in default web browser.
evil.show()
Canvas is updated and available in a tab in the default browser.
evil.head()
param
Search%3D%3C/script%3E%3C
img/%2A%00/src%3D%22w ...
symbol%3D%3Ch1%3E%3Cscrip
t%3Ealert%28/hacked/% ...
query%3D%3CIMG%2B%22%22%2
2%3E%3CSCRIPT%3Ealert ...
ReturnUrl%3Dhttp%3A//www.
elle.fr/recherche ...
_lang%3D%22%3E%3Cscript%3
Ealert%28document.coo ...
language%3D%22%3E%3C/scri
pt%3E%22%3E%27%3E%3Cs ...
q%3Dbentley%26stylesheet%
3D%22%3E%3Cscript%3Ea ...
option%3Dcom_wdshop%26vie
w%3Duserinfo%26ajax_j ...
CT_ORIG_URL%3D/arena/%22%
3E%3Cscript%3Ealert%2 ...
query%3DSearch...%26Produ
ct%3D%27%22--%3E%3C/s ...
[10 rows x 1 columns]
3 范化、分词
import nltk
import re
from urllib import unquote
def GeneSeg(payload):
#数字泛化为"0"
payload=payload['param'].lower()
payload=unquote(unquote(payload))
payload,num=re.subn(r'\d+',"0",payload)
#替换url为”http://u
payload,num=re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?]+', "http://u", payload)
#分词
r = '''
(?x)[\w\.]+?\(
|\)
|"\w+?"
|'\w+?'
|http://\w
|\w+>
|
|
|\w+=
|>
|[\w\.]+
'''
return nltk.regexp_tokenize(payload, r)
# 增加一列
normal['parsed'] = normal.apply(GeneSeg)
evil['parsed'] = evil.apply(GeneSeg)
normal.head()
param
parsed
_%3D1498591621808
[_=, 0]
code%3Dzs_000001%2Czs_399
001%2Czs_399006%26cb% ...
[code=, zs_0, zs_0, zs_0,
cb=, fortune_hq_cn, _=, ...
_%3D1498591951848%26list%
3Dml_sh600030 ...
[_=, 0, list=, ml_sh0]
6053%26ri%3Dzb6-00f%7E-
04gUry-01h- ...
[0, ri=, zb0, 0f, 0gury,
0h, 0rc, tn=, 0, en=, ...
b1498592370545%3D1
[b0=, 0]
v%3D13111002
[v=, 0]
COLLCC%3D3442798258%26
[collcc=, 0]
t%3Dcheck%26rec%3Dstratus
%26etyp%3Dconnect%26z ...
[t=, check, rec=,
stratus, etyp=, connect, ...
cn_600022%2Ccn_600516%2Cc
n_000002%2Ccn_600519% ...
[cn_0, cn_0, cn_0, cn_0,
cn_0, cn_0, cn_0, cn_0, ...
_%3D1498179095094%26list%
3Dsh600030 ...
[_=, 0, list=, sh0]
[10 rows x 2 columns]
evil.head()
param
parsed
Search%3D%3C/script%3E%3C
img/%2A%00/src%3D%22w ...
[search=, ,
symbol%3D%3Ch1%3E%3Cscrip
t%3Ealert%28/hacked/% ...
[symbol=, ,
alert(, hacked, ), ...
query%3D%3CIMG%2B%22%22%2
2%3E%3CSCRIPT%3Ealert ...
[query=, ,
ReturnUrl%3Dhttp%3A//www.
elle.fr/recherche ...
[returnurl=, http://u,
globale, searchtext, ), ...
_lang%3D%22%3E%3Cscript%3
Ealert%28document.coo ...
[_lang=, >,
alert(, document.cookie, ...
language%3D%22%3E%3C/scri
pt%3E%22%3E%27%3E%3Cs ...
[language=, >, ,
>, >,
q%3Dbentley%26stylesheet%
3D%22%3E%3Cscript%3Ea ...
[q=, bentley,
stylesheet=, >,
option%3Dcom_wdshop%26vie
w%3Duserinfo%26ajax_j ...
[option=, com_wdshop,
view=, userinfo, ...
CT_ORIG_URL%3D/arena/%22%
3E%3Cscript%3Ealert%2 ...
[ct_orig_url=, arena, >,
query%3DSearch...%26Produ
ct%3D%27%22--%3E%3C/s ...
[query=, search...,
product=, >, , ...
[10 rows x 2 columns]
4 词表
# 取词的全集生成新的SFrame
ans = []
evil['parsed'].apply(lambda x:[ans.append(i) for i in x])
dtype: list
Rows: 40637
[[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None], [None, None, None, None, None], [None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None], [None, None], [None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None], ... ]
evil_word_frame = gl.SFrame(data=ans)
evil_word_frame
X1
search=
src=
worksinchrome
colon
prompt
x0
0
x0
[1595 rows x 1 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
# 增加统计列
evil_word_frame['count'] = evil_word_frame.apply(lambda x:len(evil_word_frame.filter_by(x['X1'],'X1')))
# 去重,按数量排序
evil_word_frame = evil_word_frame.unique().sort('count',ascending=False)
evil_word_frame
X1
count
0
491
)
155
>
120
alert(
113
99
83
string.fromcharcode(
37
http://u
15
document.cookie
15
xss
10
[330 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
# 取前300个词作为词表
wordlist_len = 300
evil_word_frame = evil_word_frame[0:wordlist_len]
evil_word_frame
X1
count
0
491
)
155
>
120
alert(
113
99
83
string.fromcharcode(
37
http://u
15
document.cookie
15
xss
10
[300 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
evil_word_frame.save('file/evil_word_list.csv', format='csv')
5 词向量
# 将不在词表内的词全部替换为'WORD',结果存入'words'列
evil['words'] = evil.apply(lambda x:[i if len(evil_word_frame.filter_by(i,'X1')) else 'WORD' for i in x['parsed']])
evil
param
parsed
words
Search%3D%3C/script%3E%3C
img/%2A%00/src%3D%22w ...
[search=, ,
[search=, ,
symbol%3D%3Ch1%3E%3Cscrip
t%3Ealert%28/hacked/% ...
[symbol=, ,
alert(, hacked, ), ...
[WORD, ,
alert(, hacked, ), ...
query%3D%3CIMG%2B%22%22%2
2%3E%3CSCRIPT%3Ealert ...
[query=, ,
[query=, ,
ReturnUrl%3Dhttp%3A//www.
elle.fr/recherche ...
[returnurl=, http://u,
globale, searchtext, ), ...
[returnurl=, http://u,
globale, searchtext, ), ...
_lang%3D%22%3E%3Cscript%3
Ealert%28document.coo ...
[_lang=, >,
alert(, document.cookie, ...
[_lang=, >,
alert(, document.cookie, ...
language%3D%22%3E%3C/scri
pt%3E%22%3E%27%3E%3Cs ...
[language=, >, ,
>, >,
[language=, >, ,
>, >,
q%3Dbentley%26stylesheet%
3D%22%3E%3Cscript%3Ea ...
[q=, bentley,
stylesheet=, >,
[q=, bentley,
stylesheet=, >,
option%3Dcom_wdshop%26vie
w%3Duserinfo%26ajax_j ...
[option=, com_wdshop,
view=, userinfo, ...
[WORD, com_wdshop, view=,
userinfo, ajax_json=, ...
CT_ORIG_URL%3D/arena/%22%
3E%3Cscript%3Ealert%2 ...
[ct_orig_url=, arena, >,
[ct_orig_url=, arena, >,
query%3DSearch...%26Produ
ct%3D%27%22--%3E%3C/s ...
[query=, search...,
product=, >, , ...
[query=, search...,
product=, >, , ...
[40637 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
normal['words'] = normal.apply(lambda x:[i if len(evil_word_frame.filter_by(i,'X1')) else 'WORD' for i in x['parsed']])
normal
param
parsed
words
_%3D1498591621808
[_=, 0]
[WORD, 0]
code%3Dzs_000001%2Czs_399
001%2Czs_399006%26cb% ...
[code=, zs_0, zs_0, zs_0,
cb=, fortune_hq_cn, _=, ...
[WORD, WORD, WORD, WORD,
WORD, WORD, WORD, 0] ...
_%3D1498591951848%26list%
3Dml_sh600030 ...
[_=, 0, list=, ml_sh0]
[WORD, 0, WORD, WORD]
6053%26ri%3Dzb6-00f%7E-
04gUry-01h- ...
[0, ri=, zb0, 0f, 0gury,
0h, 0rc, tn=, 0, en=, ...
[0, WORD, WORD, WORD,
WORD, WORD, WORD, WORD, ...
b1498592370545%3D1
[b0=, 0]
[WORD, 0]
v%3D13111002
[v=, 0]
[v=, 0]
COLLCC%3D3442798258%26
[collcc=, 0]
[WORD, 0]
t%3Dcheck%26rec%3Dstratus
%26etyp%3Dconnect%26z ...
[t=, check, rec=,
stratus, etyp=, connect, ...
[WORD, WORD, WORD, WORD,
WORD, WORD, WORD, WORD, ...
cn_600022%2Ccn_600516%2Cc
n_000002%2Ccn_600519% ...
[cn_0, cn_0, cn_0, cn_0,
cn_0, cn_0, cn_0, cn_0, ...
[WORD, WORD, WORD, WORD,
WORD, WORD, WORD, WORD, ...
_%3D1498179095094%26list%
3Dsh600030 ...
[_=, 0, list=, sh0]
[WORD, 0, WORD, WORD]
[200129 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
5.1 word2vec
from gensim.models.word2vec import Word2Vec
Using TensorFlow backend.
embedding_size=128
skip_window=5
num_sampled=64
num_iter=100
data_set = evil['words']
data_set[0:10]
dtype: list
Rows: 10
[['search=', '', ''], ['WORD', '', '', '', 'id=', '0'], ['query=', '', '', '>', 'WORD', 'true', 'ct=', 'null', 'autobounce=', 'true'], ['returnurl=', 'http://u', 'globale', 'searchtext', ')', '>', ''], ['_lang=', '>', ''], ['language=', '>', '', '>', '>', ''], ['q=', 'bentley', 'stylesheet=', '>', '', 'collections=', 'libcms'], ['WORD', 'com_wdshop', 'view=', 'userinfo', 'ajax_json=', 'ajax_fill_city_state', 'format=', 'WORD', 'zip=', '>', '', '', ''], ['ct_orig_url=', 'arena', '>', '', '', ''], ['query=', 'search...', 'product=', '>', '', '', '', 'page=', '0']]
model=Word2Vec(data_set,size=embedding_size,window=skip_window,negative=num_sampled,iter=num_iter)
#model=Word2Vec(data_set)
5.2 模型存取
model.save('model_word2vec_auto')
# import pickle
# with open('model_word2vec_pickle','wb') as f:
# pickle.dump(model,f)
model_new = Word2Vec.load('model_word2vec_auto')
# import pickle
# model_new = pickle.load('model_word2vec_pickle')
5.3 数据存取
# evil.save('evil_data')
# normal.save('normal_data')
# evil = graphlab.SFrame('evil_data')
# normal = graphlab.SFrame('normal_data')
5.4 测试模型
embeddings=model_new.wv
embeddings.similar_by_word("",5)
[('keyword=', 0.4176161289215088),
('searchfor=', 0.39741984009742737),
('page=', 0.38440901041030884),
('id=', 0.3829260766506195),
('q=', 0.3615732789039612)]
embeddings.similar_by_word("alert(",5)
[(')', 0.3255831003189087),
('', 0.3212870955467224),
('', 0.3093520998954773),
('', 0.30848926305770874),
('', 0.2897389233112335)]
5.5 添加向量和标签
dictionary=dict([(embeddings.index2word[i],i)for i in range(len(embeddings.index2word))])
reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
word2vec={"dictionary":dictionary,"embeddings":embeddings,"reverse_dictionary":reverse_dictionary}
from graphlab import SArray
def generate_vec(words):
l = SArray([0.0]*128)
for word in words:
if word in dictionary:
l += SArray(embeddings[word])
return l
# 添加新的vec列、label列
black = gl.SFrame([evil.apply(lambda x:generate_vec(x['words'])),evil.apply(lambda x:1)])
white = gl.SFrame([normal.apply(lambda x:generate_vec(x['words'])),normal.apply(lambda x:0)])
black.show()
Canvas is accessible via web browser at the URL: http://localhost:59854/index.html
Opening Canvas in default web browser.
#black.save('fuck_black')
#white.save('fuck_white')
len(white)
200129
二分类
之后的工作即划分数据集之后训练SVM,这部分工作导出到机器学习平台完成,可能涉敏就不展开了。
反思
从结果来看,尽管验证集给出了良好的预测效果,但模型的泛化能力很差。如果想要机器在大量的标签和函数的组合中识别出是否为恶意,则对数据集的要求则会更高。
同时,仅根据GET/POST参数判定XSS攻击是不完整的,需关联response的返回内容才能判断是否攻击成功,即便这样也仅能检出反射型XSS。该模型面对真实场景中复杂的输入和触发方式可能会略显鸡肋,需结合业务