在进行Windows系统下恶意软件分析时,没有公开的良性软件数据集,需要自己手动收集一些良性软件用于分析,由于分析时需要的数据量比较多,故而使用爬虫技术来帮助我们快速收集大量的数据,下面的代码是用于收集【系统之家】网站中的良性软件:
import time
from urllib.request import urlopen as uOpen
from bs4 import BeautifulSoup as soup
import re
import os
def benign():
myUrl = "https://www.xitongzhijia.net/new/"
count = 1
# 连接和下载页面
uClient = uOpen(myUrl)
page_html = uClient.read()
uClient.close()
# 初始化第一页的BeautifulSoup解析
page_soup = soup(page_html, "html.parser")
numPagesA = page_soup.findAll("span",{"class":"ml20 last"})
maxPage = 1
for numPageA in numPagesA:
maxPage=int(re.search("\d+",numPageA.text).group(0))
downloadPageHeaders = page_soup.findAll("td", {"class": "td_2"})
print("第1页有"+str(len(downloadPageHeaders))+"个软件")
downloadPageLinks = []
for pageHeader in downloadPageHeaders:
pageHeader = pageHeader.findAll("a")[0]
downloadPageLink = pageHeader["href"]
downloadPageLinks.append(downloadPageLink)
# 加载页面的链接下载页面并下载exe文件
for dlPage in downloadPageLinks:
myUrl2 = "https://www.xitongzhijia.net"+dlPage
# 连接和下载页面
uClient = uOpen(myUrl2)
page_html = uClient.read()
uClient.close()
# 实例化页面的BeautifulSoup解析
dlPageSoup = soup(page_html, "html.parser")
downLinks = dlPageSoup.findAll("a", {"class": "local_download"})
if len(downLinks) == 0:
time.sleep(5)
continue
downLink=downLinks[0]
link = downLink["o_href"]
print("第"+str(count)+"个下载链接: "+link)
try:
file = uOpen(link)
max_length = 100 * 1024 *1024 #500MB
if int(file.info()['Content-Length']) <= max_length:
print(str(count) + ": " + link)
name=link.rindex('/')
print(link[name+1 :])
# os.system("sudo wget -O /home/liang/Desktop/benignware-1/" + link[name+1 :] + " " + link + " --read-timeout=1")
count += 1
except:
pass
#for pageNum in range(2,maxPage+1):
for pageNum in range(135,145):
myUrl3 = myUrl+"list_"+str(pageNum)+".html"
print("\n\n"+myUrl3)
uClient = uOpen(myUrl3)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
downloadPageHeaders = page_soup.findAll("td", {"class": "td_2"})
print("第"+str(pageNum)+"页有"+str(len(downloadPageHeaders))+"个软件")
downloadPageLinks = []
for pageHeader in downloadPageHeaders:
pageHeader = pageHeader.findAll("a")[0]
downloadPageLink = pageHeader["href"]
downloadPageLinks.append(downloadPageLink)
# 加载页面的链接下载页面并下载exe文件
for dlPage in downloadPageLinks:
myUrl2 = "https://www.xitongzhijia.net"+dlPage
# 连接和下载页面
uClient = uOpen(myUrl2)
page_html = uClient.read()
uClient.close()
# 实例化页面的BeautifulSoup解析
dlPageSoup = soup(page_html, "html.parser")
downLinks = dlPageSoup.findAll("a", {"class": "local_download"})
if len(downLinks) == 0:
time.sleep(5)
continue
downLink=downLinks[0]
link = downLink["o_href"]
print("第"+str(count)+"个下载链接: "+link)
try:
file = uOpen(link)
max_length = 300 * 1024 *1024 #500MB
if int(file.info()['Content-Length']) <= max_length:
print(str(count) + ": " + link)
name=link.rindex('/')
print(link[name+1 :])
os.system("sudo wget -O /home/liang/Desktop/benignware-1/" + link[name+1 :] + " " + link + " --read-timeout=1")
count += 1
except:
pass
benign()
在使用上面python代码收集之后,使用virusTotal进行标记数据,使用在线检测进行保证良性软件的纯洁性,将标记好的良性软件与公开数据集中的恶意软件进行混合,然后就可以进行分析或者检测之类的操作了。。。希望可以帮助到大家