背景:
之前遇到一个需求,需要看下anyrun一年的新样本。我们库里样本很多,只要有md5,基本许多都可以下到。所以就爬一下anyrun的吧!个人反爬虫技术有限,没想到更好的方法去爬anyrun,用了相对笨的方法Selenium大法。简单分享下,希望大家有所学习!
代码如下 :
import time
import re
import sys
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
user_name = "your username"
pass_word = "your password"
end_year_time = "2019"
url = "https://app.any.run/submissions"
md5_value_path = "2019_current_file_virus_md5.txt"
run_log_path = "run_log.log"
browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)
browser.get(url)
time.sleep(2)
browser.maximize_window()
browser.find_element_by_xpath("/html/body/nav/ul[2]/li[1]").click()
time.sleep(2)
browser.find_element_by_id("at-field-username_and_email").click()
browser.find_element_by_id("at-field-username_and_email").clear()
browser.find_element_by_id("at-field-username_and_email").send_keys(user_name)
browser.find_element_by_id("at-field-password").click()
browser.find_element_by_id("at-field-password").clear()
browser.find_element_by_id("at-field-password").send_keys(pass_word)
browser.find_element_by_id("at-btn").submit()
time.sleep(2)
browser.find_element_by_id("history-filterBtn").click()
browser.find_element_by_xpath("//*[@id='historyMenu']/div/div/div[1]/div/form/div[1]").click()
browser.find_element_by_xpath("//*[@id='historyMenu']/div/div/div[1]/div/form/div[1]/div/ul/li[2]").click()
browser.find_element_by_xpath("//*[@id='historySearchBtn']").click()
time.sleep(2)
pageSource = str(browser.page_source.encode("GBK","ignore"))
time_pattern = re.compile('os__time"?>(.*?)<')
md5_pattern = re.compile('md5:(.*?)aria-hidden',re.S)
max_page_content = int(browser.find_element_by_css_selector(".history-table--footer__current-page").text.split(" OF ")[1])
for i in range(0,sys.maxsize):
print("=============={0}================".format(i+1))
try:
pageSource = browser.page_source
first_time = re.findall(time_pattern, pageSource)[0].replace(",", "").split(" ")[2]
md5_list = re.findall(md5_pattern, pageSource)
if int(first_time) >= int(end_year_time):
count = 0
for md5 in md5_list:
if "data-clipboard-text" in md5:
count = count + 1
md5_value = md5.split('"')[-2]
with open(md5_value_path,"a") as fa:
fa.write(md5_value + "\n")
print("{0} Page Get Md5 Success! Count:{1} \n".format(i+1,count))
with open(run_log_path, "a") as f_log_a:
f_log_a.write("{0} Page Get Md5 Success! Count:{1} \n".format(i+1,count))
else:
print("Time Over")
sys.exit()
browser.find_element_by_class_name("history-table--footer__next").click()
time.sleep(2)
except:
print("{0} page error ".format(i+1))
with open(run_log_path, "a") as f_log_a:
f_log_a.write("{0} page error \n".format(i+1))
time.sleep(2)
pass