Python+Selenium实现爬取anyrun样本md5

背景:

之前遇到一个需求,需要看下anyrun一年的新样本。我们库里样本很多,只要有md5,基本许多都可以下到。所以就爬一下anyrun的吧!个人反爬虫技术有限,没想到更好的方法去爬anyrun,用了相对笨的方法Selenium大法。简单分享下,希望大家有所学习!

代码如下 :

import time
import re
import sys
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
user_name = "your username"
pass_word = "your password"
end_year_time = "2019"

url = "https://app.any.run/submissions"
md5_value_path = "2019_current_file_virus_md5.txt"
run_log_path = "run_log.log"

browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)

browser.get(url)
time.sleep(2)
browser.maximize_window()
browser.find_element_by_xpath("/html/body/nav/ul[2]/li[1]").click()

time.sleep(2)
browser.find_element_by_id("at-field-username_and_email").click()
browser.find_element_by_id("at-field-username_and_email").clear()
browser.find_element_by_id("at-field-username_and_email").send_keys(user_name)
browser.find_element_by_id("at-field-password").click()
browser.find_element_by_id("at-field-password").clear()
browser.find_element_by_id("at-field-password").send_keys(pass_word)
browser.find_element_by_id("at-btn").submit()
time.sleep(2)

browser.find_element_by_id("history-filterBtn").click()
browser.find_element_by_xpath("//*[@id='historyMenu']/div/div/div[1]/div/form/div[1]").click()
browser.find_element_by_xpath("//*[@id='historyMenu']/div/div/div[1]/div/form/div[1]/div/ul/li[2]").click()
browser.find_element_by_xpath("//*[@id='historySearchBtn']").click()
time.sleep(2)
pageSource = str(browser.page_source.encode("GBK","ignore"))

time_pattern = re.compile('os__time"?>(.*?)<')
md5_pattern = re.compile('md5:(.*?)aria-hidden',re.S)
max_page_content = int(browser.find_element_by_css_selector(".history-table--footer__current-page").text.split(" OF ")[1])
for i in range(0,sys.maxsize):
    print("=============={0}================".format(i+1))
    try:
        pageSource = browser.page_source
        first_time = re.findall(time_pattern, pageSource)[0].replace(",", "").split(" ")[2]
        md5_list = re.findall(md5_pattern, pageSource)
        if int(first_time) >= int(end_year_time):
            count = 0
            for md5 in md5_list:
                if "data-clipboard-text" in md5:
                    count = count + 1
                    md5_value = md5.split('"')[-2]
                    with open(md5_value_path,"a") as fa:
                        fa.write(md5_value + "\n")
            print("{0} Page Get Md5 Success! Count:{1} \n".format(i+1,count))
            with open(run_log_path, "a") as f_log_a:
                f_log_a.write("{0} Page Get Md5 Success! Count:{1} \n".format(i+1,count))

        else:
            print("Time Over")
            sys.exit()
        browser.find_element_by_class_name("history-table--footer__next").click()
        time.sleep(2)
    except:
        print("{0} page error ".format(i+1))
        with open(run_log_path, "a") as f_log_a:
            f_log_a.write("{0} page error \n".format(i+1))
        time.sleep(2)
        pass

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 6
    评论
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值