phantomjs+selenium+python爬虫

#!/usr/bin/python
# -*- coding:utf-8 -*-



import sys


sys.path.append('..')
import urllib, datetime, urllib2, re, threading, time, random, requests, sys, cookielib, json


from selenium import webdriver
import sqldb.sqldb_test
import urlparse, socket
from snapshot import SnapShot
from apscheduler.schedulers.blocking import BlockingScheduler
from iploc import IPLoc
from phinfo import PhInfo
from detectPhishing.example import fishing


loginurl = 'http://183.207.215.20:7080/login'
getcookiesidurl = 'http://183.207.215.20:7080'
getdomainurl = 'http://183.207.215.20:7080/view/fpss/domainsealed'


reload(sys)
sys.setdefaultencoding('utf-8')




# 模拟登陆
class Login(object):
    def __init__(self):
        self.m_sqldb = sqldb.sqldb_test()  # .SQLdb()
        self.lock = threading.Lock()
        self.tasksched = BlockingScheduler()
        self.iploc = IPLoc()
        self.sshot = SnapShot()
        self.m_phi = PhInfo()


        self.name = ''
        self.passwprd = ''
        self.domain = ''


        self.cj = cookielib.LWPCookieJar()
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
        urllib2.install_opener(self.opener)


    # def getContent(self, maxPage):
    #     for index in range(1, maxPage + 1):
    #         self.login(index)


    # 线程条数
    def setThreadNum(self, nums=50):
        self.max_threadings = nums


    def setLoginInfo(self, username, password, domain):
        '''设置用户登录信息'''
        self.name = username
        self.pwd = password
        self.domain = domain


    def login(self):
        '''登录网站'''


        loginparams = {'username': self.name, 'password': self.pwd}
        r = requests.get(getcookiesidurl)  # 获取cookie
        cookie = [c.name + '=' + c.value for c in r.cookies]
        if cookie:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2896.3 Mobile Safari/537.36',
                'Cookie': cookie[0]}
            req = urllib2.Request(loginurl, urllib.urlencode(loginparams), headers=headers)
            response = urllib2.urlopen(req)
            self.operate = self.opener.open(req)
            thePage = response.read()
            jsonstr = json.loads(thePage)
            today = datetime.date.today()
            yesterday = today - datetime.timedelta()
            if jsonstr['opSucc'] == True:  # 判断登陆是否成功true or false


                cap = webdriver.DesiredCapabilities.PHANTOMJS
                cap["phantomjs.page.settings.resourceTimeout"] = 1000
                cap["phantomjs.page.settings.loadImages"] = False
                cap["phantomjs.page.settings.disk-cache"] = True
                cap["phantomjs.page.customHeaders.Cookie"] = cookie[0]
                driver = webdriver.PhantomJS(executable_path='C:\Python27\Scripts\phantomjs.exe',
                                             desired_capabilities=cap)  # 携带cookie抓取登陆后页面
                driver.set_window_size(800, 600)  # 设置浏览器宽,高
                getpage_total = 'http://183.207.215.20:7080/data/fpss/domainsealed?currentpage=1&domain=&url=&realurl=&datafrom=&reason=&operator=&pagesize=' + '&starttime=' + str(yesterday) + '+00:00:00' + '&endtime=' + str(yesterday) + '+23:59:59'


                driver.get(getpage_total)
                # time.sleep(2)
                # html = driver.execute_script("return document.documentElement.outerHTML").encode('gb18030')
                get_page_total = driver.find_elements_by_xpath("//body/pre")
                operate_time_result = []
                for q in get_page_total:
                    get_page_totall = q.text
                    json_data = json.loads(get_page_totall)
                    totalcount = json_data['data']['totalcount']


                    if totalcount != 0:
                        result_url = 'http://183.207.215.20:7080/data/fpss/domainsealed?currentpage=1&domain=&url=&realurl=&datafrom=&reason=&operator=' + '&pagesize=' + str(
                            totalcount) + '&starttime=' + str(yesterday) + '+00:00:00' + '&endtime=' + str(yesterday) + '+23:59:59'
                        driver.get(result_url)
                        # time.sleep(2)
                        operate_time = driver.find_elements_by_xpath("//body/pre")
                        for i in operate_time:
                            operate_time = i.text
                        json_data = json.loads(operate_time)
                        result_data = json_data['data']['reportlist']


                        for w in result_data:
                            data = {
                                'operate_time_result': w['operatetime'],
                                'domain_name_result': w['domain'],
                                'source_url_result': w['url'],
                                'true_url_result': w['realurl'],
                            }
                            # print operate_time_result
                            try:
                                self.insert(w['operatetime'], w['domain'],w['url'],w['realurl'])
                            except Exception,e:
                                raise
                            fishing_url_result=self.fishing_domain().testfishingsite(w['url'])
                            phish_type = fishing_url_result[1]
                            #print phish_type,222
                            if fishing_url_result[0] ==True:
                                try:
                                    self.processwork(w['domain'],w['realurl'],w['operatetime'])
                                    self.updateDeadDb(w['url'],phish_type)
                                except Exception,e:
                                    raise
                        #     operate_time_result.append(data)
                        # return operate_time_result


                    driver.quit()




        else:
            print u'网络异常'


    # 获取malicous_domain数据
    def getmaliciousdb(self):
        strSql = "SELECT DISTINCT(domain),src_url,actual_url,time FROM malicious_domains"
        self.lock.acquire()
        result = self.m_sqldb.fechdb(strSql)
        self.lock.release()
        return result


    # 更新malicious_domain表
    def updateDeadDb(self, dom, phis_type):
        print dom, 11
        strSql = "UPDATE malicious_domain SET malicious_type='phishing',phishing_type='%s',is_phishing='1' WHERE src_url='%s'" % (
        phis_type, dom)
        self.lock.acquire()
        lstUrl = self.m_sqldb.updatedb(strSql)
        self.lock.release()
        return lstUrl


    def __jionthreads(self, blockflag=False):
        '''
        join the threading
        '''


        main_thread = threading.currentThread()
        for t in threading.enumerate():
            if t is main_thread:
                continue
            else:
                if blockflag:


                    t.join()


                else:
                    t.join(0)  # 非阻塞join


    def fishing_domain(self):
        self.fishing_url = fishing.MainInterface()
        return self.fishing_url


    # 线程处理函数
    def main_function(self):


        # self.insert(operate_time_result,domain_name_result,source_url_result,true_url_result)
        # log = userlogin.getmaliciousdb()
        log = self.login()
        for i in log:
            print i[1],i[0]
            domain_name_result=i[0]
            operate_time_result=i[3]
            source_url_result=i[1]
            true_url_result=i[2]
            fishing_url_result=self.fishing_domain().testfishingsite(source_url_result)
            phish_type = fishing_url_result[1]
            #print phish_type,222
            if fishing_url_result[0] ==True:
                try:
                    self.processwork(domain_name_result, true_url_result, operate_time_result)
                    self.updateDeadDb(source_url_result,phish_type)
                except Exception,e:
                    raise




    def mainTask(self):
        '''
        主任务从数据库获取未备案检测的domain进行domain检测并更新相应的数据库
        '''
        # log = userlogin.getmaliciousdb()
        log = self.login()
        for i in log:
            # print i[1],i[0]
            # domain_name_result=i[0]
            # operate_time_result=i[3]
            # source_url_result=i[1]
            # true_url_result=i[2]
            domain_name_result = i['domain_name_result']
            operate_time_result = i['operate_time_result']
            # source_url_result=i['source_url_result']
            true_url_result = i['true_url_result']
            while True:
                self.__jionthreads()
                # print '当前线程数量:%d' % len(threading.enumerate())
                if len(threading.enumerate()) < self.setThreadNum:
                    try:
                        # 线程创建成功则退出循环
                        # self.logger.debug('创建新的线程')
                        # time.sleep(1)
                        newthread = threading.Thread(target=self.main_function,
                                                     args=(domain_name_result, operate_time_result, true_url_result))
                        break
                    except:
                        # self.logger.debug('线程创建失败 休眠0.5秒')
                        continue
                else:
                    pass


            # 设置线程为守护线程
            newthread.setDaemon(True)
            # 启动线程
            newthread.start()
            # self.logger.debug('新线程启动完毕'
        # 等待所有线程结束任务
        self.__jionthreads(True)


    # 插入数据库malicious_domain表
    def insert(self, operate_time_result, domain_name_result, source_url_result, true_url_result):
        into_str = 'domain,src_url,actual_url,time'
        value_str = "'%s', '%s','%s','%s'" % (
            domain_name_result, source_url_result, true_url_result, operate_time_result)
        sql = "insert into malicious_domain ( %s ) values ( %s )" % (into_str, value_str)
        self.lock.acquire()
        insertsql = self.m_sqldb.insertdb(sql)
        self.lock.release()
        return insertsql


    # 插入数据库phishing_info表
    def insertphisinfo(self, phishing_ip, phishing_ip_loc, snapshot_path, domain_name_result, true_url_result,
                       operate_time_result):
        into_str = 'phishing_site,phishing_source,phishing_ip,phishing_ip_loc,snapshot_path,phishing_url,found_time'
        value_str = "'%s', '%s','%s','%s','%s','%s','%s'" % (
            domain_name_result, 'jiangsu', phishing_ip, phishing_ip_loc, snapshot_path, true_url_result,
            operate_time_result)
        sql = "insert into phishing_info ( %s ) values ( %s )" % (into_str, value_str)
        self.lock.acquire()
        insertsql = self.m_sqldb.insertdb(sql)
        self.lock.release()
        return insertsql


    # 解析域名为ip
    def urlToIp(self, url):
        # print url
        '''
        get ip_loc from url
        '''
        try:
            parse = urlparse.urlparse(url)
            loc = parse.netloc
        except:
            loc = url
        try:
            result = socket.getaddrinfo(loc, None)
            return result[0][4][0]
        except:
            return ''


    def processwork(self, domain_site,actual_url,found_time):
        phishing_ip_loc = ''
        snapshot_path = ''
        # 线程内解析IP
        if domain_site.startswith('http://'):
            pass
        elif domain_site.startswith('https://'):
            pass
        else:
            phis_ip = 'http://' + domain_site


        phishing_ip = self.urlToIp(phis_ip)
        #print phishing_ip,111
        if phishing_ip:
            phishing_ip_loc = self.iploc.find_ip_loc(phishing_ip)
            snapshot_path = self.sshot.get_snapshot(domain_site, phishing_ip)
            # print snapshot_path, 3333
            if snapshot_path is None:
                snapshot_path = ''
        try:
            self.insertphisinfo(phishing_ip, phishing_ip_loc, snapshot_path, domain_site, actual_url, found_time)
        except Exception,e:
            raise
        return phishing_ip, phishing_ip_loc


    # 提取钓鱼网站的 URL,IP
    def getAllFishDb(self):
        # strSql = "select httpdump from fishingdump order by dump_time desc limit 0, 50"
        strSql = "SELECT DISTINCT(phishing_site),phishing_ip FROM phishing_info"
        lstFishSites = self.m_sqldb.fechdb(strSql)
        return lstFishSites


    def inserpis(self, phis, ip):
        all_data = self.getAllFishDb()
        print all_data, 1111
        old_phis_site = [i[0] for i in all_data]
        if phis in old_phis_site:
            # print 2222
            if ip == '':
                return
            else:
                try:
                    strSql = "UPDATE phishing_info SET phishing_ip='%s' WHERE phishing_site='%s'" % (
                        ip, phis)
                    self.lock.acquire()
                    lstUrl = self.m_sqldb.updatedb(strSql)
                    self.lock.release()
                except Exception, e:
                    raise
                return lstUrl
        else:
            try:
                into_str = 'phishing_site,phishing_source,phishing_ip,phishing_ip_loc,snapshot_path,phishing_url,found_time'
                value_str = "'%s', '%s','%s','%s','%s','%s','%s'" % (
                    phis, 'jiangsu', ip, 'sss', '', '', datetime.datetime.now())
                sql = "insert into phishing_info ( %s ) values ( %s )" % (into_str, value_str)
                self.lock.acquire()
                insertsql = self.m_sqldb.insertdb(sql)
                self.lock.release()
            except Exception, e:
                raise
            return insertsql


    # 定时任务
    def start_client(self):
        '''
        --interval--
            Parameters:
                weeks (int) – number of weeks to wait
                days (int) – number of days to wait
                hours (int) – number of hours to wait
                minutes (int) – number of minutes to wait
                seconds (int) – number of seconds to wait
                start_date (datetime|str) – starting point for the interval calculation
                end_date (datetime|str) – latest possible date/time to trigger on
                timezone (datetime.tzinfo|str) – time zone to use for the date/time calculations
        --cron--
            Parameters:
                year (int|str) – 4-digit year
                month (int|str) – month (1-12)
                day (int|str) – day of the (1-31)
                week (int|str) – ISO week (1-53)
                day_of_week (int|str) – number or name of weekday (0-6 or mon,tue,wed,thu,fri,sat,sun)
                hour (int|str) – hour (0-23)
                minute (int|str) – minute (0-59)
                second (int|str) – second (0-59)
                start_date (datetime|str) – earliest possible date/time to trigger on (inclusive)
                end_date (datetime|str) – latest possible date/time to trigger on (inclusive)
                timezone (datetime.tzinfo|str) – time zone to use for the date/time calculations (defaults to scheduler timezone)


        add_job:
            Parameters:
                func – callable (or a textual reference to one) to run at the given time
                trigger (str|apscheduler.triggers.base.BaseTrigger) – trigger that determines when func is called
                args (list|tuple) – list of positional arguments to call func with
                kwargs (dict) – dict of keyword arguments to call func with
                id (str|unicode) – explicit identifier for the job (for modifying it later)
                name (str|unicode) – textual description of the job
                misfire_grace_time (int) – seconds after the designated run time that the job is still allowed to be run
                coalesce (bool) – run once instead of many times if the scheduler determines that the job should be run more than once in succession
                max_instances (int) – maximum number of concurrently running instances allowed for this job
                next_run_time (datetime) – when to first run the job, regardless of the trigger (pass None to add the job as paused)
                jobstore (str|unicode) – alias of the job store to store the job in
                executor (str|unicode) – alias of the executor to run the job with
                replace_existing (bool) – True to replace an existing job with the same id (but retain the number of runs from the existing one)
        '''


        # 根据情况调度任务
        self.tasksched.add_job(self.mainTask, 'cron', year='*', month='*', day='*', hour=1, minute=0)
        # self.tasksched.add_job(self.get_newconfig, 'interval', minutes = 30,max_instances = 10)
        self.tasksched.start()




if __name__ == '__main__':
    userlogin = Login()
    username = 'jtyjy'
    password = 'jtyjy!@#$'
    domain = getcookiesidurl
    userlogin.setLoginInfo(username, password, domain)
    userlogin.login()
    # userlogin.inserpis(('wap.ccbmdgww.cc',''))
    # userlogin.main_function()
    # userlogin.mainTask()
    # userlogin.processwork('www.bankcomm.com/')




    # userlogin.start_client()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值