phantomjs+selenium+python爬虫

最新推荐文章于 2023-10-30 11:24:00 发布

qq_36778460

最新推荐文章于 2023-10-30 11:24:00 发布

阅读量422

点赞数

本文链接：https://blog.csdn.net/qq_36778460/article/details/54601695

版权

#!/usr/bin/python
# -*- coding:utf-8 -*-

import sys

sys.path.append('..')
import urllib, datetime, urllib2, re, threading, time, random, requests, sys, cookielib, json

from selenium import webdriver
import sqldb.sqldb_test
import urlparse, socket
from snapshot import SnapShot
from apscheduler.schedulers.blocking import BlockingScheduler
from iploc import IPLoc
from phinfo import PhInfo
from detectPhishing.example import fishing

loginurl = 'http://183.207.215.20:7080/login'
getcookiesidurl = 'http://183.207.215.20:7080'
getdomainurl = 'http://183.207.215.20:7080/view/fpss/domainsealed'

reload(sys)
sys.setdefaultencoding('utf-8')

# 模拟登陆
class Login(object):
def __init__(self):
self.m_sqldb = sqldb.sqldb_test() # .SQLdb()
self.lock = threading.Lock()
self.tasksched = BlockingScheduler()
self.iploc = IPLoc()
self.sshot = SnapShot()
self.m_phi = PhInfo()

self.name = ''
self.passwprd = ''
self.domain = ''

self.cj = cookielib.LWPCookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
urllib2.install_opener(self.opener)

# def getContent(self, maxPage):
# for index in range(1, maxPage + 1):
# self.login(index)

# 线程条数
def setThreadNum(self, nums=50):
self.max_threadings = nums

def setLoginInfo(self, username, password, domain):
'''设置用户登录信息'''
self.name = username
self.pwd = password
self.domain = domain

def login(self):
'''登录网站'''

loginparams = {'username': self.name, 'password': self.pwd}
r = requests.get(getcookiesidurl) # 获取cookie
cookie = [c.name + '=' + c.value for c in r.cookies]
if cookie:
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2896.3 Mobile Safari/537.36',
'Cookie': cookie[0]}
req = urllib2.Request(loginurl, urllib.urlencode(loginparams), headers=headers)
response = urllib2.urlopen(req)
self.operate = self.opener.open(req)
thePage = response.read()
jsonstr = json.loads(thePage)
today = datetime.date.today()
yesterday = today - datetime.timedelta()
if jsonstr['opSucc'] == True: # 判断登陆是否成功true or false

cap = webdriver.DesiredCapabilities.PHANTOMJS
cap["phantomjs.page.settings.resourceTimeout"] = 1000
cap["phantomjs.page.settings.loadImages"] = False
cap["phantomjs.page.settings.disk-cache"] = True
cap["phantomjs.page.customHeaders.Cookie"] = cookie[0]
driver = webdriver.PhantomJS(executable_path='C:\Python27\Scripts\phantomjs.exe',
desired_capabilities=cap) # 携带cookie抓取登陆后页面
driver.set_window_size(800, 600) # 设置浏览器宽，高
getpage_total = 'http://183.207.215.20:7080/data/fpss/domainsealed?currentpage=1&domain=&url=&realurl=&datafrom=&reason=&operator=&pagesize=' + '&starttime=' + str(yesterday) + '+00:00:00' + '&endtime=' + str(yesterday) + '+23:59:59'

driver.get(getpage_total)
# time.sleep(2)
# html = driver.execute_script("return document.documentElement.outerHTML").encode('gb18030')
get_page_total = driver.find_elements_by_xpath("//body/pre")
operate_time_result = []
for q in get_page_total:
get_page_totall = q.text
json_data = json.loads(get_page_totall)
totalcount = json_data['data']['totalcount']

if totalcount != 0:
result_url = 'http://183.207.215.20:7080/data/fpss/domainsealed?currentpage=1&domain=&url=&realurl=&datafrom=&reason=&operator=' + '&pagesize=' + str(
totalcount) + '&starttime=' + str(yesterday) + '+00:00:00' + '&endtime=' + str(yesterday) + '+23:59:59'
driver.get(result_url)
# time.sleep(2)
operate_time = driver.find_elements_by_xpath("//body/pre")
for i in operate_time:
operate_time = i.text
json_data = json.loads(operate_time)
result_data = json_data['data']['reportlist']

for w in result_data:
data = {
'operate_time_result': w['operatetime'],
'domain_name_result': w['domain'],
'source_url_result': w['url'],
'true_url_result': w['realurl'],
}
# print operate_time_result
try:
self.insert(w['operatetime'], w['domain'],w['url'],w['realurl'])
except Exception,e:
raise
fishing_url_result=self.fishing_domain().testfishingsite(w['url'])
phish_type = fishing_url_result[1]
#print phish_type,222
if fishing_url_result[0] ==True:
try:
self.processwork(w['domain'],w['realurl'],w['operatetime'])
self.updateDeadDb(w['url'],phish_type)
except Exception,e:
raise
# operate_time_result.append(data)
# return operate_time_result

driver.quit()

else:
print u'网络异常'

# 获取malicous_domain数据
def getmaliciousdb(self):
strSql = "SELECT DISTINCT(domain),src_url,actual_url,time FROM malicious_domains"
self.lock.acquire()
result = self.m_sqldb.fechdb(strSql)
self.lock.release()
return result

# 更新malicious_domain表
def updateDeadDb(self, dom, phis_type):
print dom, 11
strSql = "UPDATE malicious_domain SET malicious_type='phishing',phishing_type='%s',is_phishing='1' WHERE src_url='%s'" % (
phis_type, dom)
self.lock.acquire()
lstUrl = self.m_sqldb.updatedb(strSql)
self.lock.release()
return lstUrl

def __jionthreads(self, blockflag=False):
'''
join the threading
'''

main_thread = threading.currentThread()
for t in threading.enumerate():
if t is main_thread:
continue
else:
if blockflag:

t.join()

else:
t.join(0) # 非阻塞join

def fishing_domain(self):
self.fishing_url = fishing.MainInterface()
return self.fishing_url

# 线程处理函数
def main_function(self):

# self.insert(operate_time_result,domain_name_result,source_url_result,true_url_result)
# log = userlogin.getmaliciousdb()
log = self.login()
for i in log:
print i[1],i[0]
domain_name_result=i[0]
operate_time_result=i[3]
source_url_result=i[1]
true_url_result=i[2]
fishing_url_result=self.fishing_domain().testfishingsite(source_url_result)
phish_type = fishing_url_result[1]
#print phish_type,222
if fishing_url_result[0] ==True:
try:
self.processwork(domain_name_result, true_url_result, operate_time_result)
self.updateDeadDb(source_url_result,phish_type)
except Exception,e:
raise

def mainTask(self):
'''
主任务从数据库获取未备案检测的domain进行domain检测并更新相应的数据库
'''
# log = userlogin.getmaliciousdb()
log = self.login()
for i in log:
# print i[1],i[0]
# domain_name_result=i[0]
# operate_time_result=i[3]
# source_url_result=i[1]
# true_url_result=i[2]
domain_name_result = i['domain_name_result']
operate_time_result = i['operate_time_result']
# source_url_result=i['source_url_result']
true_url_result = i['true_url_result']
while True:
self.__jionthreads()
# print '当前线程数量：%d' % len(threading.enumerate())
if len(threading.enumerate()) < self.setThreadNum:
try:
# 线程创建成功则退出循环
# self.logger.debug('创建新的线程')
# time.sleep(1)
newthread = threading.Thread(target=self.main_function,
args=(domain_name_result, operate_time_result, true_url_result))
break
except:
# self.logger.debug('线程创建失败休眠0.5秒')
continue
else:
pass

# 设置线程为守护线程
newthread.setDaemon(True)
# 启动线程
newthread.start()
# self.logger.debug('新线程启动完毕'
# 等待所有线程结束任务
self.__jionthreads(True)

# 插入数据库malicious_domain表
def insert(self, operate_time_result, domain_name_result, source_url_result, true_url_result):
into_str = 'domain,src_url,actual_url,time'
value_str = "'%s', '%s','%s','%s'" % (
domain_name_result, source_url_result, true_url_result, operate_time_result)
sql = "insert into malicious_domain ( %s ) values ( %s )" % (into_str, value_str)
self.lock.acquire()
insertsql = self.m_sqldb.insertdb(sql)
self.lock.release()
return insertsql

# 插入数据库phishing_info表
def insertphisinfo(self, phishing_ip, phishing_ip_loc, snapshot_path, domain_name_result, true_url_result,
operate_time_result):
into_str = 'phishing_site,phishing_source,phishing_ip,phishing_ip_loc,snapshot_path,phishing_url,found_time'
value_str = "'%s', '%s','%s','%s','%s','%s','%s'" % (
domain_name_result, 'jiangsu', phishing_ip, phishing_ip_loc, snapshot_path, true_url_result,
operate_time_result)
sql = "insert into phishing_info ( %s ) values ( %s )" % (into_str, value_str)
self.lock.acquire()
insertsql = self.m_sqldb.insertdb(sql)
self.lock.release()
return insertsql

# 解析域名为ip
def urlToIp(self, url):
# print url
'''
get ip_loc from url
'''
try:
parse = urlparse.urlparse(url)
loc = parse.netloc
except:
loc = url
try:
result = socket.getaddrinfo(loc, None)
return result[0][4][0]
except:
return ''

def processwork(self, domain_site,actual_url,found_time):
phishing_ip_loc = ''
snapshot_path = ''
# 线程内解析IP
if domain_site.startswith('http://'):
pass
elif domain_site.startswith('https://'):
pass
else:
phis_ip = 'http://' + domain_site

phishing_ip = self.urlToIp(phis_ip)
#print phishing_ip,111
if phishing_ip:
phishing_ip_loc = self.iploc.find_ip_loc(phishing_ip)
snapshot_path = self.sshot.get_snapshot(domain_site, phishing_ip)
# print snapshot_path, 3333
if snapshot_path is None:
snapshot_path = ''
try:
self.insertphisinfo(phishing_ip, phishing_ip_loc, snapshot_path, domain_site, actual_url, found_time)
except Exception,e:
raise
return phishing_ip, phishing_ip_loc

# 提取钓鱼网站的 URL,IP
def getAllFishDb(self):
# strSql = "select httpdump from fishingdump order by dump_time desc limit 0, 50"
strSql = "SELECT DISTINCT(phishing_site),phishing_ip FROM phishing_info"
lstFishSites = self.m_sqldb.fechdb(strSql)
return lstFishSites

def inserpis(self, phis, ip):
all_data = self.getAllFishDb()
print all_data, 1111
old_phis_site = [i[0] for i in all_data]
if phis in old_phis_site:
# print 2222
if ip == '':
return
else:
try:
strSql = "UPDATE phishing_info SET phishing_ip='%s' WHERE phishing_site='%s'" % (
ip, phis)
self.lock.acquire()
lstUrl = self.m_sqldb.updatedb(strSql)
self.lock.release()
except Exception, e:
raise
return lstUrl
else:
try:
into_str = 'phishing_site,phishing_source,phishing_ip,phishing_ip_loc,snapshot_path,phishing_url,found_time'
value_str = "'%s', '%s','%s','%s','%s','%s','%s'" % (
phis, 'jiangsu', ip, 'sss', '', '', datetime.datetime.now())
sql = "insert into phishing_info ( %s ) values ( %s )" % (into_str, value_str)
self.lock.acquire()
insertsql = self.m_sqldb.insertdb(sql)
self.lock.release()
except Exception, e:
raise
return insertsql

# 定时任务
def start_client(self):
'''
--interval--
Parameters:
weeks (int) – number of weeks to wait
days (int) – number of days to wait
hours (int) – number of hours to wait
minutes (int) – number of minutes to wait
seconds (int) – number of seconds to wait
start_date (datetime|str) – starting point for the interval calculation
end_date (datetime|str) – latest possible date/time to trigger on
timezone (datetime.tzinfo|str) – time zone to use for the date/time calculations
--cron--
Parameters:
year (int|str) – 4-digit year
month (int|str) – month (1-12)
day (int|str) – day of the (1-31)
week (int|str) – ISO week (1-53)
day_of_week (int|str) – number or name of weekday (0-6 or mon,tue,wed,thu,fri,sat,sun)
hour (int|str) – hour (0-23)
minute (int|str) – minute (0-59)
second (int|str) – second (0-59)
start_date (datetime|str) – earliest possible date/time to trigger on (inclusive)
end_date (datetime|str) – latest possible date/time to trigger on (inclusive)
timezone (datetime.tzinfo|str) – time zone to use for the date/time calculations (defaults to scheduler timezone)

add_job:
Parameters:
func – callable (or a textual reference to one) to run at the given time
trigger (str|apscheduler.triggers.base.BaseTrigger) – trigger that determines when func is called
args (list|tuple) – list of positional arguments to call func with
kwargs (dict) – dict of keyword arguments to call func with
id (str|unicode) – explicit identifier for the job (for modifying it later)
name (str|unicode) – textual description of the job
misfire_grace_time (int) – seconds after the designated run time that the job is still allowed to be run
coalesce (bool) – run once instead of many times if the scheduler determines that the job should be run more than once in succession
max_instances (int) – maximum number of concurrently running instances allowed for this job
next_run_time (datetime) – when to first run the job, regardless of the trigger (pass None to add the job as paused)
jobstore (str|unicode) – alias of the job store to store the job in
executor (str|unicode) – alias of the executor to run the job with
replace_existing (bool) – True to replace an existing job with the same id (but retain the number of runs from the existing one)
'''

# 根据情况调度任务
self.tasksched.add_job(self.mainTask, 'cron', year='*', month='*', day='*', hour=1, minute=0)
# self.tasksched.add_job(self.get_newconfig, 'interval', minutes = 30,max_instances = 10)
self.tasksched.start()

if __name__ == '__main__':
userlogin = Login()
username = 'jtyjy'
password = 'jtyjy!@#$'
domain = getcookiesidurl
userlogin.setLoginInfo(username, password, domain)
userlogin.login()
# userlogin.inserpis(('wap.ccbmdgww.cc',''))
# userlogin.main_function()
# userlogin.mainTask()
# userlogin.processwork('www.bankcomm.com/')

# userlogin.start_client()