android 敏感词过滤器,网站安检过滤敏感词汇并爬取暗链,把含有敏感词汇的网页下载,利用爬虫原理...

[Python] 纯文本查看 复制代码# -*- coding: UTF-8 -*-

import sys

import urllib2

import re

import uniout

import os

from datetime import datetime

import time

reload(sys)

sys.setdefaultencoding('utf-8')

url = "https://www.hao123.com/?tn=88093251_74_hao_pg" # 要爬取的网址

http = "http"

request = urllib2.Request(url, "headers=Headers")

nodes = {".bmp", ".jpg", ".png", ".jpeg", ".xls", ".xlsx", ".doc", ".docx", ".wav", ".rmvb", ".mp3", ".gif", "#",

".mp4"}

filename = 'word.txt' # txt文件和当前脚本在同一目录下,所以不用写具体路径 在文件中分行放置你要爬取的敏感词汇

pos = []

count = 0

with open(filename, 'r') as file_to_read:

pos = file_to_read.readlines()

# 链接网址

def getResponse():

try:

response = urllib2.urlopen(request)

except urllib2.HTTPError, he:

print he.code

except urllib2.URLError, ue:

print ue.reason

else:

return response.read().decode('utf-8')

# 获取链接

def getUrl():

html = getResponse()

patterncss = '

patternjs = '

patternpage = '

patternonclick = "openQuestion.*?'(.*?)'"

# 根据正则获取标签中的url

href = re.compile(patterncss, re.S).findall(html)

# 爬取图片地址 一般敏感词汇用不着爬取图片 可以注释该段代码

href += re.compile(patternimg, re.S).findall(html)

href += re.compile(patternpage, re.S).findall(html)

href += re.compile(patternjs, re.S).findall(html)

href += re.compile(patternonclick, re.S).findall(html)

hrefs = []

for h in href: # 根据规则去除多余连接

for n in nodes:

if n.upper() not in h.upper():

hrefs.append(h)

break

hrefs = list(set(hrefs)) # 去重

return hrefs

def reasonCode():

itemurl = getUrl()

for item1 in itemurl:

if http in item1:

getUrlMsg(item1)

# 爬虫内容对比关键字

def getUrlMsg(aa):

global count

ss = []

try:

response = urllib2.urlopen(urllib2.Request(aa, "headers=Headers"))

bb = response.read()

for po in pos:

postr = ''.join(po.split())

if postr in bb and postr is not '':

ss.append(postr)

if len(ss) > 0:

print '敏感网址', aa, str(ss).decode('string_escape')

greatMirk('敏感网址' + str(count), bb)

else:

print '该网址合格:', aa

count += 1

except Exception as e:

count += 1

print e, ' ', aa

pass

# 创建文件下载html

def greatMirk(name, msg):

filename = 'd:/html/' + name.decode('utf-8') + '.html'

if os.path.exists(filename):

with open(filename, 'wb') as ff:

ff.write(msg)

else:

with open(filename, 'wb') as ff:

ff.write(msg)

starTime = datetime.now() # 获得当前时间

reasonCode()

endTime = datetime.now() # 获取当前时间

durn = (endTime - starTime).seconds # 两个时间差,并以秒显示出来

print '在', durn, '秒内爬取了', count, '个网址'

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值