python html解析对比_python htmlparse页面解析示例

最新推荐文章于 2021-02-03 04:51:11 发布

weixin_39888082

最新推荐文章于 2021-02-03 04:51:11 发布

阅读量154

点赞数

文章标签： python html解析对比

本文链接：https://blog.csdn.net/weixin_39888082/article/details/111525719

版权

11月

2013

0x01 今天写了个示例程序，用python解析网页，htmlparse是核心，配合urllib2，解析页面中的某些特定标签

0x02 代码如下，作用是用来爬取页面信息的，

#coding=utf-8

'''

Created on 2013-11-5

@author: lenovo

'''

from HTMLParser import HTMLParser

import time

import urllib2

import urllib

import time

from urllib2 import urlopen

loginPass=[]

check=0

webshell=[]

temp=0

savefile=''

pagenum=0

class MyParser(HTMLParser):

"""一个简单的HTMLparser的例子"""

def handle_decl(self, decl):

"""处理头文档"""

HTMLParser.handle_decl(self, decl)

#print decl

def handle_starttag(self, tag, attrs):

"""处理起始标签"""

global loginPass

global check

global webshell

global temp

HTMLParser.handle_starttag(self, tag, attrs)

#if not HTMLParser.get_starttag_text(self).endswith("/>"):

#print ""

if tag=='tr' and self.rawdata.find("""

check=1

if tag=='a' and check==1 and len(attrs)>1 and attrs[1][1][-3:]=='php' :

#print attrs[1][1]

z=[]

z.append(attrs[1][1])

webshell.append(z)

temp=len(webshell)

if tag=='input' and check==1 and len(attrs)>3 and attrs[3][0]=='value' and attrs[1][0]=='style':

#print attrs

webshell[temp-1].append(attrs[3][1])

if tag=='input' and check==1 and len(attrs)==3 and attrs[2][0]=='value' and attrs[1][0]=='style' and attrs[0][1]=='text':

#print attrs

webshell[temp-1].append(attrs[2][1])

if tag=='input':

#print attrs

if attrs[0][0]=='type' and attrs[0][1]=='checkbox' and attrs[1][1]=='pwd[]' :

#print attrs[2][1]

loginPass.append(attrs[2][1]) # 处理图片

#for attr in attrs:

# for t in attr:

# print t

def handle_data(self, data):

"""处理文本元素"""

HTMLParser.handle_data(self, data)

#print data,

def handle_endtag(self, tag):

"""处理结束标签"""

HTMLParser.handle_endtag(self, tag)

if tag=='tr':

check=0

#if not HTMLParser.get_starttag_text(self).endswith("/>"):

#print "",tag,">"

def handle_startendtag(self, tag, attrs):

"""处理自闭标签"""

HTMLParser.handle_startendtag(self, tag, attrs)

#print HTMLParser.get_starttag_text(self)

def handle_comment(self, data):

"""处理注释"""

HTMLParser.handle_comment(self, data)

#print data

def close(self):

HTMLParser.close(self)

#print "parser over"

def Post(url,s):

try:

s1=urllib.quote(s,"=&")

#print s1

req = urllib2.Request(url,s1)

resp = urllib2.urlopen(req,timeout=10)

web=resp.read()

#print strlist

except Exception:

return ""

return web

def Get(url):

try:

#s1=urllib.quote(s,"=#"()[],@'&\")

#print s1

req = urllib2.Request(url)

resp = urllib2.urlopen(req,timeout=60)

web=resp.read()

#print strlist

except Exception:

return ""

return web

def saveWebshell():

file=open(savefile,'w')

for i in webshell:

print i

file.write(i[0]+','+i[1]+'n')

file.close()

print '******* Save SuccessFul ********'

#print 'Save SuccessFul'

#def getInfo(page):

def getPage(url,num):

if num==0:

if len(loginPass)!=0:

print '******* Login ********'

passWd='pwd[]='+loginPass[2]+'&pwd[]='+loginPass[4]

demo=MyParser()

demo.feed(Post(url,passWd))

demo.close()

print '******* Get Page '+str(num)+' ********'

urlNew=url+'&p='+str(num)

demo=MyParser()

demo.feed(Get(urlNew))

demo.close()

#def getTotal(url):

def login(url):

print '*******Get Password********'

demo=MyParser()

demo.feed(Get(url))

demo.close()

def autoSearch(url):

#print loginPass

#getPage(url,0)

num=pagenum

for i in xrange(0,num+1):

page=getPage(url,i)

#getInfo(page)

print '******* Scan Over ********'

saveWebshell()

if __name__ == '__main__':

pagenum=9

savefile=r'E:work profitprofitjavaicetoolstoolsWebShellphp168.txt'

autoSearch('http://www.XXXX.com/core/centerxxxxx.php?s=php168')

pass

3,464 次访问过

weixin_39888082

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫