很多时候网页中采用正则或者xpath提取数据内容的方式是很好的,但是对于不确定网页内容结构,可以采用xpath提取更大范围的div,然后去除一切标签来提取数据。
import re def filter_tags(htmlstr): #先过滤CDATA re_cdata=re.compile('//<!\[CDATA\[[^>]*//\]\]>',re.I) #匹配CDATA re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)#Script re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)#style re_br=re.compile('<br\s*?/?>')#处理换行 re_h=re.compile('</?\w+[^>]*>')#HTML标签 re_comment=re.compile('<!--[^>]*-->')#HTML注释 s=re_cdata.sub('',htmlstr)#去掉CDATA s=re_script.sub('',s) #去掉SCRIPT s=re_style.sub('',s)#去掉style s=re_br.sub('\n',s)#将br转换为换行 s=re_h.sub('',s) #去掉HTML 标签 s=re_comment.sub('',s)#去掉HTML注释 #去掉多余的空行 blank_line=re.compile('\n+') s=blank_line.sub('\n',s) s=replaceCharEntity(s)#替换实体 return s def replaceCharEntity(htmlstr): CHAR_ENTITIES={'nbsp':' ','160':' ', 'lt':'<','60':'<', 'gt':'>','62':'>', 'amp':'&','38':'&', 'quot':'"','34':'"',} re_charEntity=re.compile(r'&#?(?P<name>\w+);') sz=re_charEntity.search(htmlstr) while sz: entity=sz.group()#entity全称,如> key=sz.group('name')#去除&;后entity,如>为gt try: htmlstr=re_charEntity.sub(CHAR_ENTITIES[key],htmlstr,1) sz=re_charEntity.search(htmlstr) except KeyError: #以空串代替 htmlstr=re_charEntity.sub('',htmlstr,1) sz=re_charEntity.search(htmlstr) return htmlstr if __name__=='__main__': s=file('index.html').read() news=filter_tags(s) print news
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
import
re
def
filter_tags
(
htmlstr
)
:
#先过滤CDATA
re_cdata
=
re
.
compile
(
'//<!\[CDATA\[[^>]*//\]\]>'
,
re
.
I
)
#匹配CDATA
re_script
=
re
.
compile
(
'<\s*script[^>]*>[^<]*<\s*/\s*script\s*>'
,
re
.
I
)
#Script
re_style
=
re
.
compile
(
'<\s*style[^>]*>[^<]*<\s*/\s*style\s*>'
,
re
.
I
)
#style
re_br
=
re
.
compile
(
'<br\s*?/?>'
)
#处理换行
re_h
=
re
.
compile
(
'</?\w+[^>]*>'
)
#HTML标签
re_comment
=
re
.
compile
(
'<!--[^>]*-->'
)
#HTML注释
s
=
re_cdata
.
sub
(
''
,
htmlstr
)
#去掉CDATA
s
=
re_script
.
sub
(
''
,
s
)
#去掉SCRIPT
s
=
re_style
.
sub
(
''
,
s
)
#去掉style
s
=
re_br
.
sub
(
'\n'
,
s
)
#将br转换为换行
s
=
re_h
.
sub
(
''
,
s
)
#去掉HTML 标签
s
=
re_comment
.
sub
(
''
,
s
)
#去掉HTML注释
#去掉多余的空行
blank_line
=
re
.
compile
(
'\n+'
)
s
=
blank_line
.
sub
(
'\n'
,
s
)
s
=
replaceCharEntity
(
s
)
#替换实体
return
s
def
replaceCharEntity
(
htmlstr
)
:
CHAR_ENTITIES
=
{
'nbsp'
:
' '
,
'160'
:
' '
,
'lt'
:
'<'
,
'60'
:
'<'
,
'gt'
:
'>'
,
'62'
:
'>'
,
'amp'
:
'&'
,
'38'
:
'&'
,
'quot'
:
'"'
,
'34'
:
'"'
,
}
re_charEntity
=
re
.
compile
(
r
'&#?(?P<name>\w+);'
)
sz
=
re_charEntity
.
search
(
htmlstr
)
while
sz
:
entity
=
sz
.
group
(
)
#entity全称,如>
key
=
sz
.
group
(
'name'
)
#去除&;后entity,如>为gt
try
:
htmlstr
=
re_charEntity
.
sub
(
CHAR_ENTITIES
[
key
]
,
htmlstr
,
1
)
sz
=
re_charEntity
.
search
(
htmlstr
)
except
KeyError
:
#以空串代替
htmlstr
=
re_charEntity
.
sub
(
''
,
htmlstr
,
1
)
sz
=
re_charEntity
.
search
(
htmlstr
)
return
htmlstr
if
__name__
==
'__main__'
:
s
=
file
(
'index.html'
)
.
read
(
)
news
=
filter_tags
(
s
)
print
news
|
自定义过滤什么标签
import re def clean_tags(page, tag): reTRIM = r'<{0}[^<>]*?>([\s\S]*?)<\/{0}>' return re.sub(reTRIM.format(tag), "", page, flags=re.I) def clean_tags_hasprop(page, tag, prop): reTRIM = r'<{0}[^<>]+?{1}.*?>([\s\S]*?)<\/{0}>' return re.sub(reTRIM.format(tag,prop), "", page, flags=re.I) def clean_tags_only(page, tag): reTRIM = r'<\/?{0}[^<>]*?>' return re.sub(reTRIM.format(tag), "", page, flags=re.I) def clean_tags_exactly(page, tag): reTRIM = r'<\/?{0}>' return re.sub(reTRIM.format(tag), "", page, flags=re.I)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
import
re
def
clean_tags
(
page
,
tag
)
:
reTRIM
=
r
'<{0}[^<>]*?>([\s\S]*?)<\/{0}>'
return
re
.
sub
(
reTRIM
.
format
(
tag
)
,
""
,
page
,
flags
=
re
.
I
)
def
clean_tags_hasprop
(
page
,
tag
,
prop
)
:
reTRIM
=
r
'<{0}[^<>]+?{1}.*?>([\s\S]*?)<\/{0}>'
return
re
.
sub
(
reTRIM
.
format
(
tag
,
prop
)
,
""
,
page
,
flags
=
re
.
I
)
def
clean_tags_only
(
page
,
tag
)
:
reTRIM
=
r
'<\/?{0}[^<>]*?>'
return
re
.
sub
(
reTRIM
.
format
(
tag
)
,
""
,
page
,
flags
=
re
.
I
)
def
clean_tags_exactly
(
page
,
tag
)
:
reTRIM
=
r
'<\/?{0}>'
return
re
.
sub
(
reTRIM
.
format
(
tag
)
,
""
,
page
,
flags
=
re
.
I
)
|