#! /usr/bin/python
# #encoding:utf-8
import re
#!/usr/bin/python
#encoding:utf-8
import re
def foo(xpath, content):
print xpath
if xpath.startswith('/'):
arr= xpath[1:].split('/', 1)
node = arr[0]
c = re.compile(r"\<%s\>(.*?)\<\/%s\>" % (node, node), re.S)
# print c.pattern
content_list = c.findall(content)
if len(arr) > 1:
xpath = '/' + arr[1]
return foo(xpath, content_list[0].strip()) #注意这里一定要加上return返回值
else:
return content_list[0].strip()
s = '''<html>
<header><title>hello world</title></header>
<body>
<div>
<h1>Hello World</h1>
</div>
<div>
<span>test</span>
</div>
</body>
</html>'''
xpath = '/html/body/span'
print foo(xpath, s)
抓取豆瓣最佳影评
import re
import codecs
def foo(xpath,content):
xpath=xpath[1:]
arr=xpath.split('/',1)
cn_pattern = re.compile('[^\x00-\xff]+.*?[^\x00-\xff]+')
if len(arr)>0:
node=arr[0]
fd_pattern=re.compile(r"\<%s(.*?)\<\/%s\>" %(node,node),re.S)
fd_result=fd_pattern.findall(content)
if len(arr)>1:
xpath='/'+arr[1]
for i in range(len(fd_result)):
foo(xpath, fd_result[i])
else:
print cn_pattern.search(fd_result[0].strip()).group()
f=codecs.open('D:/doubanhtml/douban0.html','r','utf-8')
content=f.read()
xpath=('/header/h3/a')
foo(xpath,content)