学习python 一个礼拜
闲的无聊写的
很粗糙
权当用来熟悉python
#!/usr/bin/env python
# -*- coding: gbk -*-
import urllib, re
from sgmllib import SGMLParser
import sys
reload(sys)
sys.setdefaultencoding('gbk')
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
self.urls.extend(href)
homeurl ="http://hi.baidu.com/shenjianyz/blog"
urlbase = homeurl[:homeurl.rfind("/")]
urlbase = urlbase[:urlbase.rfind("/")]
usock = urllib.urlopen(homeurl)
fp = usock.read()
parser = URLLister()
parser.feed(fp)
usock.close()
parser.close()
fp = unicode(fp, "gbk")
modpattern = '^/\S+.html$'
useurl = []
for url in parser.urls:
if re.search(modpattern, url) :
useurl.append(url)
urlunique = [i for i in set(useurl)]
for i in urlunique:
tilepattern = "
line = fp
if len(line) == 0:
break
regline = line
try:
reg=re.search(tilepattern,regline).group(1)
except AttributeError:
continue
file = open(reg+'.txt','w')
linkusock = urllib.urlopen(urlbase+i)
content = linkusock.read()
content = unicode(content, "gbk")
linkusock.close()
contentpattern = "
contentline = content
if len(contentline) == 0:
break
try:
getcontent = re.search(contentpattern, contentline).group(1)
except AttributeError:
continue
#content去掉html标签
re_br = re.compile('')#
re_div = re.compile('')#div标签
re_h = re.compile('')#HTML标签
re_comment = re.compile('')#HTML注释
re_td = re.compile('')
re_nbsp = re.compile(' \s*;|<\s*;|&mdash\s*;')
getcontent = re_br.sub('\n', getcontent)#将br转换为换行
getcontent = re_div.sub(' ', getcontent)
getcontent = re_h.sub(' ', getcontent)
getcontent = re_comment.sub(' ', getcontent)
getcontent = re_td.sub(' ', getcontent)
getcontent = re_nbsp.sub(' ', getcontent)
file.write(urlbase+i+'\n')
file.write(getcontent)
file.close()
Link URL: http://hyyuanqiang.blog.163.com/blog/static/59415137200942744616488
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/12165911/viewspace-604321/,如需转载,请注明出处,否则将追究法律责任。
<%=items[i].content%>
<%if(items[i].items.items.length) { %><%=items[i].items.items[j].username%> 回复 <%=items[i].items.items[j].tousername%>: <%=items[i].items.items[j].content%>
转载于:http://blog.itpub.net/12165911/viewspace-604321/