python拉baidu blog的例子

最新推荐文章于 2024-09-11 18:01:31 发布

hyyuanqiang

最新推荐文章于 2024-09-11 18:01:31 发布

阅读量530

点赞数

文章标签： python blog class div fp url

本文链接：https://blog.csdn.net/hyyuanqiang/article/details/4261484

版权

学习python 一个礼拜

闲的无聊写的

很粗糙

权当用来熟悉python

#!/usr/bin/env python
# -*- coding: gbk -*-

import urllib, re

from sgmllib import SGMLParser

import sys
reload(sys)
sys.setdefaultencoding('gbk')

class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []

def start_a(self, attrs):
  href = [v for k, v in attrs if k=='href']
  if href:
   self.urls.extend(href)

homeurl ="http://hi.baidu.com/shenjianyz/blog "
urlbase = homeurl[:homeurl.rfind("/")]
urlbase = urlbase[:urlbase.rfind("/")]

usock = urllib.urlopen(homeurl)
fp = usock.read()
parser = URLLister()
parser.feed(fp)

usock.close()
parser.close()
fp = unicode(fp, "gbk")

modpattern = '^//S+.html$'
useurl = []
for url in parser.urls:
if re.search(modpattern, url) :
useurl.append(url)

urlunique = [i for i in set(useurl)]

for i in urlunique:
tilepattern = "<div class=/"tit/"><a href=/"" + i +"/" target=/"_blank/">(/S+)</a>"


line = fp
if len(line) == 0:
  break
regline = line
try:
  reg=re.search(tilepattern,regline).group(1)
except AttributeError:
  continue

file = open(reg+'.txt','w')
linkusock = urllib.urlopen(urlbase+i)
content = linkusock.read()
content = unicode(content, "gbk")

linkusock.close()
contentpattern = "<div id=/"m_blog/" class=/"modbox/">((/S|/s)*)<div class=/"opt/">"


contentline = content

if len(contentline) == 0:
  break
try:
  getcontent = re.search(contentpattern, contentline).group(1)
except AttributeError:
  continue

#content去掉html标签
re_br = re.compile('<br/s*?/?>')#
re_div = re.compile('</?/s*div/s*(class/s*=/s*/"/w+/"|id/s*=/s*/"/w+/"|/s)*>')#div标签
re_h = re.compile('</?/w+[^>]*>')#HTML标签
re_comment = re.compile('')#HTML注释
re_td = re.compile('</s*/?/s*td/s*>')
re_nbsp = re.compile('&nbsp/s*;|&lt/s*;|&mdash/s*;')

getcontent = re_br.sub('/n', getcontent)#将br转换为换行
getcontent = re_div.sub(' ', getcontent)
getcontent = re_h.sub(' ', getcontent)
getcontent = re_comment.sub(' ', getcontent)
getcontent = re_td.sub(' ', getcontent)
getcontent = re_nbsp.sub(' ', getcontent)
file.write(urlbase+i+'/n')
file.write(getcontent)
file.close()

我的blog:hyyuanqiang.blog.163.com