最近看天涯上的连续小说, 感到有些地方很不方便,比如: 作者写了一篇后, 后面很多人回复, 然后作者在写, 别人再回复, 导致看一篇完整的文章时, 需要自己可能得翻很多次页才能看完. 于是编写了这个python脚本, 只要用户输入帖子首页的网址, 和作者名(如果不输入,就认为是第一篇文章的作者), 就能将该帖子中该作者写的内容全部提取出来
#
-*- coding: mbcs -*-
# import re,urllib, urllib2, cookielib,datetime,sys
import urllib2,sys,os
def getUrlContent(url):
ifile = urllib2.urlopen(url)
data = ifile.read()
ifile.close()
return data
def getNextPageUrl(cont):
p0 = 0
res = []
while 1 :
p1 = cont.find( " <table border=0> " ,p0)
if p1 < 0: break
p2 = cont.find( ' <font color=black>共 ' ,p1)
if p2 < 0: break
p3 = cont.find( ' ><font color=#246cae>下一页 ' ,p2)
if p3 < 0: break
tmpl = ' <a href= '
p4 = cont.rfind(tmpl,p2,p3)
if p4 < 0: break
p0 = p4
res.append(cont[p4 + len(tmpl):p3])
return res[0]
return None
def getAuthor(cont):
p1 = cont.find( " <TABLE " )
if p1 < 0: return None
p1 = cont.find( ' >作者:<a ' , p1)
if p1 < 0: return None
p1 = cont.find( " vwriter= " , p1)
if p1 < 0: return None
s1 = " target=_blank> "
p1 = cont.find(s1, p1)
if p1 < 0: return None
s2 = " </a> "
p2 = cont.find(s2, p1)
if p2 < 0: return None
return cont[p1 + len(s1):p2]
def getTitle(cont):
s1 = ' <TITLE> '
p1 = cont.find(s1)
if p1 < 0: return None
p2 = cont.find( ' </TITLE> ' ,p1)
if p2 < 0: return None
return cont[p1 + len(s1):p2]
def getByAuthor(cont,author):
p0 = 0
res = []
while 1 :
p1 = cont.find( " <TABLE " ,p0)
# print 'p1',p1
if p1 < 0: break
p2 = cont.find( " vwriter= " , p1)
if p2 < 0: break
p2 = cont.find( " > " + author + " </a> " , p2)
# print 'p2',p2
if p2 < 0: break
p3 = cont.find( " </table> " , p2)
# print 'p3',p3
if p3 < 0: break
p4 = cont.find( " <TABLE " , p3)
# print 'p4',p4
if p4 < 0:
p4 = cont.find( " <!-- google_ad_section_end --> " , p3)
assert (p4 > 0)
res.append(cont[p3 + 8 :p4:])
break
else :
p0 = p4
res.append(cont[p3 + 8 :p4:])
return res
# url=sys.argv[1]
def mainProg(url):
fp = None
author = ""
while 1 :
print url
cont = getUrlContent(url)
print ' down OK '
if len(author) == 0:
author = getAuthor(cont)
title = getTitle(cont)
print ' author: ' ,author, ' title: ' ,title
title = title.replace( ' / ' , ' x ' ).replace( ' \\ ' , ' x ' ).replace( ' : ' , ' x ' ).replace( ' * ' , ' x ' ).replace( ' ? ' , ' x ' )
file = title + ' .htm '
if os.path.isfile(file):
print " File already exists! "
return
fp = open(file, ' w ' )
res = getByAuthor(cont,author)
print ' parse ok ' ,len(res)
fp.writelines([url + ' <br>\n ' , ' <br>--------<br> ' .join(res)])
url = getNextPageUrl(cont)
if url is None:
break
while 1 :
url = raw_input( ' input url: ' )
mainProg(url)
# import re,urllib, urllib2, cookielib,datetime,sys
import urllib2,sys,os
def getUrlContent(url):
ifile = urllib2.urlopen(url)
data = ifile.read()
ifile.close()
return data
def getNextPageUrl(cont):
p0 = 0
res = []
while 1 :
p1 = cont.find( " <table border=0> " ,p0)
if p1 < 0: break
p2 = cont.find( ' <font color=black>共 ' ,p1)
if p2 < 0: break
p3 = cont.find( ' ><font color=#246cae>下一页 ' ,p2)
if p3 < 0: break
tmpl = ' <a href= '
p4 = cont.rfind(tmpl,p2,p3)
if p4 < 0: break
p0 = p4
res.append(cont[p4 + len(tmpl):p3])
return res[0]
return None
def getAuthor(cont):
p1 = cont.find( " <TABLE " )
if p1 < 0: return None
p1 = cont.find( ' >作者:<a ' , p1)
if p1 < 0: return None
p1 = cont.find( " vwriter= " , p1)
if p1 < 0: return None
s1 = " target=_blank> "
p1 = cont.find(s1, p1)
if p1 < 0: return None
s2 = " </a> "
p2 = cont.find(s2, p1)
if p2 < 0: return None
return cont[p1 + len(s1):p2]
def getTitle(cont):
s1 = ' <TITLE> '
p1 = cont.find(s1)
if p1 < 0: return None
p2 = cont.find( ' </TITLE> ' ,p1)
if p2 < 0: return None
return cont[p1 + len(s1):p2]
def getByAuthor(cont,author):
p0 = 0
res = []
while 1 :
p1 = cont.find( " <TABLE " ,p0)
# print 'p1',p1
if p1 < 0: break
p2 = cont.find( " vwriter= " , p1)
if p2 < 0: break
p2 = cont.find( " > " + author + " </a> " , p2)
# print 'p2',p2
if p2 < 0: break
p3 = cont.find( " </table> " , p2)
# print 'p3',p3
if p3 < 0: break
p4 = cont.find( " <TABLE " , p3)
# print 'p4',p4
if p4 < 0:
p4 = cont.find( " <!-- google_ad_section_end --> " , p3)
assert (p4 > 0)
res.append(cont[p3 + 8 :p4:])
break
else :
p0 = p4
res.append(cont[p3 + 8 :p4:])
return res
# url=sys.argv[1]
def mainProg(url):
fp = None
author = ""
while 1 :
print url
cont = getUrlContent(url)
print ' down OK '
if len(author) == 0:
author = getAuthor(cont)
title = getTitle(cont)
print ' author: ' ,author, ' title: ' ,title
title = title.replace( ' / ' , ' x ' ).replace( ' \\ ' , ' x ' ).replace( ' : ' , ' x ' ).replace( ' * ' , ' x ' ).replace( ' ? ' , ' x ' )
file = title + ' .htm '
if os.path.isfile(file):
print " File already exists! "
return
fp = open(file, ' w ' )
res = getByAuthor(cont,author)
print ' parse ok ' ,len(res)
fp.writelines([url + ' <br>\n ' , ' <br>--------<br> ' .join(res)])
url = getNextPageUrl(cont)
if url is None:
break
while 1 :
url = raw_input( ' input url: ' )
mainProg(url)