以下4个文件分别为get.py makenew.py pro.py out.py
#
! /usr/bin/env python
from sgmllib import SGMLParser
import urllib
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k == ' href ' ]
if href:
self.urls.extend(href)
from sgmllib import SGMLParser
import urllib
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k == ' href ' ]
if href:
self.urls.extend(href)
#
!/usr/bin/env python
# encoding=utf-8
import urllib,get
# ---start:to correct the input, if lack of thr Protocal Name-------------
d = ' http://www.bitunion.org '
def panduan_http(d):
global req
req = ''
if d.startswith( ' http:// ' ):
req = d
else :
req = ' http:// ' + d
return req
# ---end------------------------------------------------------------------
sock = urllib.urlopen(panduan_http(d))
sword = get.URLLister()
sword.feed(sock.read())
sword.close()
bbscan = []
# sword.urls is a list which consists of the ALL URLS
for url in sword.urls:
if url.startswith( ' redirect ' ): # search from the Charactor to choose the URL for which we want
b = req + ' / '
bbsinfo = b + url
bbscan.append(bbsinfo)
bbsold = bbscan
class nail():
kee = bbsold
# encoding=utf-8
import urllib,get
# ---start:to correct the input, if lack of thr Protocal Name-------------
d = ' http://www.bitunion.org '
def panduan_http(d):
global req
req = ''
if d.startswith( ' http:// ' ):
req = d
else :
req = ' http:// ' + d
return req
# ---end------------------------------------------------------------------
sock = urllib.urlopen(panduan_http(d))
sword = get.URLLister()
sword.feed(sock.read())
sword.close()
bbscan = []
# sword.urls is a list which consists of the ALL URLS
for url in sword.urls:
if url.startswith( ' redirect ' ): # search from the Charactor to choose the URL for which we want
b = req + ' / '
bbsinfo = b + url
bbscan.append(bbsinfo)
bbsold = bbscan
class nail():
kee = bbsold
#
!/usr/bin/env python
# encoding=utf-8
from sgmllib import SGMLParser
import makenew,re
# ----start:the module is to seperate thr CH-strings----------------------
class Parse(SGMLParser):
def reset(self):
self.found_title = 0
SGMLParser.reset(self)
def start_title(self, attrs):
self.found_title += 1
def end_title(self):
self.found_title -= 1
def handle_data(self, text):
if self.found_title > 0:
aa = re.findall( ' -(?P<data>.*)- ' , text)
for a in aa:
print " %s " % a
# -----end----------------------------------------------------------------
guai = makenew.nail()
bbss = guai.kee
newurls = []
for hard in bbss:
a = [i for i in re.findall( ' tid=(?P<data>.*)&goto ' ,hard)] # get out part the string -ID
for j in a:
newurl = r ' http://www.bitunion.org/thread- ' + j + ' -1-1.html ' # make up new URL-of the ZhuTiTie
newurls.append(newurl)
link = newurls
# encoding=utf-8
from sgmllib import SGMLParser
import makenew,re
# ----start:the module is to seperate thr CH-strings----------------------
class Parse(SGMLParser):
def reset(self):
self.found_title = 0
SGMLParser.reset(self)
def start_title(self, attrs):
self.found_title += 1
def end_title(self):
self.found_title -= 1
def handle_data(self, text):
if self.found_title > 0:
aa = re.findall( ' -(?P<data>.*)- ' , text)
for a in aa:
print " %s " % a
# -----end----------------------------------------------------------------
guai = makenew.nail()
bbss = guai.kee
newurls = []
for hard in bbss:
a = [i for i in re.findall( ' tid=(?P<data>.*)&goto ' ,hard)] # get out part the string -ID
for j in a:
newurl = r ' http://www.bitunion.org/thread- ' + j + ' -1-1.html ' # make up new URL-of the ZhuTiTie
newurls.append(newurl)
link = newurls
#
!/usr/bin/env python
# encoding=utf-8
import pro,urllib
print ' - ' * 70
for newurl in pro.link:
print newurl
sock = urllib.urlopen(newurl)
html = sock.read()
sock.close()
html = unicode(html, " gbk " )
p = pro.Parse()
p.feed(html)
print ' - ' * 70
# complete by freefis in 7:19 AM Dec,22,07
# version-0.12B
# encoding=utf-8
import pro,urllib
print ' - ' * 70
for newurl in pro.link:
print newurl
sock = urllib.urlopen(newurl)
html = sock.read()
sock.close()
html = unicode(html, " gbk " )
p = pro.Parse()
p.feed(html)
print ' - ' * 70
# complete by freefis in 7:19 AM Dec,22,07
# version-0.12B