基于MIT的假论文生成器的爬虫,及简单的预处理
def fetch(start,end):
t=0
for t in xrange(start,end):
urlText = []
response = urllib2.urlopen('http://scigen.csail.mit.edu/cgi-bin/scigen.cgi?author=6&author=&author=&author=&author=')
html = response.read()
class parseText(HTMLParser.HTMLParser):
def handle_data(self,data):
if data != '\n' or data != '\n\n' or data!= '\n ' :
urlText.append(data)
lParser = parseText()
lParser.feed(html)
#print urlText
file_object = open('fakepaper/paper'+str(t)+'.txt', 'w')
i=0
while i<len(urlText):
if 'Back to the SCIgen homepage.'==urlText[i]:
del urlText[i]
break
else:
del urlText[i]
str2 = '\n'.join(urlText)
str2 = ' '.join(str2.split('\n'))
file_object.write(str2)
file_object.close( )
print t