class MyThread(threading.Thread):
def __init__(self, func):
threading.Thread.__init__(self);
self.func = func;
def run(self):
apply(self.func);
class Spider_Model(object):
def __init__(self):
self.page = 1;
self.pages = [];
self.enable = False;
def GetPage(self, page):
myUrl ="http://www.qiushibaike.com/hot/page/"+str(page);
user_agent = 'Mozilla 4.0 (compatible; MSIE 5.5; Windows NT)';
headers = {'User-Agent':user_agent};
req = urllib2.Request(myUrl, headers = headers);
myResponse = urllib2.urlopen(req);
myPage = myResponse.read();
unicodePage = myPage.decode('utf-8');
myItems = re.findall('<div.*?class="content".*?title="(.*?)">(.*?)</div>', unicodePage, re.S);
items = [];
for item in myItems:
#print 'item[0] = ', item[0], 'item[1] = ', item[1];
items.append([item[0].replace("\n", ""), item[1].replace("\n", "")]);
return items;
def LoadPage(self):
while self.enable:
if len(self.pages) < 2:
try:
myPage = self.GetPage(self.page);
self.page += 1;
self.pages.append(myPage);
except Exception, e:
print 'can not link the target', e;
else:
time.sleep(1);
def ShowPage(self, nowPage, page):
for items in nowPage:
print u'di %d ye' % page, items[0], items[1];
myInput = raw_input('please input');
if myInput == 'quit':
self.enable = False;
break;
def Start(self):
self.enable = True;
page = self.page;
print u'jia zai..';
t = MyThread(self.LoadPage);
t.start();
while self.enable:
if len(self.pages) > 0:
nowPage = self.pages[0];
del self.pages[0];
self.ShowPage(nowPage, page);
#page += 1;
print u'please enter to scan the content';
myModel = Spider_Model();
myModel.Start();