#!/usr/bin/python
# spider version 1
# goal: pass -u url -d deep -thread number
import urllib
import argparse
import threading
import Queue
import bs4
import sys
import time
import re
#import this
# #-----------------------------------------------
# # bs4 test pass
# #-----------------------------------------------
# soup = bs4.BeautifulSoup("<p>some<b>bad<i>HTML")
# print soup.prettify()
# #-----------------------------------------------
# # argparse test pass
# #-----------------------------------------------
parser = argparse.ArgumentParser(description = "spider command line test!!!")
parser.add_argument("filename", help="spider filename")
parser.add_argument("-u", "--url", help="input a org rul", default="http://www.sina.com")
parser.add_argument("-d", "--deep", help="search deep, only one parameter", type=int, default="2")
parser.add_argument("-thread", "--thread_number", help="thread number in threadpool", type=int, default="300")
class CMD_ARG():
pass
args = CMD_ARG()
parser.parse_args(sys.argv, namespace=args)
#print args.filename, args.url, args.deep, args.thread_number
#time.sleep(10)
# #-----------------------------------------------
# # threading test pass
# #-----------------------------------------------
# class x(threading.Thread):
# def __init__(self):ls
# threading.Thread.__init__(self)
# def run(self):
# th = x()
# th.acquire()
# th.release()
# th.wait()
# th.notify()
# th.start()
# th.join()
# th.stop()
# #-----------------------------------------------
# # Queue test pass
# #-----------------------------------------------
# queue = Queue.Queue()
# queue.put(x)
# queue.get()
# #-----------------------------------------------
# # main code
# #-----------------------------------------------
# input args.url, args.deep
#url queue
queue = Queue.Queue()
#visited queue
visited = Queue.Queue()
#threading pool
thpool = []
#threading num
thnum = args.thread_number
#element in queue is a dict
# {
# "url": "http://www.xxx.com",
# "deep": x
# }
#first element for queue
queue.put({'url':args.url, 'deep':0})
#re pattern
#attention: matched url should not contain ""
pat = re.compile(r'"(http://.+?)"')
#get a RLock, no use, because queue control itself
mylock = threading.RLock()
flag_done = False
# class Spider
class Spider(threading.Thread):
def __init__(self, url_dict):
try:
threading.Thread.__init__(self)
self.url = url_dict['url']
self.deep = url_dict['deep']
except:
print "init error\n"
def run(self):
print "%s\n" %(self.getName())
try:
self.search()
except:
print "search error\n"
# search url in content
def search(self):
global pat
global queue
#get url content
try:
content = urllib.urlopen(self.url).read()
except:
print "open content error\n"
#match all url in content
try:
matched= pat.findall(content)
except:
print "match error\n"
#put all matched url to queue
for link in matched:
queue.put({'url':link, 'deep':self.deep+1})
def work():
#global thnum
global flag_done
global queue
global visited
global thpool
global thnum
#flag_done is True when Spider reach reqiured deep
while not flag_done:
# thread number should not bigger than reqiured
if threading.active_count() >= thnum:
continue
#queue not empty
if not queue.empty():
#get a value from queue
val = queue.get()
#put this url to visited queue
visited.put(val)
#reach reqiured deep
if(val['deep'] == args.deep):
flag_done = True
#waiting all thread end
for t in thpool:
print "waiting %s stop\n" %(t.getName())
t.join(20)
print "Spider done in deep: %d, qsize: %d, visited: %d\n" %(args.deep, queue.qsize(), visited.qsize())
break
else:
#gen thread
th = Spider(val)
#add thread to threadPool
thpool.append(th)
th.start()
else:
continue
return
#main
work()