#!/usr/bin/python#vim: set fileencoding=utf-8:
importsysimporturllib2importrefrom BeautifulSoup importBeautifulSoupimportConfigParserimportMySQLdb as mdbclassDb_Connector:def __init__(self, config_file_path):
cf=ConfigParser.ConfigParser()
cf.read(config_file_path)
db_host= cf.get("mysql_db", "host")
db_port= cf.getint("mysql_db", "port")
db_user= cf.get("mysql_db", "username")
db_pwd= cf.get("mysql_db", "password")
db_data=cf.get("mysql_db","db_name")try:
self.con=mdb.connect(db_host,db_user,db_pwd,db_data)
self.cur=self.con.cursor()except:print "[*] DB Connect Error"
deffind_all(self,sql_script):try:
self.cur.execute(sql_script)returnself.cur.fetchall()except:print "[*] DB FindAll Error"
deffind_item(self,sql_script):try:
self.cur.execute(sql_script)returnself.cur.fetchone()except:print "[*] DB FindItem Error"
definsert_item(self,sql_script):try:
self.cur.execute(sql_script)
self.con.commit()returnTrueexceptException, e:print '[*] DB Insert Into Error'
defupdate_item(self,sql_script):try:
self.cur.execute(sql_script)
self.con.commit()returnTrueexceptException, e:print "[*] DB Update Error"
classSpriderUrl:#初始化
def __init__(self,url):
self.url=url
self.con=Db_Connector('sprider.ini')#获得目标url的第一次url清单
defget_self(self):
urls=[]try:
body_text=urllib2.urlopen(self.url).read()except:print "[*] Web Get Error:checking the Url"sys.exit(0)
soup=BeautifulSoup(body_text)
links=soup.findAll('a')for link inlinks:#获得了目标的url但还需要处理
_url=link.get('href')#接着对其进行判断处理
#先判断它是否是无意义字符开头以及是否为None值
#判断URL后缀,不是列表的不抓取
if re.match('^(javascript|:;|#)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url):continue
#然后判断它是不是http|https开头,对于这些开头的都要判断是否是本站点, 不做超出站点的爬虫
if re.match('^(http|https)',_url):if not re.match('^'+self.url,_url):continue
else:
urls.append(_url)else:
urls.append(self.url+_url)
rst=list(set(urls))for rurl inrst:if self.con.find_item("select * from url_sprider where url='"+rurl+"' and domain='"+self.url+"'"):continue
else:try:
self.con.insert_item("insert into url_sprider(url,tag,domain)values('"+rurl+"',0,'"+self.url+"')")except:print "[*] insert into is Error!"
defsprider_self_all(self,domain):
urls=[]try:
body_text=urllib2.urlopen(domain).read()except:print "[*] Web Get Error:checking the Url"sys.exit(0)
soup=BeautifulSoup(body_text)
links=soup.findAll('a')for link inlinks:#获得了目标的url但还需要处理
_url=link.get('href')#接着对其进行判断处理
#先判断它是否是无意义字符开头以及是否为None值
#判断URL后缀,不是列表的不抓取
try:if re.match('^(javascript|:;|#)',str(_url)) or str(_url) is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',str(_url)):continue
exceptTypeError:print "[*] Type is Error! :"+str(_url)continue
#然后判断它是不是http|https开头,对于这些开头的都要判断是否是本站点, 不做超出站点的爬虫
if re.match('^(http|https)',_url):if not re.match('^'+self.url,_url):continue
else:
urls.append(_url)else:
urls.append(self.url+_url)
res=list(set(urls))for rurl inres:if self.con.find_item("select * from url_sprider where url='"+rurl+"' and domain='"+self.url+"'"):continue
else:try:
self.con.insert_item("insert into url_sprider(url,tag,domain)values('"+rurl+"',0,'"+self.url+"')")except:print "[*] insert into is Error!"
defsprider_self(self):whileTrue:
wat_list=self.con.find_all("select url from url_sprider where domain='"+self.url+"' and tag=0")if len(wat_list)>0:for url inwat_list:try:
self.con.update_item("update url_sprider set tag=1 where url='"+url[0]+"'")except:print "[*] DB update Error!"
continue
try:
self.sprider_self_all(url[0])except:print "[*]Sprider Error!"
continue
else:print "[*] Sprider is Finish!"
breakspi="http://www.baidu.com/"t=SpriderUrl(spi)#第一次捕获
t.get_self()#开始深度爬虫
t.sprider_self()