python爬虫依赖库_Python整站爬虫(Demo)(依赖Mysql)

#!/usr/bin/python#vim: set fileencoding=utf-8:

importsysimporturllib2importrefrom BeautifulSoup importBeautifulSoupimportConfigParserimportMySQLdb as mdbclassDb_Connector:def __init__(self, config_file_path):

cf=ConfigParser.ConfigParser()

cf.read(config_file_path)

db_host= cf.get("mysql_db", "host")

db_port= cf.getint("mysql_db", "port")

db_user= cf.get("mysql_db", "username")

db_pwd= cf.get("mysql_db", "password")

db_data=cf.get("mysql_db","db_name")try:

self.con=mdb.connect(db_host,db_user,db_pwd,db_data)

self.cur=self.con.cursor()except:print "[*] DB Connect Error"

deffind_all(self,sql_script):try:

self.cur.execute(sql_script)returnself.cur.fetchall()except:print "[*] DB FindAll Error"

deffind_item(self,sql_script):try:

self.cur.execute(sql_script)returnself.cur.fetchone()except:print "[*] DB FindItem Error"

definsert_item(self,sql_script):try:

self.cur.execute(sql_script)

self.con.commit()returnTrueexceptException, e:print '[*] DB Insert Into Error'

defupdate_item(self,sql_script):try:

self.cur.execute(sql_script)

self.con.commit()returnTrueexceptException, e:print "[*] DB Update Error"

classSpriderUrl:#初始化

def __init__(self,url):

self.url=url

self.con=Db_Connector('sprider.ini')#获得目标url的第一次url清单

defget_self(self):

urls=[]try:

body_text=urllib2.urlopen(self.url).read()except:print "[*] Web Get Error:checking the Url"sys.exit(0)

soup=BeautifulSoup(body_text)

links=soup.findAll('a')for link inlinks:#获得了目标的url但还需要处理

_url=link.get('href')#接着对其进行判断处理

#先判断它是否是无意义字符开头以及是否为None值

#判断URL后缀,不是列表的不抓取

if re.match('^(javascript|:;|#)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url):continue

#然后判断它是不是http|https开头,对于这些开头的都要判断是否是本站点, 不做超出站点的爬虫

if re.match('^(http|https)',_url):if not re.match('^'+self.url,_url):continue

else:

urls.append(_url)else:

urls.append(self.url+_url)

rst=list(set(urls))for rurl inrst:if self.con.find_item("select * from url_sprider where url='"+rurl+"' and domain='"+self.url+"'"):continue

else:try:

self.con.insert_item("insert into url_sprider(url,tag,domain)values('"+rurl+"',0,'"+self.url+"')")except:print "[*] insert into is Error!"

defsprider_self_all(self,domain):

urls=[]try:

body_text=urllib2.urlopen(domain).read()except:print "[*] Web Get Error:checking the Url"sys.exit(0)

soup=BeautifulSoup(body_text)

links=soup.findAll('a')for link inlinks:#获得了目标的url但还需要处理

_url=link.get('href')#接着对其进行判断处理

#先判断它是否是无意义字符开头以及是否为None值

#判断URL后缀,不是列表的不抓取

try:if re.match('^(javascript|:;|#)',str(_url)) or str(_url) is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',str(_url)):continue

exceptTypeError:print "[*] Type is Error! :"+str(_url)continue

#然后判断它是不是http|https开头,对于这些开头的都要判断是否是本站点, 不做超出站点的爬虫

if re.match('^(http|https)',_url):if not re.match('^'+self.url,_url):continue

else:

urls.append(_url)else:

urls.append(self.url+_url)

res=list(set(urls))for rurl inres:if self.con.find_item("select * from url_sprider where url='"+rurl+"' and domain='"+self.url+"'"):continue

else:try:

self.con.insert_item("insert into url_sprider(url,tag,domain)values('"+rurl+"',0,'"+self.url+"')")except:print "[*] insert into is Error!"

defsprider_self(self):whileTrue:

wat_list=self.con.find_all("select url from url_sprider where domain='"+self.url+"' and tag=0")if len(wat_list)>0:for url inwat_list:try:

self.con.update_item("update url_sprider set tag=1 where url='"+url[0]+"'")except:print "[*] DB update Error!"

continue

try:

self.sprider_self_all(url[0])except:print "[*]Sprider Error!"

continue

else:print "[*] Sprider is Finish!"

breakspi="http://www.baidu.com/"t=SpriderUrl(spi)#第一次捕获

t.get_self()#开始深度爬虫

t.sprider_self()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值