python爬虫算法深度优先_基于深度优先遍历的网站爬虫-Python

Post Views:

10,441#encoding=utf-8

from bs4 import BeautifulSoup

import socket

import urllib2

import re

import zlib

import requests

urls = ["http://www.baidu.com"] #自定义要爬去的链接

depth = 10 #自定义爬去的深度

flag = "1234" #自定义要搜索的内容

class MyCrawler:

def __init__(self,seeds):

#初始化当前抓取的深度

self.current_deepth = 1

#使用种子初始化url队列

self.linkQuence=linkQuence()

if isinstance(seeds,str):

self.linkQuence.addUnvisitedUrl(seeds)

if isinstance(seeds,list):

for i in seeds:

self.linkQuence.addUnvisitedUrl(i)

#抓取过程主函数

def crawling(self,seeds,crawl_deepth,static_url):

#循环条件:抓取深度不超过crawl_deepth

while self.current_deepth <= crawl_deepth:

#循环条件:待抓取的链接不空

while not self.linkQuence.unVisitedUrlsEnmpy():

#队头url出队列

visitUrl=self.linkQuence.unVisitedUrlDeQuence()

if visitUrl is None or visitUrl=="":

continue

#获取超链接

links=self.getHyperLinks(visitUrl,static_url)

#将url放入已访问的url中

self.linkQuence.addVisitedUrl(visitUrl)

#未访问的url入列

for link in links:

self.linkQuence.addUnvisitedUrl(link)

self.current_deepth += 1

#获取源码中得超链接

def getHyperLinks(self,url,static_url):

result = []

r = requests.get(url)

data = r.text

lines = data.split("\n")

for i in lines:

if flag in i:

print url+" "+i

# 利用正则查找所有连接

link_list =re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,data)

for i in link_list:

if "http" not in i:

result.append(static_url+i)

else:

result.append(i)

return result

#获取网页源码

def getPageSource(self,url,timeout=100,coding=None):

try:

socket.setdefaulttimeout(timeout)

req = urllib2.Request(url)

req.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')

response = urllib2.urlopen(req)

page = ''

if response.headers.get('Content-Encoding') == 'gzip':

page = zlib.decompress(page, 16+zlib.MAX_WBITS)

if coding is None:

coding= response.headers.getparam("charset")

#如果获取的网站编码为None

if coding is None:

page=response.read()

#获取网站编码并转化为utf-8

else:

page=response.read()

page=page.decode(coding).encode('utf-8')

return ["200",page]

except Exception,e:

print str(e)

return [str(e),None]

class linkQuence:

def __init__(self):

#已访问的url集合

self.visted=[]

#待访问的url集合

self.unVisited=[]

#获取访问过的url队列

def getVisitedUrl(self):

return self.visted

#获取未访问的url队列

def getUnvisitedUrl(self):

return self.unVisited

#添加到访问过得url队列中

def addVisitedUrl(self,url):

self.visted.append(url)

#移除访问过得url

def removeVisitedUrl(self,url):

self.visted.remove(url)

#未访问过得url出队列

def unVisitedUrlDeQuence(self):

try:

return self.unVisited.pop()

except:

return None

#保证每个url只被访问一次

def addUnvisitedUrl(self,url):

if url!="" and url not in self.visted and url not in self.unVisited:

self.unVisited.insert(0,url)

#获得已访问的url数目

def getVisitedUrlCount(self):

return len(self.visted)

#获得未访问的url数目

def getUnvistedUrlCount(self):

return len(self.unVisited)

#判断未访问的url队列是否为空

def unVisitedUrlsEnmpy(self):

return len(self.unVisited)==0

def main(seeds,crawl_deepth,static_url):

craw=MyCrawler(seeds)

craw.crawling(seeds,crawl_deepth,static_url)

if __name__=="__main__":

for i in urls:

main([i],depth,i)

print "Done!"

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值