验证输入的url是否可正常连接,无法连接提示用户再次输入,正常连接则返回url本身
def url_get () :
url = input("请输入要爬取的首页url:" )
try :
kv = {'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER' }
requests.get(url, headers=kv)
return url
except :
print("url无法连接" )
return url_get()
根据传入的URL参数进行爬取,以列表形式返回
def spiderpage (url) :
kv = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER' }
r = requests.get(url, headers=kv)
r.encoding = r.apparent_encoding
pagetext = r.text
pagelinks = re.findall(r'(?<=<a href=\").*?(?=\")|(?<=href=\').*?(?=\')' , pagetext)
return pagelinks
根据需要对爬取的所有Url进行筛选和去重,传入的参数为列表形式的Url集合,返回的是符合条件的URL集合
def url_filtrate (pagelinks) :
same_target_url = []
for l in pagelinks:
if re.findall(r'blog.csdn.net/\w+/article/details/\d+' , l):
if re.findall(r'blockchain_lemon' , l):
pass
elif re.findall(r'passport' , l):
pass
else :
same_target_url.append(l)
unrepect_url = []
for l in same_target_url:
if l not in unrepect_url:
unrepect_url.append(l)
return unrepect_url
队列,实现将url集合分为未访问和已访问两类,当前未访问为空时结束循环等功能
class linkQuence :
def __init__ (self) :
self.visited = []
self.unvisited = []
def getvisitedurl (self) :
return self.visited
def getunvisitedurl (self) :
return self.unvisited
def addvisitedurl (self, url) :
return self.visited.append(url)
def removevisitedurl (self, url) :
return self.visited.remove(url)
def unvisitedurldequence (self) :
try :
return self.unvisited.pop()
except :
return None
def addunvisitedurl (self, url) :
if url != "" and url not in self.visited and url not in self.unvisited:
return self.unvisited.insert(0 , url)
def getvisitedurlount (self) :
return len(self.visited)
def getunvistedurlcount (self) :
return len(self.unvisited)
def unvisitedurlsempty (self) :
return len(self.unvisited) == 0
爬虫代码,通过调用上边的函数和队列,爬取输入页面下的所有链接,循环输入子页面就可以爬取所有子链接,以列表形式返回url集合
class Spider () :
def __init__ (self, url) :
self.linkQuence = linkQuence()
self.linkQuence.addunvisitedurl(url)
def crawler (self,urlcount) :
x = 1
while x <= urlcount:
if x > 1 :
print(f"第{x-1}个url,开始爬" )
visitedurl = self.linkQuence.unvisitedurldequence()
if visitedurl is None or visitedurl == '' :
continue
initial_links = spiderpage(visitedurl)
right_links = url_filtrate(initial_links)
self.linkQuence.addvisitedurl(visitedurl)
for link in right_links:
self.linkQuence.addunvisitedurl(link)
x += 1
print(f"终于爬完了,一共是{x-2}个url" )
return self.linkQuence.visited
将爬取到的子链接全部写入本地文件
def writetofile (list) :
x=1
for url in list[1 :]:
file = open('F://demo/urls.txt' , 'a' , encoding='utf8' )
file.write(f'{url}\n' )
x += 1
file.close()
print(f'写入已完成,总计{x-1}个子链接' )
启动爬虫
if __name__ == '__main__' :
url = url_get()
spider = Spider(url)
urllist = spider.crawler(100 )
writetofile(urllist)