今天8:16分到教室.今天好像讲基本数据类型的函数的功能及使用.加油!!!
ps:写好的爬虫代码:
#/usr/bin/env python
#-*- coding: utf-8 -*-
# Editer:jokersyc
import re
import os
from bs4 import BeautifulSoup
# 先给定一个目录,抓取其下的网页
base="C:\\Users\\Administrator\\Desktop\\python_课件+实验指导书\\lab\\website"
saveFile=[]
saveDir=[]
# crawler是爬取策略和执行的方法
haveGetFile=[]
haveGetDir=[]
maxDepth=0
def crawler(seedURL, targetDir, maxDepth):
# file=open(seedURL,mode="r")
if maxDepth>=0:
print(seedURL)
for a in seedURL:
if a in haveGetFile:
continue
else:
haveGetFile.append(a)
print(haveGetFile)
# 如果一级目录中有 .html文件,则执行get()
if a.endswith(".html"):
# print(a)
get(a)
elif a.endswith(".txt"):
File = open(a, mode='r+', encoding='utf-8')
h = File.read()
# print(h)
else:
seedURL.pop(seedURL.index(a))
for b in targetDir:
maxDepth -= 1
if maxDepth<=0:
break
else:
all(b)
targetDir.pop(targetDir.index(b))
crawler(seedURL,targetDir,2)
else:
print("已到达抓取深度限制")
pass
# 如果一开始的seedURL是个文件目录,则循环遍历该目录下的一级文件,返回该文件的地址
def all(seedURL):
if maxDepth>0:
if not os.path.exists(seedURL):
print("设置的基础目录不存在")
else:
aim=os.listdir(seedURL)
# print (aim)
for file in aim:
fileR=os.path.join(seedURL,file)
if os.path.isfile(fileR):
if fileR in saveFile:
pass
else:
saveFile.append(fileR)
# print (fileR)
# return fileR
elif os.path.isdir(fileR):
if fileR in saveDir:
pass
else:
saveDir.append(fileR)
# print (fileR)
# print (saveFile)
# print (saveDir)
# crawler(saveFile,saveDir,maxDepth)
else:
pass
# return all(fileR)
def get(sds):
# print(dd)
htmlFile=open(sds,mode='r+',encoding='utf-8')
# print(htmlFile.name)
h=htmlFile.read()
# h里面竟然是空的......(实际上目标地址的网页本身就是空的...)
print(h)
dd=BeautifulSoup(h,"html.parser")
print(dd.find_all("a"))
htmlFile.close()
if __name__=="__main__":
maxDepth = int(input("请设置抓取深度:"))
all(base)
crawler(saveFile, saveDir, maxDepth)