1.导入re模块:正则
2.导入urllib.request模块:爬虫
3.导入deque模块:双向队列
4.extend():列表末尾一次性追加另一个序列中的多个值
5.findall():相匹配的全部字串,返回形式为数组
6.compile():将一个字符串编译为字节
7.pop():栈为先进先出,添加在列表最后面那个值
import urllib.request
import urllib
import re
#http://bbs.tianya.cn/m/post-140-393974-4.shtml
#http://bbs.tianya.cn
#<a class="u-btn pre-btn" href="/m/post-140-393974-4.shtml"></a>
#s深度遍历使用栈
def geteveryurl(data):
alllist=[]
mylist1=[]
mylist2=[]
mylist1=getallhttp(data)
if len(mylist1) >0:
mylist2=getabsurl( mylist1[0],data)
alllist.extend(mylist1)
alllist.extend(mylist2)
return alllist
#<a class="u-btn pre-btn" href="/m/post-140-393974-4.shtml"></a>
def getabsurl(url,data):
try:
regex=re.compile("href=\"(.*?)\"",re.IGNORECASE)
httplist=regex.findall(data)
newhttplist=httplist.copy()#深拷贝
for data in newhttplist:
if data.find("http://")!=-1:
httplist.remove(data)
if data.find("javascript")!=-1:
httplist.remove(data)
hostname=gethostname(url)
if hostname!=None:
for i in range(len(httplist)):
httplist[i]=hostname+httplist[i]
return httplist
except:
return []
#http://bbs.tianya.cn/post-140-393974-1.shtml'
#http://bbs.tianya.cn
def gethostname(httpstr):
try:
mailregex = re.compile(r"(http://\S*?)/", re.IGNORECASE)
mylist = mailregex.findall(httpstr)
if len(mylist)==0:
return None
else:
return mylist[0]
except:
return None
def getallhttp(data):
try:
mailregex = re.compile(r"(http://\S*?)[\"|>|)]", re.IGNORECASE)
mylist = mailregex.findall(data)
return mylist
except:
return []
def getallemail(data):
try:
mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})", re.IGNORECASE)
mylist = mailregex.findall(data)
return mylist
except:
return []
def getdata(url):
try:
data=urllib.request.urlopen(url).read().decode("utf-8")
return data #没有异常返回字符串
except:
return "" #发生异常返回空
def DFS(urlstr):
visitlist=[] #代表已经访问的,
urlstack=[] #栈
urlstack.append(urlstr)
while len(urlstack)!=0:
url=urlstack.pop()#栈弹出的数据
print(url)#打印url链接
if url not in visitlist:
pagedata=getdata(url)#获取网页源代码
emaillist=getallemail(pagedata) #提取邮箱到列表
if len(emaillist)!=0: #邮箱不为空
for email in emaillist: #打印所有邮箱
print(email)
newurllist=geteveryurl(pagedata)#抓取所有的url
if len(newurllist)!=0: #判断长度
for urlstr in newurllist: #循环处理每一个url,
if urlstr not in urlstack: #判断存在或者不存在
urlstack.append(urlstr) #插入
visitlist.append(url)
#DFS("http://bbs.tianya.cn/m/post-140-393974-5.shtml")
DFS("http://www.baidu.com/")