1.获取http://m.sohu.com 所有的<a href=''...标签 2.递归调用getlink函数获取指定url中的所有<a 标签 3.在url.open(ulr)中,我们可以try ...来抓取异常,找到异常页面。
递归获取一个页面url:
coding:utf-8
from bs4 import BeautifulSoup import urllib import requests
语法:string.startswith(str, beg=0, end=len(string))
或string[beg:end].startswith(str)
参数说明:
string: 被检测的字符串
str: 指定的字符或者子字符串。(可以使用元组,会逐一匹配)
beg: 设置字符串检测的起始位置(可选)
end: 设置字符串检测的结束位置(可选)
如果存在参数
beg
和
end,则在指定范围内检查,否则在整个字符串中检查
返回值
如果检测到字符串,则返回True,否则返回False。默认空字符为True
函数解析:如果字符串string是以str开始,则返回True,否则返回False
#base url page0=set() #for http://xxxsohuxx page1=set() #for http://m.sohu.com/ page2=set()
def basegetlink(): html = urllib.urlopen('http://m.sohu.com') bsobj = BeautifulSoup(html) # print bs linklist = bsobj.findAll('a')
for link in linklist:
if 'href' in link.attrs:
newpage = link.attrs['href']
if newpage.startswith('http'):
page1.add(link.attrs['href'])
elif newpage.startswith('/'):
page0.add(link.attrs['href'])
def checkExlink(url): if not url.startswith('http'): return False page1.add(url) print 'add a http: url to page1',url return True
def checkInlink(url): if not url.startswith('http://m.sohu.com'): return False page2.add(url) print 'add a http: url to page1', url
def myurlopen(url): try: if not url.startswith('http'): url = 'http://m.sohu.com' + url
html = urllib.urlopen(url)
except :
html = None
print('Error1..we want.......get err:', url)
if url not in page2:
page2.add(url)
return html
#递归获取一个页面url,并去重 def getInlink(url): global page0 # html = urllib.urlopen(url) html = myurlopen(url) if not html: return
bsobj = BeautifulSoup(html)
print '*'*20
for link in bsobj.findAll('a'):
if 'href' in link.attrs:
newpage = link.attrs['href']
try:
if newpage[0] is '/' and newpage not in page0:
page0.add(newpage)
print('get a new inpage:', newpage, 'cur page0:', len(page0))
getInlink(newpage)
elif newpage.startswith('http') and newpage not in page1:
page1.add(newpage)
print('get a new http:', newpage, 'cur page1:', len(page1))
except IndexError, e:
print('Error0 we not care...:', e, 'url:', newpage)
continue
print '+'*20
def showpage(page, flag=True): print '-'*10,len(page),'-'*10 if not flag: for i in page: print 'http://m.sohu.com'+i print '-' * 10, len(page), '-' * 10
def show(*args): for page in args: showpage(page, True)
def getlinks(): tmplink = page0.pop() print tmplink
show(page0, page1)
getInlink('http://m.sohu.com'+tmplink)
show(page0, page1)
def main(): basegetlink() getlinks()
if name == 'main': main()