获取网页里所有的链接
def getAllLink(url):
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
if 'href' in link.attrs:
print (link.attrs['href'])
获取百度百科里所有相关的链接
def getAllBaikeLink(url):
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'class': 'content'}).find_all('a', {'target': '_blank'}, href = re.compile('^/item/*')):
if 'href' in link.attrs:
print(link.attrs['href'])
------------------------------------------相同---------------------------------------------
def getAllBaikeLink(url):
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'class': 'content'}).find_all('a', {'target': '_blank', 'href' : re.compile('^/item/*')}):
if 'href' in link.attrs:
print(link.attrs['href'])
遍历一个网站所有内链(去重)
pages = set()
def getAllLinks(后缀):
global pages
html = urlopen('网页地址'.format(后缀))
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a', href=re.compile('正则')):
if link.attrs['href'] not in pages:
#得到一个新的页面
newPage = link.attrs['href']
print(newPage)
pages.add(newPage)
getAllLinks(newPage)
遍历一个网站所有内链并打印信息(去重) 】
pages = set()
def getAllLinksData(后缀):
global pages
html = urlopen('网页地址'.format(后缀))
bs = BeautifulSoup(html, 'html.parser')
try:
print (bs.h1.get_text())
print (bs.find(id='xxx').find('li').find('a').attrs['href'])
except AttributeError:
print ("页面属性缺少")
for link in bs.find_all('a',href=re.compile('正则')):
if link.attrs['href'] not in pages:
#得到一个新的页面
newPage = link.attrs['href']
print(newPage)
pages.add(newPage)
getAllLinksData(newPage)