在 HTML中 <a href='xx'> 表示超链接,所以要是提取页面 url 的话就是提取 ‘xx’
方法一:find_all
import urllib
import requests
from urllib.parse import urlparse
from urllib import request, parse
from bs4 import BeautifulSoup
word = '周杰伦'
# word为关键词,pn是百度用来分页的..
url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(word) + '&pn=0'
print(url)
# 通过 url 获取域名
res = urlparse(url)
domain = res.netloc
print(domain)
print('- - '*30)
response = request.urlopen(url)
page = response.read()
soup = BeautifulSoup(page, 'lxml')
# tagh3 = soup.find_all('h3') # 返回 list
tagh3 = soup.find_all('a') # 获取所有 a 标签下内容,返回 list
all = open(r'F:\security\web\output\report\test.txt', 'w+')
hrefs = []
for h3 in tagh3:
# href = h3.find('a').get('href')
try:
href = h3