#coding=utf8
from bs4 import BeautifulSoup
import requests
#下载网页内容
def html_download(url):
if url is None:
return None
try:
# r = requests.get(url, timeout=30)headers=headers
# return r.status_code
#timeout表示如果30s没有返回,就抛出一个规定时间未返回异常
headers = {'User-Agent':'Mozilla/5.0'}
# r = requests.request('get', url, headers=headers, timeout=30)
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()#如果状态不是200,引发HTTPError异常
r.encoding = 'utf-8' #apparent_encoding
return r.text
#网络连接有风险,抛出错误异常
except:
return "--获取页面异常--"
#根据关键词定位到词条的链接
def search(keyword):
url = 'https://baike.baidu.com/search?word=' + keyword + '&pn=0&rn=10&enc=utf8'
soup = BeautifulSoup(html_download(url), 'html.parser')
#返回第一个结果,有待验证
first = soup.find('a', class_='result-title')
return first['href']
#返回content内容
def return_content(responds):
content = []
soup = BeautifulSoup(responds, 'html.parser')
#find_data包含content中多个字段信息
find_data = soup.find('div', class_='lemma-summary')
#解析find_data
paragraphs = find_data.find_all('div', class_='para')
for paragraph in paragraphs:
content.append(paragraph.get_text())
return content
#返回百科属性介绍
def return_introduce(responds):
introduce = {}
soup = BeautifulSoup(responds, 'html.parser')
#find_data包含introduce中多个字段信息
find_data = soup.find('div', class_='basic-info cmn-clearfix')
#解析find_data
dts = find_data.find_all('dt', class_='basicInfo-item name')
dds = find_data.find_all('dd', class_='basicInfo-item value')
if len(dts) == len(dds):
for dt, dd in zip(dts, dds):
introduce[dt.get_text()] = dd.get_text()
else:
print("介绍数据名称和个数不对应!")
return introduce
if __name__=="__main__":
keyword = '班布里奇号巡洋舰'
first_href = search(keyword)
responds = html_download(first_href)
print(return_content(responds))
print(return_introduce(responds))
如果大家发现错误或者有问题可以评论,我会一段时间统一回复大家的问题,欢迎一起交流