import requests
from bs4 import BeautifulSoup
def get_website_content(url):
"""
从给定的URL抓取网站内容,保持原有格式。
参数:
url -- 要抓取的网站的URL
返回:
网站的HTML内容,如果无法抓取则返回错误信息。
"""
try:
# 发送HTTP GET请求到指定的URL
response = requests.get(url)
# 检查响应的状态码
if response.status_code == 200:
# 尝试使用不同的编码解码内容
try:
# 首先尝试使用'utf-8'编码
content = response.content.decode('utf-8')
except UnicodeDecodeError:
# 如果'utf-8'解码失败,则尝试使用'gbk'编码
content = response.content.decode('gbk')
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(content, 'html.parser')
# 移除不必要的标签,比如<script>和<style>
for script_or_style in soup(["script", "style"]):
script_or_style.decompose()
# 获取处理后的HTML内容
cleaned_html = soup.prettify()
return cleaned_html
else:
return f"Error: Unable to fetch the webpage. Status code: {response.status_code}"
except requests.RequestException as e:
return f"Error: An error occurred while fetching the webpage. Details: {e}"
# 替换这个URL为你想抓取的网站的URL
url = "https://www.baidu.com/"
content = get_website_content(url)
# 打印网站内容
print(content)
结果演示: