需求分析:
- 对360搜索页面分析,删去不必要的参数信息,可得出其搜索URL为:
https://www.so.com/s?q=搜索内容
- 根据搜索关键字返回相应的整个完整的搜索结果页面信息
主要流程:
- 下载页面内容,即利用requests模块获取页面,并返回页面信息(二进制)
- 将获得的页面信息保存至本地 html 文件中,注意写入方式!
from urllib.error import HTTPError
from fake_useragent import UserAgent
from colorama import Fore
import requests
def download_page(url, params=None):
try:
ua = UserAgent()
headers = {"User-Agent": ua.random} # 用户代理设置
response = requests.get(url, params=params, headers=headers) # 传入搜索内容(参数),以及用户代理信息
print("URL:", response.url)
except HTTPError as e:
print(Fore.RED + '[-] 爬取网站%s失败:%s' % (url, e.reason))
return None
else:
return response.content # 二进制页面信息
def download_file(content=b"", filename="res.html"):
"""
:param content: 写入的内容需为 bytes 数据类型
:param filename:
:return:
"""
with open(filename, "wb") as f:
f.write(content)
print(Fore.GREEN + "[+] 写入文件%s成功" % filename)
if __name__ == '__main__':
# 京东商品页面定向下载
# content = download_page("https://item.jd.com/100012015170.html")
# download_file(content=content)
url = 'https://www.so.com/s'
params = {
'q': 'python'
}
content = download_page(url, params)
download_file(content)
执行结果: