1.搜狗主页
分四步
- 获取url
- 发送请求
- 获取响应数据
- 持久化存储
代码如下
#导入requests模块
import requests
if __name__ == "__main__":
#1.获取url
url =' https://www.sogou.com/'
#2.发送请求
response = requests.get(url = url)
#3.获取响应数据,text返回字符串形式的数据
page_text = response.text
print(page_text)
#4.持久化存储到文件里
with open('./sougou.html','w',encoding='utf-8') as fp:
fp.write(page_text)
print("爬取成功!!")
页面效果如下
2.携带参数的搜狗页面
UA伪装:全称User-Agent,将爬虫对应的载体请求身份标识伪装成某一款浏览器
代码如下
import requests
if __name__ == '__main__':
url = 'https://www.sogou.com/web' #在搜狗主页上搜索一个东西,url比较发现只有后边query不同,这个参数单独封装在字典里
#UA伪装:将对应的User-Agent封装到一个字典里
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
#将url携带的参数封装到字典中
content = input('enter a word:')
param = {
'query' : content
}
#可传三个参数
response = requests.get(url = url,params = param,headers = header)
page_text = response.text
fileName = content + '.html'
with open(fileName,'w',encoding='utf-8') as fp:
fp.write(page_text)
print(fileName,'保存成功!!')
页面效果如下
3.百度翻译-post请求
import requests
import json
#爬取失败的原因是因为反爬机制对返回数据进行了加密!!!
if __name__ == '__main__':
post_url = 'https://fanyi.baidu.com/langdetect'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
word = input('enter a word:')
data = {
'aldtype': word
}
#与get请求相比稍微有点小区别
response = requests.post(url=post_url,data=data,headers=headers)
word_mean = response.json() #注意有括号,看网页中到底是什么类型的
fileName = word + '.json'
fp = open(fileName,'w',encoding='utf-8')
# json.dumps 将数据通过特殊的形式转换为所有程序语言都认识的字符串
json.dump(word_mean,fp=fp,ensure_ascii=False) #避免中文乱码
print('翻译成功!!')
爬取失败的原因是因为反爬机制对返回数据进行了加密!!!
4.肯德基餐厅查询-post请求
注意:post请求的url要写全!!!get请求写到 ?之前
#post请求的url要写全,get请求的url只用写到?之前
import requests
import json
if __name__ == '__main__':
post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
keyword = input('enter an adress:')
data = {
'cname': '',
'pid': '',
'keyword': keyword,
'pageIndex': '1',
'pageSize': '10'
}
response = requests.post(url=post_url,data=data,headers=headers)
list_data = response.json()
fileName = keyword + '肯德基餐厅.json'
fp = open(fileName,'w',encoding='utf-8')
json.dump(list_data,fp=fp,ensure_ascii=False)
print('over!!')
页面效果如下