1.爬取搜狗页面源码信息并保存为html
import requests
if __name__ == "__main__":
url = 'https://www.sogou.com/'#指定url
response = requests.get(url = url)#发起请求,获取响应数据
page_text = response.text
print(page_text)
with open('./sogou.html','w',encoding='utf-8') as fp:
fp.write(page_text)#持久化储存
print('爬取数据结束!')
2.爬取指定关键字的搜狗检索页面信息并保存为html
问号前面的存在url里
问号后面的存在字典里(问号后面包含哪些信息可以在页面检查中找到,此例中为’query’)
import requests
if __name__ == "__main__":
url = 'https://www.sogou.com/web'
kw = input('enter a word:')
param = {
'query':kw
}
response = requests.get(url = url, params = param)
page_text = response.text
fileName = kw+'.html'
with open(fileName, 'w', encoding='utf-8') as fp:
fp.write(page_text)
print(fileName,'保存成功')
添加UA伪装(User-Agent伪装,伪装成浏览器发出的请求,而不是爬虫程序):
import requests
if __name__ == "__main__":
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
}#UA伪装(任意网页右键检查->network->点击任意包->复制User-Agent)
url = 'https://www.sogou.com/web'
kw = input('enter a word:')
param = {
'query':kw
}
response = requests.get(url = url, params = param, headers = headers)
page_text = response.text
fileName = kw+'.html'
with open(fileName, 'w', encoding='utf-8') as fp:
fp.write(page_text)
print(fileName,'保存成功')
3.破解百度翻译
翻译后与翻译前的url不变,代表是一个ajax请求,所以要在XHR下面找数据包。
返回来的是json数据
request.get与request.post的区别:Get是用来从服务器上获得数据,而Post是用来向服务器上传递数据(可以携带参数)。本次是post请求。
response返回的是字典
import requests
import json
if __name__ == "__main__":
post_url = 'https://fanyi.baidu.com/sug'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
}
word = input('enter a word:')
data = {
'kw':word
}#携带信息
response = requests.post(url = post_url, data = data, headers = headers)#发送请求
dic_obj = response.json()#返回字典obj
fileName = word+'.json'
fp = open(fileName,'w',encoding = 'utf-8')
json.dump(dic_obj,fp = fp, ensure_ascii = False)#保存(因为是中文,ASC码要选择false)
print('over!')
确定类型是json才能使用response.json。
4.爬取豆瓣电影Top250
参数:
response最外边一层返回的是列表
import requests
import json
if __name__ == "__main__":
url = 'https://movie.douban.com/j/chart/top_list'#url,只复制问号前面的
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
}
param = {
'type': '24',
'interval_id': '100:90',
'action':'',
'start': '0',
'limit': '20'
}#修改参数,从第1部到第20部
response = requests.get(url=url,params=param,headers=headers)
list_data = response.json()
fp = open('./douban.json', 'w', encoding='utf-8')
json.dump(list_data, fp=fp, ensure_ascii=False)
print('over')
保存后的json格式可以用json在线解析查看