简单的爬取一个网页页面信息然后存储。
if __name__ == '__main__':
url="https://www.sogou.com/"
response=requests.get(url=url)
page_text=response.text
print(page_text)
with open('./sogou.html','w',encoding='utf-8') as fp:
fp.write(page_text)
print("pachu")
request模块:功能强大、简单便捷、效率极高
如何使用,模仿浏览器发起请求
—指定url
—发起请求
—获取相应数据
—持久化存储数据
在pycharm端输入信息:实现动态访问信息。在此同时伪装user—agent信息
并保存信息
import requests
if __name__ == '__main__':
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
}
url = 'https://www.sogou.com/web?'
kw=input('enter a word:')
param = {
'query':kw
}
response = requests.get(url=url,params=param,headers=headers)
page_text = response.text
fileName = kw+'.html'
with open(fileName,'w',encoding='utf-8') as fp:
fp.write(page_text)
print(fileName,'保存成功!!!')
爬虫,爬取翻译
import requests
import json
if __name__ == '__main__':
#1.指定url
post_url='https://fanyi.baidu.com/sug'
#2.进行UA伪装
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
}
#3.post请求参数处理(同get请求一致)
word=input('杜金明的中英文转换器:')
data={
'kw':word
}
#4.请求发送
response =requests.post(url=post_url,data=data,headers=headers)
#5.获取响应数据:json()方法返回的是obji(如果确认相应数据是json类型才可以返回json类型)
dic_obj=response.json()
print(dic_obj)
#6.进行持久化存储
fileName=word+'.json'
fp=open(fileName,'w',encoding='utf-')
json.dump(dic_obj,fp=fp,ensure_ascii=False)
print("over!!!")
import requests
import json
if __name__ == '__main__':
url='https://movie.douban.com/j/chart/top_list?'
params={
'type':'24',
'interval_id' : '100:90',
'action':' ',
'start': '220',
'limit': '20'
}
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
}
response=requests.get(url=url,params=params,headers=headers)
list_data=response.json()
fp=open('./douban.json','w',encoding='utf-8')
json.dump(list_data,fp=fp,ensure_ascii=False)
print('over!!!')