requests 模块
引入
- 在python实现的网络爬虫中,用于网络请求发送的模块有两种,第一种为urllib模块,第二种为requests模块。urllib模块是一种比较古老的模块,在使用的过程中较为繁琐和不便。当requests模块出现后,就快速的代替了urllib模块,因此,在我们课程中,推荐大家使用requests模块。
使用requests流程
- 指定url
- 基于requests模块发起请求(get/post)
- 获取响应对象中的数据值
- 持久化存储
爬取搜狗首页整页数据
import requests
url = 'https://www.sogou.com/'
response = requests.get(url=url,verify=False)
page_text = response.text
print(page_text)
with open('sougou.html','w',encoding='utf-8')as f:
f.write(page_text)
借助sougou引擎做自定义查询
需要借助UA伪装
import requests
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
url = 'https://www.sogou.com/web?'
kw = input('请输入你要查询的内容>>> ')
param = {
'query': kw
}
response = requests.get(url=url,params=param,headers=headers,verify=False)
page_text = response.text
print(page_text)
with open(f'{kw}.html','w',encoding='utf-8')as f:
f.write(page_text)
破解百度翻译
import requests
import json
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
post_url = 'https://fanyi.baidu.com/sug'
word = input('请输入要翻译的词>>>: ')
data = {
'kw': word
}
response = requests.post(url=post_url,data=data,headers=headers,verify=False)
dic_obj = response.json()
print(dic_obj,type(dic_obj))
with open(f'{word}.json','w',encoding='utf-8')as f:
json.dump(dic_obj,f,ensure_ascii=False)
豆瓣电影详情数据
import requests
import json
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
url = 'https://movie.douban.com/j/chart/top_list?'
params = {
'type': 24,
'interval_id': '100:90',
'action':'',
'start': '0',
'limit': 20
}
response = requests.get(url=url,params=params,headers=headers,verify=False)
list_data = response.json()
print(list_data)
with open(f'douban.json','w',encoding='utf-8')as f:
json.dump(list_data,f,ensure_ascii=False)
肯德基餐厅
import requests
import json
import pandas as pd
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
store = input('请输入你要找到店的关键字>>>: ')
city = input('请输入你要找的店的城市>>>: (例如上海...)')
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
data ={
'cname': '',
'pid': '',
'keyword': store,
'pageIndex': 1,
'pageSize': 10,
}
response = requests.post(url=url,data=data,headers=headers,verify=False)
list_data = response.text
print(n := json.loads(list_data))
for i in n:
print(i)
print(n_2 := n['Table1'])
print(n_2)
df = pd.DataFrame([],columns=['storeName','addressDetail','provinceName'])
store_list = []
address_list = []
provinceName_list = []
for j in n_2:
if city in j['provinceName']:
store_list.append(j['storeName'])
address_list.append(j['addressDetail'])
provinceName_list.append(j['provinceName'])
df.storeName = store_list
df.addressDetail = address_list
df.provinceName = provinceName_list
if df.empty:
print('抱歉未找到记录')
else:print(df)
df.to_excel(f'{city}-{store}-kfc店地址明细.xlsx',index=None,freeze_panes=(1,0))
国家药品总监管理总局化妆品生产许可证相关信息
需要处理动态加载的数据
import requests
import json
import pandas as pd
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
post_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
id_list = []
for page in range(1,15):
page = str(page)
data = {
'on': 'true',
'page': page,
'pageSize': 15,
'productName':'' ,
'conditionType': 1,
'applyname': '',
'applysn': '',
}
response = requests.post(url=post_url,data=data,headers=headers,verify=False)
print(json_id := response.json())
print(json_id_list := json_id['list'])
for i in json_id_list:
print(i['ID'])
id_list.append(i['ID'])
all_data_lst = []
post_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
data={
'id':id
}
response = requests.post(url=post_url,data=data,headers=headers,verify=False)
detail_json = response.json()
all_data_lst.append(detail_json)
with open('化妆品信息.txt','a',encoding='utf-8')as f:
for i in all_data_lst:
content = i['epsName'] + '\t' + i['epsProductAddress'] + '\t' + i['legalPerson'] + '\n'
f.write(content)