最近在跟着网上的视频复习爬虫,记录一下
一、简易网页采集器
爬取搜狗搜索中对应词条的搜索结果页面
# -*-coding:utf-8-*-
import requests
# UA伪装:User-Agent
# 门户网站的服务器会检测对应请求的载体身份标识
#
if __name__=='__main__':
# 将对应的User-agent封装到一个字典中
header={
'User-Agent':'Mozilla / 5.0(X11;Linuxx86_64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 93.0.4577.82Safari / 537.36'
}
url="https://www.sogou.com/web"
# 处理url携带的参数:封装到字典中
kw=input('enter a key word:')
param={
'query':kw
}
# 对指定url发起请求,对应url是携带以处理号的参数
response=requests.get(url,param)
page_txt = response.text
filename=kw+".html"
with open(filename,'w',encoding='utf-8') as f:
f.write(page_txt)
print(filename,"save successful!!!")
二、爬取百度翻译
# -*-coding:utf-8-*-
import requests
import json
# 页面局部刷新==>ajax
if __name__=="__main__":
headers={
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
url='https://fanyi.baidu.com/sug'
kw_v=input("输入待翻译的词:")
data={
'kw':kw_v
}
response =requests.post(url,data,headers)
# 请求响应回来的是json数据
print(response.text)
print(response.json())
# 请求响应回来的是json数据 .json() 才可以返回的是obj对象
dict_obj = response.json()
# .dump()将生成的文本文件放哪
f = open(kw_v+".json",'w',encoding='utf-8')
# ensure_ascii=False 内容是中文,是不能使用ascii码的
json.dump(dict_obj,f,ensure_ascii=False)
print("success!")
三、爬取豆瓣电影分类排行榜中电影详情数据
# -*-coding:utf-8-*-
import requests
import json
if __name__=='__main__':
headers={
'User-Agent':'https://movie.douban.com/j/chart/top_list?type=26&interval_id=100%3A90&action=&start=0&limit=20'
}
url="https://movie.douban.com/j/chart/top_list"
param={
'type': '26',
'interval_id': '100:90',
'action': '',
'start': '0',
'limit': '20',
}
response = requests.get(url,param,headers=headers)
f = open('./douban_.json','w',encoding='utf-8')
list_data = response.json()
json.dump(list_data,f, ensure_ascii=False)
print("success!!")
四、肯德基餐厅查询中指定地点的餐厅信息
# -*-coding:utf-8-*-
import requests
import json
if __name__=='__main__':
headers={
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
url="http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"
ke_v=input("输入要搜索的地址keyword:")
param={
'cname':'',
'pid':'',
'keyword': ke_v,
'pageIndex': '1',
'pageSize': '10'
}
response = requests.post(url,param,headers)
dict_data = response.text
f=open(ke_v+'.json','w',encoding='utf-8')
json.dump(dict_data,f,ensure_ascii=False)
print('success!!!!!')
五、爬取国家药品监督管理总局中华人民共和国化妆品生产许可证相关数据
# -*-coding:utf-8-*-
import requests
import json
import os
if __name__=='__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
url="http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList"
param={
'on': 'true',
'page': '1',
'pageSize': '15',
'productName': '',
'conditionType': '1',
'applyname':'',
'applysn':'',
}
path="./化妆品"
if not os.path.exists(path):
os.mkdir(path)
for i in requests.post(url,param,headers).json()['list']:
url_new="http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById"
param_new={
"id": i['ID']
}
response_new = requests.post(url_new,param_new,headers)
dict_data=response_new.json()
f = open(os.path.join(path,str(i['EPS_NAME'])+".json"), 'w', encoding='utf-8')
json.dump(dict_data,f,ensure_ascii=False)
print('{} success'.format(i['EPS_NAME']))