问题解答
可能是请求的url返回的类型是text,不是json无法解析,打印res.text试试
反爬机制,UA伪装
反爬机制,UA伪装
一、关于爬虫的第一个练习,百度关键词网页缓存
反爬机制,UA伪装
# 作为第一个爬虫练习,爬取网页并且保存为html格式
# 1、知识点,requests get方法
# 2、反爬机制,UA
import requests
def request_baidu():
url = 'http://www.baidu.com/s'
keyword = input('输入需要搜索的关键词:')
param = {
'wd': keyword
}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
res = requests.get(url=url, params=param, headers=header)
print(res.json())
with open(f'{keyword}.html', 'wt+', encoding='utf-8', errors='ignore') as f:
f.write(res.text)
if __name__ == '__main__':
request_baidu()
二、百度翻译,post的使用
1、requests 的post 使用data传参
2、当返回的报文中的格式可以解析为json,使用.json方法进行格式解析
3、json.dumps 可以赋值给变量,json.dump 可以使用fp写入文件
# 作为第二个爬虫练习,查看百度翻译返回的结果(json),并保存
# 1、知识点,requests post方法
# 2、知识点,json的解析,和字符串编码
import requests
import json
def request_baidu_translation():
url = 'https://fanyi.baidu.com/sug'
keyword = input('输入需要搜索的关键词:')
param = {
'kw': keyword
}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
res = requests.post(url=url, data=param, headers=header)
with open(f'{keyword}.json', 'wt+', encoding='utf-8', errors='ignore') as f:
a = json.dumps(res.json(), ensure_ascii=False)
f.write(a)
if __name__ == '__main__':
request_baidu_translation()
三、豆瓣排行榜爬取,动态Ajax分析
post 多个参数传递
# 作为第三个爬虫练习,动态爬取豆瓣排行榜
# 1、知识点,多个参数分析
import requests
import json
def request_baidu_translation():
url = 'https://movie.douban.com/j/chart/top_list'
param = {
'type': 24,
'interval_id': '100:90',
'action': '',
'start': 0,
'limit': 100
}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
res = requests.get(url=url, params=param, headers=header)
print(res.json())
return res.json()
def write_json(res_json):
keyword = input('输入存储的文件名称:')
with open(f'{keyword}.json', 'wt+', encoding='utf-8', errors='ignore') as f:
a = json.dumps(res_json, ensure_ascii=False)
f.write(a)
if __name__ == '__main__':
res_json = request_baidu_translation()
write_json(res_json=res_json)
四、KFC商家爬取,知识的复习
# 作为第四个爬虫练习,查看百度翻译返回的结果(json),并保存
# 1、知识点,复习巩固
import requests
import json
def request_baidu_translation():
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
keyword = input('文件名字:')
param = {
'cname': '',
'pid': '',
'keyword': '郫',
'pageIndex': 1,
'pageSize': 10
}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
res = requests.post(url=url, data=param, headers=header)
with open(f'{keyword}.json', 'wt+', encoding='utf-8', errors='ignore') as f:
a = json.dumps(res.json(), ensure_ascii=False)
f.write(a)
if __name__ == '__main__':
request_baidu_translation()
五、药监局实战,数据解析和存储
# 作为第五个爬虫练习,查看百度翻译返回的结果(json),并保存
# 1、知识点,多个实战
import requests
import json
import os
def request_baidu_translation():
url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
data_list = []
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
for i in range(1, 3):
param = {
'on': 'true',
'page': i,
'pageSize': 15,
'conditionType': 1
}
res_all_id = requests.post(url=url, data=param, headers=header)
data_list.append(res_all_id.json())
print(data_list)
if not os.path.exists('Drug_Administration_Data'):
os.mkdir('Drug_Administration_Data')
else:
print("Dir exists")
for i in data_list:
if not os.path.exists(f'./Drug_Administration_Data/Page_{data_list.index(i)}'):
os.mkdir(f'./Drug_Administration_Data/Page_{data_list.index(i)}')
else:
print("Dir exists")
for j in range(15):
with open(f"./Drug_Administration_Data/Page_{data_list.index(i)}/{i['list'][j]['EPS_NAME']}.json", 'wt+',
encoding='utf-8', errors='ignore') as f:
json.dump(detail_crawl(i['list'][j]['ID']), fp=f, ensure_ascii=False)
def detail_crawl(ID_EPS: str):
url_detail = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
param = {
'id': ID_EPS,
}
res_all_id = requests.post(url=url_detail, data=param, headers=header)
return res_all_id.json()
if __name__ == '__main__':
request_baidu_translation()
notion地址:
https://elastic-scion-778.notion.site/55b704dcb84b4279933cba3b14e6525b
项目来源
https://www.bilibili.com/video/BV1Yh411o7Sz?p=11