数据提取方法
json
- 数据交换格式,看起来像python类型(列表、字典)的字符串
- 使用json之前,需要导入
import json
哪里会返回json的数据
- 浏览器切换到手机版
- 转包app
json.loads
把json字符串转化为python类型
json.loads(json字符串)
# 百度翻译示例 import requests import json url = "http://fanyi.baidu.com/basetrans" headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36"} quque = input("请输入中文:") data = {"query":quque, "from":"zh", "to":"en"} response = requests.post(url,headers = headers,data=data) txt = response.content.decode() txtj = json.loads(txt) print(txtj"trans"['dst'])
json.dumps
把python类型转化为json字符串
json.dumps({"a","a","b","b"})
,写入文件时使用json.dumps(ensure_ascii = False,indent = 2))
ensure_ascii:让中文显示成中文
indent:表示下一行在上一行的基础上空格
import requests import json url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_movie_sci-fi_hot/items?os=ios&for_mobile=1&start=0&count=18&loc_id=108288&_=0" #url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_movie_sci-fi_hot/items?os=ios&for_mobile=1&callback=jsonp1&start=0&count=18&loc_id=108288&_=0" headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36", "Referer":"https://music.douban.com/tag/%E6%AC%A7%E7%BE%8E"} response = requests.get(url,headers = headers) txt = response.content.decode() txtj = json.loads(txt) print(txtj) #写入文件时,转化为字符串 with open("douban.txt","w",encoding = 'utf-8') as f: f.write(json.dumps(txtj,ensure_ascii = False,indent = 2))
豆瓣电视爬虫案例
import requests import json def geturl(num): url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_movie_sci-fi_hot/items?os=ios&for_mobile=1&start={}&count=18&loc_id=108288&_=0".format(num) print("获取RUL完毕") return url def getjson(url): headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36", "Referer":"https://music.douban.com/tag/%E6%AC%A7%E7%BE%8E"} douban_get= requests.get(url,headers = headers) douban_content = douban_get.content.decode() douban_json = json.loads(douban_content) return douban_json["subject_collection_items"] def writejson(json_content): with open("douban.json","a",encoding = "utf-8") as f: for i in json_content: f.write(json.dumps(i,ensure_ascii = False)) f.write("\n") #1.获取初始URL num = 0 while( num < 118): url = geturl(num) #2.获取json内容 print(url) getjson_content = getjson(url) #3.保存文件 writejson(getjson_content) num += 18