JSON
urllib.request:发送request和获取request结果
urllib.request.urlopen():构造http请求,模拟浏览器发送请求过程
json.loads():用于解析有效的JSON字符串并将其转换为Python字典。
json.loads() 实例
```python
import json
# JSON string
employee ='{"id":"09", "name":"Nitin", "department":"Finance"}'
employee_dict = json.loads(employee)
print(employee_dict)
print(employee_dict['name'])
#输出:
#{'id':'09', 'department':'Finance', 'name':'Nitin'}
例:
爬取“https://movie.douban.com/”:
1.F12打开控制台–>网络,刷新,打开右侧XHR,依次查看链接找到需要的页面
如下:
2.运行代码
import urllib.request
import json
#JSON网址
url='https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0'
#请求头对应信息(未设置User-Agent时,报错418,因为网页有反爬虫机制,设置利用Request构造模拟访问)
herders={'User-Agent':'https://movie.douban.com/j/subject_abstract?subject_id=35265438'}
# 设置请求头
req = urllib.request.Request(url,headers=herders)
# 发起请求,得到response响应
response=urllib.request.urlopen(req)
# json转换为字典
hjson = json.loads(response.read())
# 遍历字典中的电影,item是每条电影信息
for item in hjson["subjects"]:
# 打印每条电影的评分、标题、网址
print(item["rate"],item["title"],item['url'])
改进版:
可自己选择需爬取页数,爬取多页,将爬取内容写入csv文件
待解决问题:
1.csv文件用excel打开存在乱码问题,需用记事本或在程序中打开
2.当存在某项无想输出内容时直接报错,还未进行判断
```python
import urllib.request
import json
import random
import time
import requests
import pandas as pd
def urls(num):
#JSON网址
url=f'https://yoopu.me/api/posts/new?page={num}'
#请求头对应信息(未设置User-Agent时,报错418,因为网页有反爬虫机制,设置利用Request构造模拟访问)
USER_AGENTS = [
"Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1"
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50",
"Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)",
"Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12 "
]
#ip池,避免同一ip访问被禁,仅requests可用
IP_AGENTS = [
{'HTTP': '220.168.52.245:53548'},
{'HTTP': '14.215.212.37:9168'},
]
herders={'User-Agent':random.choice(USER_AGENTS)}
# 设置请求头 req对应request,res对应requests
req = urllib.request.Request(url,headers=herders)
#res = requests.get(url,headers=herders,proxies=random.choice(IP_AGENTS))
#随机休眠一段时间
time.sleep(random.uniform(0.5, 1.5))
# 发起请求,得到response响应
response=urllib.request.urlopen(req)
# json转换为字典
response = json.loads(response.read())
#response = requests.get(url=url,headers=herders,proxies=random.choice(IP_AGENTS)).json()
#写入csv文件,但utf_8_sig编码文件无法用Excel正确打开,可用记事本打开
df = pd.DataFrame(response)
df.to_csv(r'F:\anaconda_code\pachong.csv',mode='a',encoding='utf_8_sig')
return response
if __name__=='__main__':
for i in range(0,5):
#response=urls(i)
request=urls(i)
print("第",i,'页')
for j in request:
print(j['title'])
#
#for j in request:
# print(j['title'])
借鉴:
爬虫代码 https://zhuanlan.zhihu.com/p/265602471
json.loads() https://vimsky.com/examples/usage/json-loads-in-python.html