jsonpath解析

jsonpath在爬虫中专门用于解析json数据,是xpath在json中的应用

jsonpath基本语法

相关链接
jsonpath和xpath的区别:

  • jsonpath只能解析本地数据,xpath可以解析网页数据和本地数据
  • 基本语法相似,都又不同
    jsonpath基本语法:
    在这里插入图片描述

实例

json文件如下:

{ "store": {
    "book": [
      { "category": "reference",
        "author": "Nigel Rees",
        "title": "Sayings of the Century",
        "price": 8.95
      },
      { "category": "fiction",
        "author": "Evelyn Waugh",
        "title": "Sword of Honour",
        "price": 12.99
      },
      { "category": "fiction",
        "author": "Herman Melville",
        "title": "Moby Dick",
        "isbn": "0-553-21311-3",
        "price": 8.99
      },
      { "category": "fiction",
        "author": "J. R. R. Tolkien",
        "title": "The Lord of the Rings",
        "isbn": "0-395-19395-8",
        "price": 22.99
      }
    ],
    "bicycle": {
      "color": "red",
      "price": 19.95,
      "author": "老妈"
    }
  }
}

jsonpath解析:

import jsonpath
import json

obj = json.load(open('爬虫_解析_jsonpath.json', 'r', encoding='utf-8'))
print(obj)

# 书店所有的书的作者
author_list = jsonpath.jsonpath(obj, '$.store.book[*].author')
print(author_list)

# 所有的作者
author2_list = jsonpath.jsonpath(obj, '$..author')
print(author2_list)

# store下面的所有元素
tag_list = jsonpath.jsonpath(obj, '$.store.*')
print(tag_list)

# store里面所有的price
price_list = jsonpath.jsonpath(obj, '$.store..price')
print(price_list)

# 第三个书
book = jsonpath.jsonpath(obj, '$..book[2]')
print(book)

# 最后一本书
book = jsonpath.jsonpath(obj, '$..book[(@.length-1)]')
print(book)

# 前面两本书
book_list = jsonpath.jsonpath(obj, '$..book[0,1]')
print(book_list)
book_list2 = jsonpath.jsonpath(obj, '$..book[:2]')
print(book_list2)

# 条件过滤需要在圆括号前面添加过滤
# 过滤出所有包含版本号的书
book_isbn_list = jsonpath.jsonpath(obj, '$..book[?(@.isbn)]')
print(book_isbn_list)

# 哪本书超过了10块钱
book_10_list = jsonpath.jsonpath(obj, '$..book[?(@.price > 10)]')
print(book_10_list)

解析淘票票

打开淘票票官网
在这里插入图片描述

获取headers反爬数据

打开url到新的页面,检查url是否起作用,发现复制url之后的页面内容是这个:
jsonp80({
“returnCode”: “0”,
“returnValue”: {

}
});
之前的地区信息完全没有了,那么这个网站肯定采取了某些反爬措施,我们的headers当中应该将这些信息都带上。
复制request headers中的内容到editplus这个软件,通过正则表达式,将其进行规范化:
并且删除前面带有:的部分(例如:’:authority’:’ dianying.taobao.com’,),以及’accept-encoding’:’ gzip, deflate, br’。那么得到的就是我们需要的headers数据。

分割json数据

我们得到的数据格式为:jsonp80({
“returnCode”: “0”,
“returnValue”: {
# 数据内容
}
});
而我们需要的json数据应该是去掉jsonp80();的部分,这时候就要用到split方法:
content = content.split(’(’)[1]
content = content.split(’)’)[0]
复制json数据到json解析器(网址:json.cn),可以看到:
在这里插入图片描述

代码段:

import urllib.request

url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1645840455706_79&jsoncallback=jsonp80&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'

headers = {
    #':authority':' dianying.taobao.com',
    #':method':' GET',
    #':path':' /cityAction.json?activityId&_ksTS=1645840455706_79&jsoncallback=jsonp80&action=cityAction&n_s=new&event_submit_doGetAllRegion=true',
    #':scheme':' https',
    'accept':' */*',
    # 'accept-encoding':' gzip, deflate, br',
    'accept-language':' en,zh-CN;q=0.9,zh;q=0.8',
    'cookie':' _samesite_flag_=true; cookie2=1b4195933b8cf4f805cab4cfe2bd1714; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=KMp%2F3JZt%2FBcO6%2Fk7WJ6a6aDWbk2Jm%2BSJtziPkqwSuie%2FPl%2B9Ta3Yzzj2GNIRvfz9dzPX%2FtD8bCJI4nRyeZy9nQ%3D%3D; cna=kop+GD8HKwMCAXa3kzvYDAKq; cancelledSubSites=empty; dnk=tb27867112; tracknick=tb27867112; v=0; miid=2652906292127496881; oa2=7b662fb6c7453edff55b993db866411d; t=e30d4d7a3566ee69f912ececf122d887; lgc=tb27867112; UM_distinctid=17ea3c1d10b3f4-0f608cf2eb031e-f791539-144000-17ea3c1d10c25b; _tb_token_=333e33e131eee; _m_h5_tk=b98e4a7777282769655ef10a17379a74_1645787762939; _m_h5_tk_enc=3617c942aed58ce25e82c2abb3725657; sgcookie=E100ypcoQUIEwvXHDnhcIYblY9Fknx1edHRR9xAFP8BtT07FtiFLBsp8JTf3Zz1nw4WpPPZG58fJ3q%2FFKTmHlHzQZexCPzXB%2B67%2BDdwdoGZtYcF91FR2cBtR2ExXirABy%2FY9; uc3=id2=UNGWOUzprbj0rQ%3D%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&vt3=F8dCvUCROMNhP9Drqmc%3D&nk2=F5RHprSDi2bCLw%3D%3D; csg=6d11ebe2; skt=c87c37092346272b; existShop=MTY0NTc3NzUyMw%3D%3D; uc4=nk4=0%40FY4MtLWKKjdD9DZjSlaLr5LWqwf%2F&id4=0%40UgbuMoUnOLqHO8OnDx6aHPOyz7ja; _cc_=UIHiLt3xSw%3D%3D; xlly_s=1; mt=ci=-1_0; uc1=cookie15=W5iHLLyFOGW7aA%3D%3D&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&cookie21=URm48syIYBrb0wDboXk1&cookie14=UoewBVSDtQ%2FeeA%3D%3D&pas=0&existShop=false; tfstk=ceXVB7Tzg-e2lV9_9K9alx0uM4OAawYMRY-eisBYOPoQnXAX8sArW3vrd3-Z4VAc.; l=eBa79O3uLOlLr3OhBO5aFurza77tzIRb4sPzaNbMiInca1zVtFObANCnCtOwSdtj_t1cUetzQ4is6RLHR3A0hc0c0xb0hl0jnxf..; isg=BD4-R1_U0PBM9ATPbxX7S4wWj1SAfwL5syywOuhHcgF8i95lUA6CCS1tA19Hs_oR',
    'dnt':' 1',
    'referer': 'https://www.taobao.com/',
    'sec-ch-ua':' " Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
    'sec-ch-ua-mobile':' ?0',
    'sec-ch-ua-platform':' "Windows"',
    'sec-fetch-dest':' script',
    'sec-fetch-mode':' no-cors',
    'sec-fetch-site':' same-site',
    'user-agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}

request = urllib.request.Request(url = url, headers = headers)
response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')
#split 切割
content = content.split('(')[1]
content = content.split(')')[0]

with open('爬虫_解析_解析淘票票.json', 'w', encoding='utf-8') as fp:
    fp.write(content)

import json
import jsonpath

obj = json.load(open('爬虫_解析_解析淘票票.json','r', encoding = 'utf-8'))

city_list = jsonpath.jsonpath(obj, '$..regionName')
print(city_list)

得到的.json文件可通过ctrl + alt + l 进行快速排版

解析某宝

相关链接1
相关链接2

代码段

import urllib.request

def updateFile(file,old_str,new_str):
    """
    替换文件中的字符串
    :param file:文件名
    :param old_str:就字符串
    :param new_str:新字符串
    :return:
    """
    file_data = ""
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            if old_str in line:
                line = line.replace(old_str,new_str)
            file_data += line
    with open(file,"w",encoding="utf-8") as f:
        f.write(file_data)



# url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1645840455706_79&jsoncallback=jsonp80&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'
url = 'https://s.taobao.com/search?data-key=s&data-value=44&ajax=true&_ksTS=1645866049212_742&callback=jsonp743&q=%E5%A4%A7%E8%A1%A3&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=1&ntoffset=7&p4ppushleft=2%2C48'


"""
headers = {
    #':authority':' dianying.taobao.com',
    #':method':' GET',
    #':path':' /cityAction.json?activityId&_ksTS=1645840455706_79&jsoncallback=jsonp80&action=cityAction&n_s=new&event_submit_doGetAllRegion=true',
    #':scheme':' https',
    'accept':' */*',
    # 'accept-encoding':' gzip, deflate, br',
    'accept-language':' en,zh-CN;q=0.9,zh;q=0.8',
    'cookie':' _samesite_flag_=true; cookie2=1b4195933b8cf4f805cab4cfe2bd1714; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=KMp%2F3JZt%2FBcO6%2Fk7WJ6a6aDWbk2Jm%2BSJtziPkqwSuie%2FPl%2B9Ta3Yzzj2GNIRvfz9dzPX%2FtD8bCJI4nRyeZy9nQ%3D%3D; cna=kop+GD8HKwMCAXa3kzvYDAKq; cancelledSubSites=empty; dnk=tb27867112; tracknick=tb27867112; v=0; miid=2652906292127496881; oa2=7b662fb6c7453edff55b993db866411d; t=e30d4d7a3566ee69f912ececf122d887; lgc=tb27867112; UM_distinctid=17ea3c1d10b3f4-0f608cf2eb031e-f791539-144000-17ea3c1d10c25b; _tb_token_=333e33e131eee; _m_h5_tk=b98e4a7777282769655ef10a17379a74_1645787762939; _m_h5_tk_enc=3617c942aed58ce25e82c2abb3725657; sgcookie=E100ypcoQUIEwvXHDnhcIYblY9Fknx1edHRR9xAFP8BtT07FtiFLBsp8JTf3Zz1nw4WpPPZG58fJ3q%2FFKTmHlHzQZexCPzXB%2B67%2BDdwdoGZtYcF91FR2cBtR2ExXirABy%2FY9; uc3=id2=UNGWOUzprbj0rQ%3D%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&vt3=F8dCvUCROMNhP9Drqmc%3D&nk2=F5RHprSDi2bCLw%3D%3D; csg=6d11ebe2; skt=c87c37092346272b; existShop=MTY0NTc3NzUyMw%3D%3D; uc4=nk4=0%40FY4MtLWKKjdD9DZjSlaLr5LWqwf%2F&id4=0%40UgbuMoUnOLqHO8OnDx6aHPOyz7ja; _cc_=UIHiLt3xSw%3D%3D; xlly_s=1; mt=ci=-1_0; uc1=cookie15=W5iHLLyFOGW7aA%3D%3D&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&cookie21=URm48syIYBrb0wDboXk1&cookie14=UoewBVSDtQ%2FeeA%3D%3D&pas=0&existShop=false; tfstk=ceXVB7Tzg-e2lV9_9K9alx0uM4OAawYMRY-eisBYOPoQnXAX8sArW3vrd3-Z4VAc.; l=eBa79O3uLOlLr3OhBO5aFurza77tzIRb4sPzaNbMiInca1zVtFObANCnCtOwSdtj_t1cUetzQ4is6RLHR3A0hc0c0xb0hl0jnxf..; isg=BD4-R1_U0PBM9ATPbxX7S4wWj1SAfwL5syywOuhHcgF8i95lUA6CCS1tA19Hs_oR',
    'dnt':' 1',
    'referer': 'https://www.taobao.com/',
    'sec-ch-ua':' " Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
    'sec-ch-ua-mobile':' ?0',
    'sec-ch-ua-platform':' "Windows"',
    'sec-fetch-dest':' script',
    'sec-fetch-mode':' no-cors',
    'sec-fetch-site':' same-site',
    'user-agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
"""
headers = {
    # ':authority':' s.taobao.com',
    # ':method':' GET',
    # ':path':' /search?data-key=s&data-value=44&ajax=true&_ksTS=1645866049212_742&callback=jsonp743&q=%E5%A4%A7%E8%A1%A3&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=1&ntoffset=7&p4ppushleft=2%2C48',
    # ':scheme':' https',
    'accept':' */*',
    # 'accept-encoding':' gzip, deflate, br',
    'accept-language':' en,zh-CN;q=0.9,zh;q=0.8',
    'cookie':' _samesite_flag_=true; cookie2=1b4195933b8cf4f805cab4cfe2bd1714; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=KMp%2F3JZt%2FBcO6%2Fk7WJ6a6aDWbk2Jm%2BSJtziPkqwSuie%2FPl%2B9Ta3Yzzj2GNIRvfz9dzPX%2FtD8bCJI4nRyeZy9nQ%3D%3D; cna=kop+GD8HKwMCAXa3kzvYDAKq; cancelledSubSites=empty; dnk=tb27867112; tracknick=tb27867112; v=0; miid=2652906292127496881; oa2=7b662fb6c7453edff55b993db866411d; t=e30d4d7a3566ee69f912ececf122d887; lgc=tb27867112; alitrackid=ssl.dianzhentan.com; UM_distinctid=17ea3c1d10b3f4-0f608cf2eb031e-f791539-144000-17ea3c1d10c25b; _tb_token_=333e33e131eee; lastalitrackid=www.taobao.com; sgcookie=E100ypcoQUIEwvXHDnhcIYblY9Fknx1edHRR9xAFP8BtT07FtiFLBsp8JTf3Zz1nw4WpPPZG58fJ3q%2FFKTmHlHzQZexCPzXB%2B67%2BDdwdoGZtYcF91FR2cBtR2ExXirABy%2FY9; uc3=id2=UNGWOUzprbj0rQ%3D%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&vt3=F8dCvUCROMNhP9Drqmc%3D&nk2=F5RHprSDi2bCLw%3D%3D; csg=6d11ebe2; skt=c87c37092346272b; existShop=MTY0NTc3NzUyMw%3D%3D; uc4=nk4=0%40FY4MtLWKKjdD9DZjSlaLr5LWqwf%2F&id4=0%40UgbuMoUnOLqHO8OnDx6aHPOyz7ja; _cc_=UIHiLt3xSw%3D%3D; xlly_s=1; mt=ci=-1_0; _m_h5_tk=34d19076dd41003f05af8940de874f84_1645872989982; _m_h5_tk_enc=1ed017aeafc4b30b6e1d637c703264e4; CNZZDATA1277450732=1969566905-1643423147-https%253A%252F%252Fssl.dianzhentan.com%252F%7C1645855512; uc1=pas=0&cookie14=UoewBVSBXCF0bw%3D%3D&cookie16=UtASsssmPlP%2Ff1IHDsDaPRu%2BPw%3D%3D&cookie21=URm48syIYBrb0wDboXk1&cookie15=V32FPkk%2Fw0dUvg%3D%3D&existShop=false; JSESSIONID=AE3904898EA7D325367FE30A3E750BFE; tfstk=cg3RBQcCDKvuMsPx_0KmdZgnua3Ra_f8K_wdJIcSNcVNYBbG7sf-SVLUT3NUQ8dA.; l=eBa79O3uLOlLrTkXBOfwlurza77ODIRfguPzaNbMiOB1to1UFd3Z9HZ4So_9Q3QQKtCcPetzQ4is6R3MJ9aKg2HvCbKrCyCuOxJO.; isg=BOPj1UQNhdNN3Un8ctIOyIlpcieN2HcalgO9qRVAQMJ6VAJ2najDarpCTizadM8S',
    'dnt':' 1',
    'referer': 'https://s.taobao.com/search?q=%E5%A4%A7%E8%A1%A3&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306',
    'sec-ch-ua':' " Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
    'sec-ch-ua-mobile':' ?0',
    'sec-ch-ua-platform':' "Windows"',
    'sec-fetch-dest':' script',
    'sec-fetch-mode':' no-cors',
    'sec-fetch-site':' same-origin',
    'user-agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
}

request = urllib.request.Request(url = url, headers = headers)
response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')

with open('爬虫_解析_解析淘宝.json', 'w', encoding='utf-8') as fp:
    fp.write(content)

updateFile('爬虫_解析_解析淘宝.json', 'jsonp743(', '')
updateFile('爬虫_解析_解析淘宝.json', ');', '')

import json
import jsonpath

obj = json.load(open('爬虫_解析_解析淘宝.json','r', encoding = 'utf-8'))
item_list = jsonpath.jsonpath(obj, '$..title')
print(item_list)



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值