项目中经常使用抓取,有些接口返回的是html,我解析是使用NSoup。
但大部分接口返回使用的Json。
下面我针对json 编写了三方方法,都是用于提取属性结果的
方法1.使用的是JsonPath,配置规则详见:
newtonsoft官网文档
https://www.newtonsoft.com/json/help/html/QueryJsonSelectToken.htm
List<string> resLS = NSoupHelper.GetJsonListStrBySelector(jsonStr, "$..rows[*].newsTitle");
方法2.使用的是正则表达式提取,遍历后返回命中结果
按照json路径传参即可
List<string> regLS = NSoupHelper.GetJsonListStrByRegex(jsonStr, "ROOT rows newsTitle");
方法3.使用的是JToken判断,遍历返回命中结果
按照json路径传参即可
List<string> resLS2 = NSoupHelper.GetJsonListStrBySelector2(jsonStr, "ROOT rows newsTitle");
三种结果都是一致的:
源码
#region Json
/// <summary>
/// 根据JsonPath 获取json字符串的相应字段集合
/// </summary>
/// <param name="_JsonResult"></param>
/// <param name="_getSelector">$..Products[?(@.Price >= 50)].Name、$..rows[*].newsTitle</param>
/// <returns></returns>
public static List<string> GetJsonListStrBySelector(string _JsonResult, string _getSelector)
{
List<string> _listS = new List<string>();
if (string.IsNullOrEmpty(_JsonResult) || string.IsNullOrEmpty(_getSelector)) { return _listS; }
try
{
JObject _BaseJobject = JsonConvert.DeserializeObject<JObject>(_JsonResult);
IEnumerable<JToken> checkedToken = _BaseJobject.SelectTokens(_getSelector);
foreach (JToken item in checkedToken)
{
_listS.Add(item.ToString());
}
//https://www.newtonsoft.com/json/help/html/QueryJsonSelectToken.htm
}
catch { }
return _listS;
}
/// <summary>
/// 根据Selector表达式获取json字符串的相应字段集合 ,原理 正则表达式
/// </summary>
/// <param name="_JsonResult"></param>
/// <param name="_getSelector">逐级 如:ROOT rows newsTitle</param>
/// <returns></returns>
public static List<string> GetJsonListStrByRegex(string _JsonResult, string _getSelector)
{
List<string> _listS = new List<string>();
if (string.IsNullOrEmpty(_JsonResult) || string.IsNullOrEmpty(_getSelector)) { return _listS; }
//处理单引号问题
if (_JsonResult.IndexOf('\'') > 0)
{
_JsonResult= JsonConvert.SerializeObject(JsonConvert.DeserializeObject<JObject>(_JsonResult));
}
try
{
List<string> hitStrs = new List<string>();
string[] jPaths = _getSelector.Split(' ');
JToken jt = null;
for (int i = 0; i < jPaths.Length; i++)
{
string jp = jPaths[i];
string[] _l_JFH = new string[] { "[", "{","\"" ,":" };
string[] _r_FH = new string[] { "]", "}", "\"", ",}" };
if (i == 0)
{
for (int i2 = 0; i2 < _l_JFH.Length; i2++)
{
Regex reg = new Regex(string.Format("\"{2}\"[^\\{0}\\{1},]*\\{0}(?<json>[^\\{0}\\{1}]*(((?'Open'\\{0})[^\\{0}\\{1}]*)+((?'-Open'\\{1})[^\\{0}\\{1}]*)+)*?(?(Open)(?!))*[^\\{0}\\{1}]*)[\\{1}]", _l_JFH[i2], _r_FH[i2], jp));
MatchCollection ms = reg.Matches(_JsonResult);
foreach (Match _m in ms)
{
hitStrs.Add(_m.Groups["json"]?.Value);
}
if (ms.Count > 0)
{
break;
}
}
if (i == jPaths.Length - 1)
{
_listS = hitStrs;
}
}
else if (i == jPaths.Length - 1)
{
//最后一个层级
foreach (string _nowStr in hitStrs)
{
for (int i2 = 0; i2 < _l_JFH.Length; i2++)
{
Regex reg = new Regex(string.Format("\"{2}\"[^\\{0}\\{1},]*\\{0}(?<json>[^\\{0}\\{1}]*(((?'Open'\\{0})[^\\{0}\\{1}]*)+((?'-Open'\\{1})[^\\{0}\\{1}]*)+)*?(?(Open)(?!))*[^\\{0}\\{1}]*)[\\{1}]", _l_JFH[i2], _r_FH[i2], jp));
MatchCollection ms = reg.Matches(_nowStr);
foreach (Match _m in ms)
{
_listS.Add(_m.Groups["json"]?.Value);
}
if (ms.Count > 0)
{
break;
}
}
}
}
else
{
//中间层级
List<string> gdStrs = new List<string>();
foreach (string _nowStr in hitStrs)
{
for (int i2 = 0; i2 < _l_JFH.Length; i2++)
{
Regex reg = new Regex(string.Format("\"{2}\"[^\\{0}\\{1},]*\\{0}(?<json>[^\\{0}\\{1}]*(((?'Open'\\{0})[^\\{0}\\{1}]*)+((?'-Open'\\{1})[^\\{0}\\{1}]*)+)*?(?(Open)(?!))*[^\\{0}\\{1}]*)[\\{1}]", _l_JFH[i2], _r_FH[i2], jp));
MatchCollection ms = reg.Matches(_nowStr);
foreach (Match _m in ms)
{
gdStrs.Add(_m.Groups["json"]?.Value);
}
if (ms.Count > 0)
{
break;
}
}
}
hitStrs = gdStrs;
}
}
}
catch { }
return _listS;
}
/// <summary>
/// 根据Selector表达式获取json字符串的相应字段集合 ,原理 JToken逐级遍历
/// </summary>
/// <param name="_doc"></param>
/// <param name="_getSelector">逐级 如:ROOT rows newsTitle</param>
/// <returns></returns>
public static List<string> GetJsonListStrBySelector2(string _JsonResult, string _getSelector)
{
List<string> _listS = new List<string>();
if (string.IsNullOrEmpty(_JsonResult) || string.IsNullOrEmpty(_getSelector)) { return _listS; }
//处理单引号问题
if (_JsonResult.IndexOf('\'') > 0)
{
_JsonResult = JsonConvert.SerializeObject(JsonConvert.DeserializeObject<JObject>(_JsonResult));
}
try
{
JToken _BaseJobject = JsonConvert.DeserializeObject<JToken>(_JsonResult);
JToken _ppJObject = _BaseJobject;
string[] jPaths = _getSelector.Split(' ');
//命中集合
List<JToken> hitTokenList = new List<JToken>();
for (int i=0;i<jPaths.Length;i++)
{
//第一步匹配
if (i == 0)
{
JToken jt = _BaseJobject[jPaths[i]];
if (jt == null || jt.Type == JTokenType.Null)
{
return _listS;
}
if (jt.Type == JTokenType.Array)
{
foreach (var _jt in (JArray)jt)
{
hitTokenList.Add(_jt);
}
}
else
{
hitTokenList.Add(jt);
}
if (i == jPaths.Length - 1)
{
//最后一个层级
foreach (JToken _nowJT in hitTokenList)
{
//肯定不是Array,是的话 不支持匹配 如:[[1,2,3],[4,5,6]]
JToken jt_next = _nowJT[jPaths[i]];
if (jt_next == null || jt_next.Type == JTokenType.Null)
{
//不是最后一个层级,却出现null 直接返回
return _listS;
}
if (jt_next.Type == JTokenType.Array)
{
foreach (var _jt in (JArray)jt_next)
{
_listS.Add(_jt.ToString());
}
}
else
{
_listS.Add(jt_next.ToString());
}
}
}
}
else if (i == jPaths.Length-1)
{
//最后一个层级
foreach (JToken _nowJT in hitTokenList)
{
//肯定不是Array,是的话 不支持匹配 如:[[1,2,3],[4,5,6]]
JToken jt_next = _nowJT[jPaths[i]];
if (jt_next == null || jt_next.Type == JTokenType.Null)
{
//不是最后一个层级,却出现null 直接返回
return _listS;
}
if (jt_next.Type == JTokenType.Array)
{
foreach (var _jt in (JArray)jt_next)
{
_listS.Add(_jt.ToString());
}
}
else
{
_listS.Add(jt_next.ToString());
}
}
}
else
{
//中间层级阶段
List<JToken> bxTokens = new List<JToken>();
foreach (JToken _nowJT in hitTokenList)
{
//肯定不是Array,是的话 不支持匹配 如:[[1,2,3],[4,5,6]]
JToken jt_next = _nowJT[jPaths[i]];
if (jt_next == null || jt_next.Type == JTokenType.Null)
{
//不是最后一个层级,却出现null 直接返回
return _listS;
}
if (jt_next.Type == JTokenType.Array)
{
foreach (var _jt in (JArray)jt_next)
{
bxTokens.Add(_jt);
}
}
else
{
bxTokens.Add(jt_next);
}
}
hitTokenList = bxTokens;
}
}
}
catch { }
return _listS;
}
#endregion
参考json
string jsonStr = @"{ 'ROOT':[{
'TOKEN': 'aa1',
'SERVICE': 'bb',
'DATAPARAM': 'cc',
'rows': [
{
'searchValue': null,
'createBy': '用户N',
'createTime': '2021-02-25 11:06:03',
'updateBy': null,
'updateTime': null,
'remark': null,
'params': {},
'newsId': 241,
'newsTitle': 'IPASON × GUNDAM攀升高达联名独角兽电竞主机达抵达战场',
'newsSubtitle': '#光芒闪耀,攀升AMD 3A配置独角兽登场# IPASON × GUNDAM攀升高达联名独角兽电竞主机达抵达战场!2月24日 20:00,预售开启。搭载AMD 锐龙 5000系列处理器和AMD Radeon RX 6000系列显卡,旗舰硬件核心让你体验热血电竞与高达情怀的双重满足!上天猫、京东搜索高达攀升,了解#高达主机# 一起去唤醒并激发心中少年的勇气,男人的浪漫。',
'newsUrl': null,
'releaseTime': '2021-02-24',
'newsSource': '官方',
'showOrder': '43',
'showStatus': '0',
'newsImg': 'http://ipasoncnwebsite.oss-cn-shanghai.aliyuncs.com/images/79049291-16f3-4407-a34f-80fbc3193920.jpg',
'newsCategory': 'category_qiyexinwen'
},
{
'searchValue': null,
'createBy': '用户N',
'createTime': '2021-01-19 16:55:27',
'updateBy': null,
'updateTime': null,
'remark': null,
'params': {},
'newsId': 239,
'newsTitle': '高性能定制电脑攀升',
'newsSubtitle': '高性能定制电脑攀升',
'newsUrl': null,
'releaseTime': '2021-02-24',
'newsSource': '官方',
'showOrder': '40',
'showStatus': '0',
'newsImg': 'http://ipasoncnwebsite.oss-cn-shanghai.aliyuncs.com/images/6b96c7ee-5f15-490c-a140-727da068a62b.png',
'newsCategory': 'category_qiyexinwen'
},
{
'searchValue': null,
'createBy': '用户N',
'createTime': '2021-01-15 10:33:47',
'updateBy': null,
'updateTime': null,
'remark': null,
'params': {},
'newsId': 237,
'newsTitle': '攀升电脑2020年度荣誉时刻',
'newsSubtitle': '攀升电脑2020年度荣誉时刻',
'newsUrl': 'https://tieba.baidu.com/p/7190803603',
'releaseTime': '2021-01-15',
'newsSource': '官方',
'showOrder': '37',
'showStatus': '0',
'newsImg': 'http://ipasoncnwebsite.oss-cn-shanghai.aliyuncs.com/images/01b99cf5-e51f-43a1-8478-72ebfb38de9c.jpg',
'newsCategory': 'category_qiyexinwen'
}
]
}, {
'TOKEN': 'c2',
'SERVICE': 'bb',
'DATAPARAM': 'cc',
'rows': [
{
'searchValue': null,
'createBy': '用户N',
'createTime': '2021-02-25 11:06:03',
'updateBy': null,
'updateTime': null,
'remark': null,
'params': {},
'newsId': 241,
'newsTitle': 'IPASON × GUNDAM攀升高达联名独角兽电竞主机达抵达战场',
'newsSubtitle': '#光芒闪耀,攀升AMD 3A配置独角兽登场# IPASON × GUNDAM攀升高达联名独角兽电竞主机达抵达战场!2月24日 20:00,预售开启。搭载AMD 锐龙 5000系列处理器和AMD Radeon RX 6000系列显卡,旗舰硬件核心让你体验热血电竞与高达情怀的双重满足!上天猫、京东搜索高达攀升,了解#高达主机# 一起去唤醒并激发心中少年的勇气,男人的浪漫。',
'newsUrl': null,
'releaseTime': '2021-02-24',
'newsSource': '官方',
'showOrder': '43',
'showStatus': '0',
'newsImg': 'http://ipasoncnwebsite.oss-cn-shanghai.aliyuncs.com/images/79049291-16f3-4407-a34f-80fbc3193920.jpg',
'newsCategory': 'category_qiyexinwen'
},
{
'searchValue': null,
'createBy': '用户N',
'createTime': '2021-01-19 16:55:27',
'updateBy': null,
'updateTime': null,
'remark': null,
'params': {},
'newsId': 239,
'newsTitle': '高性能制电脑攀升',
'newsSubtitle': '高性能定制电脑攀升',
'newsUrl': null,
'releaseTime': '2021-02-24',
'newsSource': '官方',
'showOrder': '40',
'showStatus': '0',
'newsImg': 'http://ipasoncnwebsite.oss-cn-shanghai.aliyuncs.com/images/6b96c7ee-5f15-490c-a140-727da068a62b.png',
'newsCategory': 'category_qiyexinwen'
},
{
'searchValue': null,
'createBy': '用户N',
'createTime': '2021-01-15 10:33:47',
'updateBy': null,
'updateTime': null,
'remark': null,
'params': {},
'newsId': 237,
'newsTitle': '攀升电脑2020年度荣誉时刻',
'newsSubtitle': '攀升电脑2020年度荣誉时刻',
'newsUrl': 'https://tieba.baidu.com/p/7190803603',
'releaseTime': '2021-01-15',
'newsSource': '官方',
'showOrder': '37',
'showStatus': '0',
'newsImg': 'http://ipasoncnwebsite.oss-cn-shanghai.aliyuncs.com/images/01b99cf5-e51f-43a1-8478-72ebfb38de9c.jpg',
'newsCategory': 'category_qiyexinwen'
}
]
}]
}";