提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
目录
前言
前面发了一个获取批量获取tradeinfo信息的文章,没想到居然有这么多人看,本着能爬完肯定都是爬完的,所以我写这篇文章把获取招商公告信息的方法也说一下
如果想看之前的文章,可以点击:批量爬取福建省公共资源交易电子公共服务平台数据-CSDN博客
一、先展示一下爬取的结果
二、操作步骤
1.招标公告内容来源分析
随机点开一个公告:
检查源代码:
好家伙啥也没有
观察发现,这个网页所用的资源(js等)和之前交易中心的几乎一样
1.1找到content
打开开发者工具,点击网络->xhr/fetch,刷新页面,捕获到tradeinfoContent等好几个json
我测试过了,就是这个tradeinfoConten的data封装了招标公告的信息
这个data也是加密过的,但是解密方式和上一篇文章还有何老师讲的一模一样,大家可以自己去解析,我就不贴出来了
1.2观察请求头和负载
这个请求头有一个参数是不是很眼熟,诶,就是portal-sign,
负载也有一个参数很眼熟呀,ts
1.3扣代码
结合我刚刚提到的,该网站资源前后是一样的,所以说这个portal-sign和ts的获取和之前是一模一样的,那我就不再贴出代码了,但是传入的参数t我们不知道呀,我这里就直接贴出来了,不用大家去找这玩意了(找到方法和之前的文章一致,但是需要注意,生成招标公告网页时刚刚看到的那几个json都会用到生成portal-sign的函数,只有生成tradeinfoContent请求json的参数t是我们需要的)
var t = {
"url": "/Trade/TradeInfoContent",
"method": "post",
"data": {
"type": 1, "m_id": null
},
"headers": {
"common": {
"Accept": "application/json, text/plain, */*"
}, "delete": {}, "get": {}, "head": {}, "post": {
"Content-Type": "application/x-www-form-urlencoded"
}, "put": {
"Content-Type": "application/x-www-form-urlencoded"
}, "patch": {
"Content-Type": "application/x-www-form-urlencoded"
}, "content-type": "application/json;charset=UTF-8"
},
"baseURL": "/FwPortalApi",
"transformRequest": [null],
"transformResponse": [null],
"timeout": 0,
"xsrfCookieName": "XSRF-TOKEN",
"xsrfHeaderName": "X-XSRF-TOKEN",
"maxContentLength": -1
}
t参数里面有一个东西比较重要,那就是m_id,这个参数和之前请求的tradeinfo里的公告信息里的M_ID参数是对应的,我这里贴一个给大家看看:
2.代码处理流程
2.1获取tradeinfo的json并保存到一个文件夹中
2.2解析tradeinfo内容,获取M_ID和NAME
with open(f'./TradeInfo/{tradeinfo}', 'r', encoding='utf-8') as f:
tradeinfo = json.loads(f.read())
cids = jsonpath.jsonpath(tradeinfo, '$.Table[*].M_ID')
names = jsonpath.jsonpath(tradeinfo, '$.Table[*].NAME')
这里要导入jsonpath包
其中tradeinfo是命名好的文件内容,表述出来就是n_tradeinfo.json
2.3生成对应的payload和headers
async def create_ctx_payload(portal_sign2, cid, name):
ctx = execjs.compile(portal_sign2).call('get_portal_sign', cid)
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Content-Type': 'application/json;charset=UTF-8',
# 'Cookie': 'ASP.NET_SessionId=e1tkxmtb1lusrun4kzlcmf0m',
'Origin': 'https://ggzyfw.fj.gov.cn',
'Referer': 'https://ggzyfw.fj.gov.cn/business/detail?cid=250842&type=GCJS',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
'portal-sign': ctx['headers']['portal-sign'],
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Microsoft Edge";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
payload = ctx['data']
# print(cid)
# print(headers['portal-sign'])
# print(payload)
return headers, payload, name
2.4获取每个M_ID和NAME的信息
async def get_content(url, cookies, my_decrypt, cid, headers, payload, name, session):
if os.path.exists(rf'./content/{cid}_{name}.html'):
print(rf'{cid}_{name}.html下载过了')
else:
async with session.post(url, cookies=cookies, headers=headers, json=payload) as response:
if response.status == 200:
rt = await response.json()
name = name.replace('/', '').replace('\\', '').replace('?', '')
async with aiofiles.open(rf'./content/{cid}_{name}.html', 'w', encoding='utf-8') as af:
await af.write(json.loads(execjs.compile(my_decrypt).call('b', rt['Data']))['Contents'])
print(f'{cid}_{name}.html下载完毕')
async def main(url, cookies, dic):
with open('my_decrypt.js', 'r', encoding='utf-8') as f:
my_decrypt = f.read()
async with aiohttp.ClientSession() as session:
tasks = [
get_content(url, cookies, my_decrypt, cid, dic[cid].result()[0], dic[cid].result()[1], dic[cid].result()[2],
session) for cid in dic.keys()]
await asyncio.gather(*tasks)
3.整篇代码附上
var t = {
"url": "/Trade/TradeInfoContent",
"method": "post",
"data": {
"type": 1, "m_id": null
},
"headers": {
"common": {
"Accept": "application/json, text/plain, */*"
}, "delete": {}, "get": {}, "head": {}, "post": {
"Content-Type": "application/x-www-form-urlencoded"
}, "put": {
"Content-Type": "application/x-www-form-urlencoded"
}, "patch": {
"Content-Type": "application/x-www-form-urlencoded"
}, "content-type": "application/json;charset=UTF-8"
},
"baseURL": "/FwPortalApi",
"transformRequest": [null],
"transformResponse": [null],
"timeout": 0,
"xsrfCookieName": "XSRF-TOKEN",
"xsrfHeaderName": "X-XSRF-TOKEN",
"maxContentLength": -1
}
function s(t, e) {
return t.toString().toUpperCase() > e.toString().toUpperCase() ? 1 : t.toString().toUpperCase() === e.toString().toUpperCase() ? 0 : -1
}
function l(t) {
for (var e = Object.keys(t).sort(s), n = "", a = 0; a < e.length; a++) if (void 0 !== t[e[a]]) if (t[e[a]] && t[e[a]] instanceof Object || t[e[a]] instanceof Array) {
var i = JSON.stringify(t[e[a]]);
n += e[a] + i
} else n += e[a] + t[e[a]];
return n
}
function d(t) {
for (var e in t) "" !== t[e] && void 0 !== t[e] || delete t[e];
var n = 'B3978D054A72A7002063637CCDF6B2E5' + l(t);
console.log(n)
return md5(n, 32).toLowerCase()
}
function md5(string, bit) {
function md5_RotateLeft(lValue, iShiftBits) {
return (lValue << iShiftBits) | (lValue >>> (32 - iShiftBits));
}
function md5_AddUnsigned(lX, lY) {
var lX4, lY4, lX8, lY8, lResult;
lX8 = (lX & 0x80000000);
lY8 = (lY & 0x80000000);
lX4 = (lX & 0x40000000);
lY4 = (lY & 0x40000000);
lResult = (lX & 0x3FFFFFFF) + (lY & 0x3FFFFFFF);
if (lX4 & lY4) {
return (lResult ^ 0x80000000 ^ lX8 ^ lY8);
}
if (lX4 | lY4) {
if (lResult & 0x40000000) {
return (lResult ^ 0xC0000000 ^ lX8 ^ lY8);
} else {
return (lResult ^ 0x40000000 ^ lX8 ^ lY8);
}
} else {
return (lResult ^ lX8 ^ lY8);
}
}
function md5_F(x, y, z) {
return (x & y) | ((~x) & z);
}
function md5_G(x, y, z) {
return (x & z) | (y & (~z));
}
function md5_H(x, y, z) {
return (x ^ y ^ z);
}
function md5_I(x, y, z) {
return (y ^ (x | (~z)));
}
function md5_FF(a, b, c, d, x, s, ac) {
a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_F(b, c, d), x), ac));
return md5_AddUnsigned(md5_RotateLeft(a, s), b);
};
function md5_GG(a, b, c, d, x, s, ac) {
a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_G(b, c, d), x), ac));
return md5_AddUnsigned(md5_RotateLeft(a, s), b);
};
function md5_HH(a, b, c, d, x, s, ac) {
a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_H(b, c, d), x), ac));
return md5_AddUnsigned(md5_RotateLeft(a, s), b);
};
function md5_II(a, b, c, d, x, s, ac) {
a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_I(b, c, d), x), ac));
return md5_AddUnsigned(md5_RotateLeft(a, s), b);
};
function md5_ConvertToWordArray(string) {
var lWordCount;
var lMessageLength = string.length;
var lNumberOfWords_temp1 = lMessageLength + 8;
var lNumberOfWords_temp2 = (lNumberOfWords_temp1 - (lNumberOfWords_temp1 % 64)) / 64;
var lNumberOfWords = (lNumberOfWords_temp2 + 1) * 16;
var lWordArray = Array(lNumberOfWords - 1);
var lBytePosition = 0;
var lByteCount = 0;
while (lByteCount < lMessageLength) {
lWordCount = (lByteCount - (lByteCount % 4)) / 4;
lBytePosition = (lByteCount % 4) * 8;
lWordArray[lWordCount] = (lWordArray[lWordCount] | (string.charCodeAt(lByteCount) << lBytePosition));
lByteCount++;
}
lWordCount = (lByteCount - (lByteCount % 4)) / 4;
lBytePosition = (lByteCount % 4) * 8;
lWordArray[lWordCount] = lWordArray[lWordCount] | (0x80 << lBytePosition);
lWordArray[lNumberOfWords - 2] = lMessageLength << 3;
lWordArray[lNumberOfWords - 1] = lMessageLength >>> 29;
return lWordArray;
};
function md5_WordToHex(lValue) {
var WordToHexValue = "", WordToHexValue_temp = "", lByte, lCount;
for (lCount = 0; lCount <= 3; lCount++) {
lByte = (lValue >>> (lCount * 8)) & 255;
WordToHexValue_temp = "0" + lByte.toString(16);
WordToHexValue = WordToHexValue + WordToHexValue_temp.substr(WordToHexValue_temp.length - 2, 2);
}
return WordToHexValue;
};
function md5_Utf8Encode(string) {
string = string.replace(/\r\n/g, "\n");
var utftext = "";
for (var n = 0; n < string.length; n++) {
var c = string.charCodeAt(n);
if (c < 128) {
utftext += String.fromCharCode(c);
} else if ((c > 127) && (c < 2048)) {
utftext += String.fromCharCode((c >> 6) | 192);
utftext += String.fromCharCode((c & 63) | 128);
} else {
utftext += String.fromCharCode((c >> 12) | 224);
utftext += String.fromCharCode(((c >> 6) & 63) | 128);
utftext += String.fromCharCode((c & 63) | 128);
}
}
return utftext;
};var x = Array();
var k, AA, BB, CC, DD, a, b, c, d;
var S11 = 7, S12 = 12, S13 = 17, S14 = 22;
var S21 = 5, S22 = 9, S23 = 14, S24 = 20;
var S31 = 4, S32 = 11, S33 = 16, S34 = 23;
var S41 = 6, S42 = 10, S43 = 15, S44 = 21;
string = md5_Utf8Encode(string);
x = md5_ConvertToWordArray(string);
a = 0x67452301;
b = 0xEFCDAB89;
c = 0x98BADCFE;
d = 0x10325476;
for (k = 0; k < x.length; k += 16) {
AA = a;
BB = b;
CC = c;
DD = d;
a = md5_FF(a, b, c, d, x[k + 0], S11, 0xD76AA478);
d = md5_FF(d, a, b, c, x[k + 1], S12, 0xE8C7B756);
c = md5_FF(c, d, a, b, x[k + 2], S13, 0x242070DB);
b = md5_FF(b, c, d, a, x[k + 3], S14, 0xC1BDCEEE);
a = md5_FF(a, b, c, d, x[k + 4], S11, 0xF57C0FAF);
d = md5_FF(d, a, b, c, x[k + 5], S12, 0x4787C62A);
c = md5_FF(c, d, a, b, x[k + 6], S13, 0xA8304613);
b = md5_FF(b, c, d, a, x[k + 7], S14, 0xFD469501);
a = md5_FF(a, b, c, d, x[k + 8], S11, 0x698098D8);
d = md5_FF(d, a, b, c, x[k + 9], S12, 0x8B44F7AF);
c = md5_FF(c, d, a, b, x[k + 10], S13, 0xFFFF5BB1);
b = md5_FF(b, c, d, a, x[k + 11], S14, 0x895CD7BE);
a = md5_FF(a, b, c, d, x[k + 12], S11, 0x6B901122);
d = md5_FF(d, a, b, c, x[k + 13], S12, 0xFD987193);
c = md5_FF(c, d, a, b, x[k + 14], S13, 0xA679438E);
b = md5_FF(b, c, d, a, x[k + 15], S14, 0x49B40821);
a = md5_GG(a, b, c, d, x[k + 1], S21, 0xF61E2562);
d = md5_GG(d, a, b, c, x[k + 6], S22, 0xC040B340);
c = md5_GG(c, d, a, b, x[k + 11], S23, 0x265E5A51);
b = md5_GG(b, c, d, a, x[k + 0], S24, 0xE9B6C7AA);
a = md5_GG(a, b, c, d, x[k + 5], S21, 0xD62F105D);
d = md5_GG(d, a, b, c, x[k + 10], S22, 0x2441453);
c = md5_GG(c, d, a, b, x[k + 15], S23, 0xD8A1E681);
b = md5_GG(b, c, d, a, x[k + 4], S24, 0xE7D3FBC8);
a = md5_GG(a, b, c, d, x[k + 9], S21, 0x21E1CDE6);
d = md5_GG(d, a, b, c, x[k + 14], S22, 0xC33707D6);
c = md5_GG(c, d, a, b, x[k + 3], S23, 0xF4D50D87);
b = md5_GG(b, c, d, a, x[k + 8], S24, 0x455A14ED);
a = md5_GG(a, b, c, d, x[k + 13], S21, 0xA9E3E905);
d = md5_GG(d, a, b, c, x[k + 2], S22, 0xFCEFA3F8);
c = md5_GG(c, d, a, b, x[k + 7], S23, 0x676F02D9);
b = md5_GG(b, c, d, a, x[k + 12], S24, 0x8D2A4C8A);
a = md5_HH(a, b, c, d, x[k + 5], S31, 0xFFFA3942);
d = md5_HH(d, a, b, c, x[k + 8], S32, 0x8771F681);
c = md5_HH(c, d, a, b, x[k + 11], S33, 0x6D9D6122);
b = md5_HH(b, c, d, a, x[k + 14], S34, 0xFDE5380C);
a = md5_HH(a, b, c, d, x[k + 1], S31, 0xA4BEEA44);
d = md5_HH(d, a, b, c, x[k + 4], S32, 0x4BDECFA9);
c = md5_HH(c, d, a, b, x[k + 7], S33, 0xF6BB4B60);
b = md5_HH(b, c, d, a, x[k + 10], S34, 0xBEBFBC70);
a = md5_HH(a, b, c, d, x[k + 13], S31, 0x289B7EC6);
d = md5_HH(d, a, b, c, x[k + 0], S32, 0xEAA127FA);
c = md5_HH(c, d, a, b, x[k + 3], S33, 0xD4EF3085);
b = md5_HH(b, c, d, a, x[k + 6], S34, 0x4881D05);
a = md5_HH(a, b, c, d, x[k + 9], S31, 0xD9D4D039);
d = md5_HH(d, a, b, c, x[k + 12], S32, 0xE6DB99E5);
c = md5_HH(c, d, a, b, x[k + 15], S33, 0x1FA27CF8);
b = md5_HH(b, c, d, a, x[k + 2], S34, 0xC4AC5665);
a = md5_II(a, b, c, d, x[k + 0], S41, 0xF4292244);
d = md5_II(d, a, b, c, x[k + 7], S42, 0x432AFF97);
c = md5_II(c, d, a, b, x[k + 14], S43, 0xAB9423A7);
b = md5_II(b, c, d, a, x[k + 5], S44, 0xFC93A039);
a = md5_II(a, b, c, d, x[k + 12], S41, 0x655B59C3);
d = md5_II(d, a, b, c, x[k + 3], S42, 0x8F0CCC92);
c = md5_II(c, d, a, b, x[k + 10], S43, 0xFFEFF47D);
b = md5_II(b, c, d, a, x[k + 1], S44, 0x85845DD1);
a = md5_II(a, b, c, d, x[k + 8], S41, 0x6FA87E4F);
d = md5_II(d, a, b, c, x[k + 15], S42, 0xFE2CE6E0);
c = md5_II(c, d, a, b, x[k + 6], S43, 0xA3014314);
b = md5_II(b, c, d, a, x[k + 13], S44, 0x4E0811A1);
a = md5_II(a, b, c, d, x[k + 4], S41, 0xF7537E82);
d = md5_II(d, a, b, c, x[k + 11], S42, 0xBD3AF235);
c = md5_II(c, d, a, b, x[k + 2], S43, 0x2AD7D2BB);
b = md5_II(b, c, d, a, x[k + 9], S44, 0xEB86D391);
a = md5_AddUnsigned(a, AA);
b = md5_AddUnsigned(b, BB);
c = md5_AddUnsigned(c, CC);
d = md5_AddUnsigned(d, DD);
}
if (bit == 32) {
return (md5_WordToHex(a) + md5_WordToHex(b) + md5_WordToHex(c) + md5_WordToHex(d)).toLowerCase();
}
return (md5_WordToHex(b) + md5_WordToHex(c)).toLowerCase();
}
function v(t, e) {
var n = Object.keys(t);
if (Object.getOwnPropertySymbols) {
var a = Object.getOwnPropertySymbols(t);
e && (a = a.filter((function (e) {
return Object.getOwnPropertyDescriptor(t, e).enumerable
}))), n.push.apply(n, a)
}
return n
}
function aa(t, e, r) {
return e in t ? Object.defineProperty(t, e, {
value: r, enumerable: !0, configurable: !0, writable: !0
}) : t[e] = r, t
}
function g(t) {
for (var e = 1; e < arguments.length; e++) {
var n = null != arguments[e] ? arguments[e] : {};
e % 2 ? v(Object(n), !0).forEach((function (e) {
Object(aa)(t, e, n[e])
})) : Object.getOwnPropertyDescriptors ? Object.defineProperties(t, Object.getOwnPropertyDescriptors(n)) : v(Object(n)).forEach((function (e) {
Object.defineProperty(t, e, Object.getOwnPropertyDescriptor(n, e))
}))
}
return t
}
function get_portal_sign(cid) {
t['data']['m_id'] = cid.toString()
t.headers.baseURL && (t.baseURL = t.headers.baseURL);
var e = Object.assign({}, t.params);
return e["ts"] = (new Date).getTime(), "string" === typeof t.data && (t.data = JSON.parse(t.data)), "post" === t.method && t.data && Object.assign(e, t.data), t.headers["portal-sign"] = d(e), "post" === t.method ? t.data = g(g({}, t.data), {}, {ts: e["ts"]}) : "get" === t.method && (t.params = g(g({}, t.params), {}, {ts: e["ts"]})), t
}
console.log(get_portal_sign(250812))
import requests
import execjs
import json
import jsonpath
import asyncio
import aiofiles
import aiohttp
import asyncio
import os
from concurrent.futures import ThreadPoolExecutor
with open('portal_sign2.js', 'r', encoding='utf-8') as f:
portal_sign2 = f.read()
async def create_ctx_payload(portal_sign2, cid, name):
ctx = execjs.compile(portal_sign2).call('get_portal_sign', cid)
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Content-Type': 'application/json;charset=UTF-8',
# 'Cookie': 'ASP.NET_SessionId=e1tkxmtb1lusrun4kzlcmf0m',
'Origin': 'https://ggzyfw.fj.gov.cn',
'Referer': 'https://ggzyfw.fj.gov.cn/business/detail?cid=250842&type=GCJS',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
'portal-sign': ctx['headers']['portal-sign'],
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Microsoft Edge";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
payload = ctx['data']
# print(cid)
# print(headers['portal-sign'])
# print(payload)
return headers, payload, name
async def get_content(url, cookies, my_decrypt, cid, headers, payload, name, session):
if os.path.exists(rf'./content/{cid}_{name}.html'):
print(rf'{cid}_{name}.html下载过了')
else:
async with session.post(url, cookies=cookies, headers=headers, json=payload) as response:
if response.status == 200:
rt = await response.json()
name = name.replace('/', '').replace('\\', '').replace('?', '')
async with aiofiles.open(rf'./content/{cid}_{name}.html', 'w', encoding='utf-8') as af:
await af.write(json.loads(execjs.compile(my_decrypt).call('b', rt['Data']))['Contents'])
print(f'{cid}_{name}.html下载完毕')
async def main(url, cookies, dic):
with open('my_decrypt.js', 'r', encoding='utf-8') as f:
my_decrypt = f.read()
async with aiohttp.ClientSession() as session:
tasks = [
get_content(url, cookies, my_decrypt, cid, dic[cid].result()[0], dic[cid].result()[1], dic[cid].result()[2],
session) for cid in dic.keys()]
await asyncio.gather(*tasks)
def m(tradeinfo):
with open(f'./TradeInfo/{tradeinfo}', 'r', encoding='utf-8') as f:
tradeinfo = json.loads(f.read())
cids = jsonpath.jsonpath(tradeinfo, '$.Table[*].M_ID')
names = jsonpath.jsonpath(tradeinfo, '$.Table[*].NAME')
loop = asyncio.get_event_loop()
dic = {}
for cid, name in zip(cids, names):
dic[cid] = loop.create_task(create_ctx_payload(portal_sign2, cid, name))
loop.run_until_complete(dic[cid])
url = 'https://ggzyfw.fj.gov.cn/FwPortalApi/Trade/TradeInfoContent'
cookies = {
'ASP.NET_SessionId': 'e1tkxmtb1lusrun4kzlcmf0m',
}
loop.run_until_complete(main(url, cookies, dic))
if __name__ == '__main__':
tradeinfos = os.listdir('./TradeInfo/')
for tradeinfo in tradeinfos:
print(tradeinfo)
m(tradeinfo)
注意事项
在套用我的代码前,一定要先获取tradeinfo.json文件,然后将文件保存,之后修改函数m里面的
with open(f'./TradeInfo/{tradeinfo}', 'r', encoding='utf-8') as f:
tradeinfo = json.loads(f.read())
的文件路径,不修改文件路径找不到正确的文件会出bug的
我这里附上我的项目里面用到的文件,你们可以参考来制作
content装招商公告的内容
tradeinfo装tradeinfo.json内容
my_decrypt.js装解密函数
portal_sign1.js装tradeinfo内容获取的portal-sign的函数和t变量
portal_sign2.js装content内容获取的portal-sign的函数和t变量
获取前100页所有TradeInfo.py装完整的获取tradinfo内容的代码
获取一篇内容.py装完整的批量获取content内容的代码