# -*- coding: utf-8 -*- import sys import requests import random import json import jsonpath import time import datetime from random import randint from lxml import etree headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", # "Cookie": "ctx=/ircs; route=aa693231308cb485ea2c8ceae74e533b; JSESSIONID=992c806f-57a6-46ec-9987-e9d9df48c88c", "Host": "irm.cninfo.com.cn", # "Referer": "http://irm.cninfo.com.cn/ircs/search?keyword=000001", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4021.2 Safari/537.36" } def __request(url): content = '' response = requests.get(url=url, headers=headers) if response.status_code == 200: print('页面请求成功!') # content = etree.HTML(response.text) content = response.text else: print('请求失败,状态码:{}'.format(response.status_code)) return content def parseHtml(code, response): resultList = [] response = json.loads(response) results = jsonpath.jsonpath(response, "$.results[*]") for result in results: attachedContent = result.get('attachedContent') if attachedContent: attachedContent = attachedContent.replace('\t', '').replace('\r', '').replace('\n', '') askTimeStr = result.get('pubDate') pubDate = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(askTimeStr) / 1000)) mainContent = result.get('mainContent').replace('\t', '').replace('\r', '').replace('\n', '') attachedPubDateStr = result.get('attachedPubDate') attachedPubDate = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(attachedPubDateStr) / 1000)) # print mainContent + '\n' + pubDate + '\n' + attachedContent + '\n' + attachedPubDate + '\n' resultStr = code + '\t' + pubDate + '\t' + attachedPubDate + '\t' + mainContent + '\t' + attachedContent + '\n' resultList.append(resultStr) print(resultStr) return resultList def writeTotxt(resultList, path): with open(path, 'a+', encoding='utf-8') as f: for row in resultList: f.write(row) def readTxt(): with open(r'./code.txt', encoding='utf-8') as f: preili = f.readlines() codeList = [row.strip() for row in preili] return codeList def getPageNum(response): jsonStr = json.loads(response) pageNum = jsonpath.jsonpath(jsonStr, '$.totalPage')[0] return pageNum def getSecid(code): secid = '' url = 'http://irm.cninfo.com.cn/ircs/index/queryKeyboardInfo' data = { 'keyWord': '' } data['keyWord'] = code response = requests.post(url, headers=headers, params=data) if response.status_code == 200: jsonStr = json.loads(response.text) pre_secid = jsonpath.jsonpath(jsonStr, '$.data[*].secid') if pre_secid: secid = pre_secid[0] else: print('CODE: {}, secid未找到'.format(code)) else: print('CODE: {}, 获取secid请求失败'.format(code)) return secid def run(): codeList = readTxt() notFindList = [] # now_time = time.strftime("%Y-%m-%d", time.localtime()) # pass_time = # print(now_time) base_url = 'http://irm.cninfo.com.cn/ircs/search/searchResult?stockCodes={secid}_{code}&keywords=&infoTypes=1%2C11&startDate=2015-01-01+00%3A00%3A00&endDate=2021-06-19+23%3A59%3A59&onlyAttentionCompany=2&pageNum={pageNum}&pageSize=10' for code in codeList: print('-----------CODE:{}-----------'.format(code)) secid = getSecid(code) if not secid: continue first_url = base_url.format(secid=secid, code=code, pageNum='1') response = __request(first_url) pageNum = getPageNum(response) print('该代码共有{}页数据'.format(pageNum)) if pageNum: for i in range(int(pageNum)): print('CODE:{}, 第{}页数据'.format(code, str(i + 1))) url = base_url.format(secid=secid, code=code, pageNum=str(i + 1)) response = __request(url) resultList = parseHtml(code, response) writeTotxt(resultList, r'./hudongyi_3.txt') time.sleep(2) else: notFindList.append(code + '\n') print('该代码未搜索到:{}'.format(code)) time.sleep(3) writeTotxt(notFindList, r'./notFindTxt_3.txt') print('搜索完毕') if __name__ == '__main__': run()
hudongyi
最新推荐文章于 2024-11-16 19:19:32 发布