本想直接json.loads(),结果网页json试了几种方法就是转不了格式,也不知道为啥。而且得到的数据也不怎么会存储,等找到更好地方法来更新。
# -*- coding: utf-8 -*-
from time import sleep
from pyquery import PyQuery as pq
import re,requests,argparse,csv
import pandas as pd
import numpy as np
def getdate(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
'Cookie': 'UM_distinctid=170b8aa056f144-074c1185d12ebb-4313f6a-e1000-170b8aa05703b4; ASL=18329,0000z,2404b15f; ADVC=38432d7145211e; vjuids=9096406ec.170b8aa0ade.0.1eb47072712f5; vjlast=1583646117.1583646117.30; hexunATC=hexun,1676,19791,19947,1583646189348,https%3A%2F%2Fh01hxsame.hexun.com%2Fs%3Fz%3Dhexun%26c%3D1676%26op%3D1; cn_1263247791_dplus=%7B%22distinct_id%22%3A%20%22170b8aa056f144-074c1185d12ebb-4313f6a-e1000-170b8aa05703b4%22%2C%22userFirstDate%22%3A%20%2220200308%22%2C%22userID%22%3A%20%22%22%2C%22userName%22%3A%20%22%22%2C%22userType%22%3A%20%22nologinuser%22%2C%22userLoginDate%22%3A%20%2220200308%22%7D; __jsluid_h=642d73b511ae9253ac0724519b48614d; Hm_lvt_cb1b8b99a89c43761f616e8565c9107f=1583646202; HexunTrack=SID=20200308134154013aab19d532e4540798c07f2d877a151a7&CITY=34&TOWN=340100'}
r = requests.get(url,headers=headers)
a = r.text
b = a.replace("'", '"')#用正则表达式提取json信息
name = re.findall(r'industry:"(.*?)",stockNumber', b)
Shareholder_liability = re.findall(r'stockNumber:"(.*?)",industryrate', b)
gen_score = re.findall(r'industryrate:"(.*?)",Pricelimit', b)
rink = re.findall(r'Pricelimit:"(.*?)",lootingchips', b)
Employees_liability = re.findall(r'lootingchips:"(.*?)",Scramble', b)
Consumer_rights_liability = re.findall(r'Scramble:"(.*?)",rscramble', b)
environment_liability = re.findall(r'rscramble:"(.*?)",Strongstock', b)
social_liability = re.findall(r'Strongstock:"(.*?)",Hstock', b)
a = 'name'#股票名称/代码
b = 'gen_score'#总得分
c = 'Shareholder_liability'#股东责任
d = 'rink'#等级
e = 'Employees_liability'#员工责任
f = 'Consumer_rights_liability'#供应商、客户和消费者权益责任
g = 'environment_liability'#环境责任
h = 'social_liability'#社会责任
def excel(a,self):
with open('{}.csv'.format(a), 'a+', encoding='UTF-8', newline='')as csvfile:
f = open('{}.csv'.format(a), 'a+', encoding='UTF-8', newline='')
y = 1
for i in self:
i = str(i) # 这里需要转换为str类型 list错误
f.write(i)
if y < len(self)+1:
f.write("\n") # “\n”用于隔开行
y = y + 1
f.close()
#最后把生成的csv文件用Excel打开,汇总
excel(a, name)
excel(b,gen_score)
excel(c,Shareholder_liability)
excel(d,rink)
excel(e,Employees_liability)
excel(f,Consumer_rights_liability)
excel(g,environment_liability)
excel(h,social_liability)
def main():
for date in range(1, 187):#以2018年为例,输入页数范围
url = "http://stockdata.stock.hexun.com/zrbg/data/zrbList.aspx?date=2018-12-31&count=20&pname=20&titType=null&page={}&callback=hxbase_json11583922219938".format(date)#获取的数据真实网址,不同年份替换2018
getdate(url)
sleep(1)
main()