GitHub地址:TEST/HttpWebRequest at master · yangwohenmai/TEST · GitHub
爬取港交所数据最大的问题是如何获取港交所页面的Token,有了Token之后就可以从港交所接口请求数据了。
下面这段python首先解析港交所页面,从页面中获取港交所Token值,而后请求返回的数据,数据格式类似于Json,但是需要稍微处理一下,就可以用Json解析了。
完整的json数据如下:
{{
"data": {
"responsecode": "000",
"responsemsg": "",
"quote": {
"hi": "74.350",
"rs_stock_flag": false,
"fiscal_year_end": "31 Dec 2018",
"hist_closedate": "30 May 2019",
"replication_method": null,
"amt_os": "3,856,240,500",
"primaryexch": "HKEX",
"ric": "0001.HK",
"product_subtype": null,
"db_updatetime": "31 May 2019 09:36",
"mkt_cap_u": "B",
"am_u": "M",
"ew_sub_right": "",
"secondary_listing": false,
"ew_amt_os_cur": null,
"ccy": "HKD",
"management_fee": "",
"ew_underlying_code": null,
"trdstatus": "N",
"nav": "",
"original_offer_price": "",
"issue": "",
"asset_class": null,
"eps": 10.1109,
"inline_upper_strike_price": "",
"sedol": "BW9P816",
"am": "697.27",
"iv": "",
"ew_strike": "",
"as": "74.100",
"geographic_focus": null,
"incorpin": "Cayman Islands",
"etp_baseCur": null,
"ew_amt_os": "",
"bd": "74.050",
"registrar": "Computershare Hong Kong Investor Services Ltd.",
"depositary": null,
"exotic_type": null,
"callput_indicator": null,
"primary_market": null,
"underlying_index": null,
"lot": "500",
"lo52": "72.800",
"shares_issued_date": "30 Apr 2019",
"premium": "",
"strike_price_ccy": null,
"yield": "",
"vo_u": "M",
"base_currency": null,
"coupon": "",
"expiry_date": "",
"chairman": "Li Tzar Kuoi Victor",
"underlying_ric": "0001.HK",
"hi52": "92.500",
"issuer_name": "CK Hutchison Holdings Ltd.",
"h_share_flag": false,
"ew_sub_per_from": "",
"div_yield": "4.28",
"interest_payment_date": "-",
"updatetime": "31 May 2019 16:08",
"aum_date": "",
"lo": "73.050",
"mkt_cap": "285.55",
"f_aum_hkd": null,
"ew_sub_per_to": "",
"ls": "74.050",
"nav_date": "",
"csic_classification": null,
"floating_flag": false,
"issued_shares_note": null,
"eff_gear": "",
"board_lot_nominal": "",
"hsic_ind_classification": "Conglomerates - Conglomerates",
"ew_desc": null,
"inception_date": "",
"nc": "+1.050",
"aum": "",
"vo": "9.41",
"secondary_listing_flag": false,
"listing_date": "1 Nov 1972",
"as_at_label": "as at",
"ew_amt_os_dat": "",
"nm": "CK Hutchison Holdings Ltd.",
"nm_s": "CKH HOLDINGS",
"sym": "1",
"inline_lower_strike_price": "",
"listing_category": "Primary Listing",
"ew_strike_cur": null,
"exotic_warrant_indicator": null,
"investment_focus": null,
"call_price": "",
"tck": "0.050",
"strike_price": "",
"summary": "CK Hutchison Holdings Limited is an investment holding company mainly engaged in the retail business. Along with subsidiaries, the Company operates its business through five segments: the Retail segment, the Telecommunications segment, the Infrastructure segment, the Ports and Related Services segment, and the Husky Energy segment. The Retail segment is involved in the manufacturing and sale of health and beauty products, as well as consumer electronics and electrical appliances. It also operates supermarkets, as well as manufactures and distributes bottled water and beverage products. The Telecommunications segment provides mobile telecommunications and data services by 3 Group Europe, Hutchison Telecommunications Hong Kong Holdings, and Hutchison Asia Telecommunications. The Infrastructure segment is involved in the energy infrastructure, transportation infrastructure, water infrastructure, waste management, waste-to-energy and infrastructure related businesses.",
"op": "73.050",
"aum_u": "",
"nav_ccy": null,
"os": "",
"wnt_gear": "",
"transfer_of_listing_date": "",
"hsic_sub_sector_classification": "Conglomerates",
"amt_ccy": null,
"domicile_country": null,
"entitlement_ratio": "",
"product_type": "EQTY",
"office_address": "48th Floor<br/>Cheung Kong Center<br/>2 Queen's Road Central<br/>Hong Kong",
"pc": "+1.44",
"days_to_expiry": null,
"underlying_code": null,
"pe": "7.32",
"eps_ccy": "HKD",
"hdr": false,
"launch_date": "",
"hc": "73.000",
"isin": "KYG217651051",
"moneyness": ""
}
},
"qid": "NULL"
}}
程序里我随便挑了几个字段输出出来
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Net;
using System.Text;
namespace HttpWebRequestTest
{
class Program
{
static void Main(string[] args)
{
var hp = new HttpRequestClient();
//访问网站
string reslut = hp.httpGet("https://www.hkex.com.hk/?sc_lang=EN", HttpRequestClient.defaultHeaders);
//定位token字符串头
int index_head = reslut.IndexOf("evLtsLs");
string InitToken = reslut.Substring(index_head, 100);
//定位token字符串尾
int index_last = InitToken.IndexOf('"');
//截取token
string Token = reslut.Substring(index_head, index_last);
//拼接链接字符串
string link = string.Format("https://www1.hkex.com.hk/hkexwidget/data/getequityquote?sym=1&token={0}&lang=eng&qid=NULL&callback=0", Token);
//从港交所接口获取数据
string data = hp.httpGet(link, HttpRequestClient.defaultHeaders);
//解析Json数据
JObject JsonData = JsonConvert.DeserializeObject<JObject>(data.Substring(2,data.Length-3));
Console.WriteLine("hi:" + JsonData["data"]["quote"]["hi"]);
Console.WriteLine("fiscal_year_end:" + JsonData["data"]["quote"]["fiscal_year_end"]);
Console.WriteLine("amt_os:" + JsonData["data"]["quote"]["amt_os"]);
Console.WriteLine("primaryexch:" + JsonData["data"]["quote"]["primaryexch"]);
Console.WriteLine("db_updatetime:" + JsonData["data"]["quote"]["db_updatetime"]);
Console.WriteLine("ric:" + JsonData["data"]["quote"]["ric"]);
Console.WriteLine("eps:" + JsonData["data"]["quote"]["eps"]);
Console.ReadLine();
}
}
//zetee
//不能Host、Connection、User-Agent、Referer、Range、Content-Type、Content-Length、Expect、Proxy-Connection、If-Modified-Since
//等header. 这些header都是通过属性来设置的 。
public class HttpRequestClient
{
static HashSet<String> UNCHANGEHEADS = new HashSet<string>();
static HttpRequestClient()
{
UNCHANGEHEADS.Add("Host");
UNCHANGEHEADS.Add("Connection");
UNCHANGEHEADS.Add("User-Agent");
UNCHANGEHEADS.Add("Referer");
UNCHANGEHEADS.Add("Range");
UNCHANGEHEADS.Add("Content-Type");
UNCHANGEHEADS.Add("Content-Length");
UNCHANGEHEADS.Add("Expect");
UNCHANGEHEADS.Add("Proxy-Connection");
UNCHANGEHEADS.Add("If-Modified-Since");
UNCHANGEHEADS.Add("Keep-alive");
UNCHANGEHEADS.Add("Accept");
ServicePointManager.DefaultConnectionLimit = 1000;//最大连接数
}
/// <summary>
/// 默认的头
/// </summary>
public static string defaultHeaders = @"Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Encoding:gzip, deflate, sdch
Accept-Language:zh-CN,zh;q=0.8
Cache-Control:no-cache
Connection:keep-alive
Pragma:no-cache
Upgrade-Insecure-Requests:1
User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36";
/// <summary>
/// 是否跟踪cookies
/// </summary>
bool isTrackCookies = false;
/// <summary>
/// cookies 字典
/// </summary>
Dictionary<String, Cookie> cookieDic = new Dictionary<string, Cookie>();
/// <summary>
/// 平均相应时间
/// </summary>
long avgResponseMilliseconds = -1;
/// <summary>
/// 平均相应时间
/// </summary>
public long AvgResponseMilliseconds
{
get
{
return avgResponseMilliseconds;
}
set
{
if (avgResponseMilliseconds != -1)
{
avgResponseMilliseconds = value + avgResponseMilliseconds / 2;
}
else
{
avgResponseMilliseconds = value;
}
}
}
public HttpRequestClient(bool isTrackCookies = false)
{
this.isTrackCookies = isTrackCookies;
}
/// <summary>
/// http请求
/// </summary>
/// <param name="url"></param>
/// <param name="method">POST,GET</param>
/// <param name="headers">http的头部,直接拷贝谷歌请求的头部即可</param>
/// <param name="content">content,每个key,value 都要UrlEncode才行</param>
/// <param name="contentEncode">content的编码</param>
/// <param name="proxyUrl">代理url</param>
/// <returns></returns>
public string http(string url, string method, string headers, string content, Encoding contentEncode, string proxyUrl)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = method;
if (method.Equals("GET", StringComparison.InvariantCultureIgnoreCase))
{
request.MaximumAutomaticRedirections = 100;
request.AllowAutoRedirect = false;
}
fillHeaders(request, headers);
fillProxy(request, proxyUrl);
#region 添加Post 参数
if (contentEncode == null)
{
contentEncode = Encoding.UTF8;
}
if (!string.IsNullOrWhiteSpace(content))
{
byte[] data = contentEncode.GetBytes(content);
request.ContentLength = data.Length;
using (Stream reqStream = request.GetRequestStream())
{
reqStream.Write(data, 0, data.Length);
reqStream.Close();
}
}
#endregion
HttpWebResponse response = null;
System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
try
{
sw.Start();
response = (HttpWebResponse)request.GetResponse();
sw.Stop();
AvgResponseMilliseconds = sw.ElapsedMilliseconds;
CookieCollection cc = new CookieCollection();
string cookieString = response.Headers[HttpResponseHeader.SetCookie];
if (!string.IsNullOrWhiteSpace(cookieString))
{
var spilit = cookieString.Split(';');
foreach (string item in spilit)
{
var kv = item.Split('=');
if (kv.Length == 2)
cc.Add(new Cookie(kv[0].Trim().ToString().Replace(",","|*|"), kv[1].Trim()));
}
}
trackCookies(cc);
}
catch (Exception ex)
{
sw.Stop();
AvgResponseMilliseconds = sw.ElapsedMilliseconds;
return ex.Message;
}
string result = getResponseBody(response);
return result;
}
/// <summary>
/// post 请求
/// </summary>
/// <param name="url"></param>
/// <param name="headers"></param>
/// <param name="content"></param>
/// <param name="contentEncode"></param>
/// <param name="proxyUrl"></param>
/// <returns></returns>
public string httpPost(string url, string headers, string content, Encoding contentEncode, string proxyUrl = null)
{
return http(url, "POST", headers, content, contentEncode, proxyUrl);
}
/// <summary>
/// get 请求
/// </summary>
/// <param name="url"></param>
/// <param name="headers"></param>
/// <param name="content"></param>
/// <param name="proxyUrl"></param>
/// <returns></returns>
public string httpGet(string url, string headers, string content = null, string proxyUrl = null)
{
return http(url, "GET", headers, null, null, proxyUrl);
}
/// <summary>
/// 填充代理
/// </summary>
/// <param name="proxyUri"></param>
private void fillProxy(HttpWebRequest request, string proxyUri)
{
if (!string.IsNullOrWhiteSpace(proxyUri))
{
WebProxy proxy = new WebProxy();
proxy.Address = new Uri(proxyUri);
request.Proxy = proxy;
}
}
/// <summary>
/// 跟踪cookies
/// </summary>
/// <param name="cookies"></param>
private void trackCookies(CookieCollection cookies)
{
if (!isTrackCookies) return;
if (cookies == null) return;
foreach (Cookie c in cookies)
{
if (cookieDic.ContainsKey(c.Name))
{
cookieDic[c.Name] = c;
}
else
{
cookieDic.Add(c.Name, c);
}
}
}
/// <summary>
/// 格式cookies
/// </summary>
/// <param name="cookies"></param>
private string getCookieStr()
{
StringBuilder sb = new StringBuilder();
foreach (KeyValuePair<string, Cookie> item in cookieDic)
{
if (!item.Value.Expired)
{
if (sb.Length == 0)
{
sb.Append(item.Key).Append("=").Append(item.Value.Value);
}
else
{
sb.Append("; ").Append(item.Key).Append(" = ").Append(item.Value.Value);
}
}
}
return sb.ToString();
}
/// <summary>
/// 填充头
/// </summary>
/// <param name="request"></param>
/// <param name="headers"></param>
private void fillHeaders(HttpWebRequest request, string headers, bool isPrint = false)
{
if (request == null) return;
if (string.IsNullOrWhiteSpace(headers)) return;
string[] hsplit = headers.Split(new String[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
foreach (string item in hsplit)
{
string[] kv = item.Split(':');
string key = kv[0].Trim();
string value = string.Join(":", kv.Skip(1)).Trim();
if (!UNCHANGEHEADS.Contains(key))
{
request.Headers.Add(key, value);
}
else
{
#region 设置http头
switch (key)
{
case "Accept":
{
request.Accept = value;
break;
}
case "Host":
{
request.Host = value;
break;
}
case "Connection":
{
if (value == "keep-alive")
{
request.KeepAlive = true;
}
else
{
request.KeepAlive = false;//just test
}
break;
}
case "Content-Type":
{
request.ContentType = value;
break;
}
case "User-Agent":
{
request.UserAgent = value;
break;
}
case "Referer":
{
request.Referer = value;
break;
}
case "Content-Length":
{
request.ContentLength = Convert.ToInt64(value);
break;
}
case "Expect":
{
request.Expect = value;
break;
}
case "If-Modified-Since":
{
request.IfModifiedSince = Convert.ToDateTime(value);
break;
}
default:
break;
}
#endregion
}
}
CookieCollection cc = new CookieCollection();
string cookieString = request.Headers[HttpRequestHeader.Cookie];
if (!string.IsNullOrWhiteSpace(cookieString))
{
var spilit = cookieString.Split(';');
foreach (string item in spilit)
{
var kv = item.Split('=');
if (kv.Length == 2)
cc.Add(new Cookie(kv[0].Trim(), kv[1].Trim()));
}
}
trackCookies(cc);
if (!isTrackCookies)
{
request.Headers[HttpRequestHeader.Cookie] = "";
}
else
{
request.Headers[HttpRequestHeader.Cookie] = getCookieStr();
}
#region 打印头
if (isPrint)
{
for (int i = 0; i < request.Headers.AllKeys.Length; i++)
{
string key = request.Headers.AllKeys[i];
System.Console.WriteLine(key + ":" + request.Headers[key]);
}
}
#endregion
}
/// <summary>
/// 打印ResponseHeaders
/// </summary>
/// <param name="response"></param>
private void printResponseHeaders(HttpWebResponse response)
{
#region 打印头
if (response == null) return;
for (int i = 0; i < response.Headers.AllKeys.Length; i++)
{
string key = response.Headers.AllKeys[i];
System.Console.WriteLine(key + ":" + response.Headers[key]);
}
#endregion
}
/// <summary>
/// 返回body内容
/// </summary>
/// <param name="response"></param>
/// <returns></returns>
private string getResponseBody(HttpWebResponse response)
{
Encoding defaultEncode = Encoding.UTF8;
string contentType = response.ContentType;
if (contentType != null)
{
if (contentType.ToLower().Contains("gb2312"))
{
defaultEncode = Encoding.GetEncoding("gb2312");
}
else if (contentType.ToLower().Contains("gbk"))
{
defaultEncode = Encoding.GetEncoding("gbk");
}
else if (contentType.ToLower().Contains("zh-cn"))
{
defaultEncode = Encoding.GetEncoding("zh-cn");
}
}
string responseBody = string.Empty;
if (response.ContentEncoding.ToLower().Contains("gzip"))
{
using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream))
{
responseBody = reader.ReadToEnd();
}
}
}
else if (response.ContentEncoding.ToLower().Contains("deflate"))
{
using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream, defaultEncode))
{
responseBody = reader.ReadToEnd();
}
}
}
else
{
using (Stream stream = response.GetResponseStream())
{
using (StreamReader reader = new StreamReader(stream, defaultEncode))
{
responseBody = reader.ReadToEnd();
}
}
}
return responseBody;
}
public static string UrlEncode(string item, Encoding code)
{
return System.Web.HttpUtility.UrlEncode(item.Trim('\t').Trim(), Encoding.GetEncoding("gb2312"));
}
public static string UrlEncodeByGB2312(string item)
{
return UrlEncode(item, Encoding.GetEncoding("gb2312"));
}
public static string UrlEncodeByUTF8(string item)
{
return UrlEncode(item, Encoding.GetEncoding("utf-8"));
}
public static string HtmlDecode(string item)
{
return WebUtility.HtmlDecode(item.Trim('\t').Trim());
}
}
}
输出结果如下:
74.350
31 May 2019 09:36
3,856,240,500
0001.HK
HKEX
这中间有个问题要说一下,文中我只是拿00001这个代码做了个例子,所以向接口发出一次请求,只返回了一个代码的数据。如果你想每天批量爬取港股所有的行情数据,首先你要建立一个港股所有股票的码表,通过遍历这个码表,把每个股票代码对应的数据取出来。
取数据的核心请求链接是:
其中链接里sym=1这个地方就是对应的股票代码,这里股票代码的00001,在连接里要把前面的0都去掉。同理,如果你想获取00002这个股票代码的数据,那么链接里就要写sym=2
每次替换sym后面对应的数字,就能获取相应股票的行情数据。
python版代码: