C# 关于爬取网站数据遇到csrf-token的分析与解决

需求
某航空公司物流单信息查询,是一个post请求。通过后台模拟POST HTTP请求发现无法获取页面数据,通过查看航空公司网站后,发现网站使用避免CSRF攻击机制,直接发挥40X错误。

关于CSRF
读者自行百度

网站HTTP请求分析
Headers

Form Data

在head里包含了cookie 与 x-csrf-token formdata 里包含了_csrf (与head里的值是一样的).

这里通过查看该网站的JS源代码发现_csrf 来自于网页的head标签里

猜测cookie与 x-csrf-token是有一定的有效期,并且他们共同作用来防御CSRF攻击。

解决方案
1,首先请求一下该航空公司的网站,获取cookie与_csrf

2,然后C# 模拟http分别在head和formdata里加入如上参数,发起请求

代码

复制代码
public class CSRFToken
{
string cookie;//用于请求的站点的cookie
List csrfs;//用于请求站点的token的key 以及 value

    public CSRFToken(string url)
    {
        //校验传输安全
        if (!string.IsNullOrWhiteSpace(url))
        {
            try
            {
                //设置请求的头信息.获取url的host
                var _http = new HttpHelper(url);
                string cookie;
                string html = _http.CreateGetHttpResponseForPC(out cookie);
                this.cookie = cookie;

                string headRegex = @"<meta name=""_csrf.*"" content="".*""/>";

                MatchCollection matches = Regex.Matches(html, headRegex);
                Regex re = new Regex("(?<=content=\").*?(?=\")", RegexOptions.None);
                csrfs = new List<string>();
                foreach (Match math in matches)
                {

                    MatchCollection mc = re.Matches(math.Value);
                    foreach (Match ma in mc)
                    {
                        csrfs.Add(ma.Value);
                    }
                }

            }
            catch (Exception e)
            {

            }
        }
    }

    public String getCookie()
    {
        return cookie;
    }
    public void setCookie(String cookie)
    {
        this.cookie = cookie;
    }
    public List<string> getCsrf_token()
    {
        return csrfs;
    }
}

复制代码
httpHelper

复制代码
public string CreatePostHttpResponse(IDictionary<string, string> headers, IDictionary<string, string> parameters)
{
HttpWebRequest request = null;
//HTTPSQ请求
UTF8Encoding encoding = new System.Text.UTF8Encoding();
ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
request = WebRequest.Create(_baseIPAddress) as HttpWebRequest;
request.ProtocolVersion = HttpVersion.Version10;
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11;
request.Method = “POST”;
request.ContentType = “application/x-www-form-urlencoded”;
// request.ContentType = “application/json”;
request.UserAgent = DefaultUserAgent;
//request.Headers.Add(“X-CSRF-TOKEN”, “bc0cc533-60cc-484a-952d-0b4c1a95672c”);
//request.Referer = “https://www.asianacargo.com/tracking/viewTraceAirWaybill.do”;

        //request.Headers.Add("Origin", "https://www.asianacargo.com");
        //request.Headers.Add("Cookie", "JSESSIONID=HP21d2Dq5FoSlG4Fyw4slWwHb0-Sl1CG6jGtj7HE41e5f4aN_R1p!-435435446!117330181");
        //request.Host = "www.asianacargo.com";


        if (!(headers == null || headers.Count == 0))
        {

            foreach (string key in headers.Keys)
            {
                request.Headers.Add(key, headers[key]);
            }

        }


        //如果需要POST数据     
        if (!(parameters == null || parameters.Count == 0))
        {
            StringBuilder buffer = new StringBuilder();
            int i = 0;
            foreach (string key in parameters.Keys)
            {
                if (i > 0)
                {
                    buffer.AppendFormat("&{0}={1}", key, parameters[key]);
                }
                else
                {
                    buffer.AppendFormat("{0}={1}", key, parameters[key]);
                }
                i++;
            }
            byte[] data = encoding.GetBytes(buffer.ToString());
            using (Stream stream = request.GetRequestStream())
            {
                stream.Write(data, 0, data.Length);
            }
        }

        HttpWebResponse response;

        try
        {
            //获得响应流
            response = (HttpWebResponse)request.GetResponse();
            Stream s = response.GetResponseStream();

            StreamReader readStream = new StreamReader(s, Encoding.UTF8);
            string SourceCode = readStream.ReadToEnd();
            response.Close();
            readStream.Close();
            return SourceCode;
        }
        catch (WebException ex)
        {
            response = ex.Response as HttpWebResponse; return null;
        }

    }

public string CreateGetHttpResponse(out string cookie)
{
HttpWebRequest request = null;
//HTTPSQ请求
UTF8Encoding encoding = new System.Text.UTF8Encoding();
ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
request = WebRequest.Create(_baseIPAddress) as HttpWebRequest;
request.ProtocolVersion = HttpVersion.Version10;
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11;
request.Method = “GET”;
request.ContentType = “application/x-www-form-urlencoded”;
request.UserAgent = DefaultUserAgent;

        HttpWebResponse response;

        try
        {
            //获得响应流
            response = (HttpWebResponse)request.GetResponse();

            cookie = response.Headers["Set-Cookie"];
            Stream s = response.GetResponseStream();

            StreamReader readStream = new StreamReader(s, Encoding.UTF8);
            string SourceCode = readStream.ReadToEnd();
            response.Close();
            readStream.Close();
            return SourceCode;
        }
        catch (WebException ex)
        {
            response = ex.Response as HttpWebResponse;
            cookie = "";
            return null;
        }

    }

复制代码
爬取程序

爬取结果

浏览器结果

注意事项与结论
1,不同的网站,获取cstf的方式不一样,无论怎么做,只要信息传到前台我们都可以有相应的方法来获取。

2,请求时候的http验证可能不一样,测试的某航空公司物流信息的时候,http请求的安全协议是tis12。

ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11; 还有其他参数比如UserAgent后台可能也会验证

3,基于如上航空公司,发现它的cookie和cstf_token一定时间内不会改变,那么当实际爬取的时候可以考虑缓存cookie以及cstf_token,只有当请求失败的时候,才重新获取
深圳网站建设 https://www.sz886.com/

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值