C#自动登录网页浏览页面 抓取数据

<script type="text/javascript"> document.body.oncopy = function() { if (window.clipboardData) { setTimeout(function() { var text = clipboardData.getData("text"); if (text && text.length>300) { text = text + "/r/n/n本文来自CSDN博客,转载请标明出处:" + location.href; clipboardData.setData("text", text); } }, 100); } } </script> <script>function StorePage(){d=document;t=d.selection?(d.selection.type!='None'?d.selection.createRange().text:''):(d.getSelection?d.getSelection():'');void(keyit=window.open('http://www.365key.com/storeit.aspx?t='+escape(d.title)+'&u='+escape(d.location.href)+'&c='+escape(t),'keyit','scrollbars=no,width=475,height=575,left=75,top=20,status=no,resizable=yes'));keyit.focus();}</script>  需求:客户的数据同时存在在另外一个不可控的系统中,需要和当前系统同步。

思路:自动登录另外一个系统,然后抓取数据,同步到本系统中。

技术点:模拟用户登录;保存登录状态;抓取数据

 

程序非常简单

     /// <summary>

        /// visit the target url

        /// </summary>

        /// <param name="targetURL"></param>

        /// <param name="cc"> this is for keeping cookies and sessions </param>

        /// <param name="param"> this is the data need post inside form </param>

        /// <returns> html page </returns>

        public static string PostAndGetHTML(string targetURL,CookieContainer cc, Hashtable param)

        {

            //prepare the submit data

            string formData = "" ;

            foreach (DictionaryEntry de in param)

            {

                formData += de.Key.ToString() + "=" + de.Value.ToString()+"&" ;

            }

            if (formData.Length>0)

               formData = formData.Substring(0, formData.Length - 1); //remove last '&'

 

            ASCIIEncoding encoding = new ASCIIEncoding ();

            byte [] data = encoding.GetBytes(formData);

 

            HttpWebRequest request = (HttpWebRequest )WebRequest .Create(targetURL);

            request.Method = "POST" ;    //post

             request.ContentType = "application/x-www-form-urlencoded" ;

            request.ContentLength = data.Length;

            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 2.0.1124)" ;

            

            Stream newStream = request.GetRequestStream();

            newStream.Write(data, 0, data.Length);

 

            newStream.Close();

 

            request.CookieContainer = cc;

            HttpWebResponse response = (HttpWebResponse )request.GetResponse();

             cc.Add(response.Cookies);

            Stream stream = response.GetResponseStream();

            string result = new StreamReader (stream, System.Text.Encoding .Default).ReadToEnd();

            return result;

        }

这一个是调用的例子:先登录,在查询。 实际中这个逻辑可能有很多步骤

       private void button2_Click(object sender, EventArgs e)

        {

            CookieContainer cc = new CookieContainer ();//this is for keep the Session and Cookie

            Hashtable param = new Hashtable ();//this is for keep post data.

 

             string urlLogin = "http://demo.server//login.asp" ;

             //do find the elementId that needed. check the source of login page can get this information

            param.Add("User" , "xxx" );

            param.Add("Password" , "xxxx" );

            string result = PostAndGetHTML(urlLogin, cc, param);

            //check result, whether login success

          

            //if login success, goto the target url, and input some value.

            string url2 = " http://demo.server/query.asp?id=1" ;// need change. special logic

            param.Clear();

            //param.Add("SearchAreaId","JobId")

            result = PostAndGetHTML(url2, cc, new Hashtable ());

            //ConvertToDT the html or do something others

 

 

        }
      private DataTable ConvertToDT(DataTable dt, string tableHTML)

        {

 

            int lastTD = tableHTML.ToLower().LastIndexOf("</td>" );

            int firstRow = tableHTML.ToLower().IndexOf("<tr" ) + 3;//after ""<tr

            int index = tableHTML.ToLower().IndexOf("<tr" , firstRow) + 3;//after ""<tr

            while (index < lastTD)

            {

                DataRow dr = dt.NewRow();

                for (int i = 0; i < dt.Columns.Count; i++)

                 {

                    string value = "" ;

                    int startTD = tableHTML.ToLower().IndexOf("<td" , index) + 3;//after "<td"

                    int endTD = tableHTML.ToLower().IndexOf("</td>" , startTD);

                    if (endTD < 0)

                        break ;

                    string tdStr = tableHTML.Substring(startTD, endTD - startTD);

                   

                    //remove <> and others

                    tdStr = tdStr.Replace("&nbsp;" , "" ).Replace("/t" , "" ).Replace("/r" , "" );

                    string [] v = tdStr.Split('<' , '>' );

                    for (int j = 0; j < v.Length; j++)

                    {

                        j++;

                        if (v[j].Trim() != "" )

                        {

                            value = v[j].Trim();

                            break ;

                        }  

                    }

                    //

                    dr[i] = value;

                    index = endTD;

                }

                 dt.Rows.Add(dr);

 

            }

            return dt;

        }

注:对于有验证码登录系统的无效。(如果该系统的验证码放到 cookie 中存储的例外,这个容易破解)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值