网页分析及XML存储与显示

 为在校内网工作的女友写一个工作量分析软件,花了一天的时间,终于从混乱的网页上提取到了最关键的数据部分.自己我感觉跟BAIDU在做的工作有相似之处,都是网页分析,挺有意思的.原理应该就是网络外挂吧,其中被屏蔽的部分可以跨进程访问浏览器.为了方便自己统计,没有用这种方式,而是内嵌Webbrowser的方式,这样可以很方便实现自动换页.有几点自我感觉值得记下

1.再次认识了mshtml和webbrowser的Document的强大功能

2.实现了类到XML直接转化,极大的方便了存储.(柯达一年的实习上机考试就有类的快速存储)

3.为泛型List<>实现不定长类数组的储存,值得以后借鉴.

public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
            webBrowser1 = new WebBrowser();
            List<Person> list = new List<Person>();
        }
        public static mshtml.IHTMLDocument2 doc;
        List<Person> list;
        private void button1_Click(object sender, EventArgs e)
        {
            richTextBox1.Clear();
            webBrowser1.Navigate("******");//为保密,略去网址
            webBrowser1.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(button2_Click);
           
           
            SHDocVw.ShellWindows shellWindows = new SHDocVw.ShellWindowsClass();
            string filename;
            foreach (SHDocVw.InternetExplorer ie in shellWindows)
            {
                filename = Path.GetFileNameWithoutExtension(ie.FullName).ToLower();
                if (filename.Equals("maxthon"))
                {

                    richTextBox1.AppendText( ie.LocationURL+"/n");
                    doc = ie.Document as mshtml.IHTMLDocument2;
                   
                    if (doc != null)
                        richTextBox1.AppendText(doc.body.outerHTML);
                    else
                        richTextBox1.AppendText("Error/r/n");

                }
            }
  

        }

        private void button2_Click(object sender, EventArgs e)
        {
            Analysis(FindKeydata(webBrowser1.Document.Body.OuterHtml));
            //DisplayXML("ccc.xml");

        }
        private string[] FindKeydata(string context)
        {
            string[] str = new string[100];
            int plen=0;//记录字符串数组中有用元素的个数
            for (int i = 0; i < context.Length; )
            {
                string keystr = "<P class=image><A href=/"";
                int begin = context.IndexOf(keystr, i, context.Length-i);
                if (begin == -1)
                {
                    str[plen] = "000";
                    break;
                }
                string valuedata = context.Substring(begin, 400);
                str[plen] = valuedata;
                plen++;
                i = begin+100;//每次推进100bytes,可更大,待测试
            }
            str[plen] = "000";//以000结尾
            return str;
        }
        private void Analysis(string[] context)
        {
            if (context.Length <= 0)
            {

                MessageBox.Show("空数据");
                return;
            }
            //==================================================
            //说明:
            //24为主页地址的起始位,长度为end-24
           
            //==================================================
            int count = 0;

            foreach (string xx in context)
            {
               
                if (xx == "000")
                {
                    break;
                }
                int end=xx.IndexOf("/"", 24);
                string Url = xx.Substring(24, end - 24);//找到URI

                int namekey = xx.IndexOf("<IMG alt=");
                end = xx.IndexOf("src=");
                string name = xx.Substring(namekey + 9, end-namekey-9);//找到名字

                int end1=xx.IndexOf("/"><",end);
                string picpath = xx.Substring(end + 5, end1 - end - 5);//找到头像地址

                end = xx.IndexOf("<TD>",end1);
                end1 = xx.IndexOf("<",end+3);
                string sex = xx.Substring(end + 4, end1 - end - 4);//得到性别


               
                Person person = new Person();
                person = new Person();
                person.Url = Url;
                person.name = name;
                person.sex = sex;
                person.picpath = picpath;
                list.Add(person);
                //写入XML
              
                count++;
              
            }
            // Stream s = new FileStream("ccc.xml", FileMode.Append, FileAccess.Write);
            //StreamWriter writer = new StreamWriter(s);

            StreamWriter writer = new StreamWriter("ccc.xml");
            XmlSerializer serializer = new XmlSerializer(typeof(List<Person>));
            serializer.Serialize(writer, list);
            writer.Close();
            MessageBox.Show("本页采集完成,共" + count.ToString() + "个人物数据");

        }
        private void DisplayXML(string path)
        {
           
            Stream streamout = new FileStream(path, FileMode.Open, FileAccess.Read);
            XmlSerializer deserializer = new XmlSerializer(typeof(List<Person>));
            List<Person> dperson = (List<Person>)deserializer.Deserialize(streamout);
            streamout.Close();
            foreach (Person p in dperson)
            {
                richTextBox1.AppendText(p.Url);
            }
        }
      

    }
   
    public class Person
    {
        public string Url;
        public string name;
        public string sex;
        public string picpath;

    }

生成的XML部分内容

<?xml version="1.0" encoding="utf-8" ?>

- < ArrayOfPerson xmlns:xsi =" http://www.w3.org/2001/XMLSchema-instance " xmlns:xsd =" http://www.w3.org/2001/XMLSchema ">
- < Person >
  < Url > http://xiaonei.com/getuser.do?id=225789377 </ Url >
  < name > "張翔 おうひ" </ name >
  < sex > 男生 </ sex >
  < picpath > http://hd22.xiaonei.com/photos/hd22/20071208/08/54/head_500h169.jpg </ picpath >
  </ Person >
- < Person >
  < Url > http://xiaonei.com/getuser.do?id=230359702 </ Url >
  < name > "张凡 @@ 作男" </ name >
  < sex > 男生 </ sex >
  < picpath > http://hd14.xiaonei.com/photos/hd14/20080303/16/11/head_3527d107.jpg </ picpath >
  </ Person >
- < Person >
  < Url > http://xiaonei.com/getuser.do?id=233299041 </ Url >
  < name > 张齐瀚/张弓 </ name >
  < sex > 男生 </ sex >
 
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值