网页分析及XML存储与显示

最新推荐文章于 2021-04-26 16:51:50 发布

blessyou312

最新推荐文章于 2021-04-26 16:51:50 发布

阅读量1.1k

点赞数

文章标签： xml 存储 webbrowser string url button

本文链接：https://blog.csdn.net/blessyou312/article/details/2247750

版权

为在校内网工作的女友写一个工作量分析软件,花了一天的时间,终于从混乱的网页上提取到了最关键的数据部分.自己我感觉跟BAIDU在做的工作有相似之处,都是网页分析,挺有意思的.原理应该就是网络外挂吧,其中被屏蔽的部分可以跨进程访问浏览器.为了方便自己统计,没有用这种方式,而是内嵌Webbrowser的方式,这样可以很方便实现自动换页.有几点自我感觉值得记下

1.再次认识了mshtml和webbrowser的Document的强大功能

2.实现了类到XML直接转化,极大的方便了存储.(柯达一年的实习上机考试就有类的快速存储)

3.为泛型List<>实现不定长类数组的储存,值得以后借鉴.

public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
            webBrowser1 = new WebBrowser();
            List<Person> list = new List<Person>();
        }
        public static mshtml.IHTMLDocument2 doc;
        List<Person> list;
        private void button1_Click(object sender, EventArgs e)
        {
            richTextBox1.Clear();
            webBrowser1.Navigate("******");//为保密,略去网址
            webBrowser1.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(button2_Click);


            SHDocVw.ShellWindows shellWindows = new SHDocVw.ShellWindowsClass();
            string filename;
            foreach (SHDocVw.InternetExplorer ie in shellWindows)
            {
                filename = Path.GetFileNameWithoutExtension(ie.FullName).ToLower();
                if (filename.Equals("maxthon"))
                {

                    richTextBox1.AppendText( ie.LocationURL+"/n");
                    doc = ie.Document as mshtml.IHTMLDocument2;

                    if (doc != null)
                        richTextBox1.AppendText(doc.body.outerHTML);
                    else
                        richTextBox1.AppendText("Error/r/n");

                }
            }

}

        private void button2_Click(object sender, EventArgs e)
        {
            Analysis(FindKeydata(webBrowser1.Document.Body.OuterHtml));
            //DisplayXML("ccc.xml");

        }
        private string[] FindKeydata(string context)
        {
            string[] str = new string[100];
            int plen=0;//记录字符串数组中有用元素的个数
            for (int i = 0; i < context.Length; )
            {
                string keystr = "<P class=image><A href=/"";
                int begin = context.IndexOf(keystr, i, context.Length-i);
                if (begin == -1)
                {
                    str[plen] = "000";
                    break;
                }
                string valuedata = context.Substring(begin, 400);
                str[plen] = valuedata;
                plen++;
                i = begin+100;//每次推进100bytes,可更大,待测试
            }
            str[plen] = "000";//以000结尾
            return str;
        }
        private void Analysis(string[] context)
        {
            if (context.Length <= 0)
            {

                MessageBox.Show("空数据");
                return;
            }
            //==================================================
            //说明:
            //24为主页地址的起始位,长度为end-24

            //==================================================
            int count = 0;

            foreach (string xx in context)
            {

                if (xx == "000")
                {
                    break;
                }
                int end=xx.IndexOf("/"", 24);
                string Url = xx.Substring(24, end - 24);//找到URI

                int namekey = xx.IndexOf("<IMG alt=");
                end = xx.IndexOf("src=");
                string name = xx.Substring(namekey + 9, end-namekey-9);//找到名字

int end1=xx.IndexOf("/"><",end);
string picpath = xx.Substring(end + 5, end1 - end - 5);//找到头像地址

                end = xx.IndexOf("<TD>",end1);
                end1 = xx.IndexOf("<",end+3);
                string sex = xx.Substring(end + 4, end1 - end - 4);//得到性别

                Person person = new Person();
                person = new Person();
                person.Url = Url;
                person.name = name;
                person.sex = sex;
                person.picpath = picpath;
                list.Add(person);
                //写入XML

                count++;

            }
            // Stream s = new FileStream("ccc.xml", FileMode.Append, FileAccess.Write);
            //StreamWriter writer = new StreamWriter(s);

            StreamWriter writer = new StreamWriter("ccc.xml");
            XmlSerializer serializer = new XmlSerializer(typeof(List<Person>));
            serializer.Serialize(writer, list);
            writer.Close();
            MessageBox.Show("本页采集完成，共" + count.ToString() + "个人物数据");

        }
        private void DisplayXML(string path)
        {

            Stream streamout = new FileStream(path, FileMode.Open, FileAccess.Read);
            XmlSerializer deserializer = new XmlSerializer(typeof(List<Person>));
            List<Person> dperson = (List<Person>)deserializer.Deserialize(streamout);
            streamout.Close();
            foreach (Person p in dperson)
            {
                richTextBox1.AppendText(p.Url);
            }
        }

    }

    public class Person
    {
        public string Url;
        public string name;
        public string sex;
        public string picpath;

}

生成的XML部分内容

<?xml version="1.0" encoding="utf-8" ?>

- < ArrayOfPerson xmlns:xsi =" http://www.w3.org/2001/XMLSchema-instance " xmlns:xsd =" http://www.w3.org/2001/XMLSchema ">

- < Person >

< Url > http://xiaonei.com/getuser.do?id=225789377 </ Url >

< name > "張翔おうひ" </ name >

< sex > 男生 </ sex >

< picpath > http://hd22.xiaonei.com/photos/hd22/20071208/08/54/head_500h169.jpg </ picpath >

</ Person >

- < Person >

< Url > http://xiaonei.com/getuser.do?id=230359702 </ Url >

< name > "张凡 @@ 作男" </ name >

< sex > 男生 </ sex >

< picpath > http://hd14.xiaonei.com/photos/hd14/20080303/16/11/head_3527d107.jpg </ picpath >

</ Person >

- < Person >

< Url > http://xiaonei.com/getuser.do?id=233299041 </ Url >

< name > 张齐瀚/张弓 </ name >

< sex > 男生 </ sex >

blessyou312

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
网页分析及XML存储与显示

为在校内网工作的女友写一个工作量分析软件,花了一天的时间,终于从混乱的网页上提取到了最关键的数据部分.自己我感觉跟BAIDU在做的工作有相似之处,都是网页分析,挺有意思的.原理应该就是网络外挂吧,其中被屏蔽的部分可以跨进程访问浏览器.为了方便自己统计,没有用这种方式,而是内嵌Webbrowser的方式,这样可以很方便实现自动换页.有几点自我感觉值得记下1.再次认识了mshtml和webbro
复制链接

扫一扫