为在校内网工作的女友写一个工作量分析软件,花了一天的时间,终于从混乱的网页上提取到了最关键的数据部分.自己我感觉跟BAIDU在做的工作有相似之处,都是网页分析,挺有意思的.原理应该就是网络外挂吧,其中被屏蔽的部分可以跨进程访问浏览器.为了方便自己统计,没有用这种方式,而是内嵌Webbrowser的方式,这样可以很方便实现自动换页.有几点自我感觉值得记下
1.再次认识了mshtml和webbrowser的Document的强大功能
2.实现了类到XML直接转化,极大的方便了存储.(柯达一年的实习上机考试就有类的快速存储)
3.为泛型List<>实现不定长类数组的储存,值得以后借鉴.
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
webBrowser1 = new WebBrowser();
List<Person> list = new List<Person>();
}
public static mshtml.IHTMLDocument2 doc;
List<Person> list;
private void button1_Click(object sender, EventArgs e)
{
richTextBox1.Clear();
webBrowser1.Navigate("******");//为保密,略去网址
webBrowser1.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(button2_Click);
SHDocVw.ShellWindows shellWindows = new SHDocVw.ShellWindowsClass();
string filename;
foreach (SHDocVw.InternetExplorer ie in shellWindows)
{
filename = Path.GetFileNameWithoutExtension(ie.FullName).ToLower();
if (filename.Equals("maxthon"))
{
richTextBox1.AppendText( ie.LocationURL+"/n");
doc = ie.Document as mshtml.IHTMLDocument2;
if (doc != null)
richTextBox1.AppendText(doc.body.outerHTML);
else
richTextBox1.AppendText("Error/r/n");
}
}
}
private void button2_Click(object sender, EventArgs e)
{
Analysis(FindKeydata(webBrowser1.Document.Body.OuterHtml));
//DisplayXML("ccc.xml");
}
private string[] FindKeydata(string context)
{
string[] str = new string[100];
int plen=0;//记录字符串数组中有用元素的个数
for (int i = 0; i < context.Length; )
{
string keystr = "<P class=image><A href=/"";
int begin = context.IndexOf(keystr, i, context.Length-i);
if (begin == -1)
{
str[plen] = "000";
break;
}
string valuedata = context.Substring(begin, 400);
str[plen] = valuedata;
plen++;
i = begin+100;//每次推进100bytes,可更大,待测试
}
str[plen] = "000";//以000结尾
return str;
}
private void Analysis(string[] context)
{
if (context.Length <= 0)
{
MessageBox.Show("空数据");
return;
}
//==================================================
//说明:
//24为主页地址的起始位,长度为end-24
//==================================================
int count = 0;
foreach (string xx in context)
{
if (xx == "000")
{
break;
}
int end=xx.IndexOf("/"", 24);
string Url = xx.Substring(24, end - 24);//找到URI
int namekey = xx.IndexOf("<IMG alt=");
end = xx.IndexOf("src=");
string name = xx.Substring(namekey + 9, end-namekey-9);//找到名字
int end1=xx.IndexOf("/"><",end);
string picpath = xx.Substring(end + 5, end1 - end - 5);//找到头像地址
end = xx.IndexOf("<TD>",end1);
end1 = xx.IndexOf("<",end+3);
string sex = xx.Substring(end + 4, end1 - end - 4);//得到性别
Person person = new Person();
person = new Person();
person.Url = Url;
person.name = name;
person.sex = sex;
person.picpath = picpath;
list.Add(person);
//写入XML
count++;
}
// Stream s = new FileStream("ccc.xml", FileMode.Append, FileAccess.Write);
//StreamWriter writer = new StreamWriter(s);
StreamWriter writer = new StreamWriter("ccc.xml");
XmlSerializer serializer = new XmlSerializer(typeof(List<Person>));
serializer.Serialize(writer, list);
writer.Close();
MessageBox.Show("本页采集完成,共" + count.ToString() + "个人物数据");
}
private void DisplayXML(string path)
{
Stream streamout = new FileStream(path, FileMode.Open, FileAccess.Read);
XmlSerializer deserializer = new XmlSerializer(typeof(List<Person>));
List<Person> dperson = (List<Person>)deserializer.Deserialize(streamout);
streamout.Close();
foreach (Person p in dperson)
{
richTextBox1.AppendText(p.Url);
}
}
}
public class Person
{
public string Url;
public string name;
public string sex;
public string picpath;
}
生成的XML部分内容
<?xml version="1.0" encoding="utf-8" ?>