在研究搜索引擎的开发中,对于HTML网页的处理是核心的一个环节。网上有很多开源的代码,对于Java来说,HTMLParser是比较著名并且得到广泛应用
1.第一步,随便搜了一下下下来,htmlParser下载下来。编译出错,发现缺了个ICSharpCode.SharpZipLib.dll,再下个dll。就ok了。
2.看教程,首页的,那个教程,看着不错,一步步的,基本上就差不多了。虽说我用C#,教程是java的,其实一样。
还碰到两个问题
1.发现原始的数据不管用,在我删了好几大段,
<script type="text/javascript">
//<![CDATA[
</script>
if (Regex.IsMatch(htmlcode,regexstring))
{
htmlcode = Regex.Replace(htmlcode, regexstring, "");
}
protected void Page_Load(object sender, EventArgs e)
{
string htmlcode="";
string path = "C:\\Users\\weichao\\Desktop\\2.htm";
using(StreamReader sr = new StreamReader(path,Encoding.Default))//path是你的txt文件的路径
//using (StreamReader sr = File.OpenText(path))//path是你的txt文件的路径
{
string s="";
while((s=sr.ReadLine())!=null)
{
htmlcode+=s;
}
}
Response.Write(htmlcode);
#region parse
Parser parser = Parser.CreateParser(htmlcode, "GBK");
// Parser parser = Parser.CreateParser(htmlcode, "UTF-8");
//NodeFilter filter = new TagNameFilter("DIV");
//NodeList nodes = parser.ExtractAllNodesThatMatch(filter);
//if (nodes != null)
//{
// for (int i = 0; i < nodes.Size();i++ )
// {
// Response.Write( nodes.ElementAt(i).GetText());
// }
//}
// 1
//TextExtractingVisitor visitor = new TextExtractingVisitor();
//parser.VisitAllNodesWith(visitor);
//String textInPage = visitor.ExtractedText;
//Response.Write(textInPage);
//2
//for (INodeIterator i = parser.Elements(); i.HasMoreNodes(); )
//{
// INode node = i.NextNode();
// Response.Write("getText:" + node.GetText()+"<br/>");
// Response.Write("getPlainText:" + node.ToPlainTextString() + "<br/>");
// Response.Write("toHtml:" + node.ToHtml() + "<br/>");
// //Response.Write("toHtml(true):" + node.ToHtml(true));
// //Response.Write("toHtml(false):" + node.ToHtml(false));
// Response.Write("toString:" + node.ToString() + "<br/>");
// Response.Write("=================================================");
//}
//3
//NodeFilter filter = new TagNameFilter("DIV");
//NodeList nodes = parser.ExtractAllNodesThatMatch(filter);
//if (nodes != null)
//{
// for (int i = 0; i < nodes.Size(); i++)
// {
// INode textnode = (INode)nodes.ElementAt(i);
// Response.Write("getText:" + textnode.GetText() + "<br/>");
// Response.Write("=================================================" + "<br/>");
// }
//}
//4
// NodeFilter filter = new TagNameFilter("TD");
NodeFilter filter = new HasAttributeFilter("class", "tdhead");
NodeList nodes = parser.ExtractAllNodesThatMatch(filter);
if (nodes != null)
{
for (int i = 0; i < nodes.Size(); i++)
{
INode node = (INode)nodes.ElementAt(i);
Response.Write("getText:" + node.GetText() + "<br/>");
Response.Write("getPlainText:" + node.ToPlainTextString() + "<br/>");
Response.Write("toHtml:" + node.ToHtml() + "<br/>");
//Response.Write("toHtml(true):" + node.ToHtml(true));
//Response.Write("toHtml(false):" + node.ToHtml(false));
Response.Write("toString:" + node.ToString() + "<br/>");
Response.Write("=================================================" + "<br/>");
}
}
#endregion
}