![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/91d6fa37a44eb4e7477e4dbea954fd6a.png)
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Text.RegularExpressions;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;
using Winista.Text.HtmlParser.Filters;
//软件先使用HtmlParser解析浏览器导出的收藏夹文件,找出重复网址所在的html代码,然后另外查找删除这些html代码
//查找删除时,程序完善前会多删或少删,因为浏览器会记录添加收藏的日期,导致重复的网页会同时出现只有一种日期的或有两种日期以上的html代码
//同一种html代码也会出现1次或2次以上
namespace WindowsFormsApp2
{
public partial class Form1 : Form
{
string url;
string html = "";
string tmp = "";
System.Net.WebClient aWebClient = new System.Net.WebClient();
List<string> nodeList = new List<string>();//保存html标签节点
List<string> duplicateList = new List<string>();//保存重复的网页html代码
List<string> dupList = new List<string>();//保存重复的网页网址
public Form1()
{
InitializeComponent();
AddUrl();
}
private void btnParser_Click(object sender, EventArgs e)
{
#region 获得网页的html
try
{
txtHtmlWhole.Text = "";
url = CBUrl.SelectedItem.ToString().Trim();
aWebClient.Encoding = System.Text.Encoding.UTF8;
html = aWebClient.DownloadString(url);//打开html代码
txtHtmlWhole.Text = html;
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
#endregion
#region 分析网页html节点
Lexer lexer = new Lexer(html);
Parser parser = new Parser(lexer);
NodeList htmlNodes = parser.Parse(null);
this.treeView1.Nodes.Clear();
this.treeView1.Nodes.Add("root");
TreeNode treeRoot = this.treeView1.Nodes[0];
for (int i = 0; i < htmlNodes.Count; i++)
{
this.RecursionHtmlNode(treeRoot, htmlNodes[i], false);//递归遍历所有标签
}
//标签计数
this.label1.Text = this.treeView1.GetNodeCount(true).ToString();
#endregion
}
private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
{
if (htmlNode == null || treeNode == null) return;
TreeNode current = treeNode;
//current node
if (htmlNode is ITag)
{
ITag tag = (htmlNode as ITag);
if (!tag.IsEndTag())
{
string nodeString = tag.TagName;
if (tag.Attributes != null && tag.Attributes.Count > 0)
{
if (tag.TagName == "H3")//如果是文件夹
{
nodeString = " { name=\"" + tag.ToPlainTextString() + "\" }";
}
if (tag.Attributes["HREF"] != null)//如果是网页
{
nodeString = tag.ToPlainTextString() + " { href=\"" + tag.Attributes["HREF"].ToString() + "\" }";
if (nodeList.Contains(nodeString))
{
tmp = tag.Parent.ToHtml();
//txtHtmlWhole.Text += tmp.Substring(0,tmp.IndexOf("\r\n")) + "\r\n";
duplicateList.Add(tmp.Substring(0, tmp.IndexOf("\r\n")));//列表添加一行html代码,忽略换行后的值
dupList.Add(tag.Attributes["HREF"].ToString());//列表添加网址
this.treeView1.Nodes.Remove(treeNode);//当前树状图移除重复值
return;
}
else
nodeList.Add(nodeString);//当前树状图添加
}
}
if (tag.TagName == "H3" || tag.TagName == "A" || tag.TagName=="DT" || tag.TagName=="DL")//如果是网页,文件夹,html中的dt,dl标签
{
current = new TreeNode(nodeString);
treeNode.Nodes.Add(current);//当前树状图添加
}
}
}
//获取节点间的内容
if (htmlNode.Children != null && htmlNode.Children.Count > 0)
{
this.RecursionHtmlNode(current, htmlNode.FirstChild, true);//获取子节点,注意第三个参数是true
//content = new TreeNode(htmlNode.FirstChild.GetText());
//treeNode.Nodes.Add(content);
}
//the sibling nodes
if (siblingRequired)
{
INode sibling = htmlNode.NextSibling;
while (sibling != null)
{
this.RecursionHtmlNode(treeNode, sibling, false);
sibling = sibling.NextSibling;
}
}
}
private void AddUrl()
{
CBUrl.Items.Add("http://www.hao123.com");
CBUrl.Items.Add("http://www.sina.com");
CBUrl.Items.Add("http://www.heuet.edu.cn");
CBUrl.Items.Add(@"F:\reid\上位机\bookmarks_19_2_22.html");
CBUrl.Items.Add(@"D:\收藏20190505-092056.html");
}
//删除重复标签
private void button1_Click(object sender, EventArgs e)
{
int i, j;
foreach (string item in duplicateList)//遍历重复html代码列表
{
foreach(string dup in dupList)//遍历重复网址列表
{
if (item.IndexOf(dup) > 0)//如果网址对应html代码中的网址
{
i = txtHtmlWhole.Text.IndexOf(dup);//记录网址在html中第一次出现的位置
j = txtHtmlWhole.Text.IndexOf(dup, i + item.Length);//记录网址在html中第二次出现的位置
if (i != j & j > 1)//如果两次出现的位置不一样且出现2次
{
//MessageBox.Show("i=" + i + ",j=" + j + ",dup=" + dup + ",item=\r\n" + item);
txtHtmlWhole.Text = txtHtmlWhole.Text.Replace(item, "");//删除相同标签
if (txtHtmlWhole.Text.IndexOf(dup) < 0)//如果只有一种相同的html代码是重复的,replace方法会将所有该网址删除,需要重新添加该html代码
{
txtHtmlWhole.Text = txtHtmlWhole.Text.Insert(i - 13, item + "\r\n");//在i-13字符的位置插入,并在插入后回车
}
}
}
}
}
string s = txtHtmlWhole.Text;
Regex r = new Regex(@"\n\s*\r");//c#正则表达式匹配空白行^(\s*)\r\n
txtHtmlWhole.Text = r.Replace(s, "");//删除空白行
using (FileStream fsWrite = new FileStream(url, FileMode.OpenOrCreate, FileAccess.Write))
{
byte[] buffer = Encoding.UTF8.GetBytes(txtHtmlWhole.Text);
fsWrite.Write(buffer, 0, buffer.Length);
}
MessageBox.Show("覆盖保存完成");
}
private void Form1_SizeChanged(object sender, EventArgs e)
{
//文本框随窗口放大缩小
txtHtmlWhole.SetBounds(txtHtmlWhole.Location.X, txtHtmlWhole.Location.Y, (int)((this.Size.Width- txtHtmlWhole.Location.X )* 0.5), (int)((this.Size.Height- txtHtmlWhole.Location.Y )* 0.9));
treeView1.SetBounds(this.Size.Width/2+10, treeView1.Location.Y, (int)((this.Size.Width- txtHtmlWhole.Location.X-100) *0.5), (int)((this.Size.Height- treeView1.Location.Y )* 0.9));
}
private void button2_Click(object sender, EventArgs e)
{
//还没写好
//实现在treeview1中列出按重复项分组,在对应文件夹下重复的html,勾选后选择性删除
this.treeView1.Nodes.Clear();
duplicateList.Sort();
foreach(string item in duplicateList)
{
this.treeView1.Nodes.Add(item);
}
}
}
}