ZhihuArticle zhihuArticle = new ZhihuArticle(file);
using MariGold.HtmlParser;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace www.zhihu.com_equation_tex
{
public class ZhihuArticle
{
public string file_name { get; set; }
public ZhihuArticle(string filename)
{
if (!System.IO.File.Exists(filename)) return;
string html = System.IO.File.ReadAllText(filename);
MariGold.HtmlParser.HtmlTextParser htmlTextParser = new MariGold.HtmlParser.HtmlTextParser(html);
htmlTextParser.Parse();
var node = htmlTextParser.Current;
string paragraph = "";
do
{
int layer = 0;
forview(node, ref layer,ref paragraph);
node = node.Next;
if (node == null) break;
} while (node != null && node.Next != null);
if (string.IsNullOrEmpty(paragraph))
{
System.Diagnostics.Debug.WriteLine($" {paragraph}");
}
paragraph = "";
}
public void forview(IHtmlNode nodeParent, ref int layer, ref string paragraph)
{
if (nodeParent == null) return;
layer++;
if (nodeParent.HasChildren)
{
foreach (var item in nodeParent.Children)
{
var node = item;
//do
//{
forview(node, ref layer, ref paragraph);
if (node.Tag == "p")
{
if (!string.IsNullOrEmpty(paragraph))
{
WriteLines(paragraph);
}
paragraph = "";
}
if (node.Tag == "img")
{
foreach (var itemAtt in node.Attributes)
{
if (itemAtt.Key == "src")
{
}
else if (itemAtt.Key == "data-formula")
{
// string data_formula = itemAtt.Value;
// string data_formula = itemAtt.Value.Replace($"&", $"\r\n");
string data_formula = itemAtt.Value.Replace($"&", $"&");
data_formula = data_formula.Replace(@"\begin{align}", @"\begin{aligned}");
data_formula = data_formula.Replace(@"\begin{array}", @"\begin{aligned}");
data_formula = data_formula.Replace(@"\end{align}", @"\end{aligned}");
data_formula = data_formula.Replace(@"\end{array}", @"\end{aligned}");
data_formula = data_formula.Replace(@"\begin{align*}", @"\begin{aligned}");
data_formula = data_formula.Replace(@"\end{align*}", @"\end{aligned}");
data_formula = data_formula.Replace(@"\color{red}x", @"\color{red}x\color{black}");
data_formula = data_formula.Replace(@"\color{red}y", @"\color{red}y\color{black}");
data_formula = data_formula.Replace(@"\color{red}z", @"\color{red}z\color{black}");
data_formula = data_formula.Replace(@"\color{red}或", @"\color{red}或\color{black}");
if (itemAtt.Value.Length < 10)
{
// paragraph += $" 【{data_formula}】";
paragraph += $"$$ {data_formula} $$";
}
else
{
if (!string.IsNullOrEmpty(paragraph) )
{
WriteLines(paragraph);
}
paragraph = "";
//System.Diagnostics.Debug.WriteLine($"【{data_formula}】");
//System.Diagnostics.Debug.WriteLine($"$$\r\n{data_formula}\r\n$$");
string last = data_formula.Substring(data_formula.Length - 2, 2);
if (last == @"\\")
{
System.Diagnostics.Debug.WriteLine($"$${data_formula} $$");
}
else
{
System.Diagnostics.Debug.WriteLine($"$${data_formula}$$");
}
}
}
}
}
if (node.IsText)
{
string tem = $"{layer}";
for (int i = 0; i < layer; i++)
{
tem += "\t";
}
paragraph += node.InnerHtml;
// System.Diagnostics.Debug.WriteLine($"{tem}" + node.InnerHtml);
}
// node = node.Next;
// } while (node != null && node.Next != null);
}
}
layer--;
}
public void WriteLines(string paragraph)
{
if(paragraph.Contains("\r\n"))
{
string[] strs = paragraph.Split(new string[] { "\r\n" }, StringSplitOptions.None);
foreach (var item in strs)
{
WriteLineFor(item);
}
}
else
{
WriteLineFor(paragraph);
}
}
public void WriteLineFor(string paragraph)
{
if (!string.IsNullOrEmpty(paragraph))
{
if (paragraph.Contains("一、") ||
paragraph.Contains("二、") ||
paragraph.Contains("三、") ||
paragraph.Contains("四、") ||
paragraph.Contains("五、") ||
paragraph.Contains("六、") ||
paragraph.Contains("七、") ||
paragraph.Contains("八、") ||
paragraph.Contains("九、") ||
paragraph.Contains("十、") ||
paragraph.Contains("十一、")
)
{
paragraph = $"# {paragraph}";
}
if (paragraph.Contains("$$"))
{
System.Diagnostics.Debug.WriteLine($"$${paragraph.Replace("$$", "")} $$");
}
else
{
if (paragraph.Contains("1.") ||
paragraph.Contains("2.") ||
paragraph.Contains("3.") ||
paragraph.Contains("4.") ||
paragraph.Contains("5.") ||
paragraph.Contains("6.") ||
paragraph.Contains("7.") ||
paragraph.Contains("8.") ||
paragraph.Contains("9.") ||
paragraph.Contains("10.") ||
paragraph.Contains("11.")
)
{
paragraph = $"## {paragraph}";
}
System.Diagnostics.Debug.WriteLine($"{paragraph}");
}
}
}
}
}