c html to text,How can I Convert HTML to Text in C#?

Just a note about the HtmlAgilityPack for posterity. The project contains an example of parsing text to html, which, as noted by the OP, does not handle whitespace at all like anyone writing HTML would envisage. There are full-text rendering solutions out there, noted by others to this question, which this is not (it cannot even handle tables in its current form), but it is lightweight and fast, which is all I wanted for creating a simple text version of HTML emails.

using System.IO;

using System.Text.RegularExpressions;

using HtmlAgilityPack;

//small but important modification to class https://github.com/zzzprojects/html-agility-pack/blob/master/src/Samples/Html2Txt/HtmlConvert.cs

public static class HtmlToText

{

public static string Convert(string path)

{

HtmlDocument doc = new HtmlDocument();

doc.Load(path);

return ConvertDoc(doc);

}

public static string ConvertHtml(string html)

{

HtmlDocument doc = new HtmlDocument();

doc.LoadHtml(html);

return ConvertDoc(doc);

}

public static string ConvertDoc (HtmlDocument doc)

{

using (StringWriter sw = new StringWriter())

{

ConvertTo(doc.DocumentNode, sw);

sw.Flush();

return sw.ToString();

}

}

internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)

{

foreach (HtmlNode subnode in node.ChildNodes)

{

ConvertTo(subnode, outText, textInfo);

}

}

public static void ConvertTo(HtmlNode node, TextWriter outText)

{

ConvertTo(node, outText, new PreceedingDomTextInfo(false));

}

internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)

{

string html;

switch (node.NodeType)

{

case HtmlNodeType.Comment:

// don't output comments

break;

case HtmlNodeType.Document:

ConvertContentTo(node, outText, textInfo);

break;

case HtmlNodeType.Text:

// script and style must not be output

string parentName = node.ParentNode.Name;

if ((parentName == "script") || (parentName == "style"))

{

break;

}

// get text

html = ((HtmlTextNode)node).Text;

// is it in fact a special closing node output as text?

if (HtmlNode.IsOverlappedClosingElement(html))

{

break;

}

// check the text is meaningful and not a bunch of whitespaces

if (html.Length == 0)

{

break;

}

if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)

{

html= html.TrimStart();

if (html.Length == 0) { break; }

textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;

}

outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));

if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))

{

outText.Write(' ');

}

break;

case HtmlNodeType.Element:

string endElementString = null;

bool isInline;

bool skip = false;

int listIndex = 0;

switch (node.Name)

{

case "nav":

skip = true;

isInline = false;

break;

case "body":

case "section":

case "article":

case "aside":

case "h1":

case "h2":

case "header":

case "footer":

case "address":

case "main":

case "div":

case "p": // stylistic - adjust as you tend to use

if (textInfo.IsFirstTextOfDocWritten)

{

outText.Write("\r\n");

}

endElementString = "\r\n";

isInline = false;

break;

case "br":

outText.Write("\r\n");

skip = true;

textInfo.WritePrecedingWhiteSpace = false;

isInline = true;

break;

case "a":

if (node.Attributes.Contains("href"))

{

string href = node.Attributes["href"].Value.Trim();

if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase)==-1)

{

endElementString = "";

}

}

isInline = true;

break;

case "li":

if(textInfo.ListIndex>0)

{

outText.Write("\r\n{0}.\t", textInfo.ListIndex++);

}

else

{

outText.Write("\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022

}

isInline = false;

break;

case "ol":

listIndex = 1;

goto case "ul";

case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems

endElementString = "\r\n";

isInline = false;

break;

case "img": //inline-block in reality

if (node.Attributes.Contains("alt"))

{

outText.Write('[' + node.Attributes["alt"].Value);

endElementString = "]";

}

if (node.Attributes.Contains("src"))

{

outText.Write('');

}

isInline = true;

break;

default:

isInline = true;

break;

}

if (!skip && node.HasChildNodes)

{

ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten){ ListIndex = listIndex });

}

if (endElementString != null)

{

outText.Write(endElementString);

}

break;

}

}

}

internal class PreceedingDomTextInfo

{

public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)

{

IsFirstTextOfDocWritten = isFirstTextOfDocWritten;

}

public bool WritePrecedingWhiteSpace {get;set;}

public bool LastCharWasSpace { get; set; }

public readonly BoolWrapper IsFirstTextOfDocWritten;

public int ListIndex { get; set; }

}

internal class BoolWrapper

{

public BoolWrapper() { }

public bool Value { get; set; }

public static implicit operator bool(BoolWrapper boolWrapper)

{

return boolWrapper.Value;

}

public static implicit operator BoolWrapper(bool boolWrapper)

{

return new BoolWrapper{ Value = boolWrapper };

}

}

As an example, the following HTML code...

Whatever Inc.

Thanks for your enquiry. As this is the 1st time you have contacted us, we would like to clarify a few things:

  1. Please confirm this is your email by replying.

  2. Then perform this step.

Please solve this complex equation. Then, in any order, could you please:

Sincerely,

The whatever.com team

Ph: 000 000 000

mail: whatever st

...will be transformed into:

Whatever Inc.

Thanks for your enquiry. As this is the 1st time you have contacted us, we would like to clarify a few things:

1. Please confirm this is your email by replying.

2. Then perform this step.

Please solve this [complex equation]. Then, in any order, could you please:

* a point.

* another point, with a hyperlink.

Sincerely,

The whatever.com team

Ph: 000 000 000

mail: whatever st

...as opposed to:

Whatever Inc.

Thanks for your enquiry. As this is the 1st time you have contacted us, we would like to clarify a few things:

Please confirm this is your email by replying.

Then perform this step.

Please solve this . Then, in any order, could you please:

a point.

another point, with a hyperlink.

Sincerely,

The whatever.com team

Ph: 000 000 000

mail: whatever st

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值