最近忙着面试,遇到挺多有意思的面试题,拿出来跟大家分享一下,其中有个题是这样:
如果我想从内容中取100个字符出来(注:内容中有HTML代码,因为想保留基本的p、div、img、font、b、i),如何取?取得时候应该考虑哪些问题?
<div><b>快活</b><font color=red>林顾心</font>怿得分可 &;倒萨飞机佛挡杀佛1阿飞 2 <span>粉底</span>阿飞<script>alert("df");</script>第三发嗲发iedfias发的死阿飞第四阿夫顿大佛寺<br>阿飞大夫额3脎放大发得啊<范德萨范德萨>分大福大佛挡杀佛倒萨范德萨妇撒佛挡杀佛嗒89飒8哒89783721849372148<img src="1.jpg" alt="图片标示" title="图片标示">978978范地挲范德萨范德萨风刀霜剑艾迪斯入洞房教科书粉底霜噢批</div>
1. 对上面文字进行字符串截取,100(注1个汉字等于2个字符)。
2. 保留P,DIV,IMG,FONT,B,I标签。
3. 需要注意的地方有:举一个例子,如 是空格,算1个字符
下面给出我写的代码,还有很多不完善的地方,有待商榷。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace shangyun2
{
class cutHtml
{
/// <summary>
/// 截取带html标签的字符串
/// </summary>
/// <param name="html">源字符串</param>
/// <param name="count">要截取的字符数</param>
/// <param name="script">是否保留JS脚本</param>
/// <param name="tags">要去掉的html标签</param>
/// <returns></returns>
public string CutStringWithHtmlTag(string html, int count, bool script, params string[] tags)
{
//
if (string.IsNullOrEmpty(html) || count <= 0)
{
throw new ArgumentException();
}
//
if (script == false)
{
Regex regScript = new Regex("<script.*?</script>");
html = regScript.Replace(html, "");
}
//
Regex regImg = new Regex("(?<IMG><img.*?/?)>");
MatchCollection matchCol = regImg.Matches(html);
if (matchCol.Count > 0)
{
foreach (Match match in matchCol)
{
html = regImg.Replace(html, match.Groups["IMG"].Value + "/>");
}
}
//待优化 读取出来的字符串中的换号和回车\r\n,这里暂未处理
html = ReplaceHtml(html, "<", ref count);
html = ReplaceHtml(html, ">", ref count);
html = ReplaceHtml(html, " ", ref count);
html = ReplaceHtml(html, "&", ref count);
html = html.Replace("<br>", "<br />");
//用正则去掉不要的html标签
//因为考虑到可能有自定义的一些标签,暂时没想到好的方法去做,所以暂时没做
StringBuilder sbResult = new StringBuilder();
int temp = 0;//计数器
while (true)
{
if (html[temp].ToString() == "<")
{
while (true)
{
if (html[temp].ToString() == ">")
{
count++;
break;
}
sbResult.Append(html[temp]);
temp++;
}
}
if (count == 0)
{
break;
}
sbResult.Append(html[temp]);
temp++;
count--;
}
//这里写的有些麻烦,还希望大家帮忙改正
html = sbResult.ToString().Replace("< />", " ").Replace("<&/>", "&").Replace("<</>", "<").Replace("<>/>", ">");
//
Stack<string> stack = new Stack<string>();
for (int i = 0; i < html.Length; i++)
{
stack.Push(html[i].ToString());
//成对出现的标签出栈操作
if (html[i].ToString() == "<")
{
int temps = i + 1;
if (html[temps].ToString() == "/")
{
stack.Pop();
string tempString = "";
while (html[temps].ToString() != ">")
{
temps++;
}
i = temps;
do
{
tempString = stack.Pop();
} while (tempString != "<");
}
}
//单个标签出栈操作
else if (html[i].ToString() == "/")
{
int temps = i + 1;
if (html[temps].ToString() == ">")
{
string tempString = "";
do
{
tempString = stack.Pop();
} while (tempString != "<");
}
}
}
StringBuilder sb = new StringBuilder();
for (int i = stack.Count - 1; i >= 0; i--)
{
sb.Insert(0, stack.Pop());
}
Regex checkTag = new Regex("<(.*?)>");
//判断是否有单个的标签 如果有的话在最后给他添加相应的闭标签
MatchCollection matchCollect = checkTag.Matches(sb.ToString());
if (matchCollect.Count > 0)
{
for (int i = matchCollect.Count - 1; i >= 0; i--)
{
string htmlTag = "</" + matchCollect[i].Groups[1].Value + ">";
html = html + htmlTag;
}
}
return html;
}
/// <summary>
///
/// </summary>
/// <param name="html"></param>
/// <param name="tags"></param>
/// <param name="count"></param>
/// <returns></returns>
private string ReplaceHtml(string html, string tags, ref int count)
{
Regex regLt = new Regex(tags);
MatchCollection mcLt = regLt.Matches(html);
if (mcLt.Count > 0)
{
count += mcLt.Count;
html = regLt.Replace(html, "<" + tags + "/>");
}
return html;
}
}
}