这里 有两种,可能是因为它们本身的编码方式不一样,所以解析也不一样,只是做一个笔记,不一定能用得着
1、获取word文档解析的网页
public string GetHtml(string url)
{
//http://center.file.odxd.com/2016/12/19/10067/69f1ac86-d92e-448f-905e-2403642389a9.docx.html
//http://center.file.odxd.com/2017/9/12/10155/eb279af3-03d6-4817-8804-fd563b573c44/00e681ab-0016-4883-a4d1-2b6d1f5d8c8a.txt.html
string strResult;
try
{
HttpWebRequest hwr = (System.Net.HttpWebRequest)System.Net.HttpWebRequest.Create(url);
hwr.Timeout = 19600;
HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse();
Stream myStream = hwrs.GetResponseStream();
StreamReader sr = new StreamReader(myStream, Encoding.UTF8);
StringBuilder sb = new StringBuilder();
while (-1 != sr.Peek())
{
sb.Append(sr.ReadLine() + "\r\n");
}
strResult = sb.ToString().Replace("?", "");
hwrs.Close();
}
catch
{
strResult = "";
}
return strResult;
}
2、获取txt解析的网页
public string GetTxtHtml(string url)
{
try
{
byte[] buffer = new WebClient().DownloadData(url);
StreamReader streamReader = new StreamReader(new MemoryStream(buffer), Encoding.UTF8);
string text = streamReader.ReadToEnd();
if (IsMessyCode(text))
{
streamReader = new StreamReader(new MemoryStream(buffer), Encoding.Default);
text = streamReader.ReadToEnd();
}
return text.Replace("\r\n", "<br>");
}
catch
{
return "";
}
}
/// <summary>
/// 判断是否为乱码
/// </summary>
/// <param name="txt">文本</param>
/// <returns></returns>
private static bool IsMessyCode(string txt)
{
byte[] bytes = Encoding.UTF8.GetBytes(txt);
for (int i = 0; i < bytes.Length; i++)
{
if ((i < (bytes.Length - 3)) && (((bytes[i] == 0xef) && (bytes[i + 1] == 0xbf)) && (bytes[i + 2] == 0xbd)))
{
return true;
}
}
return false;
}