贴的这些代码,是我一个构想,代码太长了,没全贴,就是把别人的网站下载下来,替换上自己的内容,简单的说就是网站下载程序
这样,我就可以快速建站,我准备先拿一个具体站点做实验,然后做成通用的
在下载别的站点网页的时候,发现以下问题,大家能否解答下?
1、发现有很多网址现在不规范,造成在识别时候,写很多种情况,也没法正确识别,例如abc.aspx?id=1&&c=1,这还算好识别的,还有/do?abc=0这种URL基本就没什么规律了
2、正则表达式效率很高,但是编写实在复杂,花很久才能调试对一个正则表达式,有什么抓取内容的替代方案?
3、程序运行后,效率不高,处理一个实例需要不少时间,我用.net内存分析工具,发现内存增长很快,不知道是我电脑不行还是什么原因,1小时左右偶然会内存溢出
且有大量string字符串在托管堆上无法回收?
大家分析下代码?
public static void RecURLDone(int urlID)
{
string sql = "update SiteInfo set DownloadDone=1 where ID="+urlID;
//DBUtility db2 = new DBUtility();
IDataBase mydb;
DataBaseFactory dbFactory = new DataBaseFactory();
mydb = dbFactory.MakeDataBase(EnvConfig.getSystemDataBase(SystemDataBaseEnum.Oracle));
mydb.OperateDB2(sql);
}
public string[] predoURL(string[] UrlArry,string CurrUrl)
{
string[] urlArry=new string[UrlArry.Length];
urlArry=UrlArry;
for (int i = 0; i < urlArry.Length; i++)
{
if (urlArry[i] != "")
{
if (urlArry[i] == "#")
{
urlArry[i] = "";
}
else
{
string str = urlArry[i].ToLower().Trim();
string firstLetter = str.Substring(0, 1);
if (firstLetter == "/")
{
//html_sb = html_sb.Replace("\"" + urlArry[i] + "\"", "\"" + getCurrUrlRoot(CurrUrl) + urlArry[i] + "\"");
//html_sb = html_sb.Replace("'" + urlArry[i] + "'", "'" + getCurrUrlRoot(CurrUrl) + urlArry[i] + "'");
urlArry[i] = getCurrUrlRoot(CurrUrl) + urlArry[i];
//urlArry[i] = urlArry[i].Replace("//", "/");
}
else
{
if (!str.Contains("http://"))
{
if (firstLetter != "/")
{
//html_sb = html_sb.Replace("\"" + urlArry[i] + "\"", "\"" + getCurrUrlRoot(CurrUrl) + "/" + urlArry[i] + "\"");
//html_sb = html_sb.Replace("'" + urlArry[i] + "'", "'" + getCurrUrlRoot(CurrUrl) + "/" + urlArry[i] + "'");
urlArry[i] = getCurrUrlRoot(CurrUrl) + "/" + urlArry[i];
}
}
}
}
}
}
return urlArry;
}
public string predoURL(string UrlArry, string CurrUrl)
{
//string[] urlArry = new string[UrlArry.Length];
string urlArry = UrlArry;
if (urlArry != "")
{
if (urlArry == "#")
{
urlArry = "";
}
else
{
string str = urlArry;
string firstLetter = str.Substring(0, 1);
if (firstLetter == "/")
{
urlArry = getCurrUrlRoot(CurrUrl) + urlArry;
}
else
{
if (!str.Contains("http://"))
{
if (firstLetter != "/")
{
urlArry= getCurrUrlRoot(CurrUrl) + "/" + urlArry;
}
}
}
}
}
if (urlArry.Contains("http:// www"))
{
int pp = 0; }
return urlArry;
}
private string getCurrUrlRoot(string _url)
{
string url = _url;
string url_front = "";
url = url.Replace("http://", "");
if (url.Contains("/"))
{
int tt = url.IndexOf("/");
url_front = url.Substring(0, tt);
//url_back = url.Substring(tt + 1);
}
else
{
url_front = url;
}
if (url_front.Contains(" "))
{
int pp = 0;
}
return "http://"+url_front;
}
public string[] analysisURL(string _url,string currURL)
{
string AppRoot = EnvConfig.AppPath;
string url = _url;
url=url.Trim().ToLower();
//url=url;
url = predoURL(url, currURL);
//url = url.Replace("http://", "");
int tmp_position = 0;
int tmp_length = 0;
string tmp_str = "";
string url_front="";
string url_back="";
string root_site="";
string saveFileName = "";
string newUrl ="";
int siteType = 2000;
int sitePos = 2000;
url = url.Replace("http://", "");
if (url.Contains("/"))
{
int tt = url.IndexOf("/");
url_front = url.Substring(0, tt);
url_back = url.Substring(tt);
}
else
{
url_front = url;
}
//tmp_position = url.IndexOf(EnvConfig.SiteTypeList[siteType]);
//tmp_length = EnvConfig.SiteTypeList[siteType].Length;
// url_front=url.Substring(0,tmp_position+tmp_length);
// url_back=url.Substring(tmp_position+tmp_length);
root_site = url_front.Replace("http://", "");
//root_site=url_front.Replace("")
string[] temp_dirs=null;
string t_dirs = "/";//构造存储目录变量
if (url_back.Length > 2 && url_back.Contains("/"))
{
temp_dirs = Regex.Split(url_back,"/");
//url_back.Split(
}
//如果URL中没有文件名,保存为 index.htm
bool _findFileName = true;//发现URL包含文件名标志
bool _findQuestionMark = false;
if (_url.Contains("?"))
{
_findQuestionMark = true;
}
if ((temp_dirs != null) && (temp_dirs[temp_dirs.Length - 1]!=""))
{
saveFileName = temp_dirs[temp_dirs.Length-1];
}
else
{ saveFileName = "index.htm"; _findFileName = false; }//如果未发现文件名,则文件名为空
bool chg=false;
for (int i = 0; i < EnvConfig.downLoadFileTypeList.Length; i++)
{
if (saveFileName.Contains(EnvConfig.downLoadFileTypeList[i])) chg = true;
}
if ((!chg)&&(_findQuestionMark==false))
{
tmp_position = saveFileName.LastIndexOf(".");
if (tmp_position >= 0)
{
saveFileName = saveFileName.Substring(0, tmp_position) + ".htm";
}
else
{
saveFileName += ".htm";
}
}
if ((!chg) && (_findQuestionMark == true) && url_front.Contains(EnvConfig.MinhttpURL))
{
string _res = "";
string[] _fileName ;
_fileName = url_back.Split('?');
string _frontFileName = _fileName[0];
string _backFileName = _fileName[1];
int p1 = _frontFileName.IndexOf(".");
if ((_frontFileName != "")&&(p1>0))
{
_res = _frontFileName.Substring(0, p1);
}
string[] tag;
if (_backFileName.Contains("&"))
{
tag = Regex.Split(_backFileName, "&");
for (int i = 0; i < tag.Length; i++)
{
string[] tmp;
tmp = Regex.Split(tag[i], "=");
//HttpUtility.UrlEncode()
if (tmp.Length >= 2)
{
_res += "_" + HttpUtility.UrlEncode(tmp[0]) + "_" + HttpUtility.UrlEncode(tmp[1]);
}
else
{ _res += tag[i]; }
}
}
else
{
string[] tmp;
if (_backFileName.Contains("="))
{
tmp = Regex.Split(_backFileName, "=");
_res += "_" + HttpUtility.UrlEncode(tmp[0]) + "_" + HttpUtility.UrlEncode(tmp[1]);
}
else { _res += _backFileName; }
}
_res += ".htm";
saveFileName = _res;
}
int nCount = 0;
if (temp_dirs != null)
{
for (int i = 0; i < temp_dirs.Length; i++)
{
if (temp_dirs[i] != "" && i != (temp_dirs.Length - 1))
{
nCount++;
t_dirs = t_dirs + "/" + temp_dirs[i] + "/";
}
}
}
t_dirs = root_site + t_dirs;
//控制最小符合URL,防止范围过大
//if (url_front.Contains(EnvConfig.MinhttpURL))
//{ chg = true; }
string desKey=@"南京快餐,南京快餐外送,江宁快餐,江宁快餐配送,南京食堂承包,南京职工食堂承包,http://www.njyhkc.com";
//替换用户关键字
if (EnvConfig.replace_DIR != null)
{
int ccc=EnvConfig.replace_DIR.Length/ EnvConfig.replace_DIR.Rank;
string[,] aaa = EnvConfig.replace_DIR;
for (int i = 0; i < ccc; i++)
{
t_dirs = t_dirs.Replace(aaa[i,0],aaa[i,1]);
}
}
if (EnvConfig.replace_URL != null)
{
int ccc = EnvConfig.replace_URL.Length / EnvConfig.replace_URL.Rank;
string[,] aaa = EnvConfig.replace_URL;
for (int i = 0; i < ccc; i++)
{
saveFileName = saveFileName.Replace(aaa[i, 0], aaa[i, 1]);
}
}
这样,我就可以快速建站,我准备先拿一个具体站点做实验,然后做成通用的
在下载别的站点网页的时候,发现以下问题,大家能否解答下?
1、发现有很多网址现在不规范,造成在识别时候,写很多种情况,也没法正确识别,例如abc.aspx?id=1&&c=1,这还算好识别的,还有/do?abc=0这种URL基本就没什么规律了
2、正则表达式效率很高,但是编写实在复杂,花很久才能调试对一个正则表达式,有什么抓取内容的替代方案?
3、程序运行后,效率不高,处理一个实例需要不少时间,我用.net内存分析工具,发现内存增长很快,不知道是我电脑不行还是什么原因,1小时左右偶然会内存溢出
且有大量string字符串在托管堆上无法回收?
大家分析下代码?
public static void RecURLDone(int urlID)
{
string sql = "update SiteInfo set DownloadDone=1 where ID="+urlID;
//DBUtility db2 = new DBUtility();
IDataBase mydb;
DataBaseFactory dbFactory = new DataBaseFactory();
mydb = dbFactory.MakeDataBase(EnvConfig.getSystemDataBase(SystemDataBaseEnum.Oracle));
mydb.OperateDB2(sql);
}
public string[] predoURL(string[] UrlArry,string CurrUrl)
{
string[] urlArry=new string[UrlArry.Length];
urlArry=UrlArry;
for (int i = 0; i < urlArry.Length; i++)
{
if (urlArry[i] != "")
{
if (urlArry[i] == "#")
{
urlArry[i] = "";
}
else
{
string str = urlArry[i].ToLower().Trim();
string firstLetter = str.Substring(0, 1);
if (firstLetter == "/")
{
//html_sb = html_sb.Replace("\"" + urlArry[i] + "\"", "\"" + getCurrUrlRoot(CurrUrl) + urlArry[i] + "\"");
//html_sb = html_sb.Replace("'" + urlArry[i] + "'", "'" + getCurrUrlRoot(CurrUrl) + urlArry[i] + "'");
urlArry[i] = getCurrUrlRoot(CurrUrl) + urlArry[i];
//urlArry[i] = urlArry[i].Replace("//", "/");
}
else
{
if (!str.Contains("http://"))
{
if (firstLetter != "/")
{
//html_sb = html_sb.Replace("\"" + urlArry[i] + "\"", "\"" + getCurrUrlRoot(CurrUrl) + "/" + urlArry[i] + "\"");
//html_sb = html_sb.Replace("'" + urlArry[i] + "'", "'" + getCurrUrlRoot(CurrUrl) + "/" + urlArry[i] + "'");
urlArry[i] = getCurrUrlRoot(CurrUrl) + "/" + urlArry[i];
}
}
}
}
}
}
return urlArry;
}
public string predoURL(string UrlArry, string CurrUrl)
{
//string[] urlArry = new string[UrlArry.Length];
string urlArry = UrlArry;
if (urlArry != "")
{
if (urlArry == "#")
{
urlArry = "";
}
else
{
string str = urlArry;
string firstLetter = str.Substring(0, 1);
if (firstLetter == "/")
{
urlArry = getCurrUrlRoot(CurrUrl) + urlArry;
}
else
{
if (!str.Contains("http://"))
{
if (firstLetter != "/")
{
urlArry= getCurrUrlRoot(CurrUrl) + "/" + urlArry;
}
}
}
}
}
if (urlArry.Contains("http:// www"))
{
int pp = 0; }
return urlArry;
}
private string getCurrUrlRoot(string _url)
{
string url = _url;
string url_front = "";
url = url.Replace("http://", "");
if (url.Contains("/"))
{
int tt = url.IndexOf("/");
url_front = url.Substring(0, tt);
//url_back = url.Substring(tt + 1);
}
else
{
url_front = url;
}
if (url_front.Contains(" "))
{
int pp = 0;
}
return "http://"+url_front;
}
public string[] analysisURL(string _url,string currURL)
{
string AppRoot = EnvConfig.AppPath;
string url = _url;
url=url.Trim().ToLower();
//url=url;
url = predoURL(url, currURL);
//url = url.Replace("http://", "");
int tmp_position = 0;
int tmp_length = 0;
string tmp_str = "";
string url_front="";
string url_back="";
string root_site="";
string saveFileName = "";
string newUrl ="";
int siteType = 2000;
int sitePos = 2000;
url = url.Replace("http://", "");
if (url.Contains("/"))
{
int tt = url.IndexOf("/");
url_front = url.Substring(0, tt);
url_back = url.Substring(tt);
}
else
{
url_front = url;
}
//tmp_position = url.IndexOf(EnvConfig.SiteTypeList[siteType]);
//tmp_length = EnvConfig.SiteTypeList[siteType].Length;
// url_front=url.Substring(0,tmp_position+tmp_length);
// url_back=url.Substring(tmp_position+tmp_length);
root_site = url_front.Replace("http://", "");
//root_site=url_front.Replace("")
string[] temp_dirs=null;
string t_dirs = "/";//构造存储目录变量
if (url_back.Length > 2 && url_back.Contains("/"))
{
temp_dirs = Regex.Split(url_back,"/");
//url_back.Split(
}
//如果URL中没有文件名,保存为 index.htm
bool _findFileName = true;//发现URL包含文件名标志
bool _findQuestionMark = false;
if (_url.Contains("?"))
{
_findQuestionMark = true;
}
if ((temp_dirs != null) && (temp_dirs[temp_dirs.Length - 1]!=""))
{
saveFileName = temp_dirs[temp_dirs.Length-1];
}
else
{ saveFileName = "index.htm"; _findFileName = false; }//如果未发现文件名,则文件名为空
bool chg=false;
for (int i = 0; i < EnvConfig.downLoadFileTypeList.Length; i++)
{
if (saveFileName.Contains(EnvConfig.downLoadFileTypeList[i])) chg = true;
}
if ((!chg)&&(_findQuestionMark==false))
{
tmp_position = saveFileName.LastIndexOf(".");
if (tmp_position >= 0)
{
saveFileName = saveFileName.Substring(0, tmp_position) + ".htm";
}
else
{
saveFileName += ".htm";
}
}
if ((!chg) && (_findQuestionMark == true) && url_front.Contains(EnvConfig.MinhttpURL))
{
string _res = "";
string[] _fileName ;
_fileName = url_back.Split('?');
string _frontFileName = _fileName[0];
string _backFileName = _fileName[1];
int p1 = _frontFileName.IndexOf(".");
if ((_frontFileName != "")&&(p1>0))
{
_res = _frontFileName.Substring(0, p1);
}
string[] tag;
if (_backFileName.Contains("&"))
{
tag = Regex.Split(_backFileName, "&");
for (int i = 0; i < tag.Length; i++)
{
string[] tmp;
tmp = Regex.Split(tag[i], "=");
//HttpUtility.UrlEncode()
if (tmp.Length >= 2)
{
_res += "_" + HttpUtility.UrlEncode(tmp[0]) + "_" + HttpUtility.UrlEncode(tmp[1]);
}
else
{ _res += tag[i]; }
}
}
else
{
string[] tmp;
if (_backFileName.Contains("="))
{
tmp = Regex.Split(_backFileName, "=");
_res += "_" + HttpUtility.UrlEncode(tmp[0]) + "_" + HttpUtility.UrlEncode(tmp[1]);
}
else { _res += _backFileName; }
}
_res += ".htm";
saveFileName = _res;
}
int nCount = 0;
if (temp_dirs != null)
{
for (int i = 0; i < temp_dirs.Length; i++)
{
if (temp_dirs[i] != "" && i != (temp_dirs.Length - 1))
{
nCount++;
t_dirs = t_dirs + "/" + temp_dirs[i] + "/";
}
}
}
t_dirs = root_site + t_dirs;
//控制最小符合URL,防止范围过大
//if (url_front.Contains(EnvConfig.MinhttpURL))
//{ chg = true; }
string desKey=@"南京快餐,南京快餐外送,江宁快餐,江宁快餐配送,南京食堂承包,南京职工食堂承包,http://www.njyhkc.com";
//替换用户关键字
if (EnvConfig.replace_DIR != null)
{
int ccc=EnvConfig.replace_DIR.Length/ EnvConfig.replace_DIR.Rank;
string[,] aaa = EnvConfig.replace_DIR;
for (int i = 0; i < ccc; i++)
{
t_dirs = t_dirs.Replace(aaa[i,0],aaa[i,1]);
}
}
if (EnvConfig.replace_URL != null)
{
int ccc = EnvConfig.replace_URL.Length / EnvConfig.replace_URL.Rank;
string[,] aaa = EnvConfig.replace_URL;
for (int i = 0; i < ccc; i++)
{
saveFileName = saveFileName.Replace(aaa[i, 0], aaa[i, 1]);
}
}