今天做个批量查询百度和google收录的小工具
不知道怎么做 想了下就去抓取查询页面然后获取收录总数
效果如下图
导入txt文本 文本里面每行一个网址
private void button4_Click(object sender, EventArgs e){
openFileDialog1.Filter = "超级文本(*.txt)|*.txt|(All Files)|*.*|文档|*.rtf ";
if (openFileDialog1.ShowDialog() == DialogResult.OK)
{
string s = openFileDialog1.FileName;
openFileDialog1.Dispose();
StreamReader sr = new StreamReader(s);
String line;
while ((line = sr.ReadLine()) != null)
{
DataGridViewRow row = new DataGridViewRow();
//row.Cells.Add(new DataGridCell(
//row.Cells[0].Value = line;
object[] para=new object[3];
para[0]=line;
dataGridView1.Rows.Add(para);
}
sr.Close();
sr.Dispose();
}
}
查询代码
//public delegate string MethodCaller(string name);//定义个代理
/// <summary>
/// 多线程查询
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button1_Click(object sender, EventArgs e)
{
for (int i = 0; i < dataGridView1.RowCount; i++)
{
if (dataGridView1.Rows[i].Cells[0].Value != null && dataGridView1.Rows[i].Cells[0].Value.ToString().Trim()!="")
{
string url = dataGridView1.Rows[i].Cells[0].Value.ToString().Trim();
//MethodCaller mc = new MethodCaller(getBaiduCount);
//IAsyncResult result = mc.BeginInvoke(url, null, null);//输出参数
//MethodCaller mg = new MethodCaller(getGoogleCount);
//IAsyncResult result1 = mg.BeginInvoke(url, null, null);
//dataGridView1.Rows[i].Cells[1].Value = mc.EndInvoke(result);
//dataGridView1.Rows[i].Cells[2].Value = mg.EndInvoke(result1);
ParameterizedThreadStart ParStart = new ParameterizedThreadStart(getBaiduCount);
Thread myThread = new Thread(ParStart);
object o = (object)(url+"_"+i.ToString()+"_1");
myThread.Start(o);
ParameterizedThreadStart ParStart1 = new ParameterizedThreadStart(getGoogleCount);
Thread myThread1 = new Thread(ParStart1);
object o1 = (object)(url + "_" + i.ToString() + "_2");
myThread1.Start(o1);
//dataGridView1.Rows[i].Cells[1].Value = getBaiduCount(url);
//dataGridView1.Rows[i].Cells[2].Value = getGoogleCount(url);
}
}
}
/// <summary>
/// google收录查询
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public void getGoogleCount(object url)
{
string[] arr = url.ToString().Split('_');
string ss = WebFunc.GetHtmlEx("http://www.google.com.hk/search?hl=zh-CN&newwindow=1&safe=strict&biw=1440&bih=506&q=site%3A"+arr[0]+"&btnG=Google+%E6%90%9C%E7%B4%A2&aq=f&aqi=&aql=&oq=" );
string start = "<div id=resultStats>找到约 ";
if (ss.Contains(start))
{
//start = ss.Substring(ss.IndexOf(start) + start.Length, 20);
//string result = StringSplit(start, "条结果")[0].Replace(",", "");
string result = StringSplit(StringSplit(ss, start)[1], "条结果")[0].Replace(",", "");
dataGridView1.Rows[int.Parse(arr[1])].Cells[2].Value = result;
//return result;
}
else
{
//return "0";
}
}
/// <summary>
/// 百度收录查询
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public void getBaiduCount(object url)
{
string[] arr = url.ToString().Split('_');
string ss = "";
if (url.ToString().Replace(".", ".").Split('.')[0] == "www")
{
//string ss = WebFunc.GetHtmlEx("http://www.baidu.com/s?wd=site%3A"+url);
ss = WebFunc.GetHtmlEx("http://www.baidu.com/s?wd=site%3A" + arr[0]);
}
else
{
ss = WebFunc.GetHtmlEx("http://www.baidu.com/s?bs=site%3A" + arr[0].Replace(arr[0].Split('.')[0],"www")+"&f=8&wd=site%3A" + arr[0]);
}
string start = "<span class=\"nums\" style=\"margin-left:120px\">找到相关结果";
if (ss.Contains(start))
{
//start = StringSplit(StringSplit(ss, start)[1], "个")[0].Replace(",", ""); //ss.Substring(ss.IndexOf(start) + start.Length, 20);
//string result = StringSplit(start, "个")[0].Replace(",", "");
string result=StringSplit(StringSplit(ss, start)[1], "个")[0].Replace(",", "").Replace("约","");
dataGridView1.Rows[int.Parse(arr[1])].Cells[1].Value = result;
//return result;
}
else
{
dataGridView1.Rows[int.Parse(arr[1])].Cells[1].Value = "0";
}
}
/// <summary>
/// 将字符串分割成数组
/// </summary>
/// <param name="strSource"></param>
/// <param name="strSplit"></param>
/// <returns></returns>
public static string[] StringSplit(string strSource, string strSplit)
{
string[] strtmp = new string[1];
int index = strSource.IndexOf(strSplit, 0);
if (index < 0)
{
strtmp[0] = strSource;
return strtmp;
}
else
{
strtmp[0] = strSource.Substring(0, index);
return StringSplit(strSource.Substring(index + strSplit.Length), strSplit, strtmp);
}
}
/// <summary>
/// 采用递归将字符串分割成数组
/// </summary>
/// <param name="strSource"></param>
/// <param name="strSplit"></param>
/// <param name="attachArray"></param>
/// <returns></returns>
public static string[] StringSplit(string strSource, string strSplit, string[] attachArray)
{
string[] strtmp = new string[attachArray.Length + 1];
attachArray.CopyTo(strtmp, 0);
int index = strSource.IndexOf(strSplit, 0);
if (index < 0)
{
strtmp[attachArray.Length] = strSource;
return strtmp;
}
else
{
strtmp[attachArray.Length] = strSource.Substring(0, index);
return StringSplit(strSource.Substring(index + strSplit.Length), strSplit, strtmp);
}
}
导出excel代码
/// <summary>/// 另存为excel文件
/// </summary>
private void SaveAs() //另存新档按钮 导出成Excel
{
SaveFileDialog saveFileDialog = new SaveFileDialog();
saveFileDialog.Filter = "Execl files (*.xls)|*.xls";
saveFileDialog.FilterIndex = 0;
saveFileDialog.RestoreDirectory = true;
saveFileDialog.CreatePrompt = true;
saveFileDialog.Title = "Export Excel File To";
saveFileDialog.ShowDialog();
Stream myStream;
myStream = saveFileDialog.OpenFile();
//StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding("gb2312"));
StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding(-0));
string str = "";
try
{
//写标题
for (int i = 0; i < dataGridView1.ColumnCount; i++)
{
if (i > 0)
{
str += "\t";
}
str += dataGridView1.Columns[i].HeaderText;
}
sw.WriteLine(str);
//写内容
for (int j = 0; j < dataGridView1.Rows.Count; j++)
{
string tempStr = "";
for (int k = 0; k < dataGridView1.Columns.Count; k++)
{
if (k > 0)
{
tempStr += "\t";
}
if (dataGridView1.Rows[j].Cells[k].Value != null)
{
tempStr += dataGridView1.Rows[j].Cells[k].Value.ToString();
}
}
sw.WriteLine(tempStr);
}
sw.Close();
myStream.Close();
}
catch (Exception e)
{
MessageBox.Show(e.ToString());
}
finally
{
sw.Close();
myStream.Close();
}
}
根据网址获取源代码
static class WebFunc{
/// <summary>
/// 网页抓取
/// </summary>
private static CookieContainer cookie = new CookieContainer();
private static string contentType = "application/x-www-form-urlencoded";
private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
public static string GetHtmlEx(string url)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.UserAgent = userAgent;
request.ContentType = contentType;
request.CookieContainer = cookie;
request.Accept = accept;
request.Method = "get";
WebResponse response = request.GetResponse();
Stream responseStream = response.GetResponseStream();
Encoding encoding = null;
for (int i = 0; i < response.Headers.Count; i++)
{
Match m = Regex.Match(response.Headers[i].ToString(), "(?i)(?<=charset=)[^ ]+");
if (!m.Success) continue;
encoding = Encoding.GetEncoding(m.Value);
break;
}
StreamReader reader = new StreamReader(responseStream, encoding);
String html = reader.ReadToEnd();
response.Close();
return html;
}
catch
{
return "";
}
}
}
不过差的太频繁google会返回503错误的 呵呵