最近,女友的妹妹要去网上找房产中介人信息用于招聘,自己去网上一个一个找太慢,我女友知道我是搞IT的就请教我有没有办法帮她快速找电话号码,于是我就想到了爬虫程序,然后普通的爬虫代码有限制,刚好自己在搞自动化测试,就想到用webdriver自动化测试工具编写一个爬虫工具抓取赶集网上的经纪人信息。
自己一直用的是Java代码写的自动化测试脚本,但是对于他们不会编程的人,没有界面操作很不方便,于是我就想到了编写一个桌面程序,但是java的GUI又不美观,最后就想到微软的C#,于是自学VS2010,自学.NET,C#结合Webdriver FOR C#版本编写了一个小工具。
支持火狐浏览器,phatomJS内存浏览器进行抓取。
将数据查询出放在界面上
支持导出excel文件
基本的实现原理就是:先计算出页码,然后循环一页一页抓取数据,抓取出的数据先临时存储在LIST数据类型中,再存放到界面上的数据控件,点击导出excel的时候,把数据控件的数据转换成LIST,再导出到excel.
界面如下:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using OpenQA.Selenium;
using OpenQA.Selenium.Firefox;
using OpenQA.Selenium.Interactions;
using OpenQA.Selenium.Interactions.Internal;
using OpenQA.Selenium.Support;
using System.Threading;
using Microsoft.Office;
using Excel;
using System.Drawing;
using OpenQA.Selenium.PhantomJS;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.IE;
using OpenQA.Selenium.Remote;
using System.Timers;
namespace tel_search
{
public partial class Form1 : Form
{
//计时器
static int time_js = 0;//设定临时变量
static System.Timers.Timer timer;
public Form1()
{
InitializeComponent();
}
private void label1_Click(object sender, EventArgs e)
{
}
private void label2_Click(object sender, EventArgs e)
{
}
private void label3_Click(object sender, EventArgs e)
{
}
private void Form1_Load(object sender, EventArgs e)
{
radioButton_phatom.Checked = true;
}
private void button1_Click(object sender, EventArgs e)
{
//开始后台运行搜索
//backgroundWorker1.RunWorkerAsync(200000);
if (listBox1.SelectedItems.Count == 0)
{
DialogResult dr5 = MessageBox.Show("请选择地区", "温馨提示", MessageBoxButtons.OK);
if (dr5 == DialogResult.OK)
{
listBox1.Focus();
}
}
else
{
List<String[]> mp_inofs_list = new List<string[]>();
//用于统计名片个数
int mp_total = 0;
//记录错误关键字
//List<String> err_kw = null;
IWebDriver driver = null;
DialogResult dr_search_result = MessageBox.Show("数据比较多,可能需要几分钟时间抓取,请耐心等待!运行过程中,请不要关闭浏览器窗口,否则程序会报错!", "温馨提示", MessageBoxButtons.OK);
if (dr_search_result == DialogResult.OK)
{
//抓取数据过程,开始计时
timer = new System.Timers.Timer(100);
timer.Elapsed += new System.Timers.ElapsedEventHandler(OnTimedEvent);
timer.AutoReset = true;
timer.Enabled = true;
if (radioButton_phatom.Checked == true)
{
driver = new PhantomJSDriver();
}
if (radioButton_firefox.Checked == true)
{
try
{
FirefoxProfile profile = new FirefoxProfile();
profile.SetPreference("browser.bookmarks.restore_default_bookmarks", false);
driver = new FirefoxDriver(profile);
}
catch
{
DialogResult dr4 = MessageBox.Show("你的电脑没有安装火狐浏览器,是否立即下载", "温馨提示", MessageBoxButtons.YesNo);
if (dr4 == DialogResult.Yes)
{
listBox1.Focus();
System.Diagnostics.Process.Start("http://download.firefox.com.cn/releases/stub/official/zh-CN/Firefox-latest.exe");
//中止程序往下运行
return;
}
if (dr4 == DialogResult.No)
{
return;
}
}
}
}
String quyu = null;
String url_quyu = null;
driver.Manage().Window.Maximize();
try
{
if (listBox1.SelectedItem.ToString() == "全深圳")
{
quyu = "";
}
if (listBox1.SelectedItem.ToString() == "福田")
{
quyu = "futian";
}
if (listBox1.SelectedItem.ToString() == "罗湖")
{
quyu = "luohu";
}
if (listBox1.SelectedItem.ToString() == "南山")
{
quyu = "nanshan";
}
if (listBox1.SelectedItem.ToString() == "宝安")
{
quyu = "baoan";
}
if (listBox1.SelectedItem.ToString() == "龙岗")
{
quyu = "longgang";
}
if (listBox1.SelectedItem.ToString() == "盐田")
{
quyu = "yantian";
}
if (listBox1.SelectedItem.ToString() == "龙华新区")
{
quyu = "longhuaxinqu";
}
if (listBox1.SelectedItem.ToString() == "光明新区")
{
quyu = "guangmingxinqu";
}
if (listBox1.SelectedItem.ToString() == "坪山新区")
{
quyu = "pingshanxinqu";
}
if (listBox1.SelectedItem.ToString() == "大鹏新区")
{
quyu = "dapengxinqu";
}
if (listBox1.SelectedItem.ToString() == "深圳周边")
{
quyu = "shenzhenzhoubian";
}
url_quyu = "http://sz.ganji.com/zufang/agent/" + quyu + "/";
driver.Navigate().GoToUrl(url_quyu);
}
catch
{
driver.Quit();
DialogResult dr_search_result2 = MessageBox.Show("打不开网页,您的网络可能有问题,请检查网络连接是否正常!", "温馨提示");
return;
}
List<IWebElement> page_next_page_button = new List<IWebElement>();
List<IWebElement> page_after_index = null;
do
{
page_next_page_button = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[@class='next']/span")));
page_after_index = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]")));
page_after_index.ElementAt(page_after_index.Count - 1).Click();
}while (page_next_page_button.Count!=0);//判断是否查找到"下一页"的按钮.特别注意不能用!=null判断,这样是无效的。
//driver.Close();不能关闭浏览器,如果关闭了,需要重新创建driver对象
int page_nums = Convert.ToInt16(page_after_index.ElementAt(page_after_index.Count-1).Text);
//获取每个关键词含有多少页,通过获取页码元素判断
url_quyu = "http://sz.ganji.com/zufang/agent/" + quyu + "/";
driver.Navigate().GoToUrl(url_quyu);
List<IWebElement> page_index = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]")));
for (int page_num = 0; page_num < page_nums; page_num++)
{
//每次点击页码之后,都要重新找元素
driver.Manage().Timeouts().ImplicitlyWait(new TimeSpan(0, 0, 1));
//以一个集合作为参数创建list LIST<T> TESTLIST=NEW LIST<T>(IEnumerable<T> Collections)
List<IWebElement> page = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]")));
String page_num_text = null;
if (page_num <=5)
{
page_num_text = page.ElementAt(page_num).Text;
page.ElementAt(page_num).Click();
}
else
{
page_num_text = page.ElementAt(5).Text;
page.ElementAt(5).Click();
}
driver.Manage().Timeouts().ImplicitlyWait(new TimeSpan(0, 0, 1));
//获取每页所含名片元素,同时获取每页名片数量
List<IWebElement> mp_indx = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='listBox']/ul/li/div[@class='list-mod2']")));
//列表用于存储单个名片信息
List<String> mp_info_list = new List<String>();
//第一个detailLayer元素不含名片信息过滤掉,所以从1开始
for (int i = 0; i < mp_indx.Count(); i++)
{
//获取单个名片内容元素----Start
IWebElement mp_name = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-info']/span[@class='broker-name']/a"));
IWebElement mp_tel = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-info']/span[@class='broker-tel']"));
IWebElement mp_compy = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(0);
IWebElement mp_serverquyu = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(1);
IWebElement mp_serverxiaoqu = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(2);
String mp_name_text = mp_name.Text;
String mp_tel_text = mp_tel.Text;
//Substring函数,截取公司名字,删除开头的“经纪公司“几个字
String mp_compy_text = mp_compy.Text.Substring(6,mp_compy.Text.ToString().Length-6);
String mp_serverquyu_text = mp_serverquyu.Text.Substring(6,mp_serverquyu.Text.ToString().Length-6);
String mp_serverxiaoqu_text = mp_serverxiaoqu.Text.Substring(6,mp_serverxiaoqu.Text.ToString().Length-6);
mp_info_list.Add(listBox1.SelectedItem.ToString());
mp_info_list.Add(mp_name_text);
mp_info_list.Add(mp_tel_text);
mp_info_list.Add(mp_compy_text);
mp_info_list.Add(mp_serverquyu_text);
mp_info_list.Add(mp_serverxiaoqu_text);
mp_info_list.Add(page_num_text);
//------------------END
//将单个名片内容列表转为数组
String[] mp_info_arrary = mp_info_list.ToArray();
//将单个名片,存储到名片二维列表
mp_inofs_list.Add(mp_info_arrary);
//将名片列表内容删除 ,用于存储下一个名片内容
mp_info_list.RemoveRange(0, mp_info_list.Count());
mp_total++;
}
}
//将所有名片信息存储为一个二维数组
String[][] mp_infos_arrary = mp_inofs_list.ToArray();
//列宽设置
col_header1.Width = 30;
col_header2.Width = 45;
col_header3.Width = 100;
col_header4.Width = 90;
col_header5.Width = 350;
col_header6.Width = 115;
col_header7.Width = 465;
col_header8.Width = 20;
//开始更新LISTVIEW数据--START
listView1.GridLines = true;
listView1.MultiSelect = true;
//listView1.Clear();
listView1.BeginUpdate();
for (int i = 0; i < mp_inofs_list.Count; i++)
{
ListViewItem Lvi = new ListViewItem();
//第一列序号
Lvi.Text = (i + 1).ToString();
for (int j = 0; j < 7; j++)
{
Lvi.SubItems.Add(mp_inofs_list.ElementAt(i).ElementAt(j));
}
listView1.Items.Add(Lvi);
}
listView1.EndUpdate();
listView1.LabelEdit = true;
listView1.FullRowSelect = true;
driver.Quit();
//结束计时
timer.Enabled = false;
timer.Dispose();
time_lbl.Visible = true;
//换算成分秒
int minute = 0;//分
double second = 0;//秒
second = time_js / 10;
if (second > 60)
{
minute = (int)(second / 60);
second = second % 60;
time_lbl.Text = minute.ToString() + "分" + second.ToString() + "秒";
}
else
{
time_lbl.Text = 0.ToString() + "分" + second.ToString() + "秒";
}
DialogResult msg_mp_total = MessageBox.Show("谢谢你的耐心等待" + "成功抓取" + page_nums + "页" + mp_inofs_list.Count + "条名片信息,用时:" + time_lbl.Text, "恭喜");
}
}
private void listView1_SelectedIndexChanged(object sender, EventArgs e)
{
}
private void button4_Click(object sender, EventArgs e)
{
turntoexcel();
}
private void turntoexcel()
{
SaveFileDialog sfd = new SaveFileDialog();
sfd.DefaultExt = "xls";
sfd.Filter = "Excel文件(*.xls)|*.xls";
if (sfd.ShowDialog() == DialogResult.OK)
{
DoExport(listView1, sfd.FileName);
}
}
private void DoExport(ListView listView, string strFileName)
{
if (listView1.Items.Count== 0)
{
MessageBox.Show("没有数据,无法导出!");
return;
}
int rowNum = listView.Items.Count;
int columnNum = listView.Items[0].SubItems.Count;
int rowIndex = 1;
int columnIndex = 0;
if (rowNum == 0 || string.IsNullOrEmpty(strFileName))
{
return;
}
if (rowNum > 0)
{
Microsoft.Office.Interop.Excel.Application xlApp = new Microsoft.Office.Interop.Excel.ApplicationClass();
if (xlApp == null)
{
MessageBox.Show("无法创建excel对象,可能您的系统没有安装excel");
return;
}
xlApp.DefaultFilePath = "";
xlApp.DisplayAlerts = true;
xlApp.SheetsInNewWorkbook = 1;
Microsoft.Office.Interop.Excel.Workbook xlBook = xlApp.Workbooks.Add(true);
//将ListView的列名导入Excel表第一行
foreach (ColumnHeader dc in listView.Columns)
{
columnIndex++;
xlApp.Cells[rowIndex, columnIndex] = dc.Text;
}
//将ListView中的数据导入Excel中
for (int i = 0; i < rowNum; i++)
{
rowIndex++;
columnIndex = 0;
for (int j = 0; j < columnNum; j++)
{
columnIndex++;
//注意这个在导出的时候加了“\t” 的目的就是避免导出的数据显示为科学计数法。可以放在每行的首尾。
xlApp.Cells[rowIndex, columnIndex] = Convert.ToString(listView.Items[i].SubItems[j].Text) + "\t";
}
}
//例外需要说明的是用strFileName,Excel.XlFileFormat.xlExcel9795保存方式时 当你的Excel版本不是95、97 而是2003、2007 时导出的时候会报一个错误:异常来自 HRESULT:0x800A03EC。 解决办法就是换成strFileName, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal。
xlBook.SaveAs(strFileName, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Microsoft.Office.Interop.Excel.XlSaveAsAccessMode.xlExclusive, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing);
xlApp = null;
xlBook = null;
MessageBox.Show("恭喜导出成功!");
}
}
private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
{
//允许长时间的操作
int input = (int)e.Argument;
Thread.Sleep(input);
}
private void textBox1_TextChanged(object sender, EventArgs e)
{
}
private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e)
{
linkLabel1.Links[0].LinkData = "http://download.firefox.com.cn/releases/stub/official/zh-CN/Firefox-latest.exe";
String URL = linkLabel1.Links[0].LinkData.ToString();
System.Diagnostics.Process.Start(URL);
}
private void OnTimedEvent(Object source, ElapsedEventArgs e)
{
if (timer.Enabled == true)
{
time_js++;
}
else
{
return;
}
}
}
}