一.使用nuget安装Selenium
在nuget中搜索Selenium 安装以下三项
二.下载ChromeDriver
在http://chromedriver.storage.googleapis.com/index.html或者镜像网站http://npm.taobao.org/mirrors/chromedriver/手动下载与浏览器对应版本的ChromeDriver放入bin/debug目录中
三.初始化Driver
ChromeOptions op = new ChromeOptions();
//op.AddArguments("--headless"); 启用无头浏览器
// op.AddArguments("--window-size=1920,1080");
ChromeDriver driver = new ChromeDriver(AppDomain.CurrentDomain.BaseDirectory.ToString(), op);
四.按照关键词打开对应网页 如果爬取过于频繁会被forbidden所以每爬取一个页面sleep一段时间或者使用代理
List<string> ketList = new List<string> { "新闻联播", "康辉", "国际锐评", "央视快评", "耕战频道", "中央广播电视总台", "中央台", "三台", "CCTV" };
foreach (string keyWord in ketList)
{
for (int i = 0; i <= PageCount; i++)
{
string Url = $"http://www.douban.com/group/search?start={i * 50}&cat=1013&sort=time&q={keyWord}";
driver.Navigate().GoToUrl(Url);
GetArticleDetail(driver, keyWord);
Thread.Sleep(sleeptime);
if (driver.PageSource.Contains("可以换个关键词试试") || driver.PageSource.Contains("此页的内容可能已被移除"))
{
break;
}
}
}
五.使用xpath搜索信息
private static void GetArticleDetail(ChromeDriver driver,string keyword)
{
try
{
var modelHtmlList = driver.FindElementsByXPath(".//tr[@class='pl']");
foreach (var modelHtml in modelHtmlList)
{
Articles entity = new Articles();
entity.keyword = keyword;
entity.url = modelHtml.FindElement(By.XPath("./td[@class='td-subject']/a")).GetAttribute("href");
entity.title = modelHtml.FindElement(By.XPath("./td[@class='td-subject']/a")).GetAttribute("title");
entity.commentnum = int.Parse(modelHtml.FindElement(By.XPath("./td[@class='td-reply']/span")).Text.Replace("回应", ""));
SearchResult.Add(entity);
Console.WriteLine(entity.ToString());
}
}
catch(Exception ex)
{
Console.WriteLine(ex.ToString());
}
}
六.把结果导出为excel
需要安装NPOI
public static void ExortExcel(List<Articles> accountList, string tableName)
{
IWorkbook workbook = new XSSFWorkbook();
ISheet sheet = workbook.CreateSheet("sheet");
IRow Title = null;
IRow rows = null;
Type entityType = accountList[0].GetType();
System.Reflection.PropertyInfo[] entityProperties = entityType.GetProperties();
for (int i = 0; i <= accountList.Count; i++)
{
if (i == 0)
{
Title = sheet.CreateRow(0);
for (int k = 1; k < entityProperties.Length + 1; k++)
{
Title.CreateCell(0).SetCellValue("序号");
Title.CreateCell(k).SetCellValue(entityProperties[k - 1].Name);
}
continue;
}
else
{
rows = sheet.CreateRow(i);
object entity = accountList[i - 1];
for (int j = 1; j <= entityProperties.Length; j++)
{
object[] entityValues = new object[entityProperties.Length];
entityValues[j - 1] = entityProperties[j - 1].GetValue(entity);
rows.CreateCell(0).SetCellValue(i);
if (entityValues[j - 1] != null)
{
rows.CreateCell(j).SetCellValue(entityValues[j - 1].ToString());
}
else
{
rows.CreateCell(j).SetCellValue("");
}
}
}
}
string fileName = Directory.GetCurrentDirectory() + "\\" + DateTime.Now.ToString("yyyyMMddHHmmss") + tableName + ".xlsx";
using (FileStream ms = new FileStream(fileName, FileMode.OpenOrCreate))
{
workbook.Write(ms);
ms.Close();
}
}