.netcore Selenium Chrome 抓取百度翻页数据 (不处理跳转后链接)
环境准备
nuget packages
- Selenium.RC
- Selenium.Support
- Selenium.WebDriver
chrome driver
-
查看 chrome 版本: chrome://settings/help
-
下载相应的 chromedriver https://chromedriver.chromium.org/downloads
-
把 chromedriver.exe 丢进工程目录,右键,点击 properties, ‘Copy to output Directory’ 选择 ‘Copy if newer’
抓取数据
核心代码
using OpenQA.Selenium;
using OpenQA.Selenium.Chrome;
using System.Collections.Generic;
namespace Demo
{
public static class DataScraper
{
public static void ScrapeData(string url)
{
var driver = new ChromeDriver();
driver.Navigate().GoToUrl(url);
driver.FindElementByXPath("/html/body/div[1]/div[1]/div[5]/div/div/form/span[1]/input").SendKeys("vasdfassdf"); // 搜索关键词
driver.FindElementByXPath("/html/body/div[1]/div[1]/div[5]/div/div/form/span[2]/input").Click();
System.Threading.Thread.Sleep(1000);
Dictionary<string, string> dic = new Dictionary<string, string>(); // 目标数据 <url, title>
while (true)
{
var content = driver.FindElementByXPath("//*[@id='content_left']");
var datas = content.FindElements(By.XPath("//*[@class='t']"));
foreach (var item in datas)
{
var link = item.FindElement(By.XPath("child::a")).GetAttribute("href");
var title = item.Text;
if(!dic.ContainsKey(link))
dic.Add(link, title);
}
var nextPage = driver.FindElements(By.XPath("/html/body/div[1]/div[3]/div[2]/div/a[contains(text(),'下一页')]"));
if (nextPage.Count > 0)
{
nextPage[0].Click();
System.Threading.Thread.Sleep(700); // 延时,避免找不到元素
}
else
{
break;
}
}
driver.Quit();
}
}
}
常见问题
-
如何便捷获取 xpath?
-
如何避免找不到元素报错?
System.Threading.Thread.Sleep(700); // 延时,避免找不到元素
-
如何检测一个元素是否存在?
使用 FindElements 寻找,并使用 nextPage.Count > 0 判断是否存在