利用Selenium爬取CSDN文章信息
–仅供学习参考
效果图:
package parameters;
import java.util.List;
import org.junit.Test;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;
public class Select01 {
@Test
public void test() throws InterruptedException {
//设置火狐浏览器驱动
System.setProperty("webdriver.gecko.driver","C:\\LanQiaoTest\\project\\JavaLanqiaoTest\\driver\\geckodriver.exe");
//实例化驱动
WebDriver driver = new FirefoxDriver();
//打开CSDN登录页面
driver.get("https://passport.csdn.net/login?code=applets");
Thread.sleep(2000);
//点击密码登录
driver.findElement(By.xpath("/html/body/div[2]/div/div[2]/div[2]/div[2]/div[1]/div[1]/span[4]")).click();
//输入账号
driver.findElement(By.xpath("/html/body/div[2]/div/div[2]/div[2]/div[2]/div/div[2]/div/div[1]/div/input")).sendKeys("");
//输入密码
driver.findElement(By.xpath("/html/body/div[2]/div/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div/input")).sendKeys("");
//勾选协议
driver.findElement(By.xpath("/html/body/div[2]/div/div[2]/div[2]/div[2]/div/div[2]/p/div/i")).click();
//点击登录
driver.findElement(By.xpath("/html/body/div[2]/div/div[2]/div[2]/div[2]/div/div[2]/div/div[4]/button")).click();
Thread.sleep(2000);
//跳转到测试页面,这里也可以换成其他技术的页面
driver.navigate().to("https://blog.csdn.net/nav/test");
Thread.sleep(3000);
//不断刷新页面的文章数量
for(int j =0;j<1;j++) {
JavascriptExecutor js = (JavascriptExecutor) driver;
String jsCode = "window.scrollTo(0, document.body.scrollHeight);";
js.executeScript(jsCode);
Thread.sleep(500);
}
//用于记录获取到的数据数量
int count =1;
//获取标题
List<WebElement> ListWebElement01 =driver.findElements(By.xpath("//span[@class=\"blog-text\"]"));
//作者
List<WebElement> ListWebElement03 =driver.findElements(By.xpath("//div[@class=\"operation-c\"]"));
//文章URL
List<WebElement> ListWebElement04 =driver.findElements(By.xpath("//a[@class=\"blog\"]"));
System.out.println(ListWebElement01.size());
System.out.println(ListWebElement03.size());
System.out.println(ListWebElement04.size());
//输出集合中的元素
for(int i =0;i<ListWebElement01.size();i++) {
System.out.println("第:"+count+"条数据");
System.out.println("标题:"+ListWebElement01.get(i).getText());
System.out.println(ListWebElement03.get(i).getText());
System.out.println("文章地址"+ListWebElement04.get(i).getAttribute("href"));
System.out.println("-----------------------------------------------------------");
count++;
}
}
}