Java爬虫第三篇:使用Jsoup 抓取文章
概述
本章讲解Selenium模拟登录CSDN之后,使用Jsoup 抓取文章。
1. Jsoup maven配置
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
2. 测试
public class Test{
public static void main(String[] args) {
String username = "xxxx@163.com";
String password = "***********";
String url = "https://passport.csdn.net/login";
System.setProperty("webdriver.chrome.driver", SeleniumUtil.CHROMEDRIVERPATH );// chromedriver localPath
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.addArguments("–no-sandbox"); //--start-maximized
WebDriver driver = new ChromeDriver(chromeOptions);
driver.get(url);
SeleniumUtil.sleep(1000);
WebElement mainSelectE = driver.findElement(By.cssSelector(".main-select"));
List<WebElement> loginType = mainSelectE.findElements(By.tagName("a"));
bgm:for (int i = 0; i < loginType.size(); i++) {
WebElement aelement = loginType.get(i);
if( aelement.getText().indexOf("帐号登录") != -1){
aelement.click();
System.out.println("切换到帐号登录....");
break bgm;
}
}
System.out.println("继续操作....");
SeleniumUtil.sleep(1000);
WebElement formE = driver.findElement(By.cssSelector("form"));
WebElement nameE = formE.findElement(By.id("all"));
WebElement passwordE = formE.findElement(By.id("password"));
nameE.sendKeys(username);
formE.findElement(By.id("password-number")).sendKeys(password);
formE.findElement(By.cssSelector("button")).click();
driver.get("https://blog.csdn.net/forezp/column/info/15197/3");
SeleniumUtil.sleep(1000);
//Selenium获取网页内容
WebElement body = driver.findElement(By.cssSelector("body"));
//转化为Jsoup文档处理
Document doc = Jsoup.parse( body.getAttribute("outerHTML") );
Elements es = doc.select("ul.column_article_list>li");
for ( Element e : es ){
String absHref = e.select("a").attr("abs:href"); // "http://www.open-open.com/"
String titile = e.select("h2.title").text();
String desc = e.select("div.column_article_desc").html();
System.out.println( "absHref:" + absHref );
Document detailDoc = Jsoup.connect(absHref).get();
Elements detaile = detailDoc.select("div#content_views");
String content = detaile.html();
System.out.println( "titile:" + titile );
System.out.println( "desc:" + desc );
System.out.println( "content:" + content );
System.out.println( );
}
}
}