1.webmagic基本使用。
详情不在赘述,具体请看开发者给出的开发文档,我在此处使用的是 webmagic-selenium,因为往往我们爬取的页面是动态的,有时候甚至伴随着点击事件,若静态界面则不需要
2. webgmic配置。
- maven依赖。
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>0.7.3</version>
</dependency>
2.去gitee上下载webmagic源码进行修改部分代码(若用不到webmagic-selenium,不用修改)。
修改模块文件webmagic-selenium下的WebDriverPool.java文件,修改完成打包源文件,重新刷新依赖。
// 1.修改配置文件地址private static final String DEFAULT_CONFIG_FILE = "selenium.properties";//2.修改打开方式(原理就是java代码打开一个浏览器监听,若觉得很烦此处可进行关闭打开浏览器)
if (isUrl(driver)) {
sCaps.setBrowserName("phantomjs");
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
} else if (driver.equals(DRIVER_FIREFOX)) {
mDriver = new FirefoxDriver(sCaps);
} else if (driver.equals(DRIVER_CHROME)) {
ChromeOptions options = new ChromeOptions();
options.setHeadless(true);
options.addArguments("-headless");
mDriver = new ChromeDriver(options);
} else if (driver.equals(DRIVER_PHANTOMJS)) {
mDriver = new PhantomJSDriver(sCaps);
}
3.webmagic的使用
去下载自己浏览器对应的版本驱动
1.使用webmagic进行爬取当当网的商品分类,不需要使用selenium。
import com.alibaba.fastjson.JSONObject;import com.magic.demo.ConsolePipeline;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.selector.Selectable;import java.util.List;public class DangDangProcessor implements PageProcessor { private Site site = Site
.me()
.setSleepTime(3000)//设置超时时间,单位是毫秒
.setUserAgent( //设置UserAgent
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"); @Override
public void process(Page page) {
Selectable selectable = page.getHtml().xpath("//div[@class="classify_books"]");
List<Selectable> nodes = selectable.nodes(); for(Selectable selectableNode : nodes){ //一级标题
String oneTitle = selectableNode.$("h3[class^="classify_title"] > a", "text").toString();
System.out.println("======"+oneTitle);
Selectable selectable2 = selectableNode.xpath("//div[@class="classify_kind"]"); for(Selectable selectable2C :selectable2.nodes()){ //二级标题
String twoTitle = selectable2C.xpath("/div/div/a/text()").toString();
System.out.println("============"+twoTitle); //三级标题
List<String> threeTitle = selectable2C.xpath("//li[@name="cat_3"]/a/text()").all();
System.out.println("===================================="+JSONObject.toJSONString(threeTitle));
}
selectableNode.$("a");
}
} @Override
public Site getSite() { return site;
} public static void main(String[] args) {
Spider.create(new DangDangProcessor()) //关于Pipeline的使用 http://webmagic.io/docs/zh/posts/ch6-custom-componenet/pipeline.html
.addUrl("http://category.dangdang.com/?ref=www-0-C")
.addPipeline(new ConsolePipeline())
.run();
}
}import com.alibaba.fastjson.JSONArray;import com.alibaba.fastjson.JSONObject;import us.codecraft.webmagic.ResultItems;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.pipeline.Pipeline;import java.util.List;import java.util.Map;public class ConsolePipeline implements Pipeline { @Override
public void process(ResultItems resultItems, Task task) { //获取访问的Url
System.out.println("url:"+resultItems.getRequest().getUrl()); //关于Pipeline的使用 http://webmagic.io/docs/zh/posts/ch6-custom-componenet/pipeline.html
}
}
效果如下
2. 当我们需要一些需要登录才能爬虫的网站,获取动态网页进入网页需要单击某个菜单以后获取到需要的页面信息,使用webmagic-selenium。
import com.alibaba.fastjson.JSONObject;import com.magic.demo.ConsolePipeline;import org.openqa.selenium.By;import org.openqa.selenium.Cookie;import org.openqa.selenium.WebDriver;import org.openqa.selenium.WebElement;import org.openqa.selenium.chrome.ChromeDriver;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;import us.codecraft.webmagic.processor.PageProcessor;import java.util.Properties;import java.util.Set;public class JcoolLoginProcessor implements PageProcessor { private Set<Cookie> cookies; private Site site = Site
.me()
.setSleepTime(3000) // .setCycleRetryTimes(5)失败则会重试
.setUserAgent( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"); @Override
public void process(Page page) {
} void getCookie(){ //加载驱动
System.setProperty("webdriver.chrome.driver", "C:UsersadminAppDataLocalGoogleChromeApplication75.0.3770.100chromedriver_win32chromedriver.exe");
WebDriver driver = new ChromeDriver();
driver.manage().window().maximize(); //打开地址
driver.get("登录url"); //获取用户名 密码的标签
driver.findElement(By.xpath("//input[@name='loginName']")).sendKeys("admin");
driver.findElement(By.xpath("//input[@name='loginPass']")).sendKeys("123456"); //获取登录按钮
WebElement element = driver.findElement(By.xpath("//button[@class='el-button loadBtn el-button--primary']")); //单击登录
element.click(); //获取返回cookie
cookies = driver.manage().getCookies();
driver.close();
} @Override
public Site getSite() { //给site增加请求头 注意需设置域名后,addCookie才可生效 site.setDomain()
for (Cookie cookie : cookies) {
site.addCookie(cookie.getName().toString(), cookie.getValue().toString());
} return site;
} public static void main(String[] args) {
JcoolLoginProcessor jcoolLoginProcessor = new JcoolLoginProcessor();
jcoolLoginProcessor.getCookie();
Spider.create(jcoolLoginProcessor)
.addUrl("要访问的需要登录url")
.addPipeline(new ConsolePipeline())
.setDownloader(new SeleniumDownloader("C:UsersadminAppDataLocalGoogleChromeApplication75.0.3770.100chromedriver_win32chromedriver.exe"))
.run();
}
}