作为一名小菜鸡,经常看到很多博客在说python的爬虫的优点以及好处,但是由于工作比较忙,以及暂时不想把重心转移到新的语言的学习上,去百度了java的爬虫框架.
结果找到了这一款WebMagic框架,基于Java,由国人编写,功能很完善,所以尝试一下.
本例使用Idea编译器,基于Maven创建了项目,引入了WebMagic的相关包以及log4j,因为考虑到后期考虑到数据的存储,所以直接在pom中配置了spring+springMVC+MyBatis的相关包,使用MySql5.5数据库.
下面贴上pom.xml的代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.com.zach.webMagic</groupId>
<artifactId>WebMagic</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>war</packaging>
<name>WebMagic Maven Webapp</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<!-- spring版本号 -->
<spring.version>5.0.5.RELEASE</spring.version>
<!-- mybatis版本号 -->
<mybatis.version>3.2.6</mybatis.version>
<!-- log4j日志文件管理包版本 -->
<slf4j.version>1.7.7</slf4j.version>
<log4j.version>1.2.17</log4j.version>
</properties>
<dependencies>
<!-- spring核心包 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-web</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-oxm</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-aspects</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-tx</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-webmvc</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-aop</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>${spring.version}</version>
</dependency>
<!-- mybatis核心包 -->
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>${mybatis.version}</version>
</dependency>
<!-- mybatis/spring包 -->
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis-spring</artifactId>
<version>1.2.2</version>
</dependency>
<!-- 导入Mysql数据库链接jar包 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.30</version>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
</dependency>
<!-- JSTL标签类 -->
<dependency>
<groupId>jstl</groupId>
<artifactId>jstl</artifactId>
<version>1.2</version>
</dependency>
<!-- 日志文件管理包 -->
<!-- log start -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
</dependency>
<!-- 格式化对象,方便输出日志 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.1.41</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
</dependency>
<!-- log end -->
<!-- 映入JSON -->
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.9.13</version>
</dependency>
<!-- 上传组件包 -->
<dependency>
<groupId>commons-fileupload</groupId>
<artifactId>commons-fileupload</artifactId>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.9</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-dbcp/commons-dbcp -->
<dependency>
<groupId>commons-dbcp</groupId>
<artifactId>commons-dbcp</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>jetty</groupId>
<artifactId>servlet-api</artifactId>
<version>2.5-6.0.2</version>
</dependency>
<!-- junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.0</version>
</dependency>
<!-- WebMagic的包 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<version>0.6.1</version>
<artifactId>webmagic-extension</artifactId>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>0.6.1</version>
</dependency>
</dependencies>
<build>
<finalName>WebMagic</finalName>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_war_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.20.1</version>
</plugin>
<plugin>
<artifactId>maven-war-plugin</artifactId>
<version>3.2.0</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
</plugins>
</pluginManagement>
<resources>
<resource>
<directory>src/main/java</directory>
<includes>
<include>**/*.xml</include>
</includes>
</resource>
</resources>
</build>
</project>
考虑到Demo中集成了log4j的相关包,所以需要配置一下log4j,在resources下创建log4j.propertise文件
log4j.propertise的配置如下:
### 设置###
log4j.rootLogger = debug,stdout,D,E
### 输出信息到控制抬 ###
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern = [%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n
### 输出DEBUG 级别以上的日志到= F://webMagic爬虫/log/log.log ###
log4j.appender.D = org.apache.log4j.DailyRollingFileAppender
log4j.appender.D.File = F://webMagic爬虫/log/log.log
log4j.appender.D.Append = true
log4j.appender.D.Threshold = DEBUG
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
### 输出ERROR 级别以上的日志到= F://webMagic爬虫/log/error.log ###
log4j.appender.E = org.apache.log4j.DailyRollingFileAppender
log4j.appender.E.File = F://webMagic爬虫/log/error.log
log4j.appender.E.Append = true
log4j.appender.E.Threshold = ERROR
log4j.appender.E.layout = org.apache.log4j.PatternLayout
log4j.appender.E.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
基本配置就这些,由于是第一次尝试,没有考虑存储数据库,仅仅试试是否可行
配置完成后,开始创建class进行爬虫的编写
WebMagic的基本使用方式十分简单:
创建一个类-->这个类实现PageProcessor接口 --> 重写process方法以及getSite方法
process方法:编写爬虫的主要逻辑
getSite方法:不需要写什么东西,把Site对象返回就好(理解可能不太对,但是初步这样写就好)
在这些之外,我们还需要了解一下我们如果想爬取网页相关信息,最少需要掌握或者知道的东西
1.网站url你得知道,并且确定自己的ip是可以访问的
2.不求html大成精通,但是你最起码得知道网站结构,不要给你一个网站你都看不懂html,无从下手
3.要对css了解,因为可能会根据它来筛选,当然,如果从事web开发或者大学毕设涉及到web开发,应该都会一些
4.正则表达式,xpath得了解,因为css筛选的局限性还是有的
现在看下代码,在类的开头,我先初始化了一点数据
private static HashMap<String, String> rtnMap = new HashMap<String,String>();
private static Spider spider = Spider.create(new GetCsdn());
private Site site = Site.me()
.setDomain("www.baidu.com")
.setSleepTime(1131)
.setCharset("utf-8")
.setRetrySleepTime(3)
.setTimeOut(1000)//设置超时
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
Spider用于初始化爬虫的相关信息
创建了一个Map用于存储查到的数据
初始化了Site对象,这个Site对象是WebMagic的配置对象
具体的配置如下图
配置完成后,来书写process逻辑方法
String list = page.getHtml().xpath("*[@id=\"mainBox\"]/aside/div[1]/div[1]/div[2]/p/a/text()").toString();//账号名
String PageTitle = page.getHtml().xpath("*[@id=\"mainBox\"]/main/div[1]/div[1]/div[1]/div[1]/h1/text()").toString();//文章标题
String time = page.getHtml().xpath("*[@id=\"mainBox\"]/main/div[1]/div[1]/div[1]/div[2]/div[1]/span[2]/text()").toString();//时间
String content = page.getHtml().xpath("*[@id=\"mainBox\"]/main/div[1]/article/div[1]/div[1]/span/text()").toString();//内容
rtnMap.put("username",list);
rtnMap.put("PageTitle",PageTitle);
rtnMap.put("time",time);
rtnMap.put("title",content);
这是根据网页的结构去获取信息,使用xpath,不多赘述
最后写main方法,执行:
public static void main(String[] args){
spider
.thread(5)//线程数控制在10以内差不多,否则极易出现read time out
.addUrl("https://blog.csdn.net/yinbucheng/article/details/71023037")
.run();
// rtnMap.forEach((key, value) -> {
// System.out.println("**key的值为" + key);
// System.out.println("VALUE的值为" + rtnMap.get(key));
// });
}
最后的结果为:
需要注意的是,代码中使用的遍历方法为java8的lambda表达式,若报错请自行更改遍历方法
下面贴上全部代码:
package WebMagicForCSDN;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.HashMap;
import java.util.function.BiConsumer;
/**
* 解析单页面多信息的Demo
*
* 这个Demo的作用是利用WebMagic下载并解析一个单页面的有用信息,并按照Map格式存储,没有进行多页面的关联以及处理.
*
* 算是对XPath这种筛选器的一个练习
*/
public class GetCsdn implements PageProcessor {
private static HashMap<String, String> rtnMap = new HashMap<String,String>();
private static Spider spider = Spider.create(new GetCsdn());
private Site site = Site.me()
.setDomain("www.baidu.com")
.setSleepTime(1131)
.setCharset("utf-8")
.setRetrySleepTime(3)
.setTimeOut(1000)//设置超时
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
@Override
public void process(Page page) {
String list = page.getHtml().xpath("*[@id=\"mainBox\"]/aside/div[1]/div[1]/div[2]/p/a/text()").toString();//账号名
String PageTitle = page.getHtml().xpath("*[@id=\"mainBox\"]/main/div[1]/div[1]/div[1]/div[1]/h1/text()").toString();//文章标题
String time = page.getHtml().xpath("*[@id=\"mainBox\"]/main/div[1]/div[1]/div[1]/div[2]/div[1]/span[2]/text()").toString();//时间
String content = page.getHtml().xpath("*[@id=\"mainBox\"]/main/div[1]/article/div[1]/div[1]/span/text()").toString();//内容
rtnMap.put("username",list);
rtnMap.put("PageTitle",PageTitle);
rtnMap.put("time",time);
rtnMap.put("title",content);
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args){
spider
.thread(5)//线程数控制在10以内差不多,否则极易出现read time out
.addUrl("https://blog.csdn.net/yinbucheng/article/details/71023037")
.run();
rtnMap.forEach(new BiConsumer<String, String>() {
@Override
public void accept(String key, String value) {
System.out.println("**key的值为" + key);
System.out.println("VALUE的值为" + rtnMap.get(key));
}
});
}
}