SpringBoot 集成爬虫框架WebMagic

任务要求:爬取百度新闻相关信息页面。

核心pom.xml 文件配置如下:

    <parent>
		<groupId>org.springframework.boot</groupId>
		<artifactId>spring-boot-starter-parent</artifactId>
		<version>2.1.1.RELEASE</version>
	</parent>

	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
		<java.version>1.8</java.version>
		<mybatis-spring-boot-starter.version>1.3.2</mybatis-spring-boot-starter.version>
		<mysql-connector-java.version>8.0.11</mysql-connector-java.version>
		<com.alibaba.druid.version>1.1.9</com.alibaba.druid.version>
		<commons-lang.version>2.6</commons-lang.version>
		<commons-codec.version>1.10</commons-codec.version>
		<commons-lang3.version>3.8.1</commons-lang3.version>
		<commons-net.version>3.6</commons-net.version>
		<commons-io.version>2.6</commons-io.version>
		<commons-collections.version>3.2.1</commons-collections.version>
		<common-fileupload.version>1.3.1</common-fileupload.version>
		<fastjson.version>1.2.48</fastjson.version>
		<jasperreports.version>6.10.0</jasperreports.version>
	</properties>


	<dependencies>
		<!-- SpringWeb模块 -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-web</artifactId>
			<!-- 移除springboot 自带日志框架log-back  -->
			<!--
			 <exclusions>
                <exclusion>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-starter-logging</artifactId>
                </exclusion>
            </exclusions>  -->
		</dependency>

		<!--springboot 集成测试框架 -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-test</artifactId>
			<scope>test</scope>
		</dependency>


		<!--lombok插件 -->
		<dependency>
			<groupId>org.projectlombok</groupId>
			<artifactId>lombok</artifactId>
			<version>${lombok.version}</version>
			<scope>provided</scope>
		</dependency>


		<!-- mysql 连接 -->
		<dependency>
			<groupId>org.mybatis.spring.boot</groupId>
			<artifactId>mybatis-spring-boot-starter</artifactId>
			<version>${mybatis-spring-boot-starter.version}</version>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>${mysql-connector-java.version}</version>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>druid-spring-boot-starter</artifactId>
			<version>${com.alibaba.druid.version}</version>
		</dependency>
		<!-- 分页控件 -->
		<dependency>
			<groupId>com.github.pagehelper</groupId>
			<artifactId>pagehelper</artifactId>
			<version>4.1.6</version>
		</dependency>

		<!--common-lang 常用工具包 -->
		<dependency>
			<groupId>commons-lang</groupId>
			<artifactId>commons-lang</artifactId>
			<version>${commons-lang.version}</version>
		</dependency>
		<!--commons-lang3 工具包 -->
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-lang3</artifactId>
			<version>${commons-lang3.version}</version>
		</dependency>

		<!--commons-codec 加密工具包 -->
		<dependency>
			<groupId>commons-codec</groupId>
			<artifactId>commons-codec</artifactId>
			<version>${commons-codec.version}</version>
		</dependency>
		<!--commons-net 网络工具包 -->
		<dependency>
			<groupId>commons-net</groupId>
			<artifactId>commons-net</artifactId>
			<version>${commons-net.version}</version>
		</dependency>
		<!--common-io 工具包 -->
		<dependency>
			<groupId>commons-io</groupId>
			<artifactId>commons-io</artifactId>
			<version>${commons-io.version}</version>
		</dependency>
		<!--common-collection 工具包 -->
		<dependency>
			<groupId>commons-collections</groupId>
			<artifactId>commons-collections</artifactId>
			<version>${commons-collections.version}</version>
		</dependency>
		<!--common-fileupload 工具包 -->
		<dependency>
			<groupId>commons-fileupload</groupId>
			<artifactId>commons-fileupload</artifactId>
			<version>${common-fileupload.version}</version>
		</dependency>

		<!-- Swagger2 -->
		<dependency>
			<groupId>io.springfox</groupId>
			<artifactId>springfox-swagger2</artifactId>
			<version>2.7.0</version>
		</dependency>
		<dependency>
			<groupId>io.springfox</groupId>
			<artifactId>springfox-swagger-ui</artifactId>
			<version>2.7.0</version>
		</dependency>

		<!-- fastjson -->
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>${fastjson.version}</version>
		</dependency>

        <!-- 爬虫框架集成 -->
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-core</artifactId>
			<version>0.7.2</version>
		</dependency>
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-extension</artifactId>
			<version>0.7.2</version>
		</dependency>

配置实例化对象:

package com.zzg.reptile.config;

import java.util.HashMap;
import java.util.Map;

import org.springframework.boot.web.servlet.FilterRegistrationBean;
import org.springframework.boot.web.servlet.ServletRegistrationBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

import com.alibaba.druid.support.http.StatViewServlet;
import com.alibaba.druid.support.http.WebStatFilter;
/**
 * druid 监控配置
 * @author zzg
 *
 */
@Configuration
public class DruidConfig {
	 	@Bean
	    public ServletRegistrationBean druidServletRegistrationBean() {
	        ServletRegistrationBean servletRegistrationBean = new ServletRegistrationBean();
	        servletRegistrationBean.setServlet(new StatViewServlet());
	        servletRegistrationBean.addUrlMappings("/druid/*");
	        servletRegistrationBean.addInitParameter("allow", "");
	        servletRegistrationBean.addInitParameter("deny", "");
	        servletRegistrationBean.addInitParameter("loginUsername", "admin");
	        servletRegistrationBean.addInitParameter("loginPassword", "admin");
	        return servletRegistrationBean;
	    }

	    /**
	     * 注册DruidFilter拦截
	     *
	     * @return
	     */
	    @Bean
	    public FilterRegistrationBean duridFilterRegistrationBean() {
	        FilterRegistrationBean filterRegistrationBean = new FilterRegistrationBean();
	        filterRegistrationBean.setFilter(new WebStatFilter());
	        Map<String, String> initParams = new HashMap<String, String>();
	        //设置忽略请求
	        initParams.put("exclusions", "*.js,*.gif,*.jpg,*.bmp,*.png,*.css,*.ico,/druid/*");
	        filterRegistrationBean.setInitParameters(initParams);
	        filterRegistrationBean.addUrlPatterns("/*");
	        return filterRegistrationBean;
	    }
}
package com.zzg.reptile.config;

import java.util.Properties;

import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

import com.github.pagehelper.PageHelper;

/**
 * mybatis 配置对象
 * @author zzg
 *
 */
@Configuration
public class MyBatisConfig {
	/**
	 * 分页对象实列化
	 * @return
	 */
	@Bean
	public PageHelper pageHelper() {
		PageHelper pageHelper = new PageHelper();
		Properties p = new Properties();
		p.setProperty("offsetAsPageNum", "true");
		p.setProperty("rowBoundsWithCount", "true");
		p.setProperty("reasonable", "true");
		p.setProperty("dialect", "mysql");
		pageHelper.setProperties(p);
		return pageHelper;
	}
}
package com.zzg.reptile.config;

import java.util.ArrayList;
import java.util.List;

import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

import io.swagger.annotations.ApiOperation;
import springfox.documentation.builders.ApiInfoBuilder;
import springfox.documentation.builders.ParameterBuilder;
import springfox.documentation.builders.PathSelectors;
import springfox.documentation.builders.RequestHandlerSelectors;
import springfox.documentation.schema.ModelRef;
import springfox.documentation.service.ApiInfo;
import springfox.documentation.service.Contact;
import springfox.documentation.service.Parameter;
import springfox.documentation.spi.DocumentationType;
import springfox.documentation.spring.web.plugins.Docket;
import springfox.documentation.swagger2.annotations.EnableSwagger2;

@Configuration
@EnableSwagger2
public class SwaggerConfig {
	@Bean
	public Docket buildDocket() {

		ParameterBuilder tokenPar = new ParameterBuilder();
		List<Parameter> pars = new ArrayList<Parameter>();
		tokenPar.name("X-CSRF-TOKEN").description("令牌").modelRef(new ModelRef("string")).parameterType("header")
				.required(false).build();
		pars.add(tokenPar.build());

		return new Docket(DocumentationType.SWAGGER_2).select()
				.apis(RequestHandlerSelectors.withMethodAnnotation(ApiOperation.class)).paths(PathSelectors.any())
				.build().globalOperationParameters(pars).apiInfo(buildApiInf());
	}

	private ApiInfo buildApiInf() {
		return new ApiInfoBuilder().title("****").termsOfServiceUrl("http://www.baidu.cn/")
				.description("API接口")
				.contact(new Contact("baidu", "http://www.baidu.cn/", "zhouzhiwengang@163.com"))
				.version("2.0").build();

	}
}

业务逻辑实体对象(model、Mapper、service、serviceImpl)省略。

application.properties 

# æå®æå¡ç«¯å£
server.port=7090
# æå®æå¡ å称
# server.context-path=/jreport
#mybatis xml æ件éç½®
mybatis.mapper-locations=classpath*:mapper/reptile/*Mapper.xml
mybatis.type-aliases-package=com.zzg.reptile.domain
# MyBatis mysql8 éç½®
spring.datasource.url=jdbc:mysql://192.168.1.73:3306/boot-security?serverTimezone=UTC&useSSL=false&allowPublicKeyRetrieval=true&allowMultiQueries=true&nullCatalogMeansCurrent=true
spring.datasource.username=root
spring.datasource.password=digipower
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
# Druid éç½®
# åå§åæ¶å»ºç«ç©çè¿æ¥ç个æ°
spring.datasource.druid.initial-size=5
# æ大è¿æ¥æ± æ°é
spring.datasource.druid.max-active=30
# æå°è¿æ¥æ± æ°é
spring.datasource.druid.min-idle=5
# è·åè¿æ¥æ¶æ大ç­å¾æ¶é´ï¼åä½æ¯«ç§
spring.datasource.druid.max-wait=60000
# éç½®é´éå¤ä¹æè¿è¡ä¸æ¬¡æ£æµï¼æ£æµéè¦å³é­ç空é²è¿æ¥ï¼åä½æ¯æ¯«ç§
spring.datasource.druid.time-between-eviction-runs-millis=60000
# è¿æ¥ä¿æ空é²èä¸è¢«é©±éçæå°æ¶é´
spring.datasource.druid.min-evictable-idle-time-millis=300000
# ç¨æ¥æ£æµè¿æ¥æ¯å¦ææçsqlï¼è¦æ±æ¯ä¸ä¸ªæ¥è¯¢è¯­å¥
spring.datasource.druid.validation-query=SELECT 1 FROM DUAL
# 建议é置为trueï¼ä¸å½±åæ§è½ï¼å¹¶ä¸ä¿è¯å®å¨æ§ãç³è¯·è¿æ¥çæ¶åæ£æµï¼å¦æ空é²æ¶é´å¤§äºtimeBetweenEvictionRunsMillisï¼æ§è¡validationQueryæ£æµè¿æ¥æ¯å¦ææã
spring.datasource.druid.test-while-idle=true
# ç³è¯·è¿æ¥æ¶æ§è¡validationQueryæ£æµè¿æ¥æ¯å¦ææï¼åäºè¿ä¸ªéç½®ä¼éä½æ§è½ã
spring.datasource.druid.test-on-borrow=false
# å½è¿è¿æ¥æ¶æ§è¡validationQueryæ£æµè¿æ¥æ¯å¦ææï¼åäºè¿ä¸ªéç½®ä¼éä½æ§è½ã
spring.datasource.druid.test-on-return=false
# æ¯å¦ç¼å­preparedStatementï¼ä¹å°±æ¯PSCacheãPSCache对æ¯æ游æ çæ°æ®åºæ§è½æå巨大ï¼æ¯å¦è¯´oracleãå¨mysqlä¸å»ºè®®å³é­ã
spring.datasource.druid.pool-prepared-statements=true
# è¦å¯ç¨PSCacheï¼å¿é¡»é置大äº0ï¼å½å¤§äº0æ¶ï¼poolPreparedStatementsèªå¨è§¦åä¿®æ¹ä¸ºtrueã
spring.datasource.druid.max-pool-prepared-statement-per-connection-size=50
# éç½®çæ§ç»è®¡æ¦æªçfiltersï¼å»æåçæ§çé¢sqlæ æ³ç»è®¡
#spring.datasource.druid.filters=stat,wall
# éè¿connectPropertieså±æ§æ¥æå¼mergeSqlåè½ï¼æ¢SQLè®°å½
spring.datasource.druid.connection-properties=druid.stat.mergeSql=true;druid.stat.slowSqlMillis=500
# å并å¤ä¸ªDruidDataSourceççæ§æ°æ®
spring.datasource.druid.use-global-data-source-stat=true
# éç½®sql 注å¥æ¹å¼
spring.datasource.druid.filters=stat
# æ¥å¿æ件éç½®
logging.config=classpath:logback.xml

爬虫核心业务组件编写:

package com.zzg.reptile.component;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.springframework.stereotype.Component;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

/**
 * 百度新闻#国内焦点 
 * @author zzg
 *
 */
@Component
public class BaiduNewPageProcessor implements PageProcessor {
	
	//解析数据
	private JSONArray array = new JSONArray();
	
	public JSONArray getArray() {
		return array;
	}

	public void setArray(JSONArray array) {
		this.array = array;
	}

	// 站点信息
	String domain ="news.baidu.com";
	Integer sleepTime = 1000;
	Integer retryTime = 30;
	String charset ="utf-8";
	Integer timeOut = 30000;
	String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36";
	
	// 
	
	// 初始化站点
	private Site site = Site.me().setDomain(domain).setSleepTime(sleepTime)
            .setRetryTimes(retryTime).setCharset(charset).setTimeOut(timeOut).setUserAgent(userAgent);
	
	@Override
	public void process(Page page) {
		// TODO Auto-generated method stub
		Html html = page.getHtml();
		if(html != null){
			List<Selectable> selectables = html.xpath("//div[@class='hotnews']/ul/li/strong/").nodes();
			
			for(int i = 0; i < selectables.size(); i ++){
				Selectable selectable = selectables.get(i);
				String content = selectable.toString();
				Pattern pattern = Pattern.compile("^<i");
				Matcher matcher = pattern.matcher(content);
				if(matcher.find()){
					selectables.remove(i);
				}
			}
			
			for(int i =0; i < selectables.size(); i ++){
				// 解析数据接收
				JSONObject jsonObject = new JSONObject();
				
				Selectable selectable = selectables.get(i);
				String url = selectable.links().toString();
				System.out.println("url is:" + url);
				// 设置访问url
				jsonObject.put("url", url);
				String context = selectable.toString();
				Pattern pattern = Pattern.compile(".*(<b>).*");
				Matcher matcher = pattern.matcher(context);
				if(matcher.find()){
					String title = selectable.xpath("/a/b/text()").get();
					// 设置访问标题
					jsonObject.put("title", title);
				} else {
					String title = selectable.xpath("a/text()").get();
					// 设置访问标题
					jsonObject.put("title", title);
				}
				array.add(jsonObject);
			}

			
			
			
		}
	}

	@Override
	public Site getSite() {
		// TODO Auto-generated method stub
		return site;
	}

}

爬虫功能测试:

package com.zzg.reptile.controller;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;

import com.alibaba.fastjson.JSONArray;
import com.zzg.jreport.response.JreportResponse;
import com.zzg.reptile.component.BaiduNewPageProcessor;

import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import us.codecraft.webmagic.Spider;

@Controller
@RequestMapping("/api/webmagic")
@Api(value = "爬虫Controlle", tags = "爬虫操作服务")
public class WebmagicController {
	@Autowired
	private BaiduNewPageProcessor processor;

	@ApiOperation(httpMethod = "POST", value = "新闻信息爬取")
	@RequestMapping(value = "/news", method = { RequestMethod.POST }, produces = "application/json;charset=UTF-8")
	@ResponseBody
	public JreportResponse news() {
		String url = "https://news.baidu.com/?cmd=1&class=civilnews&tn=rss&sub=0";
		Spider.create(processor).addUrl(url).run();
		
		JSONArray array = processor.getArray();
		return JreportResponse.ok(array);
	}

}

项目结构:

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值