写这个的目的是为了爬歌词,因为喜欢听歌,遇到喜欢的歌就喜欢把歌词下载下来。
WebMacgic 教程地址
http://webmagic.io/docs/zh/posts/ch1-overview/
使用 IDEA 创建 maven工程
下面为工程目录结构
下面为源代码
package bean;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import java.sql.Timestamp;
import java.util.Date;
import java.util.List;
/**
* @author zhaoshenjiao
* @Date 2017-04-18 23:12:34
*/
@TargetUrl("http://www.kuwo.cn/yinyue/*")
public class KuWoMusic {
/**
* 歌名
*/
// @ExtractBy(value="div.tit em.f-ff2",type = ExtractBy.Type.Css)
@ExtractBy("//p[@id='lrcName']/text()")
private String name;
/**
* 歌手
*/
// @ExtractBy(value="p.des span a",type = ExtractBy.Type.Css)
@ExtractBy("//p[@class='artist']/span/a/text()")
private String singer;
/**
* 歌词
*/
// @ExtractBy(value="div.mCSB_container p",type = ExtractByactBy.Type.Css)
@ExtractBy("//p[@class='lrcItem']")
private List<String> lyrics;
private String lyric;
/**
* 所属专辑
*/
// @ExtractBy(value="p.des a",type = ExtractBy.Type.Css)
@ExtractBy("//p[@class='album']/span/a/text()")
private String album;
private Timestamp recordTime;
/**
* 所属专辑
*/
// @ExtractBy(value="body",type = ExtractBy.Type.Css)
// private String body;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getSinger() {
return singer;
}
public void setSinger(String singer) {
this.singer = singer;
}
public List<String> getLyrics() {
return lyrics;
}
public void setLyrics(List<String> lyrics) {
this.lyrics = lyrics;
}
public String getLyric() {
StringBuilder sb = new StringBuilder();
for ( String str: lyrics ) {
sb.append(str);
}
return sb.toString();
}
public void setLyric(String lyric) {
StringBuilder sb = new StringBuilder();
for ( String str: lyrics ) {
sb.append(str);
}
this.lyric = sb.toString();
}
public String getAlbum() {
return album;
}
public void setAlbum(String album) {
this.album = album;
}
public Timestamp getRecordTime() {
return new Timestamp( new Date().getTime());
}
public void setRecordTime(Timestamp recordTime) {
this.recordTime = recordTime;
}
@Override
public String toString() {
return "[name:"+name +",singer="+singer+",album="+album+",lyric="+lyric+"]";
}
}
package dao;
import bean.KuWoMusic;
import org.apache.ibatis.annotations.Insert;
/**
* @author zhaoshenjiao
* @Date 2017-04-19 00:37:57
*/
public interface KuWoMusicDao {
@Insert("insert into lyric (`title`,`content`,`source`,`singer`,`album`,`recorder`,`recordTime`,`curStatus`) " +
"values (#{name},#{lyric},'酷我',#{singer},#{album},'admin',#{recordTime},'2')")
int add(KuWoMusic kuWoMusic);
}
package dao.pipeline;
import dao.KuWoMusicDao;
import bean.KuWoMusic;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
/**
* @author zhaoshenjiao
* @Date 2017-04-19 00:42:41
*/
@Component("KuWoMusicDaoPipeline")
public class KuWoMusicDaoPipeline implements PageModelPipeline<KuWoMusic> {
ApplicationContext context = new ClassPathXmlApplicationContext("root-context.xml");
KuWoMusicDao kuWoMusicDao = (KuWoMusicDao)context.getBean("kuWoMusicDao");
// @Resource
// private KuWoMusicDao kuWoMusicDao;
@Override
public void process(KuWoMusic kuWoMusic, Task task) {
//输出歌词信息
System.out.println(kuWoMusic.toString());
kuWoMusicDao.add(kuWoMusic);
}
}
package execute;
import dao.pipeline.KuWoMusicDaoPipeline;
import bean.KuWoMusic;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
/**
* 爬虫执行类
* @author zhaoshenjiao
* @Date 2017-04-18 23:23:43
*/
public class LyricCrawlerExecutor {
public static void main(String[] args) {
//保存到数据库
OOSpider.create(
Site.me(),
new KuWoMusicDaoPipeline(), KuWoMusic.class)
.addUrl("http://www.kuwo.cn/yinyue/492211?catalog=yueku2016")
.thread(2)
.run();
//输出到控制台
// OOSpider.create(
// Site.me(),
// new ConsolePageModelPipeline(), KuWoMusic.class)
// .addUrl("http://www.kuwo.cn/yinyue/492211?catalog=yueku2016")
// .thread(2)
// .run();
//测试获取bean
// ApplicationContext context = new ClassPathXmlApplicationContext("root-context.xml");
// KuWoMusicDao kuWoMusicDao = (KuWoMusicDao)context.getBean("kuWoMusicDao");
//
// //包名(或者是保的完整路径)/配置文件名字(也就是xml文件)
// ClassPathXmlApplicationContext cpx=new ClassPathXmlApplicationContext ("root-context.xml");
//
// System.out.println(cpx.getBean("kuWoMusicDao"));
}
}
log4j.rootLogger=INFO,DEBUG,stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] -%m%n
#log4j.logger.com.ibatis=debug
#log4j.logger.com.ibatis.common.jdbc.SimpleDataSource=debug
#log4j.logger.com.ibatis.common.jdbc.ScriptRunner=debug
#log4j.logger.com.ibatis.sqlmap.engine.impl.SqlMapClientDelegate=debug
#log4j.logger.java.sql.Connection=debug
#log4j.logger.java.sql.Statement=debug
#log4j.logger.java.sql.PreparedStatement=debug,stdout
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:tx="http://www.springframework.org/schema/tx"
xmlns:task="http://www.springframework.org/schema/task"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.2.xsd
http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.2.xsd
http://www.springframework.org/schema/task http://www.springframework.org/schema/task/spring-task.xsd ">
<!-- 配置DataSource数据源 -->
<bean id="dataSource" class="org.apache.commons.dbcp.BasicDataSource" destroy-method="close">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url" value="jdbc:mysql://localhost:3306/dbname?characterEncoding=utf-8" />
<property name="username" value="" />
<property name="password" value="" />
<property name="maxActive" value="5" />
<property name="maxIdle" value="3" />
<property name="maxWait" value="1000" />
<property name="defaultAutoCommit" value="true" />
<property name="removeAbandoned" value="true" />
<property name="removeAbandonedTimeout" value="60" />
</bean>
<!-- 创建SqlSessionFactory,同时指定数据源 -->
<bean id="sqlSessionFactory" class="org.mybatis.spring.SqlSessionFactoryBean">
<property name="dataSource" ref="dataSource" />
</bean>
<!-- 配置Spring的事务管理器 -->
<bean id="transactionManager"
class="org.springframework.jdbc.datasource.DataSourceTransactionManager">
<property name="dataSource" ref="dataSource" />
</bean>
<bean id="kuWoMusicDao" class="org.mybatis.spring.mapper.MapperFactoryBean">
<property name="mapperInterface" value="dao.KuWoMusicDao" />
<property name="sqlSessionFactory" ref="sqlSessionFactory" />
</bean>
<tx:annotation-driven transaction-manager="transactionManager" />
<!-- 识别@Scheduled注解,并设置线程池为5 -->
<task:annotation-driven scheduler="qbScheduler" mode="proxy" />
<task:scheduler id="qbScheduler" pool-size="5" />
</beans>
pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>lyric.crawler</groupId>
<artifactId>lyric-crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<!-- spring版本号 -->
<spring.version>4.2.0.RELEASE</spring.version>
<!-- mybatis版本号 -->
<mybatis.version>3.3.0</mybatis.version>
<!-- mySql版本号 -->
<mysql.version>5.1.29</mysql.version>
</properties>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.6.1</version>
</dependency>
<!-- spring核心包 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-oxm</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-tx</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>${spring.version}</version>
</dependency>
<!-- mybatis核心包 -->
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>${mybatis.version}</version>
</dependency>
<!-- mybatis-spring包 -->
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis-spring</artifactId>
<version>1.2.3</version>
</dependency>
<!-- 导入Mysql数据库链接jar包 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
</dependency>
<!-- common组件 -->
<dependency>
<groupId>commons-dbcp</groupId>
<artifactId>commons-dbcp</artifactId>
<version>1.4</version>
</dependency>
</dependencies>
<build>
<finalName>lyriccrawler</finalName>
<resources>
<resource>
<directory>src/main/java</directory>
<includes>
<include>*.xml</include>
<include>*.properties</include>
<include>*.tld</include>
<include>*.txt</include>
<include>*.cfg</include>
<include>**/**/**/*.xml</include>
<include>**/**/**/**/*.xml</include>
</includes>
</resource>
</resources>
</build>
</project>
工程源代码下载地址
https://github.com/airujingye/lyriccrawler