爬虫:url+regex+stream
准备工作:
(1)创建spider线程、MySQL数据库
(2)创建表;
create table pages(id int primary key auto_increment,url varchar(1024),loc varchar(200),keywords varchar(200))
项目总体组成:
APP执行类:
package fly.domain;
public class APP {
public static void main(String[] args) {
new spider().start();
new spider().start();
new spider().start();
new spider().start();
new spider().start();
}
}
downloader下载类
package fly.domain;
import fly.service.IPagesService;
import fly.util.RegexUtil;
import fly.util.pagesUtil;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 网页下载程序,爬虫相关知识的下载,应该是只提供一个资源,类似打印机,所以设置为单例设计模式
*/
public class downloader {
private static downloader instance;
private String pageDir="D:\\workforce\\fly\\src\\main\\file\\";
public static downloader getInstance(){
if (instance==null){
instance=new downloader();
}
return instance;
}
private downloader(){}
/**
* 准备步骤:
* 1.按照网页地址进行下载:
* (1)判断网页是否下载过(查询数据库)
* (2)如果没有下载过,将url 追加到下载队列(池化模式)
*/
/**
*
* @param url 网页连接
*/
public void downloadPage(String url){
try {
//1先下载页面,解析页面
String pageCount= doDownload(url);
Set<String> hrefs= RegexUtil.parsePage(url,pageCount);
//保存到磁盘中
String locPath=pageDir+System.currentTimeMillis()+".html";
savePageToDisk(locPath,pageCount);
if(url!=null) {
pagesUtil pagesUtil=new pagesUtil();
Pages pages = new Pages();
pages.setUrl(url);
pages.setLoc(locPath);
pagesUtil.insert(pages);
//pagesService.insert(pages);
}
//2正则表达式解析所有页面中的<a href="url"
//3判断url是否在数据库中,处理解析出来的URL集合
// if(!pagesService.exist(url)){
// //追加url到队列中
// PageQueue.getInstance().addUrl(url);
// }
if(url!=null)
processHrefs(hrefs);
}catch (Exception e){
e.printStackTrace();
}
}
private void processHrefs(Set<String> hrefs) throws InterruptedException {
pagesUtil pagesUtil=new pagesUtil();
for (String url : hrefs) {
if(url!=null){
if(pagesUtil.exist(url)){
//追加url到队列中
PageQueue.getInstance().addUrl(url);
}
}
}
}
/**
* 保存页面到磁盘中
* @param locPath
*/
private void savePageToDisk(String locPath,String pageCount) {
try {
FileOutputStream fos=new FileOutputStream(locPath);
fos.write(pageCount.getBytes());
fos.close();
}catch (Exception e){
e.printStackTrace();
}
}
/**
* 下载整个网页
* @param url
*/
private String doDownload(String url){
try {
URL u=new URL(url);
HttpURLConnection conn=(HttpURLConnection)u.openConnection();//建立链接
InputStream in=conn.getInputStream();
ByteArrayOutputStream baos=new ByteArrayOutputStream();
byte[] buf=new byte[1024];
int len=0;
while((len=in.read(buf))!=-1){
baos.write(buf,0,len);
}
in.close();
baos.close();
//整个网页
String pageStr=new String(baos.toByteArray());
return pageStr;
// //正则表达式解析
// Pattern p=Pattern.compile("<a\\s*href=\"[\u0000-\uffff&&[^\u005c\u0022]]*\"");
// Matcher m=p.matcher(pageStr);
// while(m.find()){
// String s=m.group();
// System.out.println(s);
// }
}catch (Exception e){
e.printStackTrace();
}
return null;
}
}
PageQueue队列类
package fly.domain;
import java.util.LinkedList;
public class PageQueue {
private static PageQueue instance;//使用单例设计模式以及懒加载
public static PageQueue getInstance(){
if (instance==null){
instance=new PageQueue();
}
return instance;
}
private PageQueue(){
queue.add("https://www.csdn.net/");
}
private int MAX=100000;
private LinkedList<String> queue=new LinkedList<>(); //初始化一个队列
//增加url和删除url不能同时进行,所以必须是同步
public synchronized void addUrl(String url) throws InterruptedException {
while (queue.size()>MAX){
wait(); //当队列的长度大于MAX时,进行等待,否则进行添加
}
queue.add(url);
notifyAll();
}
public synchronized String takeFirst() throws InterruptedException {
while (queue.size()==0){
wait();
}
String url=queue.removeFirst();
notifyAll();
return url;
}
}
Pages页面信息类
package fly.domain;
import lombok.Data;
import lombok.Getter;
import lombok.Setter;
@Setter@Getter@Data
public class Pages {
private Integer id;
private String url;
private String loc;
private String keywords;
}
spider爬虫类
package fly.domain;
/**
* 下载线程
*/
public class spider extends Thread{
@Override
public void run() {
while (true){
try {
String url=PageQueue.getInstance().takeFirst();//获取到url,然后通过下载器下载
if(url!=null)
downloader.getInstance().downloadPage(url);
}catch (Exception e){
e.printStackTrace();
}
}
}
}
User用户类
package fly.domain;
import java.util.Date;
public class User {
private Long id;
private String password;
private String phone;
private String sex;
private String birthday;
private Date createtime;
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getPassword() {
return password;
}
public void setPassword(String password) {
this.password = password == null ? null : password.trim();
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone == null ? null : phone.trim();
}
public String getSex() {
return sex;
}
public void setSex(String sex) {
this.sex = sex == null ? null : sex.trim();
}
public String getBirthday() {
return birthday;
}
public void setBirthday(String birthday) {
this.birthday = birthday == null ? null : birthday.trim();
}
public Date getCreatetime() {
return createtime;
}
public void setCreatetime(Date createtime) {
this.createtime = createtime;
}
}
PagesMapper页面信息接口
package fly.mapper;
import fly.domain.Pages;
import java.util.List;
public interface PagesMapper {
int deleteByPrimaryKey(Integer id);
int insert(Pages record);
Pages selectByPrimaryKey(Integer id);
List<Pages> selectAll();
int updateByPrimaryKey(Pages record);
// 根据url判断在数据库中是否有该地址
int selectCountByUrl(String url);
}
UserMapper用户接口实现
package fly.mapper;
import fly.domain.User;
import java.util.List;
public interface UserMapper {
int deleteByPrimaryKey(Long id);
int insert(User record);
User selectByPrimaryKey(Long id);
List<User> selectAll();
int updateByPrimaryKey(User record);
}
IPagesService接口层
package fly.service;
import fly.domain.Pages;
import java.util.List;
public interface IPagesService {
int deleteByPrimaryKey(Integer id);
int insert(Pages record);
Pages selectByPrimaryKey(Integer id);
List<Pages> selectAll();
int updateByPrimaryKey(Pages record);
boolean exist(String url);
}
IUserService用户接口层
package fly.service;
import fly.domain.User;
import java.util.List;
public interface IUserService {
int deleteByPrimaryKey(Long id);
int insert(User record);
User selectByPrimaryKey(Long id);
List<User> selectAll();
int updateByPrimaryKey(User record);
}
pagesUtil工具类
package fly.util;
import fly.domain.Pages;
import fly.service.IPagesService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Service;
@Service
public class pagesUtil {
//如果用户没有的菜单将从列表中删除
@Autowired
private IPagesService pagesService;
public boolean exist(String url){
if(!pagesService.exist(url))
return true;
return false;
}
public void insert(Pages page){
pagesService.insert(page);
}
}
RegexUtil正则工具类
package fly.util;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RegexUtil {
/**
* 解析所有网页的超链接地址
*/
public static Set<String>parsePage(String srcUrl,String page){
Set<String>urls=new HashSet<>();
// 正则表达式解析
Pattern p= Pattern.compile("<a\\s*href=\"([\u0000-\uffff&&[^\u005c\u0022]]*)\"");
Matcher m=p.matcher(page);
while(m.find()){
String s= m.group(1);
if(s.startsWith("http://")){
urls.add(s);
for (String url : urls) {
// System.out.println(url+" 1111");
}
}else if(s.startsWith("/")){
String domainname=RegexUtil.parseDomainname(srcUrl);
urls.add(domainname+s);
}
}
return urls;
}
public static String parseDomainname(String srcUrl) {
if(!srcUrl.contains("/")){
return srcUrl;
}
Pattern p= Pattern.compile("http:\\/\\/[\\u0000-\\uffff&&[^\\/]]*\\/");
Matcher m=p.matcher(srcUrl);
if (m.find()){
String domain=m.group();
return domain.substring(0,domain.length()-1);
}
return null;
}
}
TimeTask时间工具
package fly.util;
import fly.domain.User;
import fly.service.IUserService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
@Component
public class TimeTask {
@Autowired
private IUserService userService;
//为没有结束时间的添加结束时间
@Scheduled(cron="0 30 23 ? * * ")//每天23点30分更新一次
public void updateUserActionEndTime() throws IOException, ParseException {
List<User>list=userService.selectAll();
for (User user : list) {
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
int year=cal.get(Calendar.YEAR);
int day=cal.get(Calendar.DATE);
int month = cal.get(Calendar.MONTH)+1;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date date = simpleDateFormat.parse(user.getBirthday());
int mon=date.getMonth()+1;
int da=date.getDate();
if(mon==month){
if(da-day<3&&da-day>0){
System.out.println(user.getPhone()+"在这三天将要生日,具体时间为"+year+"年"+mon+"月"+da+"日"+"请相关人员做好准备");
}
}
}
}
}
PagesMapper.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="fly.mapper.PagesMapper" >
<resultMap id="BaseResultMap" type="fly.domain.Pages" >
<id column="id" property="id" jdbcType="INTEGER" />
<result column="url" property="url" jdbcType="VARCHAR" />
<result column="loc" property="loc" jdbcType="VARCHAR" />
<result column="keywords" property="keywords" jdbcType="VARCHAR" />
</resultMap>
<delete id="deleteByPrimaryKey" parameterType="java.lang.Integer" >
delete from pages
where id = #{id,jdbcType=INTEGER}
</delete>
<insert id="insert" parameterType="fly.domain.Pages" useGeneratedKeys="true" keyProperty="id" >
insert into pages (url, loc, keywords
)
values (#{url,jdbcType=VARCHAR}, #{loc,jdbcType=VARCHAR}, #{keywords,jdbcType=VARCHAR}
)
</insert>
<update id="updateByPrimaryKey" parameterType="fly.domain.Pages" >
update pages
set url = #{url,jdbcType=VARCHAR},
loc = #{loc,jdbcType=VARCHAR},
keywords = #{keywords,jdbcType=VARCHAR}
where id = #{id,jdbcType=INTEGER}
</update>
<select id="selectByPrimaryKey" resultMap="BaseResultMap" parameterType="java.lang.Integer" >
select id, url, loc, keywords
from pages
where id = #{id,jdbcType=INTEGER}
</select>
<select id="selectAll" resultMap="BaseResultMap" >
select id, url, loc, keywords
from pages
</select>
<!--根据url判断在数据库中是否有该地址-->
<select id="selectCountByUrl" resultType="int" >
select count(1)
from pages where url=#{url}
</select>
</mapper>
UserMapper.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="fly.mapper.UserMapper" >
<resultMap id="BaseResultMap" type="fly.domain.User" >
<id column="id" property="id" jdbcType="BIGINT" />
<result column="password" property="password" jdbcType="VARCHAR" />
<result column="phone" property="phone" jdbcType="VARCHAR" />
<result column="sex" property="sex" jdbcType="VARCHAR" />
<result column="birthday" property="birthday" jdbcType="VARCHAR" />
<result column="createTime" property="createtime" jdbcType="TIMESTAMP" />
</resultMap>
<delete id="deleteByPrimaryKey" parameterType="java.lang.Long" >
delete from user
where id = #{id,jdbcType=BIGINT}
</delete>
<insert id="insert" parameterType="fly.domain.User" useGeneratedKeys="true" keyProperty="id" >
insert into user (password, phone, sex,
birthday, createTime)
values (#{password,jdbcType=VARCHAR}, #{phone,jdbcType=VARCHAR}, #{sex,jdbcType=VARCHAR},
#{birthday,jdbcType=VARCHAR}, #{createtime,jdbcType=TIMESTAMP})
</insert>
<update id="updateByPrimaryKey" parameterType="fly.domain.User" >
update user
set password = #{password,jdbcType=VARCHAR},
phone = #{phone,jdbcType=VARCHAR},
sex = #{sex,jdbcType=VARCHAR},
birthday = #{birthday,jdbcType=VARCHAR},
createTime = #{createtime,jdbcType=TIMESTAMP}
where id = #{id,jdbcType=BIGINT}
</update>
<select id="selectByPrimaryKey" resultMap="BaseResultMap" parameterType="java.lang.Long" >
select id, password, phone, sex, birthday, createTime
from user
where id = #{id,jdbcType=BIGINT}
</select>
<select id="selectAll" resultMap="BaseResultMap" >
select id, password, phone, sex, birthday, createTime
from user
</select>
</mapper>
测试类:
spiderTest测试类
package spiderTest;
import fly.domain.Pages;
import fly.domain.downloader;
import fly.service.IPagesService;
import fly.util.RegexUtil;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration("classpath:application.xml")
public class spiderTest {
@Autowired
private IPagesService pagesService;
@Test
public void test(){
Pages pages=new Pages();
pages.setUrl("1");
int j= pagesService.insert(pages);
}
@Test
public void testDownload(){
String url="https://www.runoob.com/";
downloader.getInstance().downloadPage(url);
}
@Test
public void testRegex(){
String str="";
Pattern p=Pattern.compile("<a\\s*href=\"[\u0000-\uffff&&[^\u005c\u0022]]*\"");
Matcher m=p.matcher(str);
while(m.find()){
String s=m.group();
System.out.println(s);
}
}
@Test
public void testDomainname(){
String url="http://www.runoob.com/123/kk/nnx";
// downloader.getInstance().downloadPage(url);
RegexUtil.parseDomainname(url);
}
}
爬取结果如下: