爬虫小练习

爬虫:url+regex+stream

 

准备工作:

(1)创建spider线程、MySQL数据库

(2)创建表;

          create table pages(id int primary key auto_increment,url varchar(1024),loc varchar(200),keywords varchar(200))

 

 

 

项目总体组成:

APP执行类:

package fly.domain;

 

public class APP {

public static void main(String[] args) {

new spider().start();

new spider().start();

new spider().start();

new spider().start();

new spider().start();

}

}

 

 

downloader下载类

package fly.domain;

 

import fly.service.IPagesService;

import fly.util.RegexUtil;

import fly.util.pagesUtil;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Component;

 

import java.io.ByteArrayInputStream;

import java.io.ByteArrayOutputStream;

import java.io.FileOutputStream;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.URL;

import java.util.Set;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

 

/**

* 网页下载程序,爬虫相关知识的下载,应该是只提供一个资源,类似打印机,所以设置为单例设计模式

*/

 

public class downloader {

 

private static downloader instance;

 

private String pageDir="D:\\workforce\\fly\\src\\main\\file\\";

 

public static downloader getInstance(){

if (instance==null){

instance=new downloader();

}

return instance;

}

 

private downloader(){}

 

 

/**

* 准备步骤:

* 1.按照网页地址进行下载:

* 1)判断网页是否下载过(查询数据库)

* 2)如果没有下载过,将url 追加到下载队列(池化模式)

*/

 

/**

*

* @param url 网页连接

*/

public void downloadPage(String url){

try {

//1先下载页面,解析页面

String pageCount= doDownload(url);

 

Set<String> hrefs= RegexUtil.parsePage(url,pageCount);

//保存到磁盘中

String locPath=pageDir+System.currentTimeMillis()+".html";

 

savePageToDisk(locPath,pageCount);

 

 

if(url!=null) {

pagesUtil pagesUtil=new pagesUtil();

Pages pages = new Pages();

pages.setUrl(url);

pages.setLoc(locPath);

pagesUtil.insert(pages);

//pagesService.insert(pages);

}

 

 

//2正则表达式解析所有页面中的<a href="url"

//3判断url是否在数据库中,处理解析出来的URL集合

// if(!pagesService.exist(url)){

// //追加url到队列中

// PageQueue.getInstance().addUrl(url);

// }

 

if(url!=null)

processHrefs(hrefs);

}catch (Exception e){

e.printStackTrace();

}

}

 

private void processHrefs(Set<String> hrefs) throws InterruptedException {

pagesUtil pagesUtil=new pagesUtil();

for (String url : hrefs) {

if(url!=null){

if(pagesUtil.exist(url)){

//追加url到队列中

 

PageQueue.getInstance().addUrl(url);

}

}

}

 

}

 

/**

* 保存页面到磁盘中

* @param locPath

*/

private void savePageToDisk(String locPath,String pageCount) {

try {

FileOutputStream fos=new FileOutputStream(locPath);

 

fos.write(pageCount.getBytes());

fos.close();

}catch (Exception e){

e.printStackTrace();

}

}

 

/**

* 下载整个网页

* @param url

*/

private String doDownload(String url){

try {

URL u=new URL(url);

HttpURLConnection conn=(HttpURLConnection)u.openConnection();//建立链接

InputStream in=conn.getInputStream();

ByteArrayOutputStream baos=new ByteArrayOutputStream();

byte[] buf=new byte[1024];

int len=0;

while((len=in.read(buf))!=-1){

baos.write(buf,0,len);

}

in.close();

baos.close();

//整个网页

String pageStr=new String(baos.toByteArray());

return pageStr;

// //正则表达式解析

// Pattern p=Pattern.compile("<a\\s*href=\"[\u0000-\uffff&&[^\u005c\u0022]]*\"");

// Matcher m=p.matcher(pageStr);

// while(m.find()){

// String s=m.group();

// System.out.println(s);

// }

}catch (Exception e){

e.printStackTrace();

}

return null;

}

}

 

 

PageQueue队列类

package fly.domain;

 

import java.util.LinkedList;

 

public class PageQueue {

private static PageQueue instance;//使用单例设计模式以及懒加载

public static PageQueue getInstance(){

if (instance==null){

instance=new PageQueue();

}

return instance;

}

private PageQueue(){

queue.add("https://www.csdn.net/");

}

 

private int MAX=100000;

 

private LinkedList<String> queue=new LinkedList<>(); //初始化一个队列

 

//增加url和删除url不能同时进行,所以必须是同步

public synchronized void addUrl(String url) throws InterruptedException {

while (queue.size()>MAX){

wait(); //当队列的长度大于MAX时,进行等待,否则进行添加

}

queue.add(url);

notifyAll();

}

 

 

public synchronized String takeFirst() throws InterruptedException {

while (queue.size()==0){

wait();

}

String url=queue.removeFirst();

notifyAll();

return url;

}

}

 

 

Pages页面信息类

package fly.domain;

 

import lombok.Data;

import lombok.Getter;

import lombok.Setter;

 

@Setter@Getter@Data

public class Pages {

private Integer id;

 

private String url;

 

private String loc;

 

private String keywords;

 

 

}

 

 

 

 

spider爬虫类

package fly.domain;

 

 

/**

* 下载线程

*/

public class spider extends Thread{

@Override

public void run() {

while (true){

try {

String url=PageQueue.getInstance().takeFirst();//获取到url,然后通过下载器下载

if(url!=null)

downloader.getInstance().downloadPage(url);

}catch (Exception e){

e.printStackTrace();

}

}

}

}

 

 

 

 

User用户类

package fly.domain;

 

import java.util.Date;

 

public class User {

private Long id;

 

private String password;

 

private String phone;

 

private String sex;

 

private String birthday;

 

private Date createtime;

 

public Long getId() {

return id;

}

 

public void setId(Long id) {

this.id = id;

}

 

public String getPassword() {

return password;

}

 

public void setPassword(String password) {

this.password = password == null ? null : password.trim();

}

 

public String getPhone() {

return phone;

}

 

public void setPhone(String phone) {

this.phone = phone == null ? null : phone.trim();

}

 

public String getSex() {

return sex;

}

 

public void setSex(String sex) {

this.sex = sex == null ? null : sex.trim();

}

 

public String getBirthday() {

return birthday;

}

 

public void setBirthday(String birthday) {

this.birthday = birthday == null ? null : birthday.trim();

}

 

public Date getCreatetime() {

return createtime;

}

 

public void setCreatetime(Date createtime) {

this.createtime = createtime;

}

}

 

 

PagesMapper页面信息接口

 

package fly.mapper;

 

import fly.domain.Pages;

import java.util.List;

 

public interface PagesMapper {

int deleteByPrimaryKey(Integer id);

 

int insert(Pages record);

 

Pages selectByPrimaryKey(Integer id);

 

List<Pages> selectAll();

 

int updateByPrimaryKey(Pages record);

 

// 根据url判断在数据库中是否有该地址

int selectCountByUrl(String url);

}

 

 

 

 

UserMapper用户接口实现

package fly.mapper;

 

import fly.domain.User;

import java.util.List;

 

public interface UserMapper {

int deleteByPrimaryKey(Long id);

 

int insert(User record);

 

User selectByPrimaryKey(Long id);

 

List<User> selectAll();

 

int updateByPrimaryKey(User record);

}

 

 

 

IPagesService接口层

 

package fly.service;

 

import fly.domain.Pages;

 

import java.util.List;

 

public interface IPagesService {

int deleteByPrimaryKey(Integer id);

 

int insert(Pages record);

 

Pages selectByPrimaryKey(Integer id);

 

List<Pages> selectAll();

 

int updateByPrimaryKey(Pages record);

 

boolean exist(String url);

}

 

 

IUserService用户接口层

package fly.service;

 

import fly.domain.User;

 

import java.util.List;

 

public interface IUserService {

int deleteByPrimaryKey(Long id);

 

int insert(User record);

 

User selectByPrimaryKey(Long id);

 

List<User> selectAll();

 

int updateByPrimaryKey(User record);

}

 

 

pagesUtil工具类

package fly.util;

 

import fly.domain.Pages;

import fly.service.IPagesService;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Component;

import org.springframework.stereotype.Service;

 

@Service

public class pagesUtil {

 

//如果用户没有的菜单将从列表中删除

 

 

@Autowired

private IPagesService pagesService;

 

 

public boolean exist(String url){

if(!pagesService.exist(url))

return true;

return false;

}

 

 

public void insert(Pages page){

pagesService.insert(page);

}

}

 

 

RegexUtil正则工具类

package fly.util;

 

import java.util.HashSet;

import java.util.Set;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

 

public class RegexUtil {

/**

* 解析所有网页的超链接地址

*/

public static Set<String>parsePage(String srcUrl,String page){

Set<String>urls=new HashSet<>();

// 正则表达式解析

Pattern p= Pattern.compile("<a\\s*href=\"([\u0000-\uffff&&[^\u005c\u0022]]*)\"");

Matcher m=p.matcher(page);

 

while(m.find()){

 

String s= m.group(1);

if(s.startsWith("http://")){

 

urls.add(s);

for (String url : urls) {

// System.out.println(url+" 1111");

}

}else if(s.startsWith("/")){

String domainname=RegexUtil.parseDomainname(srcUrl);

urls.add(domainname+s);

}

 

}

return urls;

}

 

public static String parseDomainname(String srcUrl) {

if(!srcUrl.contains("/")){

return srcUrl;

}

Pattern p= Pattern.compile("http:\\/\\/[\\u0000-\\uffff&&[^\\/]]*\\/");

Matcher m=p.matcher(srcUrl);

if (m.find()){

String domain=m.group();

 

return domain.substring(0,domain.length()-1);

}

return null;

}

}

 

 

 

TimeTask时间工具

package fly.util;

 

import fly.domain.User;

import fly.service.IUserService;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.scheduling.annotation.Scheduled;

import org.springframework.stereotype.Component;

 

 

import java.io.IOException;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.Calendar;

import java.util.Date;

import java.util.List;

 

@Component

public class TimeTask {

 

@Autowired

private IUserService userService;

//为没有结束时间的添加结束时间

@Scheduled(cron="0 30 23 ? * * ")//每天2330分更新一次

public void updateUserActionEndTime() throws IOException, ParseException {

List<User>list=userService.selectAll();

for (User user : list) {

Calendar cal = Calendar.getInstance();

cal.setTime(new Date());

int year=cal.get(Calendar.YEAR);

int day=cal.get(Calendar.DATE);

int month = cal.get(Calendar.MONTH)+1;

 

 

 

SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

Date date = simpleDateFormat.parse(user.getBirthday());

int mon=date.getMonth()+1;

int da=date.getDate();

 

 

if(mon==month){

 

if(da-day<3&&da-day>0){

System.out.println(user.getPhone()+"在这三天将要生日,具体时间为"+year+""+mon+""+da+""+"请相关人员做好准备");

}

}

}

}

 

 

}

 

 

 

 

 

PagesMapper.xml

<?xml version="1.0" encoding="UTF-8" ?>

<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >

<mapper namespace="fly.mapper.PagesMapper" >

<resultMap id="BaseResultMap" type="fly.domain.Pages" >

<id column="id" property="id" jdbcType="INTEGER" />

<result column="url" property="url" jdbcType="VARCHAR" />

<result column="loc" property="loc" jdbcType="VARCHAR" />

<result column="keywords" property="keywords" jdbcType="VARCHAR" />

</resultMap>

<delete id="deleteByPrimaryKey" parameterType="java.lang.Integer" >

delete from pages

where id = #{id,jdbcType=INTEGER}

</delete>

<insert id="insert" parameterType="fly.domain.Pages" useGeneratedKeys="true" keyProperty="id" >

insert into pages (url, loc, keywords

)

values (#{url,jdbcType=VARCHAR}, #{loc,jdbcType=VARCHAR}, #{keywords,jdbcType=VARCHAR}

)

</insert>

<update id="updateByPrimaryKey" parameterType="fly.domain.Pages" >

update pages

set url = #{url,jdbcType=VARCHAR},

loc = #{loc,jdbcType=VARCHAR},

keywords = #{keywords,jdbcType=VARCHAR}

where id = #{id,jdbcType=INTEGER}

</update>

<select id="selectByPrimaryKey" resultMap="BaseResultMap" parameterType="java.lang.Integer" >

select id, url, loc, keywords

from pages

where id = #{id,jdbcType=INTEGER}

</select>

<select id="selectAll" resultMap="BaseResultMap" >

select id, url, loc, keywords

from pages

</select>

 

 

<!--根据url判断在数据库中是否有该地址-->

<select id="selectCountByUrl" resultType="int" >

select count(1)

from pages where url=#{url}

</select>

</mapper>

 

 

 

UserMapper.xml

<?xml version="1.0" encoding="UTF-8" ?>

<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >

<mapper namespace="fly.mapper.UserMapper" >

<resultMap id="BaseResultMap" type="fly.domain.User" >

<id column="id" property="id" jdbcType="BIGINT" />

<result column="password" property="password" jdbcType="VARCHAR" />

<result column="phone" property="phone" jdbcType="VARCHAR" />

<result column="sex" property="sex" jdbcType="VARCHAR" />

<result column="birthday" property="birthday" jdbcType="VARCHAR" />

<result column="createTime" property="createtime" jdbcType="TIMESTAMP" />

</resultMap>

<delete id="deleteByPrimaryKey" parameterType="java.lang.Long" >

delete from user

where id = #{id,jdbcType=BIGINT}

</delete>

<insert id="insert" parameterType="fly.domain.User" useGeneratedKeys="true" keyProperty="id" >

insert into user (password, phone, sex,

birthday, createTime)

values (#{password,jdbcType=VARCHAR}, #{phone,jdbcType=VARCHAR}, #{sex,jdbcType=VARCHAR},

#{birthday,jdbcType=VARCHAR}, #{createtime,jdbcType=TIMESTAMP})

</insert>

<update id="updateByPrimaryKey" parameterType="fly.domain.User" >

update user

set password = #{password,jdbcType=VARCHAR},

phone = #{phone,jdbcType=VARCHAR},

sex = #{sex,jdbcType=VARCHAR},

birthday = #{birthday,jdbcType=VARCHAR},

createTime = #{createtime,jdbcType=TIMESTAMP}

where id = #{id,jdbcType=BIGINT}

</update>

<select id="selectByPrimaryKey" resultMap="BaseResultMap" parameterType="java.lang.Long" >

select id, password, phone, sex, birthday, createTime

from user

where id = #{id,jdbcType=BIGINT}

</select>

<select id="selectAll" resultMap="BaseResultMap" >

select id, password, phone, sex, birthday, createTime

from user

</select>

</mapper>

 

 

测试类:

spiderTest测试类

package spiderTest;

 

import fly.domain.Pages;

import fly.domain.downloader;

import fly.service.IPagesService;

import fly.util.RegexUtil;

import org.junit.Test;

import org.junit.runner.RunWith;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.test.context.ContextConfiguration;

import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;

 

import java.util.regex.Matcher;

import java.util.regex.Pattern;

 

@RunWith(SpringJUnit4ClassRunner.class)

@ContextConfiguration("classpath:application.xml")

public class spiderTest {

@Autowired

private IPagesService pagesService;

@Test

public void test(){

Pages pages=new Pages();

pages.setUrl("1");

int j= pagesService.insert(pages);

 

 

}

 

 

@Test

public void testDownload(){

String url="https://www.runoob.com/";

downloader.getInstance().downloadPage(url);

}

 

@Test

public void testRegex(){

String str="";

Pattern p=Pattern.compile("<a\\s*href=\"[\u0000-\uffff&&[^\u005c\u0022]]*\"");

Matcher m=p.matcher(str);

while(m.find()){

String s=m.group();

System.out.println(s);

}

}

 

@Test

public void testDomainname(){

String url="http://www.runoob.com/123/kk/nnx";

// downloader.getInstance().downloadPage(url);

RegexUtil.parseDomainname(url);

}

}

 

 

爬取结果如下:

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值