java爬虫 爬取动漫之家10000部漫画信息

 

Java大作业,爬取信息并写入Excel。

项目下载:https://pan.baidu.com/s/1Z1rMWSyaAeGvZCm5CMgexQ(请用eclipse 2018打开,我的jdk版本是1.8)

附我爬取的10000部漫画信息下载地址:https://pan.baidu.com/s/12wCasdunyxGfdRNw84nbHw​​​​​​​

 

数据图例:

主类代码:(如果有import报错,那么请下载上面的链接把里面lib文件夹下的.jar文件放进自己的项目,并右键jar包选择build path即可,若仍搞不懂请百度)

import java.io.File;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import us.codecraft.webmagic.pipeline.FilePipeline;
import org.openqa.selenium.firefox.*;

public class Main implements PageProcessor {
	private Site site = Site.me().
			setRetryTimes(3).	//失败重试次数
			setSleepTime(20);	//爬取时间间隔
	
	private int Num=0;
	private String Title;
	private String Author[]=new String[2];
	private String Country;
	private String State;
	private String Popularity;
	private String Tag[]=new String[3];
	private String Type;
	private String Update;
	private String BookNum;
	private String Talk;
	private String Details;
	
	private static WritableWorkbook book;
	private static WritableSheet Sheet1;
	private static WritableSheet Sheet2;
	
    public void process(Page page) {
    	int StaticNum = 0;
    	page.addTargetRequests(
    			page.
    			getHtml().
    			links().
    			regex("http://manhua.dmzj.com/[a-z 0-9 -]+/").	//筛选网页规则
    			all()
    	);
    	Title=page.getHtml().
    			xpath("div[@class='odd_anim_title_m']//span//a//h1/text()").
    			toString();
    	if(Title!=null) {
    		StaticNum=++Num;
    	}
    	Author[0]=page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[3]//td//a[1]/text()").
    			toString();
    	Author[1]=page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[3]//td//a[2]/text()").
    			toString();
    	Country=page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[4]//td//a/text()").
    			toString();
    	State=page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[5]//td//a/text()").
    			toString();
    	/*Popularity=page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[6]//td[@id='hot_hits']/text()").
    			toString();*/
    	/*
    	page.putField("img", page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[6]//td[@id='hot_hits']").
    			toString());
    	Popularity=page.getResultItems().get("img");
    	System.out.println("人气:" + Popularity);
    	*/
    	Tag[0]=page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[7]//td//a[1]/text()").
    			toString();
    	Tag[1]=page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[7]//td//a[2]/text()").
    			toString();
    	Tag[2]=page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[7]//td//a[3]/text()").
    			toString();
    	Type=page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[8]//td//a/text()").
    			toString();
    	Update=page.getHtml().
    			xpath("div[@class='anim-main_list']//table//tbody//tr[9]//td//a/text()").
    			toString();
    	/*BookNum=page.getHtml().
    			xpath("span[@id='subscribe_num']/text()").
    			toString();
    	Talk=page.getHtml().
    			xpath("span[@class='comment_num']/text()").
    			toString();
    	Details=page.getHtml().
    			xpath("div[@class='odd_anim_title_m']//a/text()").
    			toString();*/
    	if(Title!=null) {	//读取到有效数据
    		try {
    			Label label=new Label(0,StaticNum,Title);	//标题
    			Sheet1.addCell(label);
    			label=new Label(1,StaticNum,Author[0]);	//作者
    			Sheet1.addCell(label);
    			if(Author[1]!=null) {
    				label=new Label(2,StaticNum,Author[1]);	//第二作者
        			Sheet1.addCell(label);
    			}
    			label=new Label(3,StaticNum,Country);	//地区
    			Sheet1.addCell(label);
    			label=new Label(4,StaticNum,State);	//状态
    			Sheet1.addCell(label);
    			label=new Label(5,StaticNum,Tag[0]);	//标签1
    			Sheet1.addCell(label);
    			if(Tag[1]!=null) {
    				label=new Label(6,StaticNum,Tag[1]);	//标签2
        			Sheet1.addCell(label);
        			if(Tag[2]!=null) {
        				label=new Label(7,StaticNum,Tag[2]);	//标签3
            			Sheet1.addCell(label);
        			}
    			}
    			label=new Label(8,StaticNum,Type);	//类型
    			Sheet1.addCell(label);
    			label=new Label(9,StaticNum,Update);	//更新
    			Sheet1.addCell(label);
    			book.write();	//写入文件
    		}
    		catch(Exception e) {
    			System.out.println(e); 
    		}
    		/*System.out.println(
    		"编号:" + Num + "\n" +
    		"作品:" + Title + "\n" +
    		"作者:" + Author[0] + "\n" +
    		"地区:" + Country + "\n" +
    		"状态:" + State + "\n" +
    		//"人气:" + Popularity + "\n" +
    		"标签:" + Tag[0] + Tag[1] + Tag[2] + "\n" +
    		"类型:" + Type + "\n" +
    		"更新:" + Update + "\n" +
    		//"订阅:" + BookNum + "\n" +
    		//"评论:" + Talk + "\n" +
    		//"详情:" + Details + "\n" 
    		);*/
    		if(StaticNum==22000) {	//抓取数据量
    			try {
    				book.write();	//写入文件
					book.close();	//关闭文件
					System.exit(0);	//退出爬虫
    			}
    			catch(Exception e) {
        			System.out.println(e); 
        		}
    		}
    		System.out.println(StaticNum);
    	}
    }
    
    public Site getSite() {
        return site;
    }
    
	public static void main(String[] args) {
    	try {	//创建Excel
			book= Workbook.createWorkbook(new File("File.xls"));
			Sheet1=book.createSheet("表1",0);	//创建两个表页
			Sheet2=book.createSheet("表2",1);
			System.out.println("创建Excel成功\n");
			
			Label label=new Label(0,0,"漫画名");	//填表头
			Sheet1.addCell(label);
			label=new Label(1,0,"作者1");	//填表头
			Sheet1.addCell(label);
			label=new Label(2,0,"作者2");	//填表头
			Sheet1.addCell(label);
			label=new Label(3,0,"地区");	//填表头
			Sheet1.addCell(label);
			label=new Label(4,0,"状态");	//填表头
			Sheet1.addCell(label);
			label=new Label(5,0,"标签1");	//填表头
			Sheet1.addCell(label);
			label=new Label(6,0,"标签2");	//填表头
			Sheet1.addCell(label);
			label=new Label(7,0,"标签3");	//填表头
			Sheet1.addCell(label);
			label=new Label(8,0,"类型");	//填表头
			Sheet1.addCell(label);
			label=new Label(9,0,"连载进度");	//填表头
			Sheet1.addCell(label);
		}
		catch(Exception e) {
			System.out.println(e);
		}
    	
    	Spider.
        create(new Main()).
        addUrl("http://manhua.dmzj.com").	//起始网页
        thread(1).	//线程数
        run();
    }
}

Excel类:

import jxl.write.Label;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
public class Excel {
	private int Raw;
	public Excel(int Raw) {
		this.Raw=Raw;
	}
	
	public Label Add(int a,String c) throws RowsExceededException, WriteException {
		Label label=new Label(a,this.Raw,c);
		return label;
	}
}

爬动漫之家手机版网页:

import java.io.File;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class AppMain implements PageProcessor {
	private Site site = Site.me().
			setRetryTimes(5).	//失败重试次数
			setSleepTime(50);	//爬取时间间隔
	private int CodeNum=2;
	private int SuccessNum=0;
	
	private static WritableWorkbook book;
	private static WritableSheet Sheet1;
	private static WritableSheet Sheet2;
    public void process(Page page) {
    	String Title;
    	String Author[]=new String[2];
    	String Tag[]=new String[3];
    	String Type;
    	String Country;
    	String State;
    	String UpdateTime;
    	String Details;
    	for(int i=0;i<5;i++)
    		page.addTargetRequest("https://m.dmzj.com/info/" + (++CodeNum) + ".html");
    	//CodeNum+=;
    	Title=page.getHtml().
    			xpath("div[@class='BarTit']/text()").
    			toString();
    	if(Title!=null) {
    		UpdateTime=page.getHtml().
    				xpath("span[@class='date']/text()").
    				toString();
    		Author[0]=page.getHtml().
        			xpath("div[@class='sub_r']//p[1]//a[1]/text()").
        			toString();
        	Author[1]=page.getHtml().
        			xpath("div[@class='sub_r']//p[1]//a[2]/text()").
        			toString();
        	Tag[0]=page.getHtml().
        			xpath("div[@class='sub_r']//p[2]//a[1]/text()").
        			toString();
        	Tag[1]=page.getHtml().
        			xpath("div[@class='sub_r']//p[2]//a[2]/text()").
        			toString();
        	Tag[2]=page.getHtml().
        			xpath("div[@class='sub_r']//p[2]//a[3]/text()").
        			toString();
        	Type=page.getHtml().
        			xpath("div[@class='sub_r']//p[3]//a[1]/text()").
        			toString();
        	Country=page.getHtml().
        			xpath("div[@class='sub_r']//p[3]//a[2]/text()").
        			toString();
        	State=page.getHtml().
        			xpath("div[@class='sub_r']//p[3]//a[3]/text()").
        			toString();
        	Details=page.getHtml().
        			xpath("p[@class='txtDesc autoHeight']/text()").
        			toString();
    		try {
    			if(CodeNum<49020) {
    				SuccessNum++;
    				Excel a=new Excel(SuccessNum);
    				Sheet1.addCell(a.Add(0, Title));
    				Sheet1.addCell(a.Add(1, Author[0]));
    				if(Author[1]!=null) Sheet1.addCell(a.Add(2, Author[1]));
    				Sheet1.addCell(a.Add(3, Tag[0]));
    				if(Tag[1]!=null) {
    					Sheet1.addCell(a.Add(4, Tag[1]));
    					if(Tag[2]!=null) Sheet1.addCell(a.Add(5, Tag[2]));
    				}
    				Sheet1.addCell(a.Add(6, Type));
    				Sheet1.addCell(a.Add(7, Country));
    				Sheet1.addCell(a.Add(8, State));
    				Sheet1.addCell(a.Add(9, UpdateTime));
    				Sheet1.addCell(a.Add(10, Details));
    			}
    			if(CodeNum>=49020) {	//抓取数据量
        			try {
        				book.write();	//写入文件
    					book.close();	//关闭文件
    					System.exit(0);	//退出爬虫
        			}
        			catch(Exception e) {
            			System.out.println(e); 
            		}
        		}
    			System.out.println(SuccessNum);
        		//System.out.println("漫画名:" + Title);
        		/*System.out.println("作者:" + Author[0] + " " + Author[1]);
        		System.out.println("标签:" + Tag[0] + " " + Tag[1] + " " + Tag[2]);
        		System.out.println("类型:" + Type);
        		System.out.println("地区:" + Country);
        		System.out.println("状态:" + State);
        		System.out.println("最近更新时间:" + UpdateTime);
        		System.out.println(Details);*/
    		}
    		catch(Exception e) {
    			System.out.println(e);
    		}
        }
        else System.out.println(SuccessNum);
    }
    
    public Site getSite() {
        return site;
    }

	public static void main(String[] args) {
		try {	//创建Excel
			book= Workbook.createWorkbook(new File("File.xls"));
			Sheet1=book.createSheet("表1",0);	//创建两个表页
			Sheet2=book.createSheet("表2",1);
			System.out.println("创建Excel成功\n");
			Label label=new Label(0,0,"漫画名");	//填表头
			Sheet1.addCell(label);
			label=new Label(1,0,"作者1");	//填表头
			Sheet1.addCell(label);
			label=new Label(2,0,"作者2");	//填表头
			Sheet1.addCell(label);
			label=new Label(3,0,"标签1");	//填表头
			Sheet1.addCell(label);
			label=new Label(4,0,"标签2");	//填表头
			Sheet1.addCell(label);
			label=new Label(5,0,"标签3");	//填表头
			Sheet1.addCell(label);
			label=new Label(6,0,"类型");	//填表头
			Sheet1.addCell(label);
			label=new Label(7,0,"地区");	//填表头
			Sheet1.addCell(label);
			label=new Label(8,0,"状态");	//填表头
			Sheet1.addCell(label);
			label=new Label(9,0,"最近更新时间");	//填表头
			Sheet1.addCell(label);
			label=new Label(10,0,"详情");	//填表头
			Sheet1.addCell(label);
		}
		catch(Exception e) {
			System.out.println(e);
		}
    	Spider.create(new AppMain()).
    	addUrl("https://m.dmzj.com").thread(1).run();
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值