Java大作业,爬取信息并写入Excel。
项目下载:https://pan.baidu.com/s/1Z1rMWSyaAeGvZCm5CMgexQ(请用eclipse 2018打开,我的jdk版本是1.8)
附我爬取的10000部漫画信息下载地址:https://pan.baidu.com/s/12wCasdunyxGfdRNw84nbHw
数据图例:
主类代码:(如果有import报错,那么请下载上面的链接把里面lib文件夹下的.jar文件放进自己的项目,并右键jar包选择build path即可,若仍搞不懂请百度)
import java.io.File;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import us.codecraft.webmagic.pipeline.FilePipeline;
import org.openqa.selenium.firefox.*;
public class Main implements PageProcessor {
private Site site = Site.me().
setRetryTimes(3). //失败重试次数
setSleepTime(20); //爬取时间间隔
private int Num=0;
private String Title;
private String Author[]=new String[2];
private String Country;
private String State;
private String Popularity;
private String Tag[]=new String[3];
private String Type;
private String Update;
private String BookNum;
private String Talk;
private String Details;
private static WritableWorkbook book;
private static WritableSheet Sheet1;
private static WritableSheet Sheet2;
public void process(Page page) {
int StaticNum = 0;
page.addTargetRequests(
page.
getHtml().
links().
regex("http://manhua.dmzj.com/[a-z 0-9 -]+/"). //筛选网页规则
all()
);
Title=page.getHtml().
xpath("div[@class='odd_anim_title_m']//span//a//h1/text()").
toString();
if(Title!=null) {
StaticNum=++Num;
}
Author[0]=page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[3]//td//a[1]/text()").
toString();
Author[1]=page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[3]//td//a[2]/text()").
toString();
Country=page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[4]//td//a/text()").
toString();
State=page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[5]//td//a/text()").
toString();
/*Popularity=page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[6]//td[@id='hot_hits']/text()").
toString();*/
/*
page.putField("img", page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[6]//td[@id='hot_hits']").
toString());
Popularity=page.getResultItems().get("img");
System.out.println("人气:" + Popularity);
*/
Tag[0]=page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[7]//td//a[1]/text()").
toString();
Tag[1]=page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[7]//td//a[2]/text()").
toString();
Tag[2]=page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[7]//td//a[3]/text()").
toString();
Type=page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[8]//td//a/text()").
toString();
Update=page.getHtml().
xpath("div[@class='anim-main_list']//table//tbody//tr[9]//td//a/text()").
toString();
/*BookNum=page.getHtml().
xpath("span[@id='subscribe_num']/text()").
toString();
Talk=page.getHtml().
xpath("span[@class='comment_num']/text()").
toString();
Details=page.getHtml().
xpath("div[@class='odd_anim_title_m']//a/text()").
toString();*/
if(Title!=null) { //读取到有效数据
try {
Label label=new Label(0,StaticNum,Title); //标题
Sheet1.addCell(label);
label=new Label(1,StaticNum,Author[0]); //作者
Sheet1.addCell(label);
if(Author[1]!=null) {
label=new Label(2,StaticNum,Author[1]); //第二作者
Sheet1.addCell(label);
}
label=new Label(3,StaticNum,Country); //地区
Sheet1.addCell(label);
label=new Label(4,StaticNum,State); //状态
Sheet1.addCell(label);
label=new Label(5,StaticNum,Tag[0]); //标签1
Sheet1.addCell(label);
if(Tag[1]!=null) {
label=new Label(6,StaticNum,Tag[1]); //标签2
Sheet1.addCell(label);
if(Tag[2]!=null) {
label=new Label(7,StaticNum,Tag[2]); //标签3
Sheet1.addCell(label);
}
}
label=new Label(8,StaticNum,Type); //类型
Sheet1.addCell(label);
label=new Label(9,StaticNum,Update); //更新
Sheet1.addCell(label);
book.write(); //写入文件
}
catch(Exception e) {
System.out.println(e);
}
/*System.out.println(
"编号:" + Num + "\n" +
"作品:" + Title + "\n" +
"作者:" + Author[0] + "\n" +
"地区:" + Country + "\n" +
"状态:" + State + "\n" +
//"人气:" + Popularity + "\n" +
"标签:" + Tag[0] + Tag[1] + Tag[2] + "\n" +
"类型:" + Type + "\n" +
"更新:" + Update + "\n" +
//"订阅:" + BookNum + "\n" +
//"评论:" + Talk + "\n" +
//"详情:" + Details + "\n"
);*/
if(StaticNum==22000) { //抓取数据量
try {
book.write(); //写入文件
book.close(); //关闭文件
System.exit(0); //退出爬虫
}
catch(Exception e) {
System.out.println(e);
}
}
System.out.println(StaticNum);
}
}
public Site getSite() {
return site;
}
public static void main(String[] args) {
try { //创建Excel
book= Workbook.createWorkbook(new File("File.xls"));
Sheet1=book.createSheet("表1",0); //创建两个表页
Sheet2=book.createSheet("表2",1);
System.out.println("创建Excel成功\n");
Label label=new Label(0,0,"漫画名"); //填表头
Sheet1.addCell(label);
label=new Label(1,0,"作者1"); //填表头
Sheet1.addCell(label);
label=new Label(2,0,"作者2"); //填表头
Sheet1.addCell(label);
label=new Label(3,0,"地区"); //填表头
Sheet1.addCell(label);
label=new Label(4,0,"状态"); //填表头
Sheet1.addCell(label);
label=new Label(5,0,"标签1"); //填表头
Sheet1.addCell(label);
label=new Label(6,0,"标签2"); //填表头
Sheet1.addCell(label);
label=new Label(7,0,"标签3"); //填表头
Sheet1.addCell(label);
label=new Label(8,0,"类型"); //填表头
Sheet1.addCell(label);
label=new Label(9,0,"连载进度"); //填表头
Sheet1.addCell(label);
}
catch(Exception e) {
System.out.println(e);
}
Spider.
create(new Main()).
addUrl("http://manhua.dmzj.com"). //起始网页
thread(1). //线程数
run();
}
}
Excel类:
import jxl.write.Label;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
public class Excel {
private int Raw;
public Excel(int Raw) {
this.Raw=Raw;
}
public Label Add(int a,String c) throws RowsExceededException, WriteException {
Label label=new Label(a,this.Raw,c);
return label;
}
}
爬动漫之家手机版网页:
import java.io.File;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
public class AppMain implements PageProcessor {
private Site site = Site.me().
setRetryTimes(5). //失败重试次数
setSleepTime(50); //爬取时间间隔
private int CodeNum=2;
private int SuccessNum=0;
private static WritableWorkbook book;
private static WritableSheet Sheet1;
private static WritableSheet Sheet2;
public void process(Page page) {
String Title;
String Author[]=new String[2];
String Tag[]=new String[3];
String Type;
String Country;
String State;
String UpdateTime;
String Details;
for(int i=0;i<5;i++)
page.addTargetRequest("https://m.dmzj.com/info/" + (++CodeNum) + ".html");
//CodeNum+=;
Title=page.getHtml().
xpath("div[@class='BarTit']/text()").
toString();
if(Title!=null) {
UpdateTime=page.getHtml().
xpath("span[@class='date']/text()").
toString();
Author[0]=page.getHtml().
xpath("div[@class='sub_r']//p[1]//a[1]/text()").
toString();
Author[1]=page.getHtml().
xpath("div[@class='sub_r']//p[1]//a[2]/text()").
toString();
Tag[0]=page.getHtml().
xpath("div[@class='sub_r']//p[2]//a[1]/text()").
toString();
Tag[1]=page.getHtml().
xpath("div[@class='sub_r']//p[2]//a[2]/text()").
toString();
Tag[2]=page.getHtml().
xpath("div[@class='sub_r']//p[2]//a[3]/text()").
toString();
Type=page.getHtml().
xpath("div[@class='sub_r']//p[3]//a[1]/text()").
toString();
Country=page.getHtml().
xpath("div[@class='sub_r']//p[3]//a[2]/text()").
toString();
State=page.getHtml().
xpath("div[@class='sub_r']//p[3]//a[3]/text()").
toString();
Details=page.getHtml().
xpath("p[@class='txtDesc autoHeight']/text()").
toString();
try {
if(CodeNum<49020) {
SuccessNum++;
Excel a=new Excel(SuccessNum);
Sheet1.addCell(a.Add(0, Title));
Sheet1.addCell(a.Add(1, Author[0]));
if(Author[1]!=null) Sheet1.addCell(a.Add(2, Author[1]));
Sheet1.addCell(a.Add(3, Tag[0]));
if(Tag[1]!=null) {
Sheet1.addCell(a.Add(4, Tag[1]));
if(Tag[2]!=null) Sheet1.addCell(a.Add(5, Tag[2]));
}
Sheet1.addCell(a.Add(6, Type));
Sheet1.addCell(a.Add(7, Country));
Sheet1.addCell(a.Add(8, State));
Sheet1.addCell(a.Add(9, UpdateTime));
Sheet1.addCell(a.Add(10, Details));
}
if(CodeNum>=49020) { //抓取数据量
try {
book.write(); //写入文件
book.close(); //关闭文件
System.exit(0); //退出爬虫
}
catch(Exception e) {
System.out.println(e);
}
}
System.out.println(SuccessNum);
//System.out.println("漫画名:" + Title);
/*System.out.println("作者:" + Author[0] + " " + Author[1]);
System.out.println("标签:" + Tag[0] + " " + Tag[1] + " " + Tag[2]);
System.out.println("类型:" + Type);
System.out.println("地区:" + Country);
System.out.println("状态:" + State);
System.out.println("最近更新时间:" + UpdateTime);
System.out.println(Details);*/
}
catch(Exception e) {
System.out.println(e);
}
}
else System.out.println(SuccessNum);
}
public Site getSite() {
return site;
}
public static void main(String[] args) {
try { //创建Excel
book= Workbook.createWorkbook(new File("File.xls"));
Sheet1=book.createSheet("表1",0); //创建两个表页
Sheet2=book.createSheet("表2",1);
System.out.println("创建Excel成功\n");
Label label=new Label(0,0,"漫画名"); //填表头
Sheet1.addCell(label);
label=new Label(1,0,"作者1"); //填表头
Sheet1.addCell(label);
label=new Label(2,0,"作者2"); //填表头
Sheet1.addCell(label);
label=new Label(3,0,"标签1"); //填表头
Sheet1.addCell(label);
label=new Label(4,0,"标签2"); //填表头
Sheet1.addCell(label);
label=new Label(5,0,"标签3"); //填表头
Sheet1.addCell(label);
label=new Label(6,0,"类型"); //填表头
Sheet1.addCell(label);
label=new Label(7,0,"地区"); //填表头
Sheet1.addCell(label);
label=new Label(8,0,"状态"); //填表头
Sheet1.addCell(label);
label=new Label(9,0,"最近更新时间"); //填表头
Sheet1.addCell(label);
label=new Label(10,0,"详情"); //填表头
Sheet1.addCell(label);
}
catch(Exception e) {
System.out.println(e);
}
Spider.create(new AppMain()).
addUrl("https://m.dmzj.com").thread(1).run();
}
}