注意:这是通用写法,不同的网站获取的方式可能有所不同,可以在此基础上修改。
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Date;
import java.util.List;
public class BaseCrawling implements PageProcessor {
private Site site = Site.me().setCharset("utf8").setRetryTimes(1000).setSleepTime(1000);
@Override
public void process(Page page) {
Html html = page.getHtml();
System.out.println("html:"+html);
List<String> list =page.getHtml().$("img","src").regex(".*https.*").all();
for(String str : list){
System.out.println("str:"+str);
downLoadFileToPath(str,"D://tupian");
}
List<String> urlList = page.getHtml().$("a","href").regex(".*https.*").all();
for(String url : urlList){
BaseCrawling my = new BaseCrawling();
Spider.create(my).addUrl(url).thread(5).run();
System.out.println("url:"+url);
}
}
@Override
public Site getSite() {
return site;
}
/**
* 下载文件到某个路径
* @param urlStr
* @param savePath
* @throws IOException
*/
private static void downLoadFileToPath(String urlStr,String savePath) {
URL url = null;
InputStream inputStream = null;
FileOutputStream fos = null;
try {
url = new URL(urlStr);
HttpURLConnection conn = (HttpURLConnection)url.openConnection();
//设置超时间为3秒
conn.setConnectTimeout(3*1000);
//防止屏蔽程序抓取而返回403错误
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
//得到输入流
inputStream = conn.getInputStream();
//获取自己数组
byte[] getData = readInputStream(inputStream);
//文件保存位置
File saveDir = new File(savePath);
if(!saveDir.exists()){
saveDir.mkdir();
}
String str="";
String date =String.valueOf(new Date().getTime());
if(urlStr.contains("?")){
int strStartIndex = urlStr.lastIndexOf(".");
int strEndIndex = urlStr.indexOf("?");
String result = urlStr.substring(strStartIndex, strEndIndex).substring(".".length());
str = date+"."+result;
}else{
str = date +"."+urlStr.substring(urlStr.lastIndexOf(".")+1);
}
File file = new File(saveDir+File.separator+str);
fos = new FileOutputStream(file);
fos.write(getData);
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if(fos!=null) {
fos.close();
}
if(inputStream!=null){
inputStream.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("url:"+url+" download success");
}
private static byte[] readInputStream(InputStream inputStream) throws IOException {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while((len = inputStream.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
bos.close();
return bos.toByteArray();
}
public static void main(String[] args) {
BaseCrawling my = new BaseCrawling();
System.out.println("开始爬取");
Spider.create(my).addUrl("https://www.tupianzj.com/meinv/").thread(5).run();
System.out.println("爬取结束");
}
}