文章标题

爬虫
`
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Spider {
private String base = “http://meinv666.com“; //基地址
private String sourceUrl; //源网页链接
private StringBuilder html; //网页文本
private List imgUrl = new ArrayList<>();

private static int count = 0;

public Spider(String sourceUrl) throws IOException {
    this.sourceUrl = sourceUrl;
    getHtml();
    getImgURL();
}

/*得到网页源码*/
public void getHtml() throws IOException {
    URL url = new URL(sourceUrl);
    Scanner scanner = new Scanner(url.openStream());
    html = new StringBuilder();
    while (scanner.hasNext()){
        html.append(scanner.nextLine() + "\n");
    }
}

/*找出图片链接*/
public List<String> getImgURL() throws IOException {
    Pattern pattern = Pattern.compile("<img.*?src=\\\"([^\"]*?)\\\"\\s*alt=.*?>");
    Matcher m = pattern.matcher(html);
    while (m.find()){
        imgUrl.add(base + m.group(1));
    }
    return imgUrl;
}

/*下一页*/
public String getPage(){
    Pattern pattern = Pattern.compile("<a\\s*href=\\\"([^\"]*?)\\\">&raquo;</a>");
    Matcher m = pattern.matcher(html);
    if (m.find())
        sourceUrl = base + m.group(1);
    return sourceUrl;
}

/*下载图片*/
public void downLoad() throws IOException, InterruptedException {
    Iterator<String> iter = imgUrl.iterator();
    while (iter.hasNext()){
        String temp = iter.next();
        String name = temp.substring(temp.lastIndexOf("/") + 1,temp.length());
        URL url = new URL(temp);
        OutputStream ops = new FileOutputStream(new File("D:\\Downloads\\pic\\" + name));
        InputStream ips = url.openStream();
        int num = 0;
        while ((num = ips.read()) != -1){
            ops.write(num);
        }
        ips.close();
        ops.close();
        System.out.println("完成" + ++count + "张!");

        Thread.sleep(2000);
    }
}

}
`

主方法
import java.io.IOException;

public class Demo {
public static void main(String[] args) throws IOException, InterruptedException {
String source = “http://meinv666.com/sex“;

    int count = 8;
    while (count-- > 0 && source != null){
        Spider spider = new Spider(source);
        spider.downLoad();
        source = spider.getPage();
    }
    System.out.println("完成惹。。");
}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值