滑稽色图片小爬虫

前几天做的小爬虫,没用什么框架,也不是什么高级货,更不是什么分布式。纯粹自娱自乐,爬点滑稽色图片玩玩(手动滑稽)。第一次搞爬虫,也没查什么资料,全凭自己的想法搞,轻喷。


大体说下实现:一个主线程开启两个小线程:一个线程负责处理URL爬取html代码;一个线程负责解析html代码获取下一个html文本的url和图片的url,并下载图片。由于网站图片页面的跳转结构,采取BFS去搜索,所以搞了两个队列(当然是线程安全的那种,用了LinkedBlockingQueue。),一个存url,一个存html文档,然后是生产者和消费者的模型啥的。。。这里本来想将图片下载分出html解析线程的,但后来发现,图片下载速度远比解析html和处理url的速度慢(毕竟滑稽色图片,服务器比较远,手动滑稽),导致爬取解析了大量html,图片还没下载几张,所以将图片下载并入html解析来拖慢html的解析速度。


下面是代码:

************************************************

package spider;

import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class context {
protected static LinkedBlockingQueue<String> url_collecion = new LinkedBlockingQueue<String>();
protected static LinkedBlockingQueue<String> doc_collecion = new LinkedBlockingQueue<String>();
protected static String htmlpath = "D:/test/spider/webpage/html/";
protected static String imgpath = "D:/test/spider/webpage/img/";
protected static String webpath = "";

protected static Set<String> RegexString(String targetStr, String patternStr){
HashSet<String> strlist = new HashSet<String>();
Pattern pattern = Pattern.compile(patternStr);
Matcher matcher = pattern.matcher(targetStr);
while(matcher.find()){
            strlist.add(matcher.group());
        }
return strlist;
}
}

*************************************

package spider;

public class Main extends context {

public static void main(String[] args) {
try {
url_collecion.put("http://马赛克/");
webpath = "http://马赛克/";
} catch (InterruptedException e) {
System.out.println("初始化context失败");
}

spider_thread st = new spider_thread();
analysis_thread at = new analysis_thread();
new Thread(st,"st").start();
new Thread(at,"at").start();
}

}

********************************************

package spider;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;

public class spider_thread extends context implements Runnable {

@Override
public void run(){
while(true){
//从URL集合中读取一个URL;
String urlstr="";
try {
urlstr = url_collecion.take();
System.out.println(Thread.currentThread().getName()+"@读取URL:"+urlstr);
} catch (InterruptedException e) {
System.out.println(Thread.currentThread().getName()+"@读取URL异常:");
System.out.println(Thread.currentThread().getName()+"@"+e);
}

//创建储存该URL的HTML代码的文件,如果该文件已存在,则说明该URL重复访问了,continue掉;
File file=new File(htmlpath+urlstr.replace(':',':').replace('/','#').replace('.', '。')+".txt");
if(!file.exists()){
System.out.println(Thread.currentThread().getName()+"@创建文件:"+urlstr.replace(':',':').replace('/','#').replace('.', '。')+".txt");
try {
file.createNewFile();
} catch (IOException e) {
System.out.println(Thread.currentThread().getName()+"@创建文件异常:");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
}
else
continue;

//访问该URL,写入其HTML到对应文件中
try{
URL url = new URL(urlstr);
URLConnection connection = url.openConnection();
connection.connect();

BufferedReader br = new BufferedReader(new InputStreamReader(connection.getInputStream()));
FileWriter fw = new FileWriter(file);

try{
String line = null;
while((line = br.readLine())!=null){
fw.write(line);
}
}
catch(Exception e){
System.out.println(Thread.currentThread().getName()+"@IO异常:");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
finally{
br.close();
fw.close();
doc_collecion.put(file.getName());
System.out.println(Thread.currentThread().getName()+"@文件写入结束。");
}

}
catch(Exception e){
System.out.println(Thread.currentThread().getName()+"@请求URL异常:");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
}
}

}

**************************************************************************************************

package spider;

import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.Set;

public class analysis_thread extends context implements Runnable{
@Override
public void run(){
while(true){
String filename = "";
String pagename = "";
try {
pagename = doc_collecion.take();
filename = htmlpath+pagename;
} catch (InterruptedException e) {
System.out.println(Thread.currentThread().getName()+"@读取文件异常:");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
File file = new File(filename);
if(!file.exists())
continue;
System.out.println(Thread.currentThread().getName()+"@读取文件:"+file.getName());

String page = "";
try {
FileReader fr = new FileReader(file);
char[] cbuf = new char[32];
int hasread=0;
while((hasread = fr.read(cbuf))>0){
page += new String(cbuf, 0, hasread);
}
} catch (Exception e) {
System.out.println(Thread.currentThread().getName()+"@文件读取异常:");
System.out.println(Thread.currentThread().getName()+"@"+e);
}

System.out.println(Thread.currentThread().getName()+"@开始解析HTML");
analysis(page, pagename.substring(0,pagename.length()-4));
}
}

private void analysis(String page, String pagename){
String html = page.split("<div class=\"content")[1].split("<div class=\"footer")[0];
Set<String> urls = RegexString(html, "a href=[\"'](.+?)[\"']");

for(String url : urls){
String tmp = url.substring(8, url.length()-1);
if(tmp.startsWith("/piclist7/"))
tmp = tmp.substring(10);
try {
url_collecion.put(super.webpath+tmp);
} catch (InterruptedException e) {
System.out.println(Thread.currentThread().getName()+"@存储URL失败");
}
System.out.println(super.webpath+tmp);
}

urls = RegexString(html, "img src=[\"'](.+?)[\"']");
if(urls.size()>=5){
File dir = new File(imgpath+pagename+"/");
dir.mkdirs();
int id = 1;
for(String url : urls){
String tmp = url.substring(9, url.length()-1);
download_img(imgpath+pagename+"/", tmp, id);
id++;
}
}

System.out.println(Thread.currentThread().getName()+"@结束解析HTML");
}

private void download_img(String dir, String urlstr, int id){
try {
byte[] bs = new byte[1024];
int len;
File file=new File(dir+id+".jpg");
System.out.println(dir+id+".jpg");

if(!file.exists()){
try {
file.createNewFile();
} catch (IOException e) {
System.out.println(Thread.currentThread().getName()+"@创建文件异常:");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
}
else
return;

URL url = new URL(urlstr);
URLConnection con = url.openConnection();
con.setConnectTimeout(5*1000);
con.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");

InputStream is = con.getInputStream();
OutputStream os = new FileOutputStream(file);
try{
while ((len = is.read(bs)) != -1){
os.write(bs, 0, len);
}
} catch(Exception e) {
System.out.println(Thread.currentThread().getName()+"@图片写入失败");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
finally{
is.close();
os.close();
}
} catch (Exception e) {
System.out.println(Thread.currentThread().getName()+"@获取图片失败");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
}
}

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值