滑稽色图片小爬虫

最新推荐文章于 2021-09-03 17:42:27 发布

MisakaNetController

最新推荐文章于 2021-09-03 17:42:27 发布

阅读量1.1k

点赞数 1

分类专栏：计算机网络文章标签：爬虫搜索线程滑稽

本文链接：https://blog.csdn.net/misakanetcontroller/article/details/78502440

版权

计算机网络专栏收录该内容

2 篇文章 0 订阅

订阅专栏

前几天做的小爬虫，没用什么框架，也不是什么高级货，更不是什么分布式。纯粹自娱自乐，爬点滑稽色图片玩玩（手动滑稽）。第一次搞爬虫，也没查什么资料，全凭自己的想法搞，轻喷。

大体说下实现：一个主线程开启两个小线程：一个线程负责处理URL爬取html代码；一个线程负责解析html代码获取下一个html文本的url和图片的url，并下载图片。由于网站图片页面的跳转结构，采取BFS去搜索，所以搞了两个队列（当然是线程安全的那种，用了LinkedBlockingQueue。），一个存url，一个存html文档，然后是生产者和消费者的模型啥的。。。这里本来想将图片下载分出html解析线程的，但后来发现，图片下载速度远比解析html和处理url的速度慢（毕竟滑稽色图片，服务器比较远，手动滑稽），导致爬取解析了大量html，图片还没下载几张，所以将图片下载并入html解析来拖慢html的解析速度。

下面是代码：

************************************************

package spider;

import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class context {
protected static LinkedBlockingQueue<String> url_collecion = new LinkedBlockingQueue<String>();
protected static LinkedBlockingQueue<String> doc_collecion = new LinkedBlockingQueue<String>();
protected static String htmlpath = "D:/test/spider/webpage/html/";
protected static String imgpath = "D:/test/spider/webpage/img/";
protected static String webpath = "";

protected static Set<String> RegexString(String targetStr, String patternStr){
HashSet<String> strlist = new HashSet<String>();
Pattern pattern = Pattern.compile(patternStr);
Matcher matcher = pattern.matcher(targetStr);
while(matcher.find()){
strlist.add(matcher.group());
}
return strlist;
}
}

*************************************

package spider;

public class Main extends context {

public static void main(String[] args) {
try {
url_collecion.put("http://马赛克/");
webpath = "http://马赛克/";
} catch (InterruptedException e) {
System.out.println("初始化context失败");
}

spider_thread st = new spider_thread();
analysis_thread at = new analysis_thread();
new Thread(st,"st").start();
new Thread(at,"at").start();
}

}

********************************************

package spider;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;

public class spider_thread extends context implements Runnable {

@Override
public void run(){
while(true){
//从URL集合中读取一个URL；
String urlstr="";
try {
urlstr = url_collecion.take();
System.out.println(Thread.currentThread().getName()+"@读取URL:"+urlstr);
} catch (InterruptedException e) {
System.out.println(Thread.currentThread().getName()+"@读取URL异常：");
System.out.println(Thread.currentThread().getName()+"@"+e);
}

//创建储存该URL的HTML代码的文件，如果该文件已存在，则说明该URL重复访问了，continue掉；
File file=new File(htmlpath+urlstr.replace(':','：').replace('/','#').replace('.', '。')+".txt");
if(!file.exists()){
System.out.println(Thread.currentThread().getName()+"@创建文件："+urlstr.replace(':','：').replace('/','#').replace('.', '。')+".txt");
try {
file.createNewFile();
} catch (IOException e) {
System.out.println(Thread.currentThread().getName()+"@创建文件异常：");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
}
else
continue;

//访问该URL，写入其HTML到对应文件中
try{
URL url = new URL(urlstr);
URLConnection connection = url.openConnection();
connection.connect();

BufferedReader br = new BufferedReader(new InputStreamReader(connection.getInputStream()));
FileWriter fw = new FileWriter(file);

try{
String line = null;
while((line = br.readLine())!=null){
fw.write(line);
}
}
catch(Exception e){
System.out.println(Thread.currentThread().getName()+"@IO异常：");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
finally{
br.close();
fw.close();
doc_collecion.put(file.getName());
System.out.println(Thread.currentThread().getName()+"@文件写入结束。");
}

}
catch(Exception e){
System.out.println(Thread.currentThread().getName()+"@请求URL异常：");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
}
}

}

**************************************************************************************************

package spider;

import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.Set;

public class analysis_thread extends context implements Runnable{
@Override
public void run(){
while(true){
String filename = "";
String pagename = "";
try {
pagename = doc_collecion.take();
filename = htmlpath+pagename;
} catch (InterruptedException e) {
System.out.println(Thread.currentThread().getName()+"@读取文件异常：");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
File file = new File(filename);
if(!file.exists())
continue;
System.out.println(Thread.currentThread().getName()+"@读取文件："+file.getName());

String page = "";
try {
FileReader fr = new FileReader(file);
char[] cbuf = new char[32];
int hasread=0;
while((hasread = fr.read(cbuf))>0){
page += new String(cbuf, 0, hasread);
}
} catch (Exception e) {
System.out.println(Thread.currentThread().getName()+"@文件读取异常：");
System.out.println(Thread.currentThread().getName()+"@"+e);
}

System.out.println(Thread.currentThread().getName()+"@开始解析HTML");
analysis(page, pagename.substring(0,pagename.length()-4));
}
}

private void analysis(String page, String pagename){
String html = page.split("<div class=\"content")[1].split("<div class=\"footer")[0];
Set<String> urls = RegexString(html, "a href=[\"'](.+?)[\"']");

for(String url : urls){
String tmp = url.substring(8, url.length()-1);
if(tmp.startsWith("/piclist7/"))
tmp = tmp.substring(10);
try {
url_collecion.put(super.webpath+tmp);
} catch (InterruptedException e) {
System.out.println(Thread.currentThread().getName()+"@存储URL失败");
}
System.out.println(super.webpath+tmp);
}

urls = RegexString(html, "img src=[\"'](.+?)[\"']");
if(urls.size()>=5){
File dir = new File(imgpath+pagename+"/");
dir.mkdirs();
int id = 1;
for(String url : urls){
String tmp = url.substring(9, url.length()-1);
download_img(imgpath+pagename+"/", tmp, id);
id++;
}
}

System.out.println(Thread.currentThread().getName()+"@结束解析HTML");
}

private void download_img(String dir, String urlstr, int id){
try {
byte[] bs = new byte[1024];
int len;
File file=new File(dir+id+".jpg");
System.out.println(dir+id+".jpg");

if(!file.exists()){
try {
file.createNewFile();
} catch (IOException e) {
System.out.println(Thread.currentThread().getName()+"@创建文件异常：");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
}
else
return;

URL url = new URL(urlstr);
URLConnection con = url.openConnection();
con.setConnectTimeout(5*1000);
con.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");

InputStream is = con.getInputStream();
OutputStream os = new FileOutputStream(file);
try{
while ((len = is.read(bs)) != -1){
os.write(bs, 0, len);
}
} catch(Exception e) {
System.out.println(Thread.currentThread().getName()+"@图片写入失败");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
finally{
is.close();
os.close();
}
} catch (Exception e) {
System.out.println(Thread.currentThread().getName()+"@获取图片失败");
System.out.println(Thread.currentThread().getName()+"@"+e);
}
}
}