java爬虫下载付费html网页模板

最新推荐文章于 2024-07-31 09:04:01 发布

程序员bigsai

最新推荐文章于 2024-07-31 09:04:01 发布

阅读量4.5k

点赞数 2

分类专栏： # Java爬虫

本文链接：https://blog.csdn.net/qq_40693171/article/details/80846449

版权

Java爬虫专栏收录该内容

8 篇文章 7 订阅

订阅专栏

前言

前一段时间我们有一个网页的projiect小项目，要求学习bootstarp。然而自己写的模板和别人写好的东西，无论从美观和手机运行的兼容性上差距都很巨大。中途我们放弃自己写的东西，开始偷别人的模板。有些甚至不会偷的同学甚至还付费下载，都什么年代了，程序员还要花钱买模板。那次结束后，突发奇想能不能写个程序，让他自动下载模板。经过不断努力和解决bug，最终取得了成功。

思路

大致思路为：输入模板的一个页面为url，通过这个链接遍历所有与之有关的链接放到hashset中（采用队列的宽度优先遍历bfs）。这个相关用字符判断链接前面的主要域名地址。（链出去的链接不处理，防止无限扩大）。同时，还要将各种url分类放到不同的set中。

html页面分析：抓取html链接。还要按行读取html文本分析其中可能隐藏的css文件（可能有背景图片）。获取js链接，获取image地址，css地址，（注意一定要储存绝对地址而不是相对地址）。还有的涉及到上层目录。需要处理。

css页面：按行分析。因为css中可能储存背景图片以及其他logo。
js：直接下载保存。
html：下载保存
image：下载保存

注意点：

所有下载链接或者其他活动都要在try catch进行，在catch中跳过这个步骤，执行相应步骤。
下载目录在download自行更改（默认F：//download）
添加jsoup的jar包
有些图片藏在js文件中和css文件中，所以需要去判断js文件和css文件，我这个只分析了css没分析css。
由于精力和时间问题，项目并没有晚上，由于笔者此时正则能力不足，大部分采用字符串分割查找或者contains查找，难免有疏漏
目前代码测试只针对17素材之家部分模板测试有效。其他站点未进行测试
只是小白，代码亢长低水平，大佬勿喷。
附上代码如下：

代码

启动主类getmoban

import java.io.IOException;
import java.util.Iterator;
import java.util.Scanner;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;


public class getmoban {

	public static void main(String[] args) throws IOException
	{
		ExecutorService ex=Executors.newFixedThreadPool(6);
		Scanner sc=new Scanner(System.in);
		System.out.println("请输入网址（别太大否则下载不完）");
		String url=sc.nextLine();
		geturl g=new geturl(url);//
		csssearch cssimage=new csssearch();
		System.out.println(g.file);
		g.judel();		
		Iterator it=g.htmlurlset.iterator();		
		while(it.hasNext())
		{
			String name=it.next();
			try {
				download download=new download(name);
				
			 ex.execute(download);	
			}
			catch(Exception e){}
			//System.out.println("地址为" name);
		}
		Iterator it2=g.jsset.iterator();
		while(it2.hasNext())
		{
			String name=it2.next();
			try {
				download download=new download(name);
			 ex.execute(download);	
			}
				catch(Exception e){}
			//System.out.println("js地址为" name);
		}
		Iterator it3=g.cssset.iterator();
		while(it3.hasNext())//css需要过滤其中是否有背景图片
		{
			String name=it3.next();
			try {
				download download=new download(name);
				ex.execute(download);
				cssimage.searchimage(name);
			}
				catch(Exception e){}
			//System.out.println("css地址为" name);
		}
		Iterator it4=g.imgset.iterator();
		while(it4.hasNext())
		{
			String name=it4.next();
			try {
				download download=new download(name);
			 ex.execute(download);	
			}
				catch(Exception e){}
			//System.out.println("image地址为" name);
		}
		ex.shutdown();
		//judel();
	}
}

分析链接geturl

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Queue;
import java.util.Set;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class geturl {
	
	public static String url="http://www.17sucai.com/preview/1/2014-11-28/jQuery用户注册表单验证代码/index.html";
	static String head="http";
	public geturl(String url)
	{
		this.url=url;
	}
	static String file=url;//文件路径
	{
		if(url.contains("http"))
		{
			head=file.split("//")[0];
			file=file.split("//")[1];
		}
		int last=file.lastIndexOf("/");
		file=file.substring(0, last);
	}
	static Set htmlurlset=new HashSet();//html
	static Set jsset=new HashSet();//js
	static Set imgset=new HashSet();//image
	static Set cssset=new HashSet();//css样式
	static Queue queue=new ArrayDeque();
	
//	public geturl() throws IOException 
//	{this.judel();}
	public static void judel() throws IOException 
	{
		queue.add(url);htmlurlset.add(url);
		while(!queue.isEmpty()&&queue!=null)//要防止链接无限扩大
		{
			String teamurl=queue.poll();//弹出头并且删除节点
			System.out.println(teamurl);
			
			if(!teamurl.endsWith(".com"))//有的网站短小，可能识别有错误	
			{
			if(file.indexOf("/")>0)
			{if(teamurl.contains(file.substring(0,file.indexOf("/"))))
			analyze(teamurl);}
			else
				analyze(teamurl);
			}
//			catch(Exception e) {System.out.println("cuo");}			
		}
		
	}
			
	public static void analyze(String URL)
	{
		try {
	 Document doc;
		doc = Jsoup.connect(URL).timeout(20000).header("user-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36").ignoreContentType(true).get();
		 Elements all=doc.select("[class]");//检查
		 Elements js=doc.getElementsByTag("script");
		 Elements html=doc.select("a[href]");
		 Elements img=doc.select("img");
		 Elements css=doc.select("link[href]");
		 for(Element e:all)
		 {
			 if(e.attr("style")!="")//找到藏在html的css的图片背景
			 { 
				 String tex=e.attr("style");
				 if(tex.contains("url"))
				 {
					 String urladress=file;
			 		String imgurl=tex.split("url")[1];
			 		imgurl=imgurl.split("\\(")[1].split("\\)")[0];//转义字符串
			 		if(imgurl.startsWith("'")||imgurl.startsWith("\""))//注意转义字符串
			 		{
			 			imgurl=imgurl.substring(1,imgurl.length()-1);
			 		} 
			 		while(imgurl.startsWith(".."))
			 		{
			 			imgurl=imgurl.substring(imgurl.indexOf("/") 1);		 			
			 			urladress=urladress.substring(0,urladress.lastIndexOf("/"));
			 		}
			 		urladress=head "//" urladress "/" imgurl;
			 		imgset.add(urladress);
				 }				 
			 }
		 }
		 for(Element htmlelement:html)
		 {		 		
			 String a=htmlelement.absUrl("href").split("#")[0];
			 
			 if(!a.equals(""))
			 {
				 if(!htmlurlset.contains(a)&&a.contains(file.substring(0,file.indexOf("/"))))//不存在继续遍历
				 { queue.add(a);htmlurlset.add(a); //System.out.println(a);
				 }			 
			 }				 
		 }
		 for(Element jselement:js)//判断JS
		 {
			 String team=jselement.absUrl("src");	
			 if(!team.equals(""))
			 jsset.add(team);//添加

		 }
		 for(Element csselement:css)
		 {
			 String team=csselement.absUrl("href");
			 if(!team.equals(""))//绝对路径
			 cssset.add(team);			
			 // System.out.println(e.attr("href"));
		 }
		 for(Element imageelement:img)
		 {
			 String team=imageelement.absUrl("src");
			 if(!team.equals(""))//绝对路径
			 imgset.add(team);
			 
			 //System.out.println(e.attr("href"));
		 }
		}
		catch(Exception e)
		{
			if(!queue.isEmpty()) {
			URL=queue.poll();
			 analyze(URL);}
		}
	} 				 
	}

分析css（css可能隐藏图片）csssearch

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

public class csssearch {

	public static void searchimage(String ur) throws IOException {
		if(ur.toLowerCase().contains("bootstarp")) {return;}//bootstarp.css过滤掉，肯定没图片
		Set imgset=new HashSet();
		//String ur="http://demo.cssmoban.com/cssthemes5/cpts_1019_bpi/css/style.css";
		String http="http";
		String fileurl=ur;
		if(fileurl.startsWith("http"))
		{
			http=fileurl.split("//")[0];//防止https协议
			fileurl=fileurl.split("//")[1];
		}
		fileurl=fileurl.substring(0,fileurl.lastIndexOf("/"));
		//System.out.println(fileurl);//测试
		URL url=new URL(ur);
		 URLConnection conn = url.openConnection();
	 conn.setConnectTimeout(1000);
	 conn.setReadTimeout(5000);
	 conn.connect();
	 InputStream in= conn.getInputStream();
	 InputStreamReader inp=new InputStreamReader(in);
	 BufferedReader buf=new BufferedReader(inp);
	 File file=new File("F:\\download\\" ur.split("//")[1]);
		 if(!file.exists())
		 {
		 	 file.getParentFile().mkdirs();
		 	 file.createNewFile();
		 }
		// BufferedOutputStream bufout=new BufferedOutputStream(new FileOutputStream(file));
		 String tex="";
		 while((tex=buf.readLine())!=null)
		 {
//		 	System.out.println(tex);
		 	if(tex.contains("url"))
		 	{
		 		String urladress=fileurl;
		 		String imgurl=tex.split("url")[1];
		 		imgurl=imgurl.split("\\(")[1].split("\\)")[0];//转义字符串
		 		if(imgurl.startsWith("'")||imgurl.startsWith("\""))//注意转义字符串
		 		{
		 			imgurl=imgurl.substring(1,imgurl.length()-1);
		 		}
		 		//System.out.println(imgurl);//测试
		 		while(imgurl.startsWith(".."))
		 		{
		 			imgurl=imgurl.substring(imgurl.indexOf("/") 1);		 			
		 			urladress=urladress.substring(0,urladress.lastIndexOf("/"));
		 		}
		 		urladress=http "//" urladress "/" imgurl;
		 		//System.out.println(urladress);
		 		//down.download(urladress);
		 		imgset.add(urladress);
		 	}
		 }
	//	 bufout.close();
		 buf.close();
		 inp.close();
		 in.close();
		 Iterator it=imgset.iterator();
		 while(it.hasNext())
		 {		 
		 	String team=it.next();
		 	
		 	try {
		 		download down=new download(team);
		 		Thread t1=new Thread(down);
		 		t1.start();System.out.println(team "下载成功");}
		 	catch(Exception e) {System.out.println("下载失败：" team);}
		 }
		 
	}
}

download(线程池下载)

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;

public class download implements Runnable{
	
	public String ur;
	public download() {}
	public download(String ur)
	{
		this.ur=ur;
	}
	public static void download(String ur) throws IOException	
	{		 
		 //String ur="http://www.17sucai.com/preview/1266961/2018-06-22/wrj/index.html";
		String fileplace=ur;
		
		if(fileplace.contains("http"))
		{
			
			fileplace=fileplace.split("//")[1];
		}
	 URL url = new URL(ur);
	 URLConnection conn = url.openConnection();
	 conn.setConnectTimeout(4000);
	 conn.setReadTimeout(5000);
	 conn.connect();
	 InputStream in= conn.getInputStream();
	 
	 BufferedInputStream buf=new BufferedInputStream(in);
	 File file=new File("F:\\download\\" fileplace);
	 if(!file.exists())
	 {
	 	 file.getParentFile().mkdirs();
	 	 file.createNewFile();
	 }
	 //System.out.print(file.getAbsolutePath()); 
	 BufferedOutputStream bufout=new BufferedOutputStream(new FileOutputStream(file)); 
//	 int b=0;
//	 while((b=buf.read())!=-1)
//	 {
//	 	 bufout.write(b);
//	 	 //System.out.println(b "");
//	 }
	 byte b[]=new byte[1024];
	 int n=0;
	 while((n=buf.read(b))!=-1)
	 {
	 	bufout.write(b, 0, n);
	 }
	 in.close();
	 buf.close();			
			bufout.close();
			
			//fullFileName.close();
	 }
	@Override
	public void run() {
		try {
			download(ur);
			System.out.println(Thread.currentThread().getName() " 下载" ur "成功");
		} catch (IOException e) {
			// TODO 自动生成的 catch 块
			e.printStackTrace();
		}
		
	}
		
}