* 一个用于获得Request请求的接口
* 实现getMethod方法获取Get方法
*/
public interface RequestSet {
public HttpGet getMethod(String url);
}
Saveutil接口用于自定义保存方式,需要实现save方法
package SaveUtil;
/*
* 数据储存的工具接口,必须实现保存方法
*/
public interface SaveUtil {
public void save(String url,String html);
}
Spider类,有五中构造方法,可以实现多种自定义操作,其中实现了上述自定义接口的默认实现类
package Spider;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import ChoosePackage.MyResourseChooser;
import ChoosePackage.ResourseChooser;
import DownloadPackage.HtmlDownloader;
import DownloadPackage.RequestSet;
import DownloadPackage.UrlGet;
import SaveUtil.MySaveUtil;
import SaveUtil.SaveUtil;
/*
* 用于爬取资源的类
*/
public class Spider{
public static void main(String[] args) {
new Spider(“http://www.bilibili.net”).spiderstart();
}
//种子url
String seed = null;
//用于保存数据的类,需要自己实现
private SaveUtil saveutil = null;
//html下载类
private HtmlDownloader downloader = null;
//url下载类
private UrlGet urldownloader = null;
//资源选择工具
private ResourseChooser resoursechooser = null;
//用于保存未下载的网页
LinkedList unvisited = new LinkedList();
//用于保存已下载的网页
HashSet visited = new HashSet();
//自定义储存方式,请求方式,资源筛选方式的构造方法
public Spider(SaveUtil saveutil,RequestSet request,ResourseChooser resoursechooser,String seed){
this.saveutil = saveutil;
this.downloader = new HtmlDownloader(request);
this.urldownloader = new UrlGet();
this.resoursechooser = resoursechooser;
this.seed = seed;
unvisited.add(seed);
}
//自定义储存方式,资源筛选方式的构造方法
public Spider(SaveUtil saveutil,ResourseChooser resoursechooser,String seed){
this.resoursechooser = resoursechooser;
this.downloader = new HtmlDownloader(new getRequest());
this.saveutil = saveutil;
this.urldownloader = new UrlGet();
this.seed = seed;
unvisited.add(seed);
}
//自定义储存方式,请求的构造方法
public Spider(SaveUtil saveutil,RequestSet requestset,String seed){
this.saveutil = saveutil;
this.downloader = new HtmlDownloader(requestset);
this.resoursechooser = new MyResourseChooser();
this.urldownloader = new UrlGet();
this.seed = seed;
unvisited.add(seed);
}
//自定义储存方式的构造方法
public Spider(SaveUtil saveutil,String seed){
this.saveutil = saveutil;
this.downloader = new HtmlDownloader(new getRequest());
this.resoursechooser = (new MyResourseChooser());
this.urldownloader = new UrlGet();
this.seed = seed;
unvisited.add(seed);
}
//默认的爬虫构造方法
public Spider(String seed){
this.saveutil = new MySaveUtil();
this.downloader = new HtmlDownloader(new getRequest());
this.resoursechooser = (new MyResourseChooser());
this.urldownloader = new UrlGet();
this.seed = seed;
unvisited.add(seed);
}
//开始爬取的方法
private void spiderstart(){
String html = null;
while(!unvisited.isEmpty()){
String url = unvisited.poll();
System.out.println(“开始获取”+url);
if(resoursechooser.isNeed(url)){
try{
html = downloader.downloadhtml(url);
}catch(RuntimeException e){
System.out.println(url+“连接获取失败”);
continue;
}
visited.add(url);
LinkedList urls = new LinkedList();
try{
urls = urldownloader.geturls(html);
}catch(RuntimeException e){
System.out.println(url+“的html页面为空”);
continue;
}
Iterator it = urls.iterator();
while(it.hasNext()){
String newurl = it.next();
if(resoursechooser.isNeed(newurl)&&!visited.contains(newurl)&&!unvisited.contains(newurl)){
newurl = resoursechooser.process(newurl);
unvisited.add(newurl);
System.out.println(newurl+“加入页面”);
}
}
System.out.println(“获取了”+url+“上的所有url”);
if(resoursechooser.isResourse(url)){
saveutil.save(url,html);
}
}
}
}
//默认资源筛选类
private class MyResourseChooser implements ResourseChooser{
@Override
public Boolean isNeed(String url) {
// TODO Auto-generated method stub
if(!url.startsWith(“/”)&&!url.startsWith(“http”)){
return false;
}
return true;
}
@Override
public Boolean isResourse(String url) {
// TODO Auto-generated method stub
return true;
}
@Override
public String process(String url) {
// TODO Auto-generated method stub
if(!url.startsWith(“http”)){
url = seed+url;
}
return url;
}
}
public class getRequest implements RequestSet{
public HttpGet getMethod(String url) {
// TODO Auto-generated method stub
//创建一个get请求方法
HttpGet getmethod = new HttpGet(url);
//HttpHost proxy = new HttpHost(“124.88.67.81”,80);这里不设置代理IP
//设置请求超时时间等
RequestConfig responseconfig = RequestConfig.custom().setConnectionRequestTimeout(10000).setConnectTimeout(10000).setSocketTimeout(10000).build();
//设置请求头,主要是user-agent
getmethod.addHeader(“User-Agent”,“Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36”);
//设置请求参数
getmethod.setConfig(responseconfig);
return getmethod;
}
}
//默认的存储类
public class MySaveUtil implements SaveUtil{
@Override
public void save(String url, String html) {
// TODO Auto-generated method stub
String filename = getfilename(url);
BufferedWriter writer = null;
try{
writer = new BufferedWriter(new FileWriter(filename));
writer.write(html);
writer.flush();
System.out.println(“文件写入成功”);
}catch(IOException e){
一、Python所有方向的学习路线
Python所有方向的技术点做的整理,形成各个领域的知识点汇总,它的用处就在于,你可以按照下面的知识点去找对应的学习资源,保证自己学得较为全面。
二、Python必备开发工具
工具都帮大家整理好了,安装就可直接上手!
三、最新Python学习笔记
当我学到一定基础,有自己的理解能力的时候,会去阅读一些前辈整理的书籍或者手写的笔记资料,这些笔记详细记载了他们对一些技术点的理解,这些理解是比较独到,可以学到不一样的思路。
四、Python视频合集
观看全面零基础学习视频,看视频学习是最快捷也是最有效果的方式,跟着视频中老师的思路,从基础到深入,还是很容易入门的。
五、实战案例
纸上得来终觉浅,要学会跟着视频一起敲,要动手实操,才能将自己的所学运用到实际当中去,这时候可以搞点实战案例来学习。
六、面试宝典
简历模板![在这里插入图片描述](https://img-blog.csdnimg.cn/646863996ac44da8af500c049bb72fbd.png#pic_center)
网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。
一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!