1.获取URL
2.下载资源
3.分析资源
4.处理
先爬一篇不用修改User-Agent的网站
在选择源处使用了缓冲流 转换流 try …with…resource类(释放资源)
链接博客
package 爬虫原理;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import 网络编程开篇.urltest;
public class spider {
public static void main(String[] args) throws Exception {
URL url =new URL("www.js.com");
InputStream in =url.openStream();
try(
BufferedReader read =new BufferedReader(new InputStreamReader(in,"UTF-8"));//缓冲流 转换流
BufferedWriter write =new BufferedWriter(new FileWriter(new File("爬虫.txt")))
) {
String str =null;
while((str=read.readLine())!=null) {
System.out.println(str);
}
}
}
}
改进版、
package 爬虫原理;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import javax.net.ssl.HttpsURLConnection;
public class 改进版 {
public static void main(String[] args) throws Exception {
File file =new File("谷歌翻译code.txt");
URL url=new URL("https://translate.google.cn/#view=home&op=translate&sl=auto&tl=zh-CN&text=Software%20caused%20connection%20abort%3A%20recv%20failed");
HttpsURLConnection connect =(HttpsURLConnection)url.openConnection();
connect.setRequestMethod("GET");
connect.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36");
try(
BufferedReader read =new BufferedReader(new InputStreamReader(connect.getInputStream(),"UTF-8"));
BufferedWriter write =new BufferedWriter(new FileWriter(file));
){
String str= null;
while((str=read.readLine())!=null) {
System.out.println(str);
write.write(str);
write.newLine();
write.flush();
}
}
}
}
改进点
HttpsURLConnection connect =(HttpsURLConnection)url.openConnection();
connect.setRequestMethod("GET");//请求方式 connect.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36");//设置属性
资源的获取也由 openSteam() 变为 getInputsteam()